实战整理-阿里天池淘宝用户购物行为数据集实战(MySQL数据分析+Navicat)
数据集链接:淘宝用户购物行为数据集_数据集-阿里云天池
因原数据集过大,电脑和MySQL跑不动,故截取前一百万条数据进行实战演练、
1、导入数据
利用navicat软件直接导入下载好的Excel文件(较大数据集可用kettle导入)
2、数据预处理
use aori;
desc userbehavior;
select * from userbehavior limit 5;--更改字段名alter table userbehavior change f1 user_id int;
alter table userbehavior change f2 item_id int;
alter table userbehavior change f3 category_id int;
alter table userbehavior change f4 behavior_type varchar(5);
alter table userbehavior change f5 timestamps int;--检查空值select * from userbehavior where user_id is null;
select * from userbehavior where item_id is null;
select * from userbehavior where category_id is null;
select * from userbehavior where behavior_type is null;
select * from userbehavior where timestamps is null;--检查重复值
select user_id,item_id,timestamps from userbehavior
group by user_id,item_id,timestamps
having count(*) > 1;--去重-设置主键alter table userbehavior add id int first;
alter table userbehavior modify id int primary key auto_increment;
select * from userbehavior limit 5;-去除重复值delete userbehavior from
userbehavior,
(
select user_id,item_id,timestamps,min(id) id from userbehavior
group by user_id,item_id,timestamps
having count(*) > 1
) t2
where userbehavior.user_id = t2.user_id
and userbehavior.item_id = t2.item_id
and userbehavior.timestamps = t2.timestamps
and userbehavior.id > t2.id--新增日期
- datetimealter table userbehavior add datetimes TIMESTAMP(0);
update userbehavior set datetimes=FROM_UNIXTIME(timestamps);
select * from userbehavior limit 5;-time
alter table userbehavior add dates char(10);
alter table userbehavior add times char(8);
alter table userbehavior add hours char(2);update userbehavior set dates=substring(datetimes,1,10);
update userbehavior set times=substring(datetimes,12,8);
update userbehavior set hours=substring(datetimes,12,2);
select * from userbehavior limit 5;-- 去异常
select max(datetimes),min(datetimes) from userbehavior;delete from userbehavior
where datetimes < '2017-11-25 00:00:00'
or datetimes > '2017-12-03 23:59:59';--数据概览
desc userbehavior;
select * from userbehavior limit 5;
select count(1) from userbehavior; # 999530条记录
3、时间序列分析
-- 统计日期-小时行为
select dates,hours
,count(if(behavior_type='pv',behavior_type,null)) 'pv'
,count(if(behavior_type='cart',behavior_type,null)) 'cart'
,count(if(behavior_type='fav',behavior_type,null)) 'fav'
,count(if(behavior_type='buy',behavior_type,null)) 'buy'
from userbehavior
group by dates,hours
order by dates,hourscreate table date_hour_behavior(
dates char(10),
hours char(2),
pv int,
cart int,
fav int,
buy int
);insert into date_hour_behavior
select dates,hours
,count(if(behavior_type='pv',behavior_type,null)) 'pv'
,count(if(behavior_type='cart',behavior_type,null)) 'cart'
,count(if(behavior_type='fav',behavior_type,null)) 'fav'
,count(if(behavior_type='buy',behavior_type,null)) 'buy'
from userbehavior
group by dates,hours
order by dates,hoursselect * from date_hour_behavior
4、获客情况
--创建临时表
create table temp_behaviors like userbehavior--截取insert into temp_behaviors
select * from userbehavior limit 100000;select * from temp_behaviors;-- pv(Page View页面浏览量)select dates
,count(behavior_type) 'pv'
from temp_behaviors
where behavior_type = 'pv'
group by dates;-- UV(Uniqque Visitor独立访客数)select dates
,count(distinct user_id) 'uv'
from temp_behaviors
where behavior_type = 'pv'
group by dates;-- 一条语句select dates
,count(behavior_type) 'pv'
,count(distinct user_id) 'uv'
,round(count(behavior_type)/count(distinct user_id),1) 'pv/uv'
from temp_behaviors
where behavior_type = 'pv'
group by dates;-- 处理真实数据
create table pv_uv_puv(
dates char(10),
pv int(9),
uv int(9),
puv decimal(10,1)
);insert into pv_uv_puv
select dates
,count(behavior_type) 'pv'
,count(distinct user_id) 'uv'
,round(count(behavior_type)/count(distinct user_id),1) 'pv/uv'
from userbehavior
where behavior_type = 'pv'
group by dates;select * from pv_uv_puv;
5、留存情况
①留存率计算
select user_id,dates
from userbehavior
group by user_id,dates;-- 自关联select * from
(
select user_id,dates
from userbehavior
group by user_id,dates
) a
,(
select user_id,dates
from userbehavior
group by user_id,dates
) b
where a.user_id = b.user_id
and a.dates <= b.dates-- 留存数
select a.dates
,count(if(datediff(b.dates,a.dates)=0,b.user_id,null)) retention_0
,count(if(datediff(b.dates,a.dates)=1,b.user_id,null)) retention_1
,count(if(datediff(b.dates,a.dates)=3,b.user_id,null)) retention_3
,count(if(datediff(b.dates,a.dates)=1,b.user_id,null))/count(if(datediff(b.dates,a.dates)=0,b.user_id,null)) retention_rate_1
from
(
select user_id,dates
from userbehavior
group by user_id,dates
) a
,(
select user_id,dates
from userbehavior
group by user_id,dates
) b
where a.user_id = b.user_id
and a.dates <= b.dates
group by a.dates-- 保存结果
create table retention_rate(
dates char(10),
retention_1 float
);insert into retention_rate
select a.dates
,count(if(datediff(b.dates,a.dates)=1,b.user_id,null))/count(if(datediff(b.dates,a.dates)=0,b.user_id,null)) retention_rate_1
from
(
select user_id,dates
from userbehavior
group by user_id,dates
) a
,(
select user_id,dates
from userbehavior
group by user_id,dates
) b
where a.user_id = b.user_id
and a.dates <= b.dates
group by a.datesselect * from retention_rate;
②跳失用户计算
-- 跳失率-- 跳失用户select count(*)
from
(
select user_id from userbehavior
group by user_id
having count(behavior_type)=1
) a;select sum(pv) from pv_uv_puv;
6、行为路径分析
create view user_behavior_view as
select user_id,item_id
,count(if(behavior_type='pv',behavior_type,null)) 'pv'
,count(if(behavior_type='fav',behavior_type,null)) 'fav'
,count(if(behavior_type='cart',behavior_type,null)) 'cart'
,count(if(behavior_type='buy',behavior_type,null)) 'buy'
from userbehavior
group by user_id,item_id-- 用户行为标准化create view user_behavior_standard as
select user_id,item_id
,(case when pv>0 then 1 else 0 end) '浏览'
,(case when fav>0 then 1 else 0 end) '收藏'
,(case when cart>0 then 1 else 0 end) '加购'
,(case when buy>0 then 1 else 0 end) '购买'
from user_behavior_view-- 路径类型create view user_behavior_path as
select *,
concat(浏览,收藏,加购,购买) as path_type
from user_behavior_standard as a
where a.购买 > 0;-- 统计各类型购买数量create view path_count as
select path_type
,count(*) path_type_num
from user_behavior_path
group by path_type
order by path_type_num desc;select * from path_count;
-- 改名表create table change_name(
path_type char(4),
description varchar(40));insert into change_name
values('0001','购买'),
('1001','浏览购买'),
('0011','加购购买'),
('1011','浏览加购购买'),
('0101','收藏购买'),
('1101','浏览收藏购买'),
('0111','收藏加购购买'),
('1111','浏览收藏加购购买');select * from change_name;create table path_result
(description varchar(40)
,path_type_num int);insert into path_result
select description,
path_type_num
from path_count
join change_name
on path_count.path_type = change_name.path_type;select * from path_result;
7、用户转化率分析
-- 统计各类行为用户数select behavior_type
,count(DISTINCT user_id) user_num
from userbehavior
group by behavior_type
order by behavior_type desccreate table behavior_user_num(
behavior_type varchar(5),
user_num int
);insert into behavior_user_num
select behavior_type
,count(DISTINCT user_id) user_num
from userbehavior
group by behavior_type
order by behavior_type desc;select * from behavior_user_num;
-- 统计各类行为数量select behavior_type
,count(*) behavior_count_num
from userbehavior
group by behavior_type
order by behavior_type desccreate table behavior_num(
behavior_type varchar(5),
behavior_count_num int
);insert into behavior_num
select behavior_type
,count(*) behavior_count_num
from userbehavior
group by behavior_type
order by behavior_type desc;select * from behavior_num;
8、TOP商品
-- 品类浏览量TOP10create table popular_categories(
category_id int,
pv int
);insert into popular_categories
select category_id
,count(if(behavior_type='pv',behavior_type,null)) '品类浏览量'
from userbehavior
group by category_id
order by 2 desc
limit 10;select * from popular_categories;
-- 商品浏览量TOP10create table popular_items(
item_id int,
pv int
);insert into popular_items
select item_id
,count(if(behavior_type='pv',behavior_type,null)) '商品浏览量'
from userbehavior
group by item_id
order by 2 desc
limit 10;select * from popular_items;
-- 各类别商品浏览量TOP10create table popular_cateitems(
category_id int,
item_id int,
pv int
);insert into popular_cateitems
select category_id,item_id,品类商品浏览量 from
(
select category_id
,item_id
,count(if(behavior_type='pv',behavior_type,null)) '品类商品浏览量'
,rank() over(partition by category_id order by count(if(behavior_type='pv',behavior_type,null)) desc) r
from userbehavior
group by category_id,item_id
order by 3 desc
) a
where a.r = 1
order by a.品类商品浏览量 desc
limit 10;select * from popular_cateitems;
9、RFM模型
-- 最近购买时间
select user_id
,max(dates) '最近购买时间'
from userbehavior
where behavior_type = 'buy'
group by user_id
order by 2 desc;-- 购买次数
select user_id
,count(user_id) '购买次数'
from userbehavior
where behavior_type = 'buy'
group by user_id
order by 2 desc;-- 统一
select user_id
,max(dates) '最近购买时间'
,count(user_id) '购买次数'
from userbehavior
where behavior_type = 'buy'
group by user_id
order by 2 desc,3 desc;-- 存储drop table if exists rfm_model;
create table rfm_model(
user_id int,
recently char(10),
frequency int
);insert into rfm_model
select user_id
,max(dates) '最近购买时间'
,count(user_id) '购买次数'
from userbehavior
where behavior_type = 'buy'
group by user_id
order by 2 desc,3 desc;-- 根据购买次数对用户进行分层alter table rfm_model add column fscore int;update rfm_model
set fscore =
case when frequency >= 20 then 5
when frequency between 15 and 19 then 4
when frequency between 10 and 15 then 3
when frequency between 5 and 10 then 2
else 1
end;-- 根据最近购买时间对用户进行分层alter table rfm_model add column rscore int;update rfm_model
set rscore =
case when recently = '2017-12-03' then 5
when recently in ('2017-12-01','2017-12-02') then 4
when recently in ('2017-11-30','2017-11-29') then 3
when recently in ('2017-11-28','2017-11-27') then 2
else 1
end;-- 分层- 设置变量
set @f_avg = null;
set @r_avg = null;
select avg(fscore) into @f_avg from rfm_model;
select avg(rscore) into @r_avg from rfm_model;select *
,(case
when fscore > @f_avg and rscore > @r_avg then '价值用户'
when fscore > @f_avg and rscore < @r_avg then '保持用户'
when fscore < @f_avg and rscore > @r_avg then '发展用户'
when fscore < @f_avg and rscore < @r_avg then '挽留用户'
end) class
from rfm_model;-- 插入alter table rfm_model add column class varchar(40);update rfm_model
set class=
case
when fscore > @f_avg and rscore > @r_avg then '价值用户'
when fscore > @f_avg and rscore < @r_avg then '保持用户'
when fscore < @f_avg and rscore > @r_avg then '发展用户'
when fscore < @f_avg and rscore < @r_avg then '挽留用户'
end;select * from rfm_model;
-- 统计各分区用户数
select class
,count(user_id) class_num
from rfm_model
group by class;
实战整理-阿里天池淘宝用户购物行为数据集实战(MySQL数据分析+Navicat)相关推荐
- 基于天池淘宝用户100万条行为数据分析——SQL、Tableau
目录 一.项目背景和目的 1.1项目背景 1.2项目目的 二.数据来源和数据清洗 2.1数据介绍 2.2数据清洗 2.2.1观察数据添加需要的字段 2.2.2检查是否存在重复值 2.2.3检查是否存在 ...
- MySQL项目-淘宝用户购物行为数据可视化分析
一.项目背景与目的 1.1 项目背景 UserBehavior是阿里巴巴提供的一个淘宝用户行为数据集,用于隐式反馈推荐问题的研究.数据集包含了2017年11月25日至2017年12月3日之间,有行为的 ...
- 淘宝用户购物行为分析
目录 一.项目介绍 1.1 数据集 1.2 分析思路 二.数据预处理 2.1 数据抽样.导入数据 2.2 数据清洗 2.3 导出数据 三.数据分析 3.1 从网站维度分析用户行为 3.1.1 UV.P ...
- 【TIANCHI】天池大数据竞赛(学习赛)--- 淘宝用户购物行为数据可视化分析
目录 前言 一.数据集的来源和各个字段的意义 二.数据分析 1.引入库 2.读入数据 3.查看数据数量级 4.PV(Page View)/UV访问量 5.漏斗模型 6.用户购买商品的频次分析. 7.A ...
- 天池赛:淘宝用户购物行为数据可视化分析
目录 前言 一.赛题介绍 二.数据清洗.特征构建.特征可视化 1.数据缺失值及重复值处理 2.日期分离,PV及UV构建 3.PV及UV可视化 4.用户行为可视化 4.1 各个行为的面积图(以UV为例) ...
- 天池-淘宝用户行为数据分析(python+Tableau)
天池-淘宝用户行为数据分析(python+Tableau) 一.背景 用户行为分析可以让产品更加详细.清楚地了解用户的行为习惯,从而找出网站.app.推广渠道等产品存在的问题,有助于产品发掘高转化 ...
- python开发跟淘宝有关联微_基于Python的Apriori和FP-growth关联分析算法分析淘宝用户购物关联度...
关联分析用于发现用户购买不同的商品之间存在关联和相关联系,比如A商品和B商品存在很强的相关性,常用于实体商店或在线电商的推荐系统,例如某一客户购买A商品,那么他很有可能会购买B商品,通过大量销售数据找 ...
- 阿里天池——淘宝母婴销售项目分析
数据来源:淘宝母婴购物数据集_数据集-阿里云天池 --------------------------------------------------------------------------- ...
- 【Hive+MySQL+Python】淘宝用户购物行为数据分析项目
目录 一.数据集介绍 二.数据处理 1. 数据导入 2. 数据清洗 三.数据分析可视化 1. 用户流量及购物情况 (1)总访问量PV,总用户量UV (2)日均访问量,日均用户量 (3)每个用户的购物情 ...
最新文章
- 这些 Python 不为人知的「坑」,躲都躲不开
- 浅谈Android中Lifecycle
- why the SalesOrder header note is read only
- centos7 修改为任意网卡名_VirtualBox虚拟机双网卡配置实现与本机互通并上网
- JLink v8固件丢失修复教程
- 移动端判断触摸的方向
- linux firefox 检查组件是否加载,利用火狐浏览器查看网站加载速度
- 云计算的下一个时代——“容器时代”
- bzoj 1668: [Usaco2006 Oct]Cow Pie Treasures 馅饼里的财富(DP)
- 10.Configure One-to-Many(配置一对多关系)【Code-First系列】
- Firefox扩展推荐
- oracle 12cora 03113,Oracle12.2 ORA-03113
- keyshot场景素材导入_Keyshot环境贴图大合集 KEYSHOT CLOUD ALL ENVIRONMENTS
- Excel数据透视表教程小结
- rup软件测试案例,胖子说RUP - 软件测试网 _领测软件测试网站-中国软件测试技术第一门户...
- 人工智能 感情 自我意识
- 中国科学院大学计算机研究所2019,中科院计算所2019年夏令营名单
- 用matlab做音乐仿真,Matlab课程设计报告--MATLAB GUI的音乐键盘仿真
- Excel中vlookup模糊查找的妙用(模糊匹配)
- bat批处理笔记(一)