数据集链接:淘宝用户购物行为数据集_数据集-阿里云天池

因原数据集过大,电脑和MySQL跑不动,故截取前一百万条数据进行实战演练、

1、导入数据

利用navicat软件直接导入下载好的Excel文件(较大数据集可用kettle导入)

2、数据预处理

use aori;
desc userbehavior;
select * from userbehavior limit 5;--更改字段名alter table userbehavior change f1 user_id int;
alter table userbehavior change f2 item_id int;
alter table userbehavior change f3 category_id int;
alter table userbehavior change f4 behavior_type varchar(5);
alter table userbehavior change f5 timestamps int;--检查空值select * from userbehavior where user_id is null;
select * from userbehavior where item_id is null;
select * from userbehavior where category_id is null;
select * from userbehavior where behavior_type is null;
select * from userbehavior where timestamps is null;--检查重复值
select user_id,item_id,timestamps from userbehavior
group by user_id,item_id,timestamps
having count(*) > 1;--去重-设置主键alter table userbehavior add id int first;
alter table userbehavior modify id int primary key auto_increment;
select * from userbehavior limit 5;-去除重复值delete userbehavior from
userbehavior,
(
select user_id,item_id,timestamps,min(id) id from userbehavior
group by user_id,item_id,timestamps
having count(*) > 1
) t2
where userbehavior.user_id = t2.user_id
and userbehavior.item_id = t2.item_id
and userbehavior.timestamps  = t2.timestamps
and userbehavior.id > t2.id--新增日期
- datetimealter table userbehavior add datetimes TIMESTAMP(0);
update userbehavior set datetimes=FROM_UNIXTIME(timestamps);
select * from userbehavior limit 5;-time
alter table userbehavior add dates char(10);
alter table userbehavior add times char(8);
alter table userbehavior add hours char(2);update userbehavior set dates=substring(datetimes,1,10);
update userbehavior set times=substring(datetimes,12,8);
update userbehavior set hours=substring(datetimes,12,2);
select * from userbehavior limit 5;-- 去异常
select max(datetimes),min(datetimes) from userbehavior;delete from userbehavior
where datetimes < '2017-11-25 00:00:00'
or datetimes > '2017-12-03 23:59:59';--数据概览
desc userbehavior;
select * from userbehavior limit 5;
select count(1) from userbehavior; # 999530条记录

3、时间序列分析

-- 统计日期-小时行为
select dates,hours
,count(if(behavior_type='pv',behavior_type,null)) 'pv'
,count(if(behavior_type='cart',behavior_type,null)) 'cart'
,count(if(behavior_type='fav',behavior_type,null)) 'fav'
,count(if(behavior_type='buy',behavior_type,null)) 'buy'
from userbehavior
group by dates,hours
order by dates,hourscreate table date_hour_behavior(
dates char(10),
hours char(2),
pv int,
cart int,
fav int,
buy int
);insert into date_hour_behavior
select dates,hours
,count(if(behavior_type='pv',behavior_type,null)) 'pv'
,count(if(behavior_type='cart',behavior_type,null)) 'cart'
,count(if(behavior_type='fav',behavior_type,null)) 'fav'
,count(if(behavior_type='buy',behavior_type,null)) 'buy'
from userbehavior
group by dates,hours
order by dates,hoursselect * from date_hour_behavior 

4、获客情况

--创建临时表
create table temp_behaviors like userbehavior--截取insert into temp_behaviors
select * from userbehavior limit 100000;select * from temp_behaviors;-- pv(Page View页面浏览量)select dates
,count(behavior_type) 'pv'
from temp_behaviors
where behavior_type = 'pv'
group by dates;-- UV(Uniqque Visitor独立访客数)select dates
,count(distinct user_id) 'uv'
from temp_behaviors
where behavior_type = 'pv'
group by dates;-- 一条语句select dates
,count(behavior_type) 'pv'
,count(distinct user_id) 'uv'
,round(count(behavior_type)/count(distinct user_id),1) 'pv/uv'
from temp_behaviors
where behavior_type = 'pv'
group by dates;-- 处理真实数据
create table pv_uv_puv(
dates char(10),
pv int(9),
uv int(9),
puv decimal(10,1)
);insert into pv_uv_puv
select dates
,count(behavior_type) 'pv'
,count(distinct user_id) 'uv'
,round(count(behavior_type)/count(distinct user_id),1) 'pv/uv'
from userbehavior
where behavior_type = 'pv'
group by dates;select * from pv_uv_puv;

5、留存情况

①留存率计算

select user_id,dates
from userbehavior
group by user_id,dates;-- 自关联select * from
(
select user_id,dates
from userbehavior
group by user_id,dates
) a
,(
select user_id,dates
from userbehavior
group by user_id,dates
) b
where a.user_id = b.user_id
and a.dates <= b.dates-- 留存数
select a.dates
,count(if(datediff(b.dates,a.dates)=0,b.user_id,null)) retention_0
,count(if(datediff(b.dates,a.dates)=1,b.user_id,null)) retention_1
,count(if(datediff(b.dates,a.dates)=3,b.user_id,null)) retention_3
,count(if(datediff(b.dates,a.dates)=1,b.user_id,null))/count(if(datediff(b.dates,a.dates)=0,b.user_id,null)) retention_rate_1
from
(
select user_id,dates
from userbehavior
group by user_id,dates
) a
,(
select user_id,dates
from userbehavior
group by user_id,dates
) b
where a.user_id = b.user_id
and a.dates <= b.dates
group by a.dates-- 保存结果
create table retention_rate(
dates char(10),
retention_1 float
);insert into retention_rate
select a.dates
,count(if(datediff(b.dates,a.dates)=1,b.user_id,null))/count(if(datediff(b.dates,a.dates)=0,b.user_id,null)) retention_rate_1
from
(
select user_id,dates
from userbehavior
group by user_id,dates
) a
,(
select user_id,dates
from userbehavior
group by user_id,dates
) b
where a.user_id = b.user_id
and a.dates <= b.dates
group by a.datesselect * from retention_rate;

②跳失用户计算

-- 跳失率-- 跳失用户select count(*)
from
(
select user_id from userbehavior
group by user_id
having count(behavior_type)=1
) a;select sum(pv) from pv_uv_puv;

6、行为路径分析


create view user_behavior_view as
select user_id,item_id
,count(if(behavior_type='pv',behavior_type,null)) 'pv'
,count(if(behavior_type='fav',behavior_type,null)) 'fav'
,count(if(behavior_type='cart',behavior_type,null)) 'cart'
,count(if(behavior_type='buy',behavior_type,null)) 'buy'
from userbehavior
group by user_id,item_id-- 用户行为标准化create view user_behavior_standard as
select user_id,item_id
,(case when pv>0 then 1 else 0 end) '浏览'
,(case when fav>0 then 1 else 0 end) '收藏'
,(case when cart>0 then 1 else 0 end) '加购'
,(case when buy>0 then 1 else 0 end) '购买'
from user_behavior_view-- 路径类型create view user_behavior_path as
select *,
concat(浏览,收藏,加购,购买) as path_type
from user_behavior_standard as a
where a.购买 > 0;-- 统计各类型购买数量create view path_count as
select path_type
,count(*) path_type_num
from user_behavior_path
group by path_type
order by path_type_num desc;select * from path_count;

-- 改名表create table change_name(
path_type char(4),
description varchar(40));insert into change_name
values('0001','购买'),
('1001','浏览购买'),
('0011','加购购买'),
('1011','浏览加购购买'),
('0101','收藏购买'),
('1101','浏览收藏购买'),
('0111','收藏加购购买'),
('1111','浏览收藏加购购买');select * from change_name;create table path_result
(description varchar(40)
,path_type_num int);insert into path_result
select description,
path_type_num
from path_count
join change_name
on path_count.path_type = change_name.path_type;select * from path_result;

7、用户转化率分析


-- 统计各类行为用户数select behavior_type
,count(DISTINCT user_id) user_num
from userbehavior
group by behavior_type
order by behavior_type desccreate table behavior_user_num(
behavior_type varchar(5),
user_num int
);insert into behavior_user_num
select behavior_type
,count(DISTINCT user_id) user_num
from userbehavior
group by behavior_type
order by behavior_type desc;select * from behavior_user_num;

-- 统计各类行为数量select behavior_type
,count(*) behavior_count_num
from userbehavior
group by behavior_type
order by behavior_type desccreate table behavior_num(
behavior_type varchar(5),
behavior_count_num int
);insert into behavior_num
select behavior_type
,count(*) behavior_count_num
from userbehavior
group by behavior_type
order by behavior_type desc;select * from behavior_num;

8、TOP商品

-- 品类浏览量TOP10create table popular_categories(
category_id int,
pv int
);insert into popular_categories
select category_id
,count(if(behavior_type='pv',behavior_type,null))  '品类浏览量'
from userbehavior
group by category_id
order by 2 desc
limit 10;select * from popular_categories;

-- 商品浏览量TOP10create table popular_items(
item_id int,
pv int
);insert into popular_items
select item_id
,count(if(behavior_type='pv',behavior_type,null))  '商品浏览量'
from userbehavior
group by item_id
order by 2 desc
limit 10;select * from popular_items;

-- 各类别商品浏览量TOP10create table popular_cateitems(
category_id int,
item_id int,
pv int
);insert into popular_cateitems
select category_id,item_id,品类商品浏览量 from
(
select category_id
,item_id
,count(if(behavior_type='pv',behavior_type,null))  '品类商品浏览量'
,rank() over(partition by category_id order by count(if(behavior_type='pv',behavior_type,null)) desc) r
from userbehavior
group by category_id,item_id
order by 3 desc
) a
where a.r = 1
order by a.品类商品浏览量 desc
limit 10;select * from popular_cateitems;

9、RFM模型


-- 最近购买时间
select user_id
,max(dates) '最近购买时间'
from userbehavior
where behavior_type = 'buy'
group by user_id
order by 2 desc;-- 购买次数
select user_id
,count(user_id) '购买次数'
from userbehavior
where behavior_type = 'buy'
group by user_id
order by 2 desc;-- 统一
select user_id
,max(dates) '最近购买时间'
,count(user_id) '购买次数'
from userbehavior
where behavior_type = 'buy'
group by user_id
order by 2 desc,3 desc;-- 存储drop table if exists rfm_model;
create table rfm_model(
user_id int,
recently char(10),
frequency int
);insert into rfm_model
select user_id
,max(dates) '最近购买时间'
,count(user_id) '购买次数'
from userbehavior
where behavior_type = 'buy'
group by user_id
order by 2 desc,3 desc;-- 根据购买次数对用户进行分层alter table rfm_model add column fscore int;update rfm_model
set fscore =
case when frequency >= 20 then 5
when frequency between 15 and 19 then 4
when frequency between 10 and 15 then 3
when frequency between 5 and 10 then 2
else 1
end;-- 根据最近购买时间对用户进行分层alter table rfm_model add column rscore int;update rfm_model
set rscore =
case when recently = '2017-12-03' then 5
when recently in ('2017-12-01','2017-12-02') then 4
when recently in ('2017-11-30','2017-11-29') then 3
when recently in ('2017-11-28','2017-11-27') then 2
else 1
end;-- 分层- 设置变量
set @f_avg = null;
set @r_avg = null;
select avg(fscore) into @f_avg from rfm_model;
select avg(rscore) into @r_avg from rfm_model;select *
,(case
when fscore > @f_avg and rscore > @r_avg then '价值用户'
when fscore > @f_avg and rscore < @r_avg then '保持用户'
when fscore < @f_avg and rscore > @r_avg then '发展用户'
when fscore < @f_avg and rscore < @r_avg then '挽留用户'
end) class
from rfm_model;-- 插入alter table rfm_model add column class varchar(40);update rfm_model
set class=
case
when fscore > @f_avg and rscore > @r_avg then '价值用户'
when fscore > @f_avg and rscore < @r_avg then '保持用户'
when fscore < @f_avg and rscore > @r_avg then '发展用户'
when fscore < @f_avg and rscore < @r_avg then '挽留用户'
end;select * from rfm_model;

-- 统计各分区用户数
select class
,count(user_id) class_num
from rfm_model
group by class;

实战整理-阿里天池淘宝用户购物行为数据集实战(MySQL数据分析+Navicat)相关推荐

  1. 基于天池淘宝用户100万条行为数据分析——SQL、Tableau

    目录 一.项目背景和目的 1.1项目背景 1.2项目目的 二.数据来源和数据清洗 2.1数据介绍 2.2数据清洗 2.2.1观察数据添加需要的字段 2.2.2检查是否存在重复值 2.2.3检查是否存在 ...

  2. MySQL项目-淘宝用户购物行为数据可视化分析

    一.项目背景与目的 1.1 项目背景 UserBehavior是阿里巴巴提供的一个淘宝用户行为数据集,用于隐式反馈推荐问题的研究.数据集包含了2017年11月25日至2017年12月3日之间,有行为的 ...

  3. 淘宝用户购物行为分析

    目录 一.项目介绍 1.1 数据集 1.2 分析思路 二.数据预处理 2.1 数据抽样.导入数据 2.2 数据清洗 2.3 导出数据 三.数据分析 3.1 从网站维度分析用户行为 3.1.1 UV.P ...

  4. 【TIANCHI】天池大数据竞赛(学习赛)--- 淘宝用户购物行为数据可视化分析

    目录 前言 一.数据集的来源和各个字段的意义 二.数据分析 1.引入库 2.读入数据 3.查看数据数量级 4.PV(Page View)/UV访问量 5.漏斗模型 6.用户购买商品的频次分析. 7.A ...

  5. 天池赛:淘宝用户购物行为数据可视化分析

    目录 前言 一.赛题介绍 二.数据清洗.特征构建.特征可视化 1.数据缺失值及重复值处理 2.日期分离,PV及UV构建 3.PV及UV可视化 4.用户行为可视化 4.1 各个行为的面积图(以UV为例) ...

  6. 天池-淘宝用户行为数据分析(python+Tableau)

    天池-淘宝用户行为数据分析(python+Tableau) 一.背景 ​ 用户行为分析可以让产品更加详细.清楚地了解用户的行为习惯,从而找出网站.app.推广渠道等产品存在的问题,有助于产品发掘高转化 ...

  7. python开发跟淘宝有关联微_基于Python的Apriori和FP-growth关联分析算法分析淘宝用户购物关联度...

    关联分析用于发现用户购买不同的商品之间存在关联和相关联系,比如A商品和B商品存在很强的相关性,常用于实体商店或在线电商的推荐系统,例如某一客户购买A商品,那么他很有可能会购买B商品,通过大量销售数据找 ...

  8. 阿里天池——淘宝母婴销售项目分析

    数据来源:淘宝母婴购物数据集_数据集-阿里云天池 --------------------------------------------------------------------------- ...

  9. 【Hive+MySQL+Python】淘宝用户购物行为数据分析项目

    目录 一.数据集介绍 二.数据处理 1. 数据导入 2. 数据清洗 三.数据分析可视化 1. 用户流量及购物情况 (1)总访问量PV,总用户量UV (2)日均访问量,日均用户量 (3)每个用户的购物情 ...

最新文章

  1. 这些 Python 不为人知的「坑」,躲都躲不开
  2. 浅谈Android中Lifecycle
  3. why the SalesOrder header note is read only
  4. centos7 修改为任意网卡名_VirtualBox虚拟机双网卡配置实现与本机互通并上网
  5. JLink v8固件丢失修复教程
  6. 移动端判断触摸的方向
  7. linux firefox 检查组件是否加载,利用火狐浏览器查看网站加载速度
  8. 云计算的下一个时代——“容器时代”
  9. bzoj 1668: [Usaco2006 Oct]Cow Pie Treasures 馅饼里的财富(DP)
  10. 10.Configure One-to-Many(配置一对多关系)【Code-First系列】
  11. Firefox扩展推荐
  12. oracle 12cora 03113,Oracle12.2 ORA-03113
  13. keyshot场景素材导入_Keyshot环境贴图大合集 KEYSHOT CLOUD ALL ENVIRONMENTS
  14. Excel数据透视表教程小结
  15. rup软件测试案例,胖子说RUP - 软件测试网 _领测软件测试网站-中国软件测试技术第一门户...
  16. 人工智能 感情 自我意识
  17. 中国科学院大学计算机研究所2019,中科院计算所2019年夏令营名单
  18. 用matlab做音乐仿真,Matlab课程设计报告--MATLAB GUI的音乐键盘仿真
  19. Excel中vlookup模糊查找的妙用(模糊匹配)
  20. bat批处理笔记(一)

热门文章

  1. 拒绝B站邀约,从月薪3k到年薪47W,我的经验值得每一个测试人借鉴
  2. Linux 系统 CPU 占用率较高问题排查思路
  3. STM32单片机的启动模式 三种BOOT模式介绍
  4. windows 7软件
  5. 新春伊始:从CHAT-GPT到生成式AI,人工智能新范式
  6. 树莓派-微信-网易云音乐播放器
  7. 武汉创业者声讨网易:占用我们LOGO 还大张旗鼓做起销售
  8. 教育之星 计算机,冉冉升起的教育之星
  9. PS练习6——文字特效处理
  10. 物理隔离与数据交换-网闸中的核心技术