with-- 读取exchange 表 ,里面包含item 和 us_price等信息
exchange as(
select * from `heidao-market.recommend_algorithm.v_mafia1_exchange`
),
charge_item as(
select * from `heidao-market.recommend_algorithm.v_mafia1_charge_item`
),#  读取一年的活跃用户,并且去重
sdk_player as (
select distinct player_id from `heidao-market.mafia1_ods.sdk_gift_bag_tracking`
where timestamp >= '2020-07-18' and timestamp < '2020-08-25 05:00:00'
#and MOD(CAST(CONCAT('0x', SUBSTR(TO_HEX(MD5(CONCAT('0j46o', CAST(player_id AS STRING)))),0, 8)) AS INT64), 100) between 30 and 44
),# 购买的礼包记录
purchase as (
select *, max(giftbag_rn) over (partition by giftbag_id) as max_giftbag_rn from(
select player_id, timestamp, CAST(giftbag_id as STRING) as giftbag_id,
ROW_NUMBER() OVER (PARTITION BY player_id ORDER BY timestamp desc) as rn,
ROW_NUMBER() over(partition by player_id order by timestamp) as purchase_rn,
ROW_NUMBER() over(partition by giftbag_id order by timestamp) as giftbag_rn
from `mafia1_ods.game_log_giftbag`
--                   应该是开始购买的时间       purchase_end_time 购买结束的时间
--where timestamp >= '2020-01-01' and timestamp < '{4}'
where timestamp >= '2020-11-01' and timestamp < '2020-11-04'
and player_id in (select player_id from sdk_player))),--礼包点击的的数据,只提取点击停留时长多余1秒的数据
filter_record as (
select player_id, giftbag_id, timestamp, IFNULL(view_time, 1) as view_time, 0 as is_purchase from(
select player_id, value_1 as giftbag_id, timestamp, action,
TIMESTAMP_DIFF(LEAD(timestamp) OVER (PARTITION BY player_id, event_id, value_1 ORDER BY timestamp ASC), timestamp, MILLISECOND)/1000 as view_time,from `heidao-market.mafia1_ods.sdk_gift_bag_tracking` --where timestamp >= '{2}' and timestamp < '{1}' and type = 70--  record_start_time        --  end_time
where timestamp >= '2020-10-26' and timestamp < '2020-11-03 14:00:00' and type = 70
and player_id in (select player_id from sdk_player))
where action = 1 and view_time >= 1
and giftbag_id in (select cast(giftbag_id as string) from exchange)
),-- 提取某次曝光之前的购买记录,并且join ,在两次购买之间只提取某个礼包最后最后一条点击记录
filter_record1 as (
select * except(rn1) from(
select *, row_number() over(partition by player_id, giftbag_id, purchase_rn order by timestamp desc) as rn1,
count(*) over(partition by player_id, giftbag_id, purchase_rn) as view_count
from(
select * except(rn, purchase_rn), ifnull(purchase_rn, 0) as purchase_rn  from(
select filter_record.*, purchase.purchase_rn, purchase.giftbag_rn, purchase.max_giftbag_rn, purchase.giftbag_id as purchase_giftbag, purchase.timestamp as purchase_time,
ROW_NUMBER() over (partition by filter_record.player_id, filter_record.giftbag_id, filter_record.timestamp order by purchase.timestamp desc) as rn-- 两次购买之间只用某个礼包最后一条曝光数据,目的是为了去重
from (filter_record left join purchase on filter_record.player_id = purchase.player_id and filter_record.timestamp >purchase.timestamp))
--这里的rn =1 意味着 最远离上一次曝光
where rn = 1))
where rn1 = 1
#order by player_id desc, timestamp desc
),--这段sql 是为了聚合出最近购买的礼包列表
record_1 as(
select player_id, giftbag_id, timestamp, is_purchase, giftbag_rn, max_giftbag_rn, purchase_giftbag, purchase_time, purchase_rn, string_agg(view_giftbag, ',') as recent_view,
string_agg(cast(view_delta_time as string), ',') as recent_view_times,
string_agg(cast(view_count as string), ',') as recent_view_count
from(
select t1.* except(view_count), t2.view_count, t2.giftbag_id as view_giftbag, TIMESTAMP_DIFF(t1.timestamp, t2.timestamp, HOUR) + 1 as view_delta_time,
-- 这段row_number 是给用来提取最近多少条数据
row_number() over (partition by t1.player_id, t1.giftbag_id, t1.timestamp order by t2.timestamp desc) as rn
from (select * from filter_record1) as t1
left join (select * from filter_record1) as t2
-- 这段sql 是为了将曝光的表与自己聚合,在某个用户和他之前的近七天的曝光数据
on t1.player_id = t2.player_id and t2.timestamp between TIMESTAMP_SUB(t1.timestamp, INTERVAL 7*24 HOUR) and TIMESTAMP_SUB(t1.timestamp, INTERVAL 60*5 SECOND))
where rn <= 50-- 实际上并不需要 group by 这么多特征,实际上只要 player_id, giftbag_id, timestamp 就可以了,其他特征是冗余特征
group by player_id, giftbag_id, timestamp, is_purchase, giftbag_rn, max_giftbag_rn, purchase_giftbag, purchase_time, purchase_rn
),record_2 as (
select player_id, giftbag_id, timestamp, is_purchase, giftbag_rn, max_giftbag_rn, purchase_giftbag, purchase_time, purchase_rn, recent_view, recent_view_times,recent_view_count,
string_agg(purchase_giftbag_id, ',') as recent_giftbags,
string_agg(cast(purchase_delta_time as string), ',') as recent_giftbag_times
from(
select record_1.*, purchase.giftbag_id as purchase_giftbag_id, TIMESTAMP_DIFF(record_1.timestamp, purchase.timestamp, HOUR) + 1 as purchase_delta_time,
row_number() over (partition by record_1.player_id, record_1.giftbag_id, record_1.timestamp order by purchase.timestamp desc) as rn
from record_1 left join purchase --将之前的礼包点击的准备好的点击的数据和购买的数据join 起来,时间取最近30天购买的礼包
on record_1.player_id = purchase.player_id and purchase.timestamp between TIMESTAMP_SUB(record_1.timestamp, INTERVAL 30*24 HOUR) and TIMESTAMP_SUB(record_1.timestamp, INTERVAL 60*5 SECOND))
where rn <= 10
group by player_id, giftbag_id, timestamp, is_purchase, giftbag_rn, max_giftbag_rn, purchase_giftbag, purchase_time, purchase_rn, recent_view, recent_view_times, recent_view_count),record as ( select * from record_2
where giftbag_rn > max_giftbag_rn - 2000
#or giftbag_rn is null and timestamp >= '{5}'
or timestamp >='2020-11-04 00:00:00'
),item_map as (select * from  `heidao-market.recommend_algorithm.mafia1_binary_v20200713_item_map`),-- 本段sql 用来提取某个用户在一段时间的变化
change_info as (
select player_id, item_map.map_id*sign(change_num) as table_id, change_num*item_map.exchange_val as change_num,
TIMESTAMP_TRUNC(timestamp, HOUR) as timestamp_hour,
TIMESTAMP_TRUNC(timestamp, DAY) as timestamp_day,
item_map.use_log
from(
select player_id, army_id as table_id, army_num as change_num, timestamp
from `heidao-market.mafia1_ods.game_log_army` where timestamp >=  '2020-10-29' and timestamp < '2020-11-04 17:00:00' and army_id in (select table_id from item_map)
UNION ALL
select player_id,table_id, change_num, timestamp
from `heidao-market.mafia1_ods.game_log_item` where timestamp >= '2020-10-29'  and timestamp < '2020-11-04 17:00:00' and table_id in (select table_id from item_map)) as t
left join item_map on t.table_id = item_map.table_id
),
-- 离散到小时的item变化
item_change_hour as(
select player_id, table_id, timestamp_hour as timestamp,
CASE WHEN use_log <= 0 then abs(sum(change_num)) else round(log10(1 + abs(sum(change_num)))*10) end as change_num from change_info
group by player_id, table_id, timestamp_hour, use_log),-- 离散到天的item 变化
item_change_day as(
select player_id, table_id, timestamp_day as timestamp,
CASE WHEN use_log <= 0 then abs(sum(change_num)) else round(log10(1 + abs(sum(change_num)))*10) end as change_num from change_info
group by player_id, table_id, timestamp_day, use_log),data as(
select * EXCEPT(purchase_weight, purchase_delta_time, purchase_rn, rn),
purchase_weight/sum(purchase_weight) OVER(PARTITION BY player_id, giftbag_id, purchase_rn) as purchase_weight,from (
select record.* except(purchase_rn, purchase_time), purchase.timestamp as purchase_time, purchase.rn as purchase_rn,
TIMESTAMP_DIFF(purchase.timestamp, record.timestamp, MINUTE) as purchase_delta_time,
EXP(-TIMESTAMP_DIFF(purchase.timestamp, record.timestamp, MINUTE)/30 + 0.0) as purchase_weight,
count(*) OVER(PARTITION BY record.player_id, record.giftbag_id, purchase.rn) as share_count,
ROW_NUMBER() OVER (PARTITION BY record.player_id, record.giftbag_id, record.timestamp ORDER BY purchase.timestamp) as rn,
from (record left join purchase -- 将礼包点击记录和礼包购买记录join ,只提取在在点击后一个小时购买的礼包数据
on record.player_id = purchase.player_id
and record.giftbag_id = purchase.giftbag_id
and TIMESTAMP_DIFF(purchase.timestamp, record.timestamp, MINUTE) BETWEEN 0 AND 60
)
order by player_id, timestamp)
where timestamp >=  '2020-10-05' and rn = 1
#order by player_id, timestamp
),data1 as (
select player_id, giftbag_id, timestamp,
STRING_AGG(CAST(table_id as STRING), ',') as item_hour_id,
STRING_AGG(CAST(delta_time as STRING), ',') as item_hour_time,
STRING_AGG(CAST(change_num as STRING), ',') as item_hour_num from(
select data.*, item_change_hour.table_id, item_change_hour.change_num,
TIMESTAMP_DIFF(data.timestamp, item_change_hour.timestamp, HOUR) as delta_time from (--将data 与item_change_hour item 按照小时的变化进行join
data left join item_change_hour
on data.player_id = item_change_hour.player_id
and TIMESTAMP_DIFF(data.timestamp, item_change_hour.timestamp, HOUR) between 1 and 24)
)
group by player_id, giftbag_id, timestamp),data2 as (
select player_id, giftbag_id, timestamp,
STRING_AGG(CAST(table_id as STRING), ',') as item_day_id,
STRING_AGG(CAST(delta_time as STRING), ',') as item_day_time,
STRING_AGG(CAST(item_num as STRING), ',') as item_day_num from(
select data.*, item_change_day.table_id, item_change_day.change_num as item_num,
TIMESTAMP_DIFF(data.timestamp, item_change_day.timestamp, DAY) as delta_time from (
data left join item_change_day
on data.player_id = item_change_day.player_id
and TIMESTAMP_DIFF(data.timestamp, item_change_day.timestamp, DAY) between 1 and 7)
)
group by player_id, giftbag_id, timestamp),
data3 as (
select * except(giftbag_rn, max_giftbag_rn, purchase_giftbag),  CAST(FLOOR(100*RAND()) AS INT64) as rand_num from(
select t.*, data2.* except(player_id, giftbag_id, timestamp) from(
select data.*, data1.* except(player_id, giftbag_id, timestamp) from (
data left join data1 on data.player_id = data1.player_id and data.giftbag_id = data1.giftbag_id and data.timestamp = data1.timestamp)) as t
left join data2 on t.player_id = data2.player_id and t.giftbag_id = data2.giftbag_id and t.timestamp = data2.timestamp) as t1
where timestamp >=  '2020-10-05' and timestamp < '2020-11-04 17:00:00'
#and player_id not in (select player_id from mafia1_pf.v_internal_player)
),
ratio as (
select giftbag_id, ifnull(sum(purchase_weight), 0) as positive, count(*) as total,
sum(is_purchase)/count(*) as purchase_ratio,
ifnull(sum(purchase_weight)/count(*), 0) as positive_ratio from data3
group by giftbag_id
order by positive_ratio desc),data4 as (
select * from data3
where giftbag_id not in (select giftbag_id from ratio where (purchase_ratio >= 0.9 or positive_ratio <= 0.0))
),status1 as (select player_id ,timestamp ,status  from `heidao-market.mafia1_ods.game_log_status_snapshot`  where timestamp >= '2020-11-01')select * except(rank, status_time) from(
select *, row_number() over (partition by player_id, timestamp order by status_time desc) as rank from(
select data4.*, status1.timestamp as status_time, status1.status from (data4 left join status1 on data4.player_id = status1.player_id and status1.timestamp between TIMESTAMP_SUB(data4.timestamp, INTERVAL 48 HOUR) and data4.timestamp)))
where rank  = 1 and status is not null limit 1000

数据预处理 参考sql相关推荐

  1. 数据预处理 | 机器学习之特征工程

    点击"阅读原文"直接打开[北京站 | GPU CUDA 进阶课程]报名链接 作者:苏小保(jacksu) 华为工程师 擅长分布式系统.大数据.机器学习.github地址:https ...

  2. 云原生架构下日志服务数据预处理

    简介:本篇实践将以某家国际教育机构为例,为大家详细介绍云原生架构下日志服务数据预处理以及对应的解决方案和最佳实践操作手册,方便用户快速对号入座,解决云原生架构下的常见日志难题. 直达最佳实践:[htt ...

  3. [Python从零到壹] 十五.文本挖掘之数据预处理、Jieba工具和文本聚类万字详解

    欢迎大家来到"Python从零到壹",在这里我将分享约200篇Python系列文章,带大家一起去学习和玩耍,看看Python这个有趣的世界.所有文章都将结合案例.代码和作者的经验讲 ...

  4. python大数据挖掘系列之淘宝商城数据预处理实战

    在上一章节https://blog.csdn.net/qq_60168783/article/details/121824746 我们聊了python大数据分析的基本模块,下面就说说2个项目吧,第一个 ...

  5. Python数据预处理——格式转换及抽取数据文本信息

    课程地址:https://www.imooc.com/learn/1105 1. 数据预处理简介 1.1 什么是数据预处理 数据预处理简单来说就是:将原始数据装进一个预处理的黑匣子之后,产生出高质量数 ...

  6. 数据预处理+数据清理

    1.概述 实际的数据库极易受噪声.缺失值和不一致数据的侵扰,因为数据库太大,并且多半来自多个异种数据源.低质量的数据将会导致低质量的挖掘结果.有大量的数据预处理技术: - - 数据清理:可以用来清楚数 ...

  7. R语言数据预处理——离散化(分箱)

    R语言数据预处理--离散化(分箱) 一.项目环境 开发工具:RStudio R:3.5.2 相关包:infotheo,discretization,smbinning,dplyr,sqldf 二.导入 ...

  8. R 多变量数据预处理_数据科学 | 第3讲 数据清洗与预处理

    点击上方蓝字,记得关注我们! 在实际数据挖掘过程中,我们拿到的初始数据,往往存在缺失值.重复值.异常值或者错误值,通常这类数据被称为"脏数据",需要对其进行清洗.另外有时数据的原始 ...

  9. 基于Python的海量豆瓣电影、数据获取、数据预处理、数据分析、可视化、大屏设计项目(含数据库)

    目录 项目介绍 研究背景 国内外研究现状分析 研究目的 研究意义 研究总体设计 网络爬虫介绍 豆瓣电影数据的采集 数据预处理 大数据分析及可视化 豆瓣影评结构化分析 大屏可视化 文本可视化 总结 每文 ...

最新文章

  1. python入门:工欲善其事,必先利其器
  2. PHP HashTable总结
  3. 前端学习(661):逻辑运算符
  4. pat乙级相当于什么水平_英语四六级/专四/专八相当于美国人什么水平?
  5. Atitit 管理plus 的概念,为什么要留长发与管理思想的结合 目录 1.1. 孝道的体现 身体发肤 受之发肤 不敢毁伤 出自 1 1.2. 著作介绍 1 1.3. 传统国学文化的复兴 中国
  6. python 密码输入显示星号_[145]python实现控制台密码星号输入
  7. 二次线性插值实现图像放大(计算机视觉)
  8. 新手播音小白,想学播音从什么地方开始?
  9. SuiteScritp 2.0开发实例 自定义工单+领料单 单据流转 打印
  10. android 短信字体,安卓短信字体 安卓短信字体大小设置
  11. cm parcels无法在centos7.x版本下分发
  12. 用python实现分段函数_python:集成分段函数
  13. HTML5 Canvas鼠标与键盘事件
  14. ajax前后端通信的头部消息之请求头与响应头
  15. 计算机论文指导记录范本,论文指导内容记录怎么写 3篇 论文指导记录20篇
  16. 软件工程文档——步骤流程图
  17. GitHub 的 2021 年度报告,全球程序员好像都在卷呐!
  18. 【国产之光】:龙芯1B(嵌入式方向)
  19. java获取文件大小_Java实现获取文件大小的几种方法
  20. 使用JSSDK实现网站的QQ登录

热门文章

  1. 每日文献:2018-01-11
  2. SpringMVC 中整合JSON、XML视图一
  3. 免费图标字体:一套圣诞节相关的图标字体
  4. Object Builder Application Block (2)
  5. 太阳能工程联箱知识_暖通设计|太阳能热水系统设计参考手册(供参考)
  6. leetcode 5. 最长回文子串 暴力法、中心扩展算法、动态规划,马拉车算法(Manacher Algorithm)
  7. source insight搜不到关联代码
  8. spring boot第二讲
  9. java线程的cancel_多线程-Cancel详解
  10. java游戏2333整合包,我的世界魔法荣耀