数仓SQL面试题(持续更新中!!!)
nvl空字段赋值 nvl(comm, -1)
count(*) sum(1)
LATERAL VIEW udtf(expression) tableAlias AS columnAlias
split()切成后为数组
explode()将 hive 一列中复杂的 Array 或者 Map 结构拆分成多行。
CAST('1' AS INT) 字符串'1'转换成整数1;如果强制类型转换失败,如执行
CAST('X' AS INT),表达式返回空值 NULL
concat 拼接函数
collect_list 将某字段的值进行汇总,产生array类型字段。
collect_set 将某字段的值进行去重汇总,产生array类型字段。
concat_ws(separator,[string|array(string)]) separator是分隔符,数组按照分隔符进行拼接
CURRENT ROW:当前行
n PRECEDING:往前 n 行数据
n FOLLOWING:往后 n 行数据
UNBOUNDED:起点,
UNBOUNDED PRECEDING 表示从前面的起点,
UNBOUNDED FOLLOWING 表示到后面的终点
LAG(col,n,default_val):往前第 n 行数据
LEAD(col,n, default_val):往后第 n 行数据
NTILE(n):把有序窗口的行分发到指定数据的组中,各个组有编号,编号从 1 开始,对
于每一行,NTILE 返回此行所属的组的编号。注意:n 必须为 int 类型。
datediff(CURRENT_DATE(), "1990-06-04");
date_add(current_date(), 90);
current_date 返回当前日期
next_day 返回指定日期之后一周中特定的日期
date_format
last_day 获取每个月的最后一天
get_json_object
get_json_object(json_txt, path): 从一个JSON字符串中取出指定路径对应的数据!
核心:path怎么写?$: 代表根对象
. : 获取子元素的操作符
[] : 获取一个数组中子元素的操作符round( , ) 保留几位小数--各种聚合
select name,orderdate,cost,
sum(cost) over() as sample1,--所有行相加
sum(cost) over(partition by name) as sample2,--按name分组,组内数据相加
sum(cost) over(partition by name order by orderdate) as sample3,--按name分组,组内数据累加
sum(cost) over(partition by name order by orderdate rows between UNBOUNDED PRECEDING and current row ) as sample4 ,--和sample3一样,由起点到当前行的聚合
sum(cost) over(partition by name order by orderdate rows between 1 PRECEDING and current row) as sample5, --当前行和前面一行做聚合
sum(cost) over(partition by name order by orderdate rows between 1 PRECEDING AND 1 FOLLOWING ) as sample6,--当前行和前边一行及后面一行
sum(cost) over(partition by name order by orderdate rows between current row and UNBOUNDED FOLLOWING ) as sample7 --当前行及后面所有行
from business;
1.最多连胜次数
透过题目看本质:开窗函数算是一个打标记,然后我通过打的标记,去找对应的规律。得出最终的结果。
题目解析:
2.连胜的最大天数
3.直播间访客峰值
4.相互关注
5.统计累积访问次数
需求:我们有如下的用户访问数据
要求:使用SQL统计出每个用户的累积访问次数,如下表所示:
create database test_sql;
use test_sql;
--第一题
CREATE TABLE test_sql.test1 (userId string,visitDate string,visitCount INT )
ROW format delimited FIELDS TERMINATED BY "\t";
INSERT INTO TABLE test_sql.test1
VALUES
( 'u01', '2017/1/21', 5 ),
( 'u02', '2017/1/23', 6 ),
( 'u03', '2017/1/22', 8 ),
( 'u04', '2017/1/20', 3 ),
( 'u01', '2017/1/23', 6 ),
( 'u01', '2017/2/21', 8 ),
( 'u02', '2017/1/23', 6 ),
( 'u01', '2017/2/22', 4 );
set spark.sql.shuffle.partitions=4;select *,sum(sum1) over (partition by userId order by visitMonth rows between unbounded preceding and current row ) as sum2--累积
from
(select userId,substr(visitDate,0,6) as visitMonth,sum(visitCount) as sum1 --小计
from test1
group by userId,substr(visitDate,0,6)) t
order by userId,visitMonth;--优化1
select *,sum(sum1) over (partition by userId order by visitMonth rows between unbounded preceding and current row ) as sum2--累积
from
(select userId,date_format(regexp_replace(visitDate,'/','-'),'yyyy-MM') as visitMonth,sum(visitCount) as sum1 --小计
from test1
group by userId,date_format(regexp_replace(visitDate,'/','-'),'yyyy-MM') ) t
order by userId,visitMonth;
6.2017年11月的新客数
数据准备
CREATE TABLE test_sql.test3 (dt string,order_id string,user_id string,amount DECIMAL ( 10, 2 ) )ROW format delimited FIELDS TERMINATED BY '\t';
INSERT INTO TABLE test_sql.test3 VALUES ('2017-01-01','10029028','1000003251',33.57);
INSERT INTO TABLE test_sql.test3 VALUES ('2017-01-01','10029029','1000003251',33.57);
INSERT INTO TABLE test_sql.test3 VALUES ('2017-01-01','100290288','1000003252',33.57);
INSERT INTO TABLE test_sql.test3 VALUES ('2017-02-02','10029088','1000003251',33.57);
INSERT INTO TABLE test_sql.test3 VALUES ('2017-02-02','10028888','1000008888',33.57);
INSERT INTO TABLE test_sql.test3 VALUES ('2017-02-02','100290281','1000003251',33.57);
INSERT INTO TABLE test_sql.test3 VALUES ('2017-02-02','100290282','1000003253',33.57);
INSERT INTO TABLE test_sql.test3 VALUES ('2017-11-02','10290282','100003253',234);
INSERT INTO TABLE test_sql.test3 VALUES ('2018-11-02','10290284','100003243',234);
答案:
select * from test3;
--(1)给出 2017年每个月的订单数、用户数、总成交金额。
select date_format(dt, 'yyyy-MM') as month1,count(order_id) as cnt_orders,--订单数count(distinct user_id) as cnt_users,--用户数sum(amount) as sum_amt--总成交金额
from test3 where year(dt)=2017
group by date_format(dt, 'yyyy-MM');--语法2
select date_format(dt, 'yyyy-MM') as month1,count(order_id) as cnt_orders,--订单数count(distinct user_id) as cnt_users,--用户数sum(amount) as sum_amt--总成交金额
from test3 where date_format(dt,'yyyy')=2017
group by date_format(dt, 'yyyy-MM');-- (2)给出2017年2月的新客数(指在2月才有第一笔订单)
with t1 as (select *,date_format(dt,'yyyy-MM') as ym
from test3
),t2 as (select user_id,min(ym) as min_ymfrom t1group by user_id)
select count(user_id) as cnt from t2 where min_ym='2017-11';
7.
--方式1
select shop,count(distinct user_id) as uv
from test2 group by shop;
--方式2
select shop,count(user_id) as uvfrom
(select user_id,shop
from test2 group by user_id,shop) t
group by shop ;
-- (2)每个店铺访问次数top3的访客信息。输出店铺名称、访客id、访问次数
--语法1
select *
from
(select shop,user_id,cnt,row_number() over (partition by shop order by cnt desc) rn
from
(select user_id,shop,count(*) as cnt
from test2
group by user_id,shop
order by shop,user_id) t)
where rn<=3;--语法2 用with as
with t1 as (select user_id,shop,count(*) as cntfrom test2group by user_id, shoporder by shop, user_id
),t2 as (select shop,user_id,cnt,row_number() over (partition by shop order by cnt desc) rnfrom t1)
select *from t2 where rn<=3 order by shop ,cnt desc;
8.
with t1 as (select user_id,count(url) as cntfrom test4loggroup by user_id
),t2 as (select user_id,age,case when age>=0 and age<=10 then '0-10'when age>10 and age<=20 then '10-20'when age>20 and age<=30 then '20-30'when age>30 and age<=40 then '30-40'when age>40 and age<=50 then '40-50'when age>50 and age<=60 then '50-60'when age>60 and age<=70 then '60-70'end age_phasefrom test4user),t3 as (select age_phase,sum(cnt) sum1from t1 join t2 on t1.user_id=t2.user_idgroup by t2.age_phase)
select * from t3;
--方案二
with t1 as (select user_id,count(url) as cntfrom test4loggroup by user_id
),t2 as (select user_id,age,concat( floor(age/10)*10,'-',(floor(age/10)+1)*10) as age_phasefrom test4user),t3 as (select age_phase,sum(cnt) sum1from t1 join t2 on t1.user_id=t2.user_idgroup by t2.age_phase)
select * from t3;select floor(15/10)*10 as x;
select ceil(15/10)*10 as x;
select concat( floor(15/10)*10,'-',ceil(15/10)*10) as x;
select concat( floor(25/10)*10,'-',ceil(25/10)*10) as x;
select concat( floor(20/10)*10,'-',(floor(20/10)+1)*10) as x;
9.
select userid, money, paymenttime
from (select *,row_number() over (partition by userid order by paymenttime) rnfrom test6where date_format(paymenttime, 'yyyy-MM') = '2017-10') t
where rn = 1;
10.
with t1 as (select distinctuser_id,agefrom test5
),t2 as (select '所有用户' as type,count(user_id) as cnt,avg(age) as avg_agefrom t1),--步骤 2 活跃用户的总数及平均年龄,活跃用户指连续两天都有 访问记录的用户)t3 as (select distinct dt,user_id,agefrom test5),t4 as (select dt,user_id,age,--同一个客户,按照不同日期排序,得到序号row_number() over (partition by user_id order by dt) as rnfrom t3),t5 as (select *,--用日期减去序号得到临时日期date_sub(dt,rn) as date2from t4),t6 as (--统计date2临时日期出现几次。如果2次则表示连续登陆2次select user_id,date2,max(age) age,count(1) as cntfrom t5group by user_id,date2having count(1)>=2),t7 as (select distinct user_id,agefrom t6),t8 as (select '活跃用户' as type,count(user_id) as cnt,avg(age) as avg_agefrom t7)
select * from t2 union all
select * from t8;
11.
--(1)创建图书管理库的图书、读者和借阅三个基本表的表结构。请写出建表语句。
-- 创建图书表book*/
CREATE TABLE test_sql.book
(book_id string,`SORT` string,book_name string,writer string,OUTPUT string,price decimal(10, 2)
);-- 创建读者表reader
CREATE TABLE test_sql.reader
(reader_id string,company string,name string,sex string,grade string,addr string
);CREATE TABLE test_sql.borrow_log
(reader_id string,book_id string,borrow_date string
);
-- (2)找出姓李的读者姓名(NAME)和所在单位(COMPANY)。
select name,company from reader where name like '李%';
-- (3)查找“科学出版社”的所有图书名称(BOOK_NAME)及单价(PRICE),结果按单价降序 排序。
select book_name,price from book where OUTPUT='科学出版社';
-- (4)查找价格介于10元和20元之间的图书种类(SORT)出版单位(OUTPUT)和单价(PRICE),
-- 结果按出版单位(OUTPUT)和单价(PRICE)升序排序。
select SORT,OUTPUT,price from book where price >=10 and price<=20 order by OUTPUT,price;
-- (5)查找所有借了书的读者的姓名(NAME)及所在单位(COMPANY)。
select b.name,b.company from borrow_log a
join reader b on a.reader_id=b.reader_id;
-- (6)求”科学出版社”图书的最高单价、最低单价、平均单价。
select max(price),min(price),avg(price) from book where OUTPUT='科学出版社';
-- (7)找出当前至少借阅了2本图书(大于等于2本)的读者姓名及其所在单位。
select b.reader_id,b.name, b.company
from borrow_log a
join reader b on a.reader_id=b.reader_id
group by b.reader_id,b.name, b.company
having count(*)>=2
;
-- (8)考虑到数据安全的需要,需定时将“借阅记录”中数据进行备份,请使用一条SQL语句,
-- 在备份 用户bak下创建与“借阅记录”表结构完全一致的数据表 BORROW_LOG_BAK.
-- 井且将“借阅记录”中现 有数据全部复制到BORROW_L0G_ BAK中。
create table BORROW_LOG_BAK as select * from borrow_log;
-- (9)现在需要将原Oracle数据库中数据迁移至Hive仓库,
-- 请写出“图书”在Hive中的建表语句(Hive 实现,提示:列分隔符|;数据表数据需要外部导入:分区分别以month_part、day_part 命名)
CREATE TABLE book_hive(book_id string,SORT string,book_name string,writer string,OUTPUT string,price DECIMAL(10, 2)
)partitioned BY ( month_part string, day_part string )ROW format delimited FIELDS TERMINATED BY '\\|' stored AS textfile;
-- (10)Hive中有表A,现在需要将表A的月分区 201505 中 user_id为20000的user_dinner字段更新为 bonc8920,
-- 其他用户user_dinner字段数据不变,请列出更新的方法步骤。
-- (Hive实现, 提示:Hlive中无update语法,请通过其他办法进行数据更新)insert overwrite table A
select user_id,'bonc8920' as user_dinner from A where user_id=20000
union all
select * from A where user_id!=20000
12.
select ip,count(*)
from test8
where date >= '2016-11-09 14:00:00'and date < '2016-11-09 15:00:00'and interface='/api/user/login'
group by ip
order by count(*) desc
limit 10;
13.
select *
from (select *,row_number() over (distribute by dist_id order by money desc) as rnfrom test9where to_date(create_time) = '2019-01-02') t
where rn = 1;
;
14.
select dist_id, account, gold
from (select *,row_number() over (partition by dist_id order by gold desc) rnfrom test_sql.test10) t
where rn <= 10
;
15.行转列
select a,max(case when b="A" then c end) col_A,max(case when b="B" then c end) col_B
from t1
group by a;
16.列转行
select a,b,c
from (select a,"A" as b,col_a as c from t1_2 union all select a,"B" as b,col_b as c from t1_2
)tmp;
17.行转列
select a,max(case when b="A" then c end) col_A,max(case when b="B" then c end) col_B
from (select a,b,concat_ws(",",collect_set(cast(c as string))) as cfrom t1group by a,b
)tmp
group by a;
18.按a分组取b字段最小时对应的c字段
selecta,c as min_c
from
(selecta,b,c,row_number() over(partition by a order by b) as rn from t2
)a
where rn = 1;
19.按a分组取b字段排第二时对应的c字段
selecta,c as second_c
from
(selecta,b,c,row_number() over(partition by a order by b) as rn from t2
)a
where rn = 2;
20.按a分组取b字段最小和最大时对应的c字段
selecta,min(if(asc_rn = 1, c, null)) as min_c,max(if(desc_rn = 1, c, null)) as max_c
from
(selecta,b,c,row_number() over(partition by a order by b) as asc_rn,row_number() over(partition by a order by b desc) as desc_rn from t2
)a
where asc_rn = 1 or desc_rn = 1
group by a;
21.按a分组取b字段第二小和第二大时对应的c字段
selectret.a,max(case when ret.rn_min = 2 then ret.c else null end) as min_c,max(case when ret.rn_max = 2 then ret.c else null end) as max_c
from (select*,row_number() over(partition by t2.a order by t2.b) as rn_min,row_number() over(partition by t2.a order by t2.b desc) as rn_maxfrom t2
) as ret
where ret.rn_min = 2
or ret.rn_max = 2
group by ret.a;
22.按a分组取b字段前两小和前两大时对应的c字段
selecttmp1.a as a,min_c,max_c
from
(select a,concat_ws(',', collect_list(c)) as min_cfrom(selecta,b,c,row_number() over(partition by a order by b) as asc_rnfrom t2)awhere asc_rn <= 2 group by a
)tmp1
join
(select a,concat_ws(',', collect_list(c)) as max_cfrom(selecta,b,c,row_number() over(partition by a order by b desc) as desc_rn from t2)awhere desc_rn <= 2group by a
)tmp2
on tmp1.a = tmp2.a;
23. 按a分组按b字段排序,对c累计求和
24.按a分组按b字段排序,对c取累计平均值
25.按a分组按b字段排序,对b取累计排名比例
select a, b, c, round(row_number() over(partition by a order by b) / (count(c) over(partition by a)),2) as ratio_c
from t3
order by a,b;
26.按a分组按b字段排序,对b取累计求和比例
select a, b, c, round(sum(c) over(partition by a order by b) / (sum(c) over(partition by a)),2) as ratio_c
from t3
order by a,b;
27.按a分组按b字段排序,对c取前后各一行的和
不包含当前行
select a,b,lag(c,1,0) over(partition by a order by b)+lead(c,1,0) over(partition by a order by b) as sum_c
from t4;
28.按a分组按b字段排序,对c取平均值
看题。
selecta,b,case when lag_c is null then celse (c+lag_c)/2 end as avg_c
from(selecta,b,c,lag(c,1) over(partition by a order by b) as lag_cfrom t4)temp;
29.产生连续数值
语法: space(int n)
返回值: string
说明:返回长度为n的空格字符串
举例:
hive> select space(10) from dual;
hive> select length(space(10)) from dual;
10space函数与split函数结合,可以得到空格字符串数组
hive> select split(space(10), '');
[" "," "," "," "," "," "," "," "," "," ",""]
selectrow_number() over() as id
from (select split(space(99), ' ') as x) t
lateral view
explode(x) ex;
那如何产生1至1000000连续数值?
selectrow_number() over() as id
from (select split(space(999999), ' ') as x) t
lateral view
explode(x) ex;
30.数据扩充
31.数据扩充,排除偶数
32.如何处理字符串累计拼接
33.如果a字段有重复,如何实现字符串累计拼接
select a,b
from
(select t.a,t.rn,concat_ws('、',collect_list(cast(t.a1 as string))) as bfrom( select a.a,a.rn,b.a1from(select a,row_number() over(order by a ) as rn from t6) aleft join( select a as a1,row_number() over(order by a ) as rn from t6) bon 1 = 1where a.a >= b.a1 and a.rn >= b.rn order by a.a, b.a1 ) tgroup by t.a,t.rnorder by t.a,t.rn
) tt;
34.数据展开
如何将字符串"1-5,16,11-13,9"扩展成"1,2,3,4,5,16,11,12,13,9"?注意顺序不变。
select concat_ws(',',collect_list(cast(rn as string)))
from
(select a.rn,b.num,b.posfrom(selectrow_number() over() as rnfrom (select split(space(20), ' ') as x) t -- space(20)可灵活调整lateral viewexplode(x) pe) a lateral view outer posexplode(split('1-5,16,11-13,9', ',')) b as pos, numwhere a.rn between cast(split(num, '-')[0] as int) and cast(split(num, '-')[1] as int) or a.rn = numorder by pos, rn
) t;
35.合并与拆分
36.合并与拆分
37.如何将字符'1'的位置提取出来
select a,concat_ws(",",collect_list(cast(index as string))) as res
from (select a,index+1 as index,chrfrom (select a,concat_ws(",",substr(a,1,1),substr(a,2,1),substr(a,3,1),substr(a,-1)) strfrom t8) tmp1lateral view posexplode(split(str,",")) t as index,chrwhere chr = "1"
) tmp2
group by a;
38.不使用distinct或group by去重
selectt2.year,t2.num
from(select*,row_number() over (partition by t1.year,t1.num) as rank_1from (select a as year,d as numfrom t9union allselect b as year,d as numfrom t9union allselect c as year,d as numfrom t9)t1
)t2
where rank_1=1
order by num;
39.反转逗号分隔的数据:改变顺序,内容不变
select a,concat_ws(",",collect_list(reverse(str)))
from
(select a,strfrom t10lateral view explode(split(reverse(a),",")) t as str
) tmp1
group by a;
40.反转逗号分隔的数据:改变内容,顺序不变
select a,concat_ws(",",collect_list(reverse(str)))
from
(select a,strfrom t10lateral view explode(split(a,",")) t as str
) tmp1
group by a;
41.成对提取数据,字段一一对应
select a_inx,b_inx
from
(select a,b,a_id,a_inx,b_id,b_inxfrom t11lateral view posexplode(split(a,'/')) t as a_id,a_inxlateral view posexplode(split(b,'/')) t as b_id,b_inx
) tmp
where a_id=b_id;
数仓SQL面试题(持续更新中!!!)相关推荐
- db2dual_DB2常用SQL的写法(持续更新中...)
DB2常用SQL的写法(持续更新中...) -- Author: lavasoft -- Date : 2006-12-14 -- 创建一个自定义单值类型 create distinct typ ...
- psid mysql_DB2常用SQL的写法(持续更新中...)
DB2常用SQL的写法(持续更新中...) -- Author: lavasoft -- Date : 2006-12-14 -- 创建一个自定义单值类型 create distinct typ ...
- 史上最全的spark面试题——持续更新中
1.spark中的RDD是什么,有哪些特性? 答:RDD(Resilient Distributed Dataset)叫做分布式数据集,是spark中最基本的数据抽象,它代表一个不可变,可分区,里面的 ...
- java史上最全面试题--持续更新中(一)
1.面向对象的特征有哪些方面? 抽象:将同类对象的共同特征提取出来构造类. 继承:基于基类创建新类. 封装:将数据隐藏起来,对数据的访问只能通过特定接口. 多态性:不同子类型对象对相同消息作出不同响应 ...
- 2020年Java面试题及答案_Java面试宝典_Java笔试题(持续更新中)
推荐面试视频教程 2019年最新Java互联网大厂面试精粹 前言 前言: 少年易老学难成,一寸光阴不可轻.未觉池塘春草梦,阶前梧叶已秋声 .-朱熹<劝学诗> 勤奋才是改变你命运的唯一捷径. ...
- Dubbo面试题及答案,2021年Dubbo面试题-持续更新中
2021最新Dubbo面试题[附答案解析]Dubbo面试题及答案2021,Dubbo最新面试题及答案,Dubbo面试题新答案已经全部更新完了,有些答案是自己总结的,也有些答案是在网上搜集整理的.这些答 ...
- SpringBoot面试题大汇总附答案,SpringBoot面试题-持续更新中
2021最新SpringBoot面试题[附答案解析]SpringBoot面试题及答案2021,SpringBoot2021最新面试题及答案,SpringBoot面试题新答案已经全部更新完了,有些答案是 ...
- mysql查询更新优化_mysql查询优化(持续更新中)
1.索引不会包含有NULL值的列 (1) 应尽量避免在where子句中对字段进行null值判断,否则将导致引擎放弃使用索引而进行全表扫描 (2) 数据库设计时不要让字段的默认值为null,可以 ...
- 面试1:Java、微服务、架构常见面试题(持续更新中)
Java.微服务.架构常见面试题(持续更新中) 文章目录 Java.微服务.架构常见面试题(持续更新中) ==**Java**== 1.Java概述 (1)JVM.JRE和JDK (2)Java特点 ...
- 2020年拼多多校招面试题及答案-最全最新-持续更新中(2)
大家好我是好好学习天天编程的天天 一个整天在互联网上种菜和砍柴的程序员~ 2020年拼多多校招面试题及答案-最全最新-持续更新中(2) 2020年拼多多校招面试题一面 2020年拼多多校招面试题一面- ...
最新文章
- Exchange 2010与Exchange Online混合部署PART 5:配置边缘
- Android之Fragment使用简介
- oracle区号,Oracle 存儲過程
- 51单片机——LCD1602
- mongodb常用操作命令(待续)
- 计算机网络原理应用题/计算题
- 苹果Mac视频特效合成神器:Blackmagic Fusion Studio
- 一维和二维傅里叶变换的图片直观理解
- B站视频下载助手使用教程
- PC控制台使用-素材管理
- 彩色蟒蛇绘制。对 Python 蟒蛇的每个部分采用不同颜色,绘制一条彩色蟒蛇。
- 掌握IDEA的这两款插件,写“破解补丁”分分钟
- 基于FPGA的GV7600驱动
- 一起来“泡博”[--老沙]
- CSS反爬虫 大众点评
- 全面理解云上网络技术
- 阿里云轻量应用服务器基于CentOS系统镜像快速部署Apache服务
- 项目变更管理、项目集合管理、流程管理、知识管理、战略管理真题
- 联想z485在ubuntu13.04系统下发热量大的解决办法
- 软件学习史上最强攻略之--如何选择软件测试培训学校