《Hive编程指南》阅读笔记

首先搭建Hive编程环境，点我查看！

第 2 章基础操作

2.7 命令行界面

2.7.1 CLI 选项

hive --help
-- 查看 cli 服务的使用帮助
hive --service cli --help

2.7.2 变量和属性

-- 显示以上所有命名空间中的变量和属性
set;
-- 显示以上所有命名空间中的变量和属性，以及Hadoop中定义的所有属性
set -v;

 -- 新建自定义属性set foo = bar;
-- 查看自定义属性，等价于 set hiveconf:foo;
set foo;
-- 查看自定义属性
set hiveconf:foo;
-- 引用自定义属性
create table t3(i int, ${hiveconf:foo} string);

-- 新建自定义变量
set hivevar:foo2 = bar2;
-- 查看自定义变量
set hivevar:foo2;
-- 查看自定义变量
set foo2;
-- 引用自定义变量
create table t1(i int, ${foo2} string);
-- 引用自定义变量
create table t2(i int, ${hivevar:foo2} string);

-- 进入hive时就更改了这个配置属性，显示当前所处的数据库，注意=左右不要有空格
hive --hiveconf hive.cli.print.current.db=true;
-- 修改配置属性
set hive.cli.print.current.db= false;
-- 修改配置属性
set hiveconf:hive.cli.print.current.db = true;

-- 查看系统变量（不能省略system:）
set system:user.name;
-- 查看环境变量（不能省略env:）
set env:HOME;

2.7.7 查看操作命令历史

$HOME/.hivehistory

2.7.8 执行shell命令

-- 只能执行简单的shell命令
! /bin/echo ${hiveconf:foo};

2.7.9 在Hive中使用Hadoop的dfs命令

 dfs -ls /;

2.7.10 Hive脚本中如何进行注释

-- Hive注释

略

具体直接看书，随用随查，没必要浪费时间手动做一遍

2.7.3 Hive中的“一次使用”命令
2.7.4 从文件中执行Hive查询
2.7.5 hiverc文件

第 3 章数据类型和文件格式

3.1 基本数据类型

所有这些类型都是对Java接口的实现，这些类型的具体行为细节和Java中对应的类型是完全一致的。

3.2 集合数据类型

3.3 文本文件数据编码

create table employees (
-- 放在hive命令行执行前可以先去除Tabname string,salary float,subordinates array<string>,deductions map<string, float>,address struct<street:string, city:string, state:string, zip:int>
)
row format delimited
fields terminated by '\001'
collection items terminated by '\002'
map keys terminated by '\003'
lines terminated by '\n'
stored as textfile;

3.4 读时模式

第 4 章 HiveQL：数据定义

4.1 Hive中的数据库

Hive 数据库本质仅仅是一个目录或者命名空间。

-- 如果不存在就创建，如果存在不会报错
create database if not exists mydb; -- 如果不存在就创建，如果存在会报错
create database mydb; show databases like 'm.*';use mydb;show tables like 'e.*';-- 创建数据库时指定对应的 hdfs 路径（location），添加描述信息（comment），添加属性（dbproperties）
-- 默认数据库 default 的路径是属性 hive.metastore.warehouse.dir 配置的路径
create database d4 comment 'about something' location '/d4' with dbproperties('creator' = 'Tom', 'date' = '2021-04-29');-- 显示当前数据库
set hive.cli.print.current.db=true;-- 查看当前数据库
select current_database();-- extended，展示属性信息
desc database extended d4;-- 修改或添加属性信息，不可删除
alter database d4 set dbproperties('creator'= 'Jack');-- 如果 d1 不存在，不会报错；如果 d1 存在且有大于0张表，会报错
drop database if exists d1;-- 如果 d1 存在，就先把他的所有表删除（cascade），再把 d1 删除
drop database if exists d1 cascade;

4.3 创建表

-- 创建表
create table if not exists d4.employees (
-- 放在hive命令行执行前可以先去除Tabname string comment 'Employee name',salary float comment 'Employee salary',subordinates array<string> comment 'Names of subordinates',deductions map<string, float> comment 'Keys are deductions names, values are percentages',address struct<street:string, city:string, state:string, zip:int> comment 'Home address'
)
comment 'Description of the table'
tblproperties ('creator'='Tom', 'created_time'='2021-04-29');

-- 查看表信息
describe extended d4.employees;
desc formatted d4.employees;-- 创建一张和表employees结构一样的新表employees3
create table if not exists d4.employees3 like d4.employees;

4.3.1 内部表

删除内部表，会删除数据。

4.3.2 外部表

删除外部表，不会删除数据，但元数据信息会删除。

-- 创建外部表
create external table if not exists external_t1 (col string) row format delimited fields terminated by ',' location '/data/external_t1';-- 创建一张结构和external_t1一样的外部表external_t2
create external table if not exists external_t2 like external_t1 location '/path/to/data';

4.4 分区表

create table if not exists d4.employees100 (
-- 放在hive命令行执行前可以先去除Tabname string comment 'Employee name',salary float comment 'Employee salary',subordinates array<string> comment 'Names of subordinates',deductions map<string, float> comment 'Keys are deductions names, values are percentages',address struct<street:string, city:string, state:string, zip:int> comment 'Home address'
)
partitioned by (country string, state string);

-- 强制要求查询时要指定分区，否则报错
set hive.mapred.mode = strict;-- 查询时不必指定分区
set hive.mapred.mode = nonstrict;-- 查看分区
show partitions employees100;
show partitions employees100 partition(country='US');-- 查看某个分区信息
desc formatted employees100 partition(country='China', state = 'Beijing');

4.5 删除表

drop table if exists external_t1;

<!-- core-site.xml 配置trash机制，删除文件会存入trash，超过1440后删除trash -->
<property><name>fs.trash.interval</name><value>1440</value><description>Number of minutes between trash checkpoints. If zero, the trash feature is disabled.</description>
</property>

4.6 修改表

-- 重命名表alter table employees rename to emps;-- 添加分区，/d4/employees100/country=China/state=Beijing
alter table employees100 add if not exists partition(country = 'China', state = 'Beijing') ;
-- 添加分区，/d4/employees100/jap/d
alter table employees100 add if not exists partition(country = 'JAP', state = 'D') location '/d4/employees100/jap/d'-- 更改分区路径，不会删除旧路径数据
alter table employees100 partition(country = 'US', state = 'NBA') set location '/d4/employees100/country=US/state=NBA';-- 删除分区
alter table employees100 drop if exists partition(country = 'US', state = 'NBA');-- 修改列，本身就在第一个位置的字段把after name语句换成first
alter table employees100 change column salary pay float comment 'salary' after name;-- 增加列
alter table employees100 add columns(app_name string comment 'apps');-- 删除并替换列，把原来的所有列（除了分区列）删除，并用这些新列
alter table employees100 replace columns(name string comment 'Employee name',salary float comment 'Employee salary',subordinates array<string> comment 'Names of subordinates',deductions map<string, float> comment 'Keys are deductions names, values are percentages',address struct<street:string, city:string, state:string, zip:int> comment 'Home address'
)-- 修改表属性alter table employees100 set tblproperties('prop' = 'prop');-- 修改表的存储属性，如存储文件类型，略-- 把一个分区打包成一个har包，减少文件数，减轻NameNode压力，但不会节省存储空间
alter table employees100 archive partition (country="China",state="Beijing")
-- 把一个分区har包还原成原来的分区
alter table employees100 unarchive partition (country="China",state="Beijing")
-- 保护分区防止被删除
alter table employees100 partition (country="China",state="Beijing") enable no_drop
-- 保护分区防止被查询
alter table employees100 partition (country="China",state="Beijing") enable offline
-- 允许分区删除和查询
alter table employees100 partition (country="China",state="Beijing") disable no_drop
alter table employees100 partition (country="China",state="Beijing") disable offline

第 5 章 HiveQL：数据操作

-- 创建表
create table mydb.employees (
-- 放在hive命令行执行前可以先去除Tabname string comment 'Employee name',salary float comment 'Employee salary',subordinates array<string> comment 'Names of subordinates',deductions map<string, float> comment 'Keys are deductions names, values are percentages',address struct<street:string, city:string, state:string, zip:int> comment 'Home address'
)
row format delimited
fields terminated by ','
collection items terminated by '#'
map keys terminated by ':';

-- 数据准备，/home/mi2/env/data/data_type_t1.txt
Tom,5000.0,Tom_sub1#Tom_sub2#Tom_sub3,deduction1:120.0#deduction2:50.0#deduction3:200.0,Tom_street#Tom_city#Tom_state#123456
Jack,6000.0,Jack_sub1#Jack_sub2#Jack_sub3,deduction1:120.0#deduction2:50.0#deduction3:200.0,Jack_street#Jack_city#Jack_state#123456

导入数据

-- 从本地文件导入数据到Hive表
load data local inpath '/home/mi2/env/data/data_type_t1.txt' overwrite into table employees;

-- 说明：
-- 1）local表示加载本地文件，如果加载hdfs文件就不用local
-- 2）overwrite是覆盖写，如果追加写就不用overwrite
-- 3）partition是分区表，如果不是分区表就不用partition
load data local inpath '/home/mi2/env/data/data_type_t1.txt' overwrite into table employees  partition (country = 'US', state = 'CA');

-- 从查询插入数据，覆盖写
insert overwrite table employees select * from employees;-- 从查询插入数据，追加写
insert into table employees select * from employees;-- 从查询插入数据，覆盖写入某分区
insert overwrite table employees partition(country = 'US', state = 'CA') select * from employees;

-- 查询不同数据插入到不同分区（静态手写）
from staged_employees select
insert overwrite table employeespartition(country = 'US', state = 'OR')select * where se.country = 'US' and se.state = 'OR'
insert overwrite table employeespartition(country = 'US', state = 'CA')select * where se.country = 'US' and se.state = 'CA'
insert overwrite table employeespartition(country = 'US', state = 'IL')select * where se.country = 'US' and se.state = 'IL'

-- 动态导入分区，country/state的值由select语句最后两列确定，根据位置匹配
insert overwrite table employees partition (country, state)
select ...,se.country,se.state from staged_employees se;-- 动态和静态方式导入分区结合，静态分区名字必须在动态分区名字前
insert overwrite table employees partition (country = 'US', state)
select ...,se.country,se.state from staged_employees se where se.country = 'US';

-- 可以在HiveQL之前设定配置属性值
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions.pernode=100;

导出数据

-- overwrite/into表示覆盖写/追加写，local表示本地
insert overwrite local directory '/home/mi2/env/data/export_data_1'
select name, salary, address from employees;

第 6 章 HiveQL：查询

6.1 SELECT…FROM

select * from employees limit 1;
-- 查询结果
Tom 5000.0  ["Tom_sub1","Tom_sub2","Tom_sub3"]    {"deduction1":120.0,"deduction2":50.0,"deduction3":200.0} {"street":"Tom_street","city":"Tom_city","state":"Tom_state","zip":123456}select name, salary, subordinates[0], deductions['deduction1'], address.city from employees;
-- 查询结果
Tom 5000.0  Tom_sub1    120.0   Tom_city
Jack    6000.0  Jack_sub1   120.0   Jack_cityselect upper(name), salary, deductions['deduction1'],round(salary - deductions['deduction1']) from employees;
-- 查询结果
TOM 5000.0  120.0   4880.0
JACK    6000.0  120.0   5880.0

算术运算符

Hive遵循的是底层Java中数据类型的规则。

函数

-- 显示Hive所有函数show functions;
-- 查看某函数的说明desc function when;
-- 查看某函数的详细说明
desc function extended abs;

select name, explode(subordinates) from employees;-- 报错，只能包含一列explode(subordinates)，不能包含其他列
FAILED: SemanticException [Error 10081]: UDTF's are not supported outside the SELECT clause, nor nested in expressions-- 包含其他列的查法
select name, sub from employees lateral view explode(subordinates) subView as sub;

说明：用户自定义函数需要写JAVA类并达成JAR包进入（第13章）。

嵌套查询

select t.name from (select * from mydb.employees) t;

CASE…WHEN…THEN

select name, salary,case when salary < 3000.0 then 'low'when salary >= 3000.0 and salary < 5000.0 then 'middle'when salary >= 5000.0 and salary < 7000.0 then 'high'else 'very high'end as bracket
from employees;

什么情况下Hive可以避免MapReduce

-- 笔者实践发现，不管是否开启本地模式（hive.exec.mode.local.auto），简单查询（无聚合等操作，如count）都不会MapReduce
select name, salary*1.1 from employees where salary < 10000.0;

6.2 WHERE语句

注意
1）浮点比较误差，尽量不要进行浮点比较，若非要比较，得避免float自动转double情况
2）LIKE只有%和_两个匹配字符，RLIKE/REGEXP后是真的正则表达式

6.3 GROUP BY

select name, avg(salary)
from employees
where salary >= cast(5000.0 as float)
group by name
having avg(salary) >= cast(3000.0 as float);

6.4 JOIN

只支持等值连接（且on后多个等值条件不支持or，只能是and），不支持自然连接（要有相同属性的列，连接后去除重复列），不支持非等值连接（on后的条件不是=，而是<等）。

select e1.name, e1.salary
from employees e1 join employees e2
on e1.name = e2.name
where e1.name = 'Tom';

select e1.name, e1.salary
from employees e1 join employees e2
on e1.salary < e2.salary -- 不支持
where e1.name = 'Tom';

注意
1）三个及以上表join，如果每个on都使用相同的连接键，只会产生一个MapReduce job，Hive假定最后一个表是最大的表，用户要保证连续查询的表从左到右是依次增大的（如果只有一个小表，可以设置hive.auto.convert.join=true，把小表放在内存里，加速！也可也设置小表大小hive.mapjoin.smalltable.filesize）。
2）连接类型
[INNER] JOIN
LEFT [OUTER] JOIN
RIGHT [OUTER] JOIN
FULL [OUTER] JOIN
LEFT SEMI JOIN（查询结果只显示左表的）
笛卡尔积（如：select * from a join b，设置hive.mapred.mode=strict可以禁用笛卡尔积）

6.5 ORDER BY 和 SORT BY

order by：全局排序（可能很耗时）
sort by：只是每个reducer有序

select name, salary from employees order by name asc, salary desc;

6.6 含有SORT BY的DISTRIBUTE BY

默认情况下，MapReduce计算框架会根据map输入的键计算相应的哈希值，然后按照哈希值把键值对均匀地分发到多个reducer。而distribute by col可以把相同的col分布到同一个reducer，然后再sort by排序。

-- 如果distribute by和sort by后的条件一样，就等价于cluster by（不能降序！），可以实现全局排序
select name, salary from employees distribute by name sort by name asc, salary desc;

6.8 类型转换

Hive默认隐式类型转换，底层就是Java，窄类型自动转宽类型。而强制类型转换可以用cast，cast可嵌套使用。

6.9 抽样查询

略

6.10 UNION ALL

将两个及以上表合并，每个表有相同的列，且列的数据类型一样。

第 7 章 HiveQL：视图

-- 视图是只读的
create view view_01 as
select name, salary from employees;

-- 在存放Hive元数据的MySQL数据库hive中查询所有的视图
select TBL_NAME from TBLS where TBL_TYPE = 'VIRTUAL_VIEW';

第 8 章 HiveQL：索引

Hive的索引其实是一张索引表（Hive的物理表），在表里面存储索引列的值，该值对应的HDFS的文件路径，该值在数据文件中的偏移量。

-- 建立分区表 employees10
create table mydb.employees10 (
-- 放在hive命令行执行前可以先去除Tabname string comment 'Employee name',salary float comment 'Employee salary',subordinates array<string> comment 'Names of subordinates',deductions map<string, float> comment 'Keys are deductions names, values are percentages',address struct<street:string, city:string, state:string, zip:int> comment 'Home address'
)
partitioned by (country string, city string)
row format delimited
fields terminated by ','
collection items terminated by '#'
map keys terminated by ':';-- 导入数据
load data local inpath '/home/mi2/env/data/data_type_t1.txt' overwrite into table employees10  partition (country = 'US', city = 'CA');-- 创建索引
create index employees10_index
on table employees10 (name)
as 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler'
with deferred rebuild
idxproperties ('creator' = 'creator', 'ct' = '2021-04-30')
in table employees10_index_tb
comment 'Employees indexed by name';-- 查看索引
show formatted index on employees10;-- 生成索引（书简直是坑死人了，on后多了个table）
alter index employees10_index on employees10 rebuild;-- 删除索引
drop index employees10_index on employees10;

第 9 章模式设计