Doris 数据模型及自动分区使用案例代码说明

简介

Doris数据模型使用案例演示。

使用

Duplicate 模型

DROP TABLE test_stu

CREATE TABLE test_stu_duplicate

(id int,

name varchar(100),

gender TINYINT,

score TINYINT

)

DUPLICATE KEY(id, name)

DISTRIBUTED BY HASH (name)

INSERT INTO test_stu_duplicate VALUES(1,'shenliang',1,10);

INSERT INTO test_stu_duplicate VALUES(2,'shenliang',1,20);

INSERT INTO test_stu_duplicate VALUES(1,'shenliang',1,10);

INSERT INTO test_stu_duplicate VALUES(3,'liangshen',0,20);

id	name	gender	score
3	liangshen	0	20
1	shenliang	1	10
2	shenliang	1	20
1	shenliang	1	10

#第3条和第1条完全重复,但进表时不会过滤，即不会去重复数据。

Unique模型

CREATE TABLE test_stu_unique

(id int,

name varchar(100),

gender TINYINT,

score TINYINT

)

UNIQUE KEY(id, name)

DISTRIBUTED BY HASH (id)

PROPERTIES("replication_num" = "1");

INSERT INTO test_stu_unique VALUES(1,'shenliang',1,10);

INSERT INTO test_stu_unique VALUES(2,'shenliang',1,20);

INSERT INTO test_stu_unique VALUES(1,'shenliang',1,10);

INSERT INTO test_stu_unique VALUES(3,'liangshen',0,20);

id	name	gender	score
3	liangshen	0	20
1	shenliang	1	10
2	shenliang	1	20

Aggr模型

CREATE TABLE test_stu_aggr

(name varchar(100),

id int,

gender TINYINT,

score TINYINT REPLACE,

acc_classtimes INT SUM,

max_classtimes INT MAX,

min_classtimes INT MIN

)

AGGREGATE KEY(name,id,gender)

DISTRIBUTED BY HASH(id)

PROPERTIES("replication_num" = "1");

INSERT INTO test_stu_aggr VALUES('shenliang',1,1,30,5,10,1);

INSERT INTO test_stu_aggr VALUES('shenliang',1,1,40,8,12,3);

name	gender	id	score	acc_classtimes	max_classtimes	min_classtimes
shenliang	1	1	40	13	12	1

这里输入插入了2条记录，但会通过

INSERT INTO test_stu_aggr VALUES('shenliang',2,1,10,2,15,5);

INSERT INTO test_stu_aggr VALUES('shenliang',2,0,20,6,7,7);

name	id	gender	score	acc_classtimes	max_classtimes	min_classtimes
shenliang	1	1	40	13	12	1
shenliang	2	1	10	2	15	5
shenliang	2	0	20	6	7	7

物化视图

物化视图是将预先计算（根据定义好的 SELECT 语句）好的数据集，存储在 Doris 中的一个特殊的表。

create materialized view test_stu_maxclasstimes as select gender, MAX(max_classtimes)

from test_stu_aggr group by gender;

EXPLAIN select gender, MAX(max_classtimes)

from test_stu_aggr group by gender;

Explain String

PLAN FRAGMENT 0

OUTPUT EXPRS:<slot 2> `gender` | <slot 3> max(`max_classtimes`)

PARTITION: UNPARTITIONED

RESULT SINK

4:EXCHANGE

PLAN FRAGMENT 1

OUTPUT EXPRS:

PARTITION: HASH_PARTITIONED: <slot 2> `gender`

STREAM DATA SINK

EXCHANGE ID: 04

UNPARTITIONED

3:AGGREGATE (merge finalize)

| output: max(<slot 3> max(`max_classtimes`))

| group by: <slot 2> `gender`

| cardinality=-1

2:EXCHANGE

PLAN FRAGMENT 2

OUTPUT EXPRS:

PARTITION: HASH_PARTITIONED: `default_cluster:dw_incubate`.`test_stu_aggr`.`id`

STREAM DATA SINK

EXCHANGE ID: 02

HASH_PARTITIONED: <slot 2> `gender`

1:AGGREGATE (update serialize)

| STREAMING

| output: max(`max_classtimes`)

| group by: `gender`

| cardinality=-1

0:OlapScanNode

TABLE: test_stu_aggr

PREAGGREGATION: ON

partitions=1/1

rollup: test_stu_maxclasstimes

tabletRatio=10/10

tabletList=292497,292499,292501,292503,292505,292507,292509,292511,292513,292515

cardinality=0

avgRowSize=5.0

numNodes=1

Rollup

Rollup可认为是物化视图的一个子集。

alter table test_stu_aggr add rollup rollup_id(id,acc_classtimes);

EXPLAIN

SElECT id,SUM(acc_classtimes)

FROM test_stu_aggr

GROUP BY id

Explain String

PLAN FRAGMENT 0

OUTPUT EXPRS:<slot 2> `id` | <slot 3> sum(`acc_classtimes`)

PARTITION: UNPARTITIONED

RESULT SINK

2:EXCHANGE

PLAN FRAGMENT 1

OUTPUT EXPRS:

PARTITION: HASH_PARTITIONED: `default_cluster:dw_incubate`.`test_stu_aggr`.`id`

STREAM DATA SINK

EXCHANGE ID: 02

UNPARTITIONED

1:AGGREGATE (update finalize)

| output: sum(`acc_classtimes`)

| group by: `id`

| cardinality=-1

0:OlapScanNode

TABLE: test_stu_aggr

PREAGGREGATION: ON

partitions=1/1

rollup: rollup_id

tabletRatio=10/10

tabletList=292475,292477,292479,292481,292483,292485,292487,292489,292491,292493

cardinality=0

avgRowSize=8.0

numNodes=1

均值

Doris里不支持Avg模型，在指标类数据不为NULL的情况下，可通过追加计数器字段算。详见#2里指标不为NULL的情况。

#1定义常量字段cnt，通过REPLACE方式使用。

CREATE TABLE t_temp

( id int,

score int SUM,

cnt int REPLACE default '1'

) AGGREGATE KEY (id)

distributed by hash(id)

INSERT INTO t_temp(id,score) VALUES(1,80);

INSERT INTO t_temp(id,score) VALUES(1,90);

INSERT INTO t_temp(id,score) VALUES(1,70);

INSERT INTO t_temp(id,score) VALUES(2,80);

INSERT INTO t_temp(id,score) VALUES(2,NULL);

SELECT * FROM t_temp

id score cnt

1 240 1

2 80 1

#2 定义累计字段cnt并求和

CREATE TABLE t_temp_2

( id int,

score int SUM,

cnt int SUM default '1'

) AGGREGATE KEY (id)

distributed by hash(id)

INSERT INTO t_temp_2(id,score) VALUES(1,80);

INSERT INTO t_temp_2(id,score) VALUES(1,90);

INSERT INTO t_temp_2(id,score) VALUES(1,70);

INSERT INTO t_temp_2(id,score) VALUES(2,80);

INSERT INTO t_temp_2(id,score) VALUES(2,NULL);

INSERT INTO t_temp_2(id,score) VALUES(3,70);

INSERT INTO t_temp_2(id,score) VALUES(3,50);

SELECT * FROM t_temp_2

id score cnt

1 240 3

3 120 2

2 80 2

均值avg可听过score/cnt得到。

分区简介

Doris字段分区支持手动建分区和动态加分区并追加历史分区信息。

手动指定分区

# 建表时手动指定分区信息，这里核心为

PARTITION BY RANGE(collectionDate)

(

PARTITION p20211031 VALUES LESS THAN ("2021-11-01"),

PARTITION p20211101 VALUES LESS THAN ("2021-11-02"),

PARTITION p20211102 VALUES LESS THAN ("2021-11-03")

)

注：分区字段需要在key列表里，且key字段列表必须放在建表语句的前面。

详细见如下脚本：

DROP TABLE t_deviceinfo;
CREATE TABLE t_deviceinfo(
collectdate date COMMENT '采集时间',
deviceid int COMMENT '设备ID',
value biggint '指标值'

)UNIQUE KEY(collectdate,deviceid )
PARTITION BY RANGE(collectdate )
(
PARTITION p20211031 VALUES LESS THAN ("2021-11-01"),
PARTITION p20211101 VALUES LESS THAN ("2021-11-02"),
PARTITION p20211102 VALUES LESS THAN ("2021-11-03")
)
distributed by hash(collectdate,deviceId) buckets 20

动态生成分区

# 建表时指定动态分区信息，这里主要是指定动态分区相关参数，详细解释见下：

"dynamic_partition.enable" = "true" -- 指定开启动态分区

"dynamic_partition.create_history_partition" = "true", -- 历史数据是否分区

"dynamic_partition.history_partition_num" = "200", -- 历史数据回溯时间(天为单位)

"dynamic_partition.time_unit" = "DAY", -- 分区频率，支持天、周、月、年

-- "dynamic_partition.start" = "-100", -- 设置删除删除100天前的分区（其它时间单位类似），不设置该参数即默认不删除历史分区

"dynamic_partition.end" = "7", -- 往前增加7天分区（其它时间单位类似）

"dynamic_partition.prefix" = "p",-- 指定分区名的前缀，这里是以“p”开头

"dynamic_partition.buckets" = "32" – 设置分区内桶的个数

详细建表语句见下：

DROP TABLE t_deviceinfo;
CREATE TABLE t_deviceinfo(
collectdate date COMMENT '采集时间',
deviceid int COMMENT '设备ID',
value bigint '指标值'

)

UNIQUE KEY(collectdate,deviceid )

partition by range(collectdate)()

distributed by hash(collectdate,deviceid) buckets 20

PROPERTIES

(

"dynamic_partition.enable" = "true",

"dynamic_partition.create_history_partition" = "true",

"dynamic_partition.history_partition_num" = "200",

"dynamic_partition.time_unit" = "DAY",

-- "dynamic_partition.start" = "-100",

"dynamic_partition.end" = "7",

"dynamic_partition.prefix" = "p",

"dynamic_partition.buckets" = "32"

);

Doris 数据模型及自动分区使用案例相关推荐

关于oracle 11g自动分区+分区改名+定时任务综合使用实现自动分区后可以进行分区查询
一直想用oracle 11g 的interval分区(间断分区),这需要配合分区改名才能让我们可以使用分区查询,因为自动分区出来的分区名会是SYSPxxx这样.. 下面直接代码 drop tables ...
Timestamp 与 Date 变量绑定与Oracle的自动分区
2019独角兽企业重金招聘Python工程师标准>>> 好久没有更新博客了,其实是工作中遇到的很多问题在Google上都能找到答案,也就没有记录下来的必要了.今天主要想聊一下在实际的 ...
python 装机配置_Python实现自动装机功能案例分析
前言提示:在管理服务器的过程中,发现有很多服务器在启动的过程中默认以PXE方式启动,这就导致我们无法将PXE装机程序放开到所有的交换机端口中,本文是以Python对dell服务器进行了一些控制,更多 ...
oracle 定时器时间分区_Oracle数据库之oracle按时间分区以及自动分区
本文主要向大家介绍了Oracle数据库之oracle按时间分区以及自动分区,通过具体的内容向大家展现,希望对大家学习Oracle数据库有所帮助. (1) --- 创建按时间分区的表 create ta ...
爬虫三（Bs4搜索、Selenium基本使用、无界面浏览器、Selenium自动登录百度案例、自动获取12306登录验证码案例、切换选项卡、浏览器前进后退、登录Cnblogs获取Cookie自动点赞）
文章标题一.Bs4搜索文档树二.CSS选择器三.selenium基本使用四.无界面浏览器五.selenium其他使用 1)自动登录百度案例 2)获取位置属性大小.文本 3)自动获取12306 ...
Linux 磁盘自动分区脚本
Linux 运维以及脚本干货: 欢迎分享,欢迎folk https://unnunique.github.io/AADocs/skill-docs/linux/ https://github.com/ ...
mysql自动分区自动清理
1. 概述 mysql分区表功能特别有用,其中一个应用就是保存固定时间的数据信息,自动分区自动purge,不用担心数据量越积累越多. 比较实用的一个实现方式是表一天一个分区,保持固定天数的数据. 2. ...
【MapReduce】分区（分区实战案例）、Combiner、Shuffer
分区(分区实战案例).Combiner.Shuffer 1 分区 2 根据部门号建立分区 3 Combiner 4 Shuffer 手动反爬虫,禁止转载: 原博地址 https://blog.csdn ...

Doris 数据模型及自动分区使用案例

简介

使用

Duplicate 模型

Unique模型

Aggr模型

物化视图

Rollup

均值

分区简介

手动指定分区

动态生成分区

Doris 数据模型及自动分区使用案例相关推荐

最新文章

热门文章