这里使用Mysql保存最终数据,Apache Doris保存处理过程中的数据
Apache Doris
https://github.com/apache/doris
https://doris.apache.org/zh-CN/docs/dev/get-starting/what-is-apache-doris/
Doris的SQL函数
https://doris.apache.org/zh-CN/docs/dev/sql-manual/sql-functions/string-functions/split-part
https://doris.apache.org/zh-CN/docs/dev/sql-manual/sql-functions/string-functions/regexp/
create database zdb;
创建表
CREATE TABLE `t_student` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`name` varchar(255) DEFAULT NULL,
`subject` varchar(255) DEFAULT NULL,
`score` double NULL DEFAULT NULL,
PRIMARY KEY (`id`)
);
插入数据
INSERT INTO t_student(name,subject,score) VALUES ('张三','语文',60);
INSERT INTO t_student(name,subject,score) VALUES ('张三','数学',100);
INSERT INTO t_student(name,subject,score) VALUES ('张三','英语',30);
INSERT INTO t_student(name,subject,score) VALUES ('李四','语文',80);
INSERT INTO t_student(name,subject,score) VALUES ('李四','数学',70);
INSERT INTO t_student(name,subject,score) VALUES ('李四','英语',60);
INSERT INTO t_student(name,subject,score) VALUES ('王五','语文',30);
INSERT INTO t_student(name,subject,score) VALUES ('王五','数学',80);
INSERT INTO t_student(name,subject,score) VALUES ('王五','英语',60);
准备数据
create table t_student2 as (
select name,
max(case subject when '语文' then score else null end) as '语文',
max(case subject when '数学' then score else null end) as '数学',
max(case subject when '英语' then score else null end) as '英语'
from t_student group by name
);
结果如下
创建表
CREATE TABLE `t_student_detail` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`name` varchar(255) DEFAULT NULL,
`phone` varchar(255) DEFAULT NULL,
PRIMARY KEY (`id`)
);
插入数据
INSERT INTO t_student_detail(name,phone) VALUES ('张三','15225178321');
INSERT INTO t_student_detail(name,phone) VALUES ('李四','13607686585');
INSERT INTO t_student_detail(name,phone) VALUES ('王五','13837857835');
INSERT INTO t_student_detail(name,phone) VALUES ('赵六',null);
create database zdb;
CREATE TABLE IF NOT EXISTS t_student (
`id` BIGINT NOT NULL COMMENT "用户id",
`name` VARCHAR(255) COMMENT "用户名",
`subject` VARCHAR(255) COMMENT "科目",
`score` DOUBLE COMMENT "分数"
) DISTRIBUTED BY HASH(`name`) BUCKETS auto PROPERTIES (
"replication_allocation" = "tag.location.default: 1"
);
插入数据
INSERT INTO t_student(id,name,subject,score) VALUES (1,'张三','语文',60);
INSERT INTO t_student(id,name,subject,score) VALUES (2,'张三','数学',100);
INSERT INTO t_student(id,name,subject,score) VALUES (3,'张三','英语',30);
INSERT INTO t_student(id,name,subject,score) VALUES (4,'李四','语文',80);
INSERT INTO t_student(id,name,subject,score) VALUES (5,'李四','数学',70);
INSERT INTO t_student(id,name,subject,score) VALUES (6,'李四','英语',60);
INSERT INTO t_student(id,name,subject,score) VALUES (7,'王五','语文',30);
INSERT INTO t_student(id,name,subject,score) VALUES (8,'王五','数学',80);
INSERT INTO t_student(id,name,subject,score) VALUES (9,'王五','英语',60);
创建透视后的表
CREATE TABLE IF NOT EXISTS t_student2 (
`name` VARCHAR(255) COMMENT "用户名",
`chinese` DOUBLE COMMENT "科目",
`math` DOUBLE COMMENT "科目",
`english` DOUBLE COMMENT "分数"
) DISTRIBUTED BY HASH(`name`) BUCKETS auto PROPERTIES (
"replication_allocation" = "tag.location.default: 1"
);
注意
字段不支持中文。 括号的最后不能有逗号。
透视查询
select name,
sum(case subject when '语文' then score else 0 end) chinese,
sum(case subject when '数学' then score else 0 end) math,
sum(case subject when '英语' then score else 0 end) english
from t_student group by name;
结果
导入到透视后的表中
insert into t_student2(name,chinese,math,english) select name,
sum(case subject when '语文' then score else 0 end) chinese,
sum(case subject when '数学' then score else 0 end) math,
sum(case subject when '英语' then score else 0 end) english
from t_student group by name;
查询数据
select * from t_student2;
列转行
select name,'chinese' as subject,chinese as 'score' from t_student2
union all
select name,'math' as subject,math as 'score' from t_student2
union all
select name,'english' as subject,english as 'score' from t_student2
order by name;
结果
CREATE TABLE IF NOT EXISTS t_user_hobby (
`id` BIGINT NOT NULL COMMENT "用户id",
`name` VARCHAR(255) COMMENT "用户名",
`hobby` VARCHAR(255) COMMENT "爱好"
) DISTRIBUTED BY HASH(`name`) BUCKETS auto PROPERTIES (
"replication_allocation" = "tag.location.default: 1"
);
插入数据
INSERT INTO t_user_hobby(id,name,hobby) VALUES (1,'张三','唱歌-跳舞-打篮球');
where
示例参数
{
tableName:"t_user",
fields:["name","age"],
type:"deal_row_filter",
currNodeId:"wqgvwyerty",
paras:[
{
"filterSql":"name like '张%' and age between 5 and 10",
},
]
}
COALESCE()
null替换为设置的字符
select name,COALESCE(phone,'-') as phone from t_student_detail;
处理策略
示例参数
{
tableName:"t_user",
fields:["name","age"],
type:"deal_row_null",
currNodeId:"wqgvwyerty",
paras:[
{
"field":"name",
"dealType":"replace_value",
"value":"小明"
},
{
"field":"age",
"dealType":"replace_max",
"value":""
}
]
}
TopN
select * FROM t_student_detail LIMIT 2;
BottomN
select * from (select * FROM t_student_detail ORDER BY id desc LIMIT 2) t1 ORDER BY t1.id asc ;
区间
select * FROM t_student_detail LIMIT 2 OFFSET 2;
dealType
示例参数
top_n
{
tableName:"t_user",
fields:["name","age"],
type:"deal_row_select",
currNodeId:"wqgvwyerty",
paras:[
{
"dealType":"top_n",
"value":"10"
}
]
}
range
{
tableName:"t_user",
fields:["name","age"],
type:"deal_row_select",
currNodeId:"wqgvwyerty",
paras:[
{
"dealType":"range",
"min":"5",
"max":"10"
}
]
}
filter
{
tableName:"t_user",
fields:["name","age"],
type:"deal_row_select",
currNodeId:"wqgvwyerty",
paras:[
{
"dealType":"filter",
"filterSql":"name like '张%' and age between 5 and 10",
}
]
}
SELECT DISTINCT name,subject,score FROM t_student;
参数
{
tableName:"t_user",
fields:["name","age"],
type:"deal_row_distinct",
currNodeId:"wqgvwyerty",
paras:[
{
"selectFields":["name","age"],
},
]
}
{
tableName:"t_user",
fields:["name","age"],
type:"deal_row_order",
currNodeId:"wqgvwyerty",
paras:[
{
"field":"name",
"value":"asc"
},
{
"field":"age",
"value":"desc"
}
]
}
SELECT (@i:= @i + 1) AS rownum, name, subject,score FROM t_student, (SELECT @i:=0) AS rownum ORDER BY score desc;
参数
{
tableName:"t_user",
fields:["name","age"],
type:"deal_row_addnum",
currNodeId:"wqgvwyerty",
paras:[
{
"addnumName": "addnum",
},
]
}
dealType
参数
{
tableName:"t_user",
fields:["name","age"],
type:"deal_row_replace",
currNodeId:"wqgvwyerty",
paras:[
{
"dealType":"str_replace",
"field":"name",
"valueOld":"小",
"valueNew":"张"
},
]
}
replace()
#直接替换字符串中的部分字符
SELECT name,replace(phone,substring(phone,4,4),'****') as phone FROM t_student_detail;
case when
select name,case name when '张三' then '-' ELSE phone END as phone from t_student_detail;
注意对null无效
select name,case phone when null then '-' ELSE phone END as phone from t_student_detail;
dealType 清洗方式
参数
{
tableName:"t_user",
fields:["name","age"],
type:"deal_row_clean",
currNodeId:"wqgvwyerty",
paras:[
{
"field":"name",
"dealType":"remove_space_all",
}
]
}
参数
{
tableName:"t_user",
fields:["name","age"],
type:"deal_row_tm",
currNodeId:"asdfg",
paras:[
{
"field":"name",
"min":"3",
"max":"7"
}
]
}
脱敏
select sub_replace("15225178666","****",3,4);
或者
select sub_replace("15225178666",repeat("*",4),3,4);
SELECT name,MD5(phone)as phone FROM t_student_detail; -- 进行MD5加密
使用AES加解密
SELECT AES_DECRYPT(AES_ENCRYPT('码客说', '1234567891011121'),'1234567891011121');
{
tableName:"t_user",
fields:["name","age"],
type:"deal_col_select",
currNodeId:"wqgvwyerty",
paras:[
{
"selectFields":["name","age"],
},
]
}
参数
{
tableName:"t_user",
fields:["name","age"],
type:"deal_col_derive",
currNodeId:"wqgvwyerty",
paras:[
{
"expression":"year(mdate)",
"fieldNew":"myear"
},
]
}
派生列的名称不能和已有列重复
目前派生列支持的函数有:
函数名称 | 函数及示例 | 说明 |
---|---|---|
累积 | sum(<column>) over(partition by <columns> order by <columns>) | 对数字型字段进行累计计算。Sum():用于设置求和计算字段;Partition by():用于设置分组的字段;Order by():用于设置排序的字段; |
字符拼接 (拼接符) | concat_ws("sep",str1,str2,...,strN) | 通过sep连接符将字符串拼接。sep由用户自定义。str:表示连接的字符。 |
字符拼接 | concat(str1, str2, ..., strN) | 不需要连接符,直接将字符串连接。str:表示连接的字符。 |
字符截取 | substring(<column>, pos[, len]) | 从字段中截取指定位置的len个字符。<column>:表示截取的字段;pos:表示从第几个位置开始截取,设置为正整数;len:表示一共取多少个字符,设置为正整数; |
小数点保留 | format_number(<column>, num) | 保留浮点型字段小数点位数。<column>:表示设置小数点位数的字段;num:表示小数点后保留的位数,设置为0和正整数; |
内容提取(年) | year(<column>) | 截取时间字段中的年信息。<column>:表示截取的字段; |
内容提取(月) | month(<column>) | 截取时间字段中的月信息。<column>:表示截取的字段; |
内容提取(日) | day(<column>) | 截取时间字段中的日信息。<column>:表示截取的字段; |
内容提取(时) | hour(<column>) | 截取时间字段中的时信息。<column>:表示截取的字段; |
内容提取(分) | minute(<column>) | 截取时间字段中的分信息。 <column>:表示截取的字段; |
内容提取(秒) | second(<column>) | 截取时间字段中的秒信息。<column>:表示截取的字段; |
向上取整 | ceiling(<column>) | 获取大于或等于取整字段的最小整数。 <column>:表示取整的字段。 |
向下取整 | floor(<column>) | 获取小于或等于取整字段的最大整数。 <column>:表示取整的字段。 |
四舍五入 | round(<column>) | 获取四舍五入后的整数。 <column>:表示四舍五入的字段。 |
行最小值 | least(<column>,<column>,<column>) | 获取指定行中的最小值。 <column>:表示用来比较最小值的字段,默认三个,可以有多个字段。 |
行最大值 | greatest(<column>,<column>,<column>) | 获取指定行中的最大值。 <column>:表示用来比较最大值的字段,默认三个,可以有多个字段。 |
示例
select name ,least(`语文`,`数学`,`英语`) as min_score FROM t_student2;
select name ,greatest(`语文`,`数学`,`英语`) as max_score FROM t_student2;
分隔符:逗号、分号、空格、tab、-
拆分后的列将按照新字段名称_数字序号
的格式按顺序依次命名,数字序号从1开始
参数
{
tableName:"t_user",
fields:["name","age"],
type:"deal_col_split",
currNodeId:"wqgvwyerty",
paras:[
{
"field":"name",
"expression":";",
"fieldNew":"name",
"colNum":"3"
},
]
}
拆分列
SELECT SUBSTRING_INDEX(hobby, '-', 1) AS hobby1,
SUBSTRING_INDEX(SUBSTRING_INDEX(hobby, '-', 2), '-', -1) AS hobby2,
SUBSTRING_INDEX(SUBSTRING_INDEX(hobby, '-', 3), '-', -1) AS hobby3
FROM t_user_hobby;
或者
SELECT SUBSTRING_INDEX(SUBSTRING_INDEX(hobby, '-', 1), '-', -1) AS hobby1,
SUBSTRING_INDEX(SUBSTRING_INDEX(hobby, '-', 2), '-', -1) AS hobby2,
SUBSTRING_INDEX(SUBSTRING_INDEX(hobby, '-', 3), '-', -1) AS hobby3
FROM t_user_hobby;
SELECT regexp_extract(hobby, '([^-]+)-([^-]+)-([^-]+)', 1) AS hobby1,
regexp_extract(hobby, '([^-]+)-([^-]+)-([^-]+)', 2) AS hobby2,
regexp_extract(hobby, '([^-]+)-([^-]+)-([^-]+)', 3) AS hobby3
FROM t_user_hobby;
SELECT split_part(hobby, '-', 1) AS hobby1,
split_part(hobby, '-', 2) AS hobby2,
split_part(hobby, '-', 3) AS hobby3
FROM t_user_hobby;
合并一个表的多个字段为一个新字段,和Smart不一致
{
tableName:"t_user",
fields:["name","age"],
type:"deal_col_merge",
currNodeId:"wqgvwyerty",
paras:[
{
"selectFields": ["myear","month","mday"],
"expression": "-",
"fieldNew": "mdate"
},
]
}
示例
SELECT concat( name,':',hobby) AS name_hobby FROM t_user_hobby;
{
tableName:"t_user",
fields:["name","age"],
type:"deal_col_datetime",
currNodeId:"wqgvwyerty",
paras:[
{
"field":"datestr",
"formatSource":"%Y-%m-%d %H:%i:%s",
"fieldNew":"datestr2",
"formatTarget":"%Y-%m-%",
},
]
}
字符传转日期
select str_to_date('2014-12-21 12:34:56', '%Y-%m-%d %H:%i:%s');
日期转字符串
select DATE_FORMAT(now(),'%Y%m%d%H%i%s');
两者结合
select DATE_FORMAT(str_to_date('2014-12-21 12:34:56','%Y-%m-%d %H:%i:%s'), '%Y%m%d%H%i%s');
{
tableName:"t_user",
fields:["name","age"],
type:"deal_sql",
currNodeId:"wqgvwyerty",
paras:[
{
"selectSql":"select * from t_user group by name",
},
]
}
joinType支持
参数
{
tableName:"",
fields:[],
type:"deal_morejoin",
currNodeId:"wqgvwyerty",
tableAll:[
{
tableName:"t_user",
fields:["name","age"],
},
{
tableName:"t_user_detail",
fields:["job"],
}
],
paras:[
{
"tableLeft":"t_user",
"joinType":"left join"
"tableRight":"t_user_detail",
"fieldLeft":"id",
"fieldRight":"userid",
},
]
}
编辑页面显示 原字段、新字段、别名、数据类型、顺序调整
{
tableName:"t_user",
fields:["name","age"],
type:"deal_metadata",
currNodeId:"wqgvwyerty",
paras:[
{
"field":"name",
"fieldNew":"username",
"remarks":"用户名",
"dataType":"string"
},
]
}
参数示例
{
tableName:"t_user",
fields:["name","age"],
type:"deal_aggregation",
currNodeId:"wqgvwyerty",
paras:[
{
"field":"name",
"fieldNew":"name",
"dealType":"group",
},
{
"field":"age",
"fieldNew":"maxage",
"dealType":"max",
},
]
}
dealType支持
常见聚合函数有:
select name,max(score) max_score FROM t_student GROUP BY name;
select name,min(score) min_score FROM t_student GROUP BY name;
select name,avg(score) avg_score FROM t_student GROUP BY name;
select name,sum(score) total_score FROM t_student GROUP BY name;
此外,MySQL还提供一些高级聚合函数:
示例
select name,STD(score) std_score FROM t_student GROUP BY name;
select name,VARIANCE(score) var_score FROM t_student GROUP BY name;
select name,STDDEV_POP(score) std_pop_score FROM t_student GROUP BY name;
select name,STDDEV_SAMP(score) std_samp_score FROM t_student GROUP BY name;
select name,VAR_POP(score) var_pop_score FROM t_student GROUP BY name;
select name,VAR_SAMP(score) var_samp_score FROM t_student GROUP BY name;
参数示例
{
tableName:"t_student",
fields:["name","subject","score"],
type:"deal_row2col",
currNodeId:"wqgvwyerty",
paras:[
{
"fieldName":"name",
"fieldPivot":"subject",
"fieldValue":"score",
"dealType":"sum",
},
]
}
属性说明
dealType
先查出要进行行转列的列属性的值
select DISTINCT subject FROM t_student;
再生成SQL
select name 姓名,
sum(case subject when '语文' then score else 0 end) 语文,
sum(case subject when '数学' then score else 0 end) 数学,
sum(case subject when '英语' then score else 0 end) 英语
from t_student group by name;
SELECT name as 姓名,
SUM(IF(subject = '语文', score, 0)) AS 语文,
SUM(IF(subject = '数学', score, 0)) AS 数学,
SUM(IF(subject = '英语', score, 0)) AS 英语
FROM t_student
GROUP BY name;
一种说法是Mysql8开始支持povit,但是
这里测试的Mysql8也不支持povit关键字。
示例
SELECT pivot_table.*
FROM t_student
PIVOT(
SUM(score)
FOR subject IN ('语文','数学','英语')
) AS pivot_table;
查询Mysql版本
SELECT @@version;
SELECT VERSION();
参数示例
{
tableName:"t_student",
fields:["name","语文","数学","英语"],
type:"deal_col2row",
currNodeId:"wqgvwyerty",
paras:[
{
"fieldName":"name",
"fieldPivotList":["语文","数学","英语"],
"fieldValue":"score",
},
]
}
属性说明
实现列转行
select name,'语文' as subject,语文 as 'score' from t_student2
union all
select name,'数学' as subject,数学 as 'score' from t_student2
union all
select name,'英语' as subject,英语 as 'score' from t_student2
order by name;