描述
GROUP BY 子句用于根据一组指定的分组表达式对行进行分组,并根据一个或多个聚合函数对行组进行聚合计算。Spark 还支持通过 GROUPING SETS、CUBE、ROLLUP 子句进行高级聚合。语法
GROUP BY group_expression [ , ... ] [ WITH { ROLLUP | CUBE } ]GROUP BY { group_expression | { ROLLUP | CUBE | GROUPING SETS } (grouping_set [ , ...]) } [ , ... ]
参数
子句/关键字 | 说明 |
group_expression | 分组表达式,可以是列名、列位置或表达式 |
GROUPING SETS | 为每个指定的分组集进行分组 |
ROLLUP | 在单个语句中指定多个级别的聚合 |
CUBE | 基于分组列的组合进行聚合 |
FILTER | 仅将匹配行传递给聚合函数 |
示例
-- 创建测试表CREATE EXTERNAL TABLE gb_dealer (id INT, city STRING, car_model STRING, quantity INT)USING PARQUET LOCATION 'cosn://<your_cos_bucket>/test_group_by/gb_dealer';INSERT INTO gb_dealer VALUES(100,'Fremont','Honda Civic',10),(100,'Fremont','Honda Accord',15),(200,'Dublin','Honda Civic',20),(300,'San Jose','Honda Accord',8);-- GROUP BY idSELECT id, sum(quantity) FROM gb_dealer GROUP BY id ORDER BY id;-- GROUP BY 列位置SELECT id, sum(quantity) FROM gb_dealer GROUP BY 1 ORDER BY 1;-- 多个聚合SELECT id, sum(quantity) AS s, max(quantity) AS m FROM gb_dealer GROUP BY id ORDER BY id;-- GROUP BY + DISTINCTSELECT car_model, count(DISTINCT city) AS cnt FROM gb_dealer GROUP BY car_model;-- GROUPING SETSSELECT city, car_model, sum(quantity) FROM gb_dealerGROUP BY GROUPING SETS ((city, car_model), (city), ());-- ROLLUPSELECT city, car_model, sum(quantity) FROM gb_dealerGROUP BY city, car_model WITH ROLLUP ORDER BY city;-- CUBESELECT city, car_model, sum(quantity) FROM gb_dealerGROUP BY city, car_model WITH CUBE ORDER BY city;-- FILTERSELECT id, sum(quantity) FILTER (WHERE car_model = 'Honda Civic') FROM gb_dealer GROUP BY id ORDER BY id;