set hive.skewjoin.key=100000; 这个是join的键对应的记录条数超过这个值则会进行优化
mapjoin
简单总结下,mapjoin的使用场景:
Bucket join
两个表以相同方式划分桶
两个表的桶个数是倍数关系
crete table order(cid int,price float) clustered by(cid) into 32 buckets;
crete table customer(id int,first string) clustered by(id) into 32 buckets;
select price from order t join customer s on t.cid=s.id
join 优化前
select m.cid,u.id from order m join customer u on m.cid=u.id where m.dt='2013-12-12';
join优化后
select m.cid,u.id from (select cid from order where dt='2013-12-12')m join customer u on m.cid=u.id;
group by 优化
hive.groupby.skewindata=true;如果是group by 过程出现倾斜 应该设置为true
set hive.groupby.mapaggr.checkinterval=100000;--这个是group的键对应的记录条数超过这个值则会进行优化
count distinct 优化
优化前
select count(distinct id) from tablename
优化后
select count(1) from (select distinct id from tablename) tmp;
select count(1) from (select id from tablename group by id) tmp;
优化前
select a,sum(b),count(distinct c),count(distinct d) from test group by a
优化后
select a,sum(b) as b,count(c) as c,count(d) as d from(select a,0 as b,c,null as d from test group by a,c union all select a,0 as b,null as c,d from test group by a,d union all select a,b,null as c,null as d from test)tmp1 group by a;
文件存储(Cloud File Storage,CFS)为您提供安全可靠、可扩展的共享文件存储服务。文件存储可与腾讯云服务器、容器服务、批量计算等服务搭配使用,为多个计算节点提供容量和性能可弹性扩展的高性能共享存储。腾讯云文件存储的管理界面简单、易使用,可实现对现有应用的无缝集成;按实际用量付费,为您节约成本,简化 IT 运维工作。