下表为分区增量数据表(userinfo)
创建表加载数据 用户信息
create table test.userinfo(
userid string comment '用户编号',
mobile string comment '手机号码',
regdate string comment '注册日期'
)comment '用户信息'
partitioned by(dt string)
row format delimited fields terminated by ',';
拉链表(存放用户历史信息) 拉链表不是分区表:多了两个字段start_date,end_date
create table test.userhis(
userid string comment '用户编号',
mobile string comment '手机号码',
regdate string comment '注册日期',
start_date string,
end_date string
)comment '用户信息拉链表'
row format delimited fields terminated by ',';
数据准备
001,13551111111,2020-03-01,2020-06-20
002,13561111111,2020-04-01,2020-06-20
003,13571111111,2020-05-01,2020-06-20
004,13581111111,2020-06-01,2020-06-20
002,13562222222,2020-04-01,2020-06-21
004,13582222222,2020-06-01,2020-06-21
005,13552222222,2020-06-21,2020-06-21
004,13333333333,2020-06-01,2020-06-22
005,13533333333,2020-06-21,2020-06-22
006,13733333333,2020-06-22,2020-06-22
001,13554444444,2020-03-01,2020-06-23
003,13557444444,2020-05-01,2020-06-23
005,13555555444,2020-06-21,2020-06-23
007,18674444444,2020-06-23,2020-06-23
008,13699844444,2020-06-23,2020-06-23
法一: 静态分区数据加载
/data/userinfo.dat
001,13551111111,2020-03-01
002,13561111111,2020-04-01
003,13571111111,2020-05-01
004,13581111111,2020-06-01
load data local inpath '/data/userinfo.dat' into table test.userinfo partition(dt='2020-06-20');
法二: 动态分区数据的加载:分区的值不是固定的,由输入的数据决定 创建中间表
create table test.tmp1 as
select * from test.userinfo;
tmp1 非分区表,使用系统默认的字段分割符’\001’
alter table test.tmp1 set serdeproperties('field.delim'=',');
向中间表中加入数据
load data local inpath '/data/userinfo.dat' overwrite into table test.tmp1;
从中间表向分区表加载数据
set hive.exec.dynamic.partition.mode=nonstrict;
insert into table test.userinfo
partition(dt)
select * from test.tmp1;
将数据成功插入到了分区表中,动态加载更加能应对分区多,数据多情况
hive (test)> select * from test.userinfo;
OK
userinfo.userid userinfo.mobile userinfo.regdate userinfo.dt
001 13551111111 2020-03-01 2020-06-20
002 13561111111 2020-04-01 2020-06-20
003 13571111111 2020-05-01 2020-06-20
004 13581111111 2020-06-01 2020-06-20
002 13562222222 2020-04-01 2020-06-21
004 13582222222 2020-06-01 2020-06-21
005 13552222222 2020-06-21 2020-06-21
004 13333333333 2020-06-01 2020-06-22
005 13533333333 2020-06-21 2020-06-22
006 13733333333 2020-06-22 2020-06-22
001 13554444444 2020-03-01 2020-06-23
003 13557444444 2020-05-01 2020-06-23
005 13555555444 2020-06-21 2020-06-23
007 18674444444 2020-06-23 2020-06-23
008 13699844444 2020-06-23 2020-06-23
Time taken: 0.398 seconds, Fetched: 15 row(s)
我们的数据来源是来自test.userinfo分区表,每日更新为一个分区(包括新增+修改) 拉链表增加两个字段start_date,end_date(两字段区间表示该数据有效时间区间,否则无效) 拉链表第一天为历史数据
insert overwrite table test.userhis
select userid,mobile,regdate,dt as start_date,'9999-12-31' as end_date
from test.userinfo
where dt = '2020-06-20';
拉链表次日新增和修改数据 第二天的拉链表为今日数据+历史数据 今日数据包括新增数据+历史数据修改 因此此时当天拉链表=历史数据有效期整合后的数据+新增的数据 即拉链表userhis(2020-06-22) <= userinfo(2020-06-22) + userhis
select userid,mobile,regdate,dt as start_date,'9999-12-31' as end_date
from test.userinfo
where dt = '2020-06-21';
变化的数据我们要更新有效区间,未变化的数据我们要保留(使用表的外连接)
select A.userid,A.mobile,A.regdate,A.start_date,
case when A.end_date='9999-12-31' and B.userid is not null
then date_add(B.dt,-1)
else A.end_date end as end_date
from test.userhis A left join (select * from test.userinfo where dt = '2020-06-21') B on A.userid = B.userid;
综上2020-06-21当日的拉链表是:
insert overwrite table test.userhis
select userid,mobile,regdate,dt as start_date,'9999-12-31' as end_date
from test.userinfo
where dt = '2020-06-21'
union all
select A.userid,A.mobile,A.regdate,A.start_date,
case when A.end_date='9999-12-31' and B.userid is not null
then date_add(B.dt,-1)
else A.end_date end as end_date
from test.userhis A left join (select * from test.userinfo where dt = '2020-06-21') B on A.userid = B.userid;
查询当前拉链表数据
hive (test)> select * from userhis cluster by userid,start_date;
userhis.userid userhis.mobile userhis.regdate userhis.start_date userhis.end_date
001 13551111111 2020-03-01 2020-06-20 9999-12-31
002 13561111111 2020-04-01 2020-06-20 2020-06-20
002 13562222222 2020-04-01 2020-06-21 9999-12-31
003 13571111111 2020-05-01 2020-06-20 9999-12-31
004 13581111111 2020-06-01 2020-06-20 2020-06-20
004 13582222222 2020-06-01 2020-06-21 9999-12-31
005 13552222222 2020-06-21 2020-06-21 9999-12-31
Time taken: 3.148 seconds, Fetched: 7 row(s)
以后每日的拉链表就是修改对应的dt信息 通过以上方式我们每日拉链表可以通过脚本来更新
#!/bin/bash
if [ -n "$1" ]
then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
insert overwrite table test.userhis
select userid,mobile,regdate,dt as start_date,'9999-12-31' as end_date
from test.userinfo
where dt = '$do_date'
union all
select A.userid,A.mobile,A.regdate,A.start_date,
case when A.end_date='9999-12-31' and B.userid is not null
then date_add(B.dt,-1)
else A.end_date end as end_date
from test.userhis A left join (select * from test.userinfo where dt = '$do_date') B on A.userid = B.userid;"
hive -e "$sql"
通过执行脚本
sh userTest.sh 2020-06-22
userhis.userid userhis.mobile userhis.regdate userhis.start_date userhis.end_date
001 13551111111 2020-03-01 2020-06-20 9999-12-31
002 13561111111 2020-04-01 2020-06-20 2020-06-20
002 13562222222 2020-04-01 2020-06-21 9999-12-31
003 13571111111 2020-05-01 2020-06-20 9999-12-31
004 13581111111 2020-06-01 2020-06-20 2020-06-20
004 13582222222 2020-06-01 2020-06-21 2020-06-21
004 13333333333 2020-06-01 2020-06-22 9999-12-31
005 13552222222 2020-06-21 2020-06-21 2020-06-21
005 13533333333 2020-06-21 2020-06-22 9999-12-31
006 13733333333 2020-06-22 2020-06-22 9999-12-31
Time taken: 3.037 seconds, Fetched: 10 row(s)
sh userTest.sh 2020-06-23
userhis.userid userhis.mobile userhis.regdate userhis.start_date userhis.end_date
001 13551111111 2020-03-01 2020-06-20 2020-06-22
001 13554444444 2020-03-01 2020-06-23 9999-12-31
002 13561111111 2020-04-01 2020-06-20 2020-06-20
002 13562222222 2020-04-01 2020-06-21 9999-12-31
003 13571111111 2020-05-01 2020-06-20 2020-06-22
003 13557444444 2020-05-01 2020-06-23 9999-12-31
004 13581111111 2020-06-01 2020-06-20 2020-06-20
004 13582222222 2020-06-01 2020-06-21 2020-06-21
004 13333333333 2020-06-01 2020-06-22 9999-12-31
005 13552222222 2020-06-21 2020-06-21 2020-06-21
005 13533333333 2020-06-21 2020-06-22 2020-06-22
005 13555555444 2020-06-21 2020-06-23 9999-12-31
006 13733333333 2020-06-22 2020-06-22 9999-12-31
007 18674444444 2020-06-23 2020-06-23 9999-12-31
008 13699844444 2020-06-23 2020-06-23 9999-12-31
Time taken: 1.442 seconds, Fetched: 15 row(s)
hive (test)> select * from userhis where end_date='9999-12-31';
OK
userhis.userid userhis.mobile userhis.regdate userhis.start_date userhis.end_date
002 13562222222 2020-04-01 2020-06-21 9999-12-31
004 13333333333 2020-06-01 2020-06-22 9999-12-31
006 13733333333 2020-06-22 2020-06-22 9999-12-31
001 13554444444 2020-03-01 2020-06-23 9999-12-31
003 13557444444 2020-05-01 2020-06-23 9999-12-31
005 13555555444 2020-06-21 2020-06-23 9999-12-31
007 18674444444 2020-06-23 2020-06-23 9999-12-31
008 13699844444 2020-06-23 2020-06-23 9999-12-31
Time taken: 0.517 seconds, Fetched: 8 row(s)
例如:2020-06-22
hive (test)> select * from userhis where start_date <= '2020-06-22' and end_date >= '2020-06-22';
OK
userhis.userid userhis.mobile userhis.regdate userhis.start_date userhis.end_date
001 13551111111 2020-03-01 2020-06-20 2020-06-22
003 13571111111 2020-05-01 2020-06-20 2020-06-22
002 13562222222 2020-04-01 2020-06-21 9999-12-31
004 13333333333 2020-06-01 2020-06-22 9999-12-31
005 13533333333 2020-06-21 2020-06-22 2020-06-22
006 13733333333 2020-06-22 2020-06-22 9999-12-31
Time taken: 0.244 seconds, Fetched: 6 row(s)
由于种种原因,我们需要对拉链表进行回滚操作,将拉链表回滚到rollback_date那天的数据 此时我们对拉链表中的数据进行分类
按照以上方式对2020-06-22拉链表回滚进行sql编码
1.end_date<rollback_date,保留
select userid,mobile,regdate,start_date,end_date,'1' as tag
from test.userhis
where end_date < '2020-06-22';
2.start_date<= rollback_date <=end_date 修改end_date为”9999-12-31″
select userid,mobile,regdate,start_date,'9999-12-31' as end_date,'2' as tag
from test.userhis
where start_date <= '2020-06-22' and '2020-06-22' <= end_date;
将以上汇总的数据写入临时表tmp(拉链表)
drop table test.tmp;
create table test.tmp as
select userid,mobile,regdate,start_date,end_date,'1' as tag
from test.userhis
where end_date < '2020-06-22'
union all
select userid,mobile,regdate,start_date,'9999-12-31' as end_date,'2' as tag
from test.userhis
where start_date <= '2020-06-22' and '2020-06-22' <= end_date;
脚本执行
#!/bin/bash
if [ -n "$1" ]
then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
drop table test.tmp;
create table test.tmp as
select userid,mobile,regdate,start_date,end_date
from test.userhis
where end_date < '$do_date'
union all
select userid,mobile,regdate,start_date,'9999-12-31' as end_date
from test.userhis
where start_date <= '$do_date' and '$do_date' <= end_date;
select * from test.tmp;"
hive -e "$sql"
我们这里以临时表来测试数据正确性,实际可以替换真实拉链表插入更新
保存一段时间增量数据(userinfo),定期对拉链表做备份(10天或者一个月);如需回滚,直接在备份的拉链表上向后跑增量数据(类似向后更新拉链表)
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
发布者:全栈程序员栈长,转载请注明出处:https://javaforall.cn/181097.html原文链接:https://javaforall.cn