目录
先更新到需求2,后续需求我会继续更新。。。。。。敬请期待!!!!!
如果不考虑2019-02-11和2019-02-12的新增用户:2019-02-10新增100人,一天后它的留存率是30%,2天12号它的留存率是25%,3天后留存率32%;
站在2019-02-12号看02-11的留存率:新增200人,12号的留存率是20%;
站在2019-02-13号看02-12的留存率:新增100人,13号即一天后留存率是25%;
用户留存率的分析:昨日的新增且今天是活跃的 / 昨日的新增用户量
如今天11日,要统计10日的 用户留存率---->10日的新设备且是11日活跃的 / 10日新增设备 分母:10日的新增设备(每日活跃 left join 以往新增设备表(nm) nm.mid_id is null ) 分子:每日活跃表(ud) join 每日新增表(nm) where ud.dt='今天' and nm.create_date = '昨天'
① DWS层(每日留存用户明细表dws_user_retention_day) 用户1天留存的分析:===>>
留存用户=前一天新增 join 今天活跃
用户留存率=留存用户/前一天新增
创建表:dws_user_retention_day
hive (gmall)>
drop table if exists `dws_user_retention_day`;
create table `dws_user_retention_day`
(
`mid_id` string COMMENT '设备唯一标识',
`user_id` string COMMENT '用户标识',
`version_code` string COMMENT '程序版本号',
`version_name` string COMMENT '程序版本名',
`lang` string COMMENT '系统语言',
`source` string COMMENT '渠道号',
`os` string COMMENT '安卓系统版本',
`area` string COMMENT '区域',
`model` string COMMENT '手机型号',
`brand` string COMMENT '手机品牌',
`sdk_version` string COMMENT 'sdkVersion',
`gmail` string COMMENT 'gmail',
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度',
`create_date` string comment '设备新增时间',
`retention_day` int comment '截止当前日期留存天数'
) COMMENT '每日用户留存情况'
PARTITIONED BY ( `dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_user_retention_day/'
;
导入数据(每天计算前1天的新用户访问留存明细)
from dws_uv_detail_day每日活跃设备 ud join dws_new_mid_day每日新增设备 nm on ud.mid_id =nm.mid_id where ud.dt='2019-02-11' and nm.create_date=date_add('2019-02-11',-1);
hive (gmall)>
insert overwrite table dws_user_retention_day partition(dt="2019-02-11")
select
nm.mid_id,
nm.user_id ,
nm.version_code ,
nm.version_name ,
nm.lang ,
nm.source,
nm.os,
nm.area,
nm.model,
nm.brand,
nm.sdk_version,
nm.gmail,
nm.height_width,
nm.app_time,
nm.network,
nm.lng,
nm.lat,
nm.create_date,
1 retention_day
from dws_uv_detail_day ud join dws_new_mid_day nm on ud.mid_id =nm.mid_id
where ud.dt='2019-02-11' and nm.create_date=date_add('2019-02-11',-1);
查询导入数据(每天计算前1天的新用户访问留存明细)
hive (gmall)> select count(*) from dws_user_retention_day;
② DWS层(1,2,3,n天留存用户明细表)直接插入数据:dws_user_retention_day 用union all连接起来,汇总到一个表中;1)直接导入数据(每天计算前1,2,3,n天的新用户访问留存明细) 直接改变这个即可以,date_add('2019-02-11',-3); -1是一天的留存率;-2是两天的留存率、-3是三天的留存率
hive (gmall)>
insert overwrite table dws_user_retention_day partition(dt="2019-02-11")
select
nm.mid_id,
nm.user_id ,
nm.version_code ,
nm.version_name ,
nm.lang ,
nm.source,
nm.os,
nm.area,
nm.model,
nm.brand,
nm.sdk_version,
nm.gmail,
nm.height_width,
nm.app_time,
nm.network,
nm.lng,
nm.lat,
nm.create_date,
1 retention_day
from dws_uv_detail_day ud join dws_new_mid_day nm on ud.mid_id =nm.mid_id
where ud.dt='2019-02-11' and nm.create_date=date_add('2019-02-11',-1)
union all
select
nm.mid_id,
nm.user_id ,
nm.version_code ,
nm.version_name ,
nm.lang ,
nm.source,
nm.os,
nm.area,
nm.model,
nm.brand,
nm.sdk_version,
nm.gmail,
nm.height_width,
nm.app_time,
nm.network,
nm.lng,
nm.lat,
nm.create_date,
2 retention_day
from dws_uv_detail_day ud join dws_new_mid_day nm on ud.mid_id =nm.mid_id
where ud.dt='2019-02-11' and nm.create_date=date_add('2019-02-11',-2)
union all
select
nm.mid_id,
nm.user_id ,
nm.version_code ,
nm.version_name ,
nm.lang ,
nm.source,
nm.os,
nm.area,
nm.model,
nm.brand,
nm.sdk_version,
nm.gmail,
nm.height_width,
nm.app_time,
nm.network,
nm.lng,
nm.lat,
nm.create_date,
3 retention_day
from dws_uv_detail_day ud join dws_new_mid_day nm on ud.mid_id =nm.mid_id
where ud.dt='2019-02-11' and nm.create_date=date_add('2019-02-11',-3);
2)查询导入数据(每天计算前1,2,3天的新用户访问留存明细)
hive (gmall)> select retention_day , count(*) from dws_user_retention_day group by retention_day;
③ ADS层 留存用户数 ads_user_retention_day_count 直接count( * )即可 1)创建 ads_user_retention_day_count表:
hive (gmall)>
drop table if exists `ads_user_retention_day_count`;
create table `ads_user_retention_day_count`
(
`create_date` string comment '设备新增日期',
`retention_day` int comment '截止当前日期留存天数',
`retention_count` bigint comment '留存数量'
) COMMENT '每日用户留存情况'
stored as parquet
location '/warehouse/gmall/ads/ads_user_retention_day_count/';
导入数据 按创建日期create_date 和 留存天数retention_day进行分组group by;
hive (gmall)>
insert into table ads_user_retention_day_count
select
create_date,
retention_day,
count(*) retention_count
from dws_user_retention_day
where dt='2019-02-11'
group by create_date,retention_day;
查询导入数据
hive (gmall)> select * from ads_user_retention_day_count;
---> 2019-02-10 1 112
④ 留存用户比率 retention_count / new_mid_count 即留存个数 / 新增个数 创建表 ads_user_retention_day_rate
hive (gmall)>
drop table if exists `ads_user_retention_day_rate`;
create table `ads_user_retention_day_rate`
(
`stat_date` string comment '统计日期',
`create_date` string comment '设备新增日期',
`retention_day` int comment '截止当前日期留存天数',
`retention_count` bigint comment '留存数量',
`new_mid_count` string comment '当日设备新增数量',
`retention_ratio` decimal(10,2) comment '留存率'
) COMMENT '每日用户留存情况'
stored as parquet
location '/warehouse/gmall/ads/ads_user_retention_day_rate/';
导入数据
join ads_new_mid_countt --->每日新增设备表
hive (gmall)>
insert into table ads_user_retention_day_rate
select
'2019-02-11' ,
ur.create_date,
ur.retention_day,
ur.retention_count ,
nc.new_mid_count,
ur.retention_count/nc.new_mid_count*100
from
(
select
create_date,
retention_day,
count(*) retention_count
from `dws_user_retention_day`
where dt='2019-02-11'
group by create_date,retention_day
) ur join ads_new_mid_count nc on nc.create_date=ur.create_date;
查询导入数据
hive (gmall)>select * from ads_user_retention_day_rate;
2019-02-11 2019-02-10 1 112 442 25.34
沉默用户:指的是只在安装当天启动过,且启动时间是在一周前
使用日活明细表dws_uv_detail_day作为DWS层数据
建表语句
hive (gmall)>
drop table if exists ads_slient_count;
create external table ads_slient_count(
`dt` string COMMENT '统计日期',
`slient_count` bigint COMMENT '沉默设备数'
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_slient_count';
导入数据
hive (gmall)>
insert into table ads_slient_count
select
'2019-02-20' dt,
count(*) slient_count
from
(
select mid_id
from dws_uv_detail_day
where dt<='2019-02-20'
group by mid_id
having count(*)=1 and min(dt)<date_add('2019-02-20',-7)
) t1;
本周回流=本周活跃-本周新增-上周活跃
使用日活明细表dws_uv_detail_day作为DWS层数据
本周回流(上周以前活跃过,上周没活跃,本周活跃了)=本周活跃-本周新增-上周活跃 本周回流=本周活跃left join 本周新增 left join 上周活跃,且本周新增id为null,上周活跃id为null;
建表:
hive (gmall)>
drop table if exists ads_back_count;
create external table ads_back_count(
`dt` string COMMENT '统计日期',
`wk_dt` string COMMENT '统计日期所在周',
`wastage_count` bigint COMMENT '回流设备数'
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_back_count';
导入数据
hive (gmall)>
insert into table ads_back_count
select
'2019-02-20' dt,
concat(date_add(next_day('2019-02-20','MO'),-7),'_',date_add(next_day('2019-02-20','MO'),-1)) wk_dt,
count(*)
from
(
select t1.mid_id
from
(
select mid_id
from dws_uv_detail_wk
where wk_dt=concat(date_add(next_day('2019-02-20','MO'),-7),'_',date_add(next_day('2019-02-20','MO'),-1))
)t1
left join
(
select mid_id
from dws_new_mid_day
where create_date<=date_add(next_day('2019-02-20','MO'),-1) and create_date>=date_add(next_day('2019-02-20','MO'),-7)
)t2
on t1.mid_id=t2.mid_id
left join
(
select mid_id
from dws_uv_detail_wk
where wk_dt=concat(date_add(next_day('2019-02-20','MO'),-7*2),'_',date_add(next_day('2019-02-20','MO'),-7-1))
)t3
on t1.mid_id=t3.mid_id
where t2.mid_id is null and t3.mid_id is null
)t4;
流失用户:最近7天未登录我们称之为流失用户
使用日活明细表dws_uv_detail_day作为DWS层数据
建表语句
hive (gmall)>
drop table if exists ads_wastage_count;
create external table ads_wastage_count(
`dt` string COMMENT '统计日期',
`wastage_count` bigint COMMENT '流失设备数'
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_wastage_count';
导入数据
hive (gmall)>
insert into table ads_wastage_count
select
'2019-02-20',
count(*)
from
(
select mid_id
from dws_uv_detail_day
group by mid_id
having max(dt)<=date_add('2019-02-20',-7)
)t1;
最近3周连续活跃的用户:通常是周一对前3周的数据做统计,该数据一周计算一次。
使用周活明细表dws_uv_detail_wk作为DWS层数据
建表语句
hive (gmall)>
drop table if exists ads_continuity_wk_count;
create external table ads_continuity_wk_count(
`dt` string COMMENT '统计日期,一般用结束周周日日期,如果每天计算一次,可用当天日期',
`wk_dt` string COMMENT '持续时间',
`continuity_count` bigint
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_continuity_wk_count';
导入数据
hive (gmall)>
insert into table ads_continuity_wk_count
select
'2019-02-20',
concat(date_add(next_day('2019-02-20','MO'),-7*3),'_',date_add(next_day('2019-02-20','MO'),-1)),
count(*)
from
(
select mid_id
from dws_uv_detail_wk
where wk_dt>=concat(date_add(next_day('2019-02-20','MO'),-7*3),'_',date_add(next_day('2019-02-20','MO'),-7*2-1))
and wk_dt<=concat(date_add(next_day('2019-02-20','MO'),-7),'_',date_add(next_day('2019-02-20','MO'),-1))
group by mid_id
having count(*)=3
)t1;
说明:最近7天内连续3天活跃用户数
使用日活明细表dws_uv_detail_day作为DWS层数据
建表
hive (gmall)>
drop table if exists ads_continuity_uv_count;
create external table ads_continuity_uv_count(
`dt` string COMMENT '统计日期',
`wk_dt` string COMMENT '最近7天日期',
`continuity_count` bigint
) COMMENT '连续活跃设备数'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_continuity_uv_count';
导入数据
hive (gmall)>
insert into table ads_continuity_uv_count
select
'2019-02-12',
concat(date_add('2019-02-12',-6),'_','2019-02-12'),
count(*)
from
(
select mid_id
from
(
select mid_id
from
(
select
mid_id,
date_sub(dt,rank) date_dif
from
(
select
mid_id,
dt,
rank() over(partition by mid_id order by dt) rank
from dws_uv_detail_day
where dt>=date_add('2019-02-12',-6) and dt<='2019-02-12'
)t1
)t2
group by mid_id,date_dif
having count(*)>=3
)t3
group by mid_id
)t4;
ODS层跟原始字段要一模一样;
DWD层 dwd_order_info订单表 dwd_order_detail订单详情(订单和商品) dwd_user_info用户表 dwd_payment_info支付流水 dwd_sku_info商品表(增加分类)
每日用户行为宽表 dws_user_action
字段:user_id、order_count、order_amount、payment_count、payment_amount 、comment_count
drop table if exists dws_user_action;
create external table dws_user_action(
user_id string comment '用户id',
order_count bigint comment '用户下单数',
order_amount decimal(16, 2) comment '下单金额',
payment_count bigint comment '支付次数',
payment_amount decimal(16, 2) comment '支付金额',
comment_count bigint comment '评论次数'
)comment '每日用户行为宽表'
partitioned by(`dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_user_action/'
tblproperties("parquet.compression"="snappy");
导入数据
0占位符,第一个字段要有别名
with tmp_order as(
select user_id, count(*) order_count, sum(oi.total_amount) order_amount from dwd_order_info oi
where date_format(oi.create_time, 'yyyy-MM-dd')='2019-02-10' group by user_id
),
tmp_payment as(
select user_id, count(*) payment_count, sum(pi.total_amount) payment_amount from dwd_payment_info pi
where date_format(pi.payment_time, 'yyyy-MM-dd')='2019-02-10' group by user_id
),
tmp_comment as(
select user_id, count(*) comment_count from dwd_comment_log c
where date_format(c.dt, 'yyyy-MM-dd')='2019-02-10' group by user_id
)
insert overwrite table dws_user_action partition(dt='2019-02-10')
select user_actions.user_id, sum(user_actions.order_count), sum(user_actions.order_amount),
sum(user_actions.payment_count),
sum(user_actions.payment_amount),
sum(user_actions.comment_count) from(
select user_id, order_count, order_amount, 0 payment_count, 0 payment_amount, 0 comment_count from tmp_order
union all select user_id, 0, 0, payment_count, payment_amount, 0 from tmp_payment
union all select user_id, 0, 0, 0, 0, comment_count from tmp_comment
) user_actions group by user_id;