# 当天新用户
hive -e \'select count(1) from hm2.daily_helper \
where guid not in (select guid from hm2.history_helper);\' > %s'%(resultPath)
(status,result) = execHive(cmd)
# 次日活跃留存
hive -e \'select count(1) from\
(select guid from hm2.helper where dt = "%s" group by guid) yes\
inner join\
(select guid from hm2.helper where dt = "%s" group by guid) today\
where yes.guid = today.guid;\'
SQL语句统计连续登陆的三天数和以上的用户案例分析
这个问题可以扩展到很多相似的问题:连续几个月充值会员、连续天数有商品卖出、连续打滴滴、连续逾期。
测试数据:用户ID、登入日期 uid,dt guid01,2018-02-28 guid01,2018-03-01 guid01,2018-03-02 guid01,2018-03-04 guid01,2018-03-05 guid01,2018-03-06 guid01,2018-03-07 guid02,2018-03-01 guid02,2018-03-02 guid02,2018-03-03 guid02,2018-03-06 目标表格:
+---------+--------+-------------+-------------+--+ | uid | times | start_date | end_date | +---------+--------+-------------+-------------+--+ | guid01 | 4 | 2018-03-04 | 2018-03-07 | | guid02 | 3 | 2018-03-01 | 2018-03-03 | +---------+--------+-------------+-------------+--+
思路: 写sql呗 1.分组,排序,打行号,2.让时间戳-行号根据差值检查是否为连续 整体的答案:
SELECT uid, min(dt), max(dt), count(1) AS counts
FROM
(SELECT uid , dt, date_sub(dt, rn) AS dis
FROM
(SELECT uid , dt, row_number()over (partition by uid ORDER BY dt)rn
FROM continuous
)t1
)t2
GROUP BY uid ,dis HAVING counts>2
答案解析: 1.分组 排序 打行号
select
uid ,dt,row_number()over (partition by uid order by dt)rn
from continuous
表格实现
+------+-------------------+-----+ | uid | dt | rn | +------+-------------------+-----+ |guid02|2018-03-01 00:00:00| 1| |guid02|2018-03-02 00:00:00| 2| |guid02|2018-03-03 00:00:00| 3| |guid02|2018-03-06 00:00:00| 4| |guid01|2018-02-28 00:00:00| 1| |guid01|2018-03-01 00:00:00| 2| |guid01|2018-03-02 00:00:00| 3| |guid01|2018-03-04 00:00:00| 4| |guid01|2018-03-05 00:00:00| 5| |guid01|2018-03-06 00:00:00| 6| |guid01|2018-03-07 00:00:00| 7| +---------+----------------------+-------+
2…让时间戳-行号根据差值检查是否为连续
select
uid ,dt, date_sub(dt,rn) as dis
from
(
select
uid ,dt,row_number()over (partition by uid order by dt)rn
from continuous
)t1
表格实现
+------+-------------------+----------+ | uid| dt | dis | +------+-------------------+----------+ |guid02|2018-03-01 00:00:00|2018-02-28| |guid02|2018-03-02 00:00:00|2018-02-28| |guid02|2018-03-03 00:00:00|2018-02-28| |guid02|2018-03-06 00:00:00|2018-03-02| |guid01|2018-02-28 00:00:00|2018-02-27| |guid01|2018-03-01 00:00:00|2018-02-27| |guid01|2018-03-02 00:00:00|2018-02-27| |guid01|2018-03-04 00:00:00|2018-02-28| |guid01|2018-03-05 00:00:00|2018-02-28| |guid01|2018-03-06 00:00:00|2018-02-28| |guid01|2018-03-07 00:00:00|2018-02-28| +------+-------------------+----------+ 3.为连续的接果为一样的用count(1)函数计算行数总数
select
uid,min(dt),max(dt),count(1) as counts
from
(
select
uid ,dt, date_sub(dt,rn) as dis
from
(
select
uid ,dt,row_number()over (partition by uid order by dt)rn
from continuous
)t1
)t2
group by uid
表格实现
+------+-------------------+-------------------+------+ | uid| min(dt)| max(dt)|counts| +------+-------------------+-------------------+------+ |guid02|2018-03-01 00:00:00|2018-03-03 00:00:00| 3| |guid01|2018-02-28 00:00:00|2018-03-02 00:00:00| 3| |guid01|2018-03-04 00:00:00|2018-03-07 00:00:00| 4| +------+-------------------+-------------------+------+ 4.这样的结果店铺有重复为了只显示收入最好的一个店铺在在这个基础上包两层select
SELECT *
FROM
(SELECT *, row_number() over(partition by uid ORDER BY counts desc) aa
FROM
(SELECT uid, min(dt), max(dt), count(1) AS counts
FROM
(SELECT uid , dt, date_sub(dt, rn) AS dis
FROM
(SELECT uid , dt, row_number()over (partition by uid ORDER BY dt)rn
FROM continuous
)t1
)t2
GROUP BY uid )t3
)t4 WHERE aa = 1
表格实现:
+------+-------------------+-------------------+--------+ | uid| min(dt)| max(dt) | counts| +------+-------------------+-------------------+---------+ |guid02|2018-03-01 00:00:00|2018-03-03 00:00:00| 3| |guid01|2018-02-28 00:00:00|2018-03-02 00:00:00| 3| |guid01|2018-03-04 00:00:00|2018-03-07 00:00:00| 4| +------+-------------------+-------------------+------+ 原文链接:https://blog.csdn.net/weixin_45896475/article/details/103879887 |