1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
| (1)按照日期以及用户分组,按照日期排序并给出排名 select dt, user_id, min(age) age, rank() over(partition by user_id order by dt) rk from user_age group by dt,user_id;t1
(2)计算日期及排名的差值 select user_id, age, date_sub(dt,rk) flag from t1;t2 这里注意一下:DATE_SUB(dt, INTERVAL rk DAY) AS flag: 使用 DATE_SUB 函数从日期 dt 中减去 rk 天数,生成新的字段 flag。
(3)过滤出差值大于等于 2 的,即为连续两天活跃的用户 select user_id, min(age) age from t2 group by user_id,flag having count(*)>=2;t3
(4)对数据进行去重处理(一个用户可以在两个不同的时间点连续登录),例如:a 用户在 1 月10号1月11号以及1月20号和1月21号4天登录。 select user_id, min(age) age from t3 group by user_id;t4
(5)计算活跃用户(两天连续有访问)的人数以及平均年龄 select count(*) ct, cast(sum(age)/count(*) as decimal(10,2)) from t4;
(6) 对全量数据集进行按照用户去重 select user_id, min(age) age from user_age group by user_id;t5
(7)计算所有用户的数量以及平均年龄 select count(*) user_count, cast((sum(age)/count(*)) as decimal(10,1)) from t5;
(8) 将第 5 步以及第 7 步两个数据集进行 union all 操作 select 0 user_total_count, 0 user_total_avg_age, count(*) twice_count, cast(sum(age)/count(*) twice_count_avg_age from ( select user_id, min(age) age as decimal(10,2)) from (select user_id, min(age) age from ( select user_id, age, date_sub(dt,rk) flag from ( select dt, user_id, min(age) age, rank() over(partition by user_id order by dt) rk from user_age group by dt,user_id )t1 )t2 group by user_id,flag having count(*)>=2)t3 group by user_id )t4 union all select count(*) user_total_count, cast((sum(age)/count(*)) as decimal(10,1)), 0 twice_count, 0 twice_count_avg_age from ( user_id )t5;t6
(9) 求和并拼接为最终SQL select sum(user_total_count), sum(user_total_avg_age), sum(twice_count), sum(twice_count_avg_age) from (select 0 user_total_count, 0 user_total_avg_age, count(*) twice_count, cast(sum(age)/count(*) as twice_count_avg_age from ( select user_id, min(age) age from (select user_id, min(age) age from ( select user_id, age, date_sub(dt,rk) flag decimal(10,2)) from ( select dt, user_id, min(age) age, rank() over(partition by user_id order by dt) rk from user_age group by dt,user_id )t1 )t2 group by user_id,flag having count(*)>=2)t3 group by user_id )t4 union all select count(*) user_total_count, cast((sum(age)/count(*)) as decimal(10,1)), 0 twice_count, 0 twice_count_avg_age from ( user_id )t5)t6;
|