当前位置:   article > 正文

hive sql练习题(想起来就随便找几道HQL练习练习,持续更新)_hivesql计算题

hivesql计算题

2023.3.13

  1. --题目1:找出所有科目成绩都大于某一学科平均成绩的学生
  2. --表结构:score(uid,subject_id,score)
  3. --数据例:1001 01 90
  4. select t.uid
  5. from (select uid,
  6. count(1) as cnt,
  7. count(`if`(tab.score > avg_sco, tab.score, null)) as avg_ant
  8. from (select uid,
  9. subject_id,
  10. score,
  11. avg(score) over (partition by subject_id) as avg_sco
  12. from score) tab
  13. group by uid) t
  14. where t.cnt = t.avg_ant
  15. --题目2:统计出每个用户每个月的访问数和累积访问次数
  16. --表结构:action(userId string,visitDate string,visitCount)
  17. --数据例:u01 2017/2/22 4
  18. select userid,
  19. dt,
  20. cnt current_cnt,
  21. sum(cnt) over (partition by userid order by dt) acc_cnt
  22. from (select userid,
  23. dt,
  24. sum(visitcount) as cnt
  25. from (select userId,
  26. date_format(regexp_replace(visitdate, '/', '-'), 'yyyy-MM') as dt,
  27. visitcount
  28. from action) t
  29. group by userid, dt) tab
  30. --题目3:有50W个京东店铺,每个顾客访客访问任何一个店铺的任何一个商品时都会产生一条访问日志,访问日志存储的表名为Visit,访客的用户id为user_id,被访问的店铺名称为shop,请统计
  31. --数据例:u1 a
  32. --1)每个店铺的UV(访客数)
  33. --2)每个店铺访问次数top3的访客信息。输出店铺名称、访客id、访问次数数据集
  34. 1:
  35. select shop, count(distinct user_id)
  36. from visit
  37. group by shop;
  38. 2
  39. select shop,
  40. user_id,
  41. cnt,
  42. rn
  43. from (select shop,
  44. user_id,
  45. cnt,
  46. rank() over (partition by shop order by t.cnt desc ) as rn
  47. from (select shop,
  48. user_id,
  49. count(1) as cnt
  50. from visit
  51. group by shop, user_id) t) temp
  52. where temp.rn <= 3
  53. --题目4
  54. --order_tab(dt,order_id,user_id,amount)
  55. --数据样例:2017-01-01,10029028,1000003251,33.57
  56. --1)给出 2017年每个月的订单数、用户数、总成交金额。
  57. --2)给出2017年11月的新客数(指在11月才有第一笔订单)
  58. 1:
  59. select substr(dt, 1, 7),
  60. count(order_id),
  61. count(distinct user_id),
  62. sum(amount)
  63. from order_tab
  64. where substr(dt, 1, 4) = '2017'
  65. group by substr(dt, 1, 7);
  66. 2.
  67. select count(1) as cnt
  68. from (select user_id,
  69. min(dt)
  70. from order_tab
  71. group by user_id
  72. having substr(min(dt), 1, 7) = '2017-11') t
  73. --题目5
  74. --请写出代码求得所有用户和活跃用户的总数及平均年龄,活跃用户指连续两天都有访问记录的用户
  75. --user_age(dt string,user_id string,age int)
  76. --数据样例:2019-02-11,test_1,23
  77. select count(user_id) as user_cnt,
  78. sum(t.age) / count(user_id) user_avg_age,
  79. 'total' is_active
  80. from (select user_id, min(user_age.age) as age
  81. from user_age
  82. group by user_id) t
  83. union all
  84. select count(user_id) as act_user,
  85. sum(age) / count(user_id) act_user_avg_age,
  86. 'active'
  87. from (select user_id,
  88. min(age) as age
  89. from (select user_id,
  90. dt,
  91. age,
  92. datediff(nvl(lead_dt, '9999-12-31'), dt) as diff_num
  93. from (select user_id,
  94. dt,
  95. age,
  96. lead(dt, 1) over (partition by user_id order by dt) as lead_dt
  97. from (select user_id,
  98. dt,
  99. min(age) as age
  100. from user_age
  101. group by user_id, dt) t) t1) t2
  102. where diff_num = 1
  103. group by user_id) t3
  104. --题目6
  105. --用一条SQL语句查询出每门课都大于80分的学生姓名
  106. --name kecheng fenshu
  107. --数据示例:张三 语文 81
  108. select name from score group by name having min(fenshu)>80
  109. --题目7
  110. --怎么把这样一个
  111. year month amount
  112. 1991 1 1.1
  113. 1991 2 1.2
  114. 1991 3 1.3
  115. 1991 4 1.4
  116. 1992 1 2.1
  117. 1992 2 2.2
  118. 1992 3 2.3
  119. 1992 4 2.4
  120. 查成这样一个结果
  121. year m1 m2 m3 m4
  122. 1991 1.1 1.2 1.3 1.4
  123. 1992 2.1 2.2 2.3 2.4
  124. select year,
  125. max(case when month = 1 then amount else null end) as m1,
  126. max(case when month = 2 then amount else null end) as m2,
  127. max(case when month = 3 then amount else null end) as m3,
  128. max(case when month = 4 then amount else null end) as m4
  129. from m_am
  130. group by year
  131. --题目8 有一个订单表order。已知字段有:order_id(订单ID), user_id(用户ID),amount(金额), pay_datetime(付费时间),channel_id(渠道ID),dt(分区字段)。
  132. --order_a(order_id int,user_id int,amount double,pay_datatime timestamp,channel_id int)
  133. --1)查询dt=‘2018-09-01‘里每个渠道的订单数,下单人数(去重),总金额
  134. select channel_id,
  135. count(order_id),
  136. count(distinct user_id),
  137. sum(amount)
  138. from order_a
  139. where dt = '2018-09-01'
  140. group by channel_id;
  141. --2)查询dt=‘2018-09-01‘里每个渠道的金额最大3笔订单。
  142. select channel_id,
  143. order_id,
  144. rn
  145. from (select channel_id,
  146. order_id,
  147. rank() over (partition by channel_id order by amount desc ) as rn
  148. from order_a
  149. where dt = '2018-09-01') t
  150. where t.rn <= 3;
  151. --4) 有一天发现订单数据重复,请分析原因
  152. 1.首先关系型数据库不会发生重复
  153. 2.hive ods层接受到数据有重复很正常,前提数据通道是kafka数据源
  154. 3.那么很有可能后端查询mysql业务数据重复查询,然后重复打到kafka了
  155. 4.也有可能手动使用datax等工具同步数据的时候,数据拉取限制条件在凌晨12点的情况下,>>=没写清楚

2023.3.15

  1. --题目1
  2. --找出连续3天及以上减少碳排放量在100以上的用户
  3. --id dt lowcarbon
  4. --1001 2021-12-12 123
  5. --1002 2021-12-12 45
  6. --1001 2021-12-13 43
  7. --1001 2021-12-13 45
  8. --1001 2021-12-13 23
  9. --1002 2021-12-14 45
  10. --1001 2021-12-14 230
  11. --1002 2021-12-15 45
  12. --1001 2021-12-15 23
  13. select id
  14. from (select id,
  15. dt,
  16. cnt,
  17. date_sub(dt, rn) as tem_dt
  18. from (select id,
  19. dt,
  20. cnt,
  21. rank() over (partition by id,dt order by cnt) as rn
  22. from (select id, dt, sum(lowcarbon) as cnt
  23. from table
  24. group by id, dt
  25. having sum(lowcarbon) > 100) t1) t2) t3
  26. group by id, tem_dt
  27. having count(1) >= 3
  28. --题目2
  29. --计算每个用户最大的连续登录天数,可以间隔一天。解释:如果一个用户在 1,3,5,6 登录游戏,则视为连续 6 天登录
  30. --id dt
  31. --1001 2021-12-12
  32. --1001 2021-12-12
  33. --1001 2021-12-13
  34. --1001 2021-12-14
  35. --1001 2021-12-16
  36. --1001 2021-12-19
  37. --1001 2021-12-20
  38. --1002 2021-12-12
  39. --1002 2021-12-16
  40. --1002 2021-12-17

2023.3.16

  1. --第 4 题 打折日期交叉问题
  2. --如下为平台商品促销数据:字段为品牌,打折开始日期,打折结束日期
  3. --id stt edt
  4. --oppo 2021-06-05 2021-06-09
  5. --oppo 2021-06-11 2021-06-21
  6. --vivo 2021-06-05 2021-06-15
  7. --vivo 2021-06-09 2021-06-21
  8. --redmi 2021-06-05 2021-06-21
  9. --redmi 2021-06-09 2021-06-15
  10. --redmi 2021-06-17 2021-06-26
  11. --huawei 2021-06-05 2021-06-26
  12. --huawei 2021-06-09 2021-06-15
  13. --huawei 2021-06-17 2021-06-21
  14. 计算每个品牌总的打折销售天数,注意其中的交叉日期,比如 vivo 品牌,第一次活动时间为 2021-06-052021-06-15,第二次活动时间为 2021-06-092021-06-21 其中 9 号到 15 号为重复天数,只统计一次,即 vivo 总打折天数为 2021-06-052021-06-21 共计 17
  15. select id,
  16. sum(if(days>=0,days+1,0)) days
  17. from
  18. ( select id,
  19. datediff(edt,stt) days
  20. from
  21. (
  22. select id,
  23. if(maxEdt is null,stt,if(stt>maxEdt,stt,date_add(maxEdt,1))) stt,
  24. edt
  25. from
  26. ( select id,
  27. stt,
  28. edt,
  29. max(edt) over(partition by id order by stt rows between UNBOUNDED PRECEDING and 1 PRECEDING) maxEdt
  30. from test_b
  31. )t1
  32. )t2
  33. )t3
  34. group by
  35. id;

2023.3.17

  1. --如下为某直播平台主播开播及关播时间,根据该数据计算出平台最高峰同时在线的主播人数。
  2. --id stt edt
  3. --1001 2021-06-14 12:12:12 2021-06-14 18:12:12
  4. --1003 2021-06-14 13:12:12 2021-06-14 16:12:12
  5. --1004 2021-06-14 13:15:12 2021-06-14 20:12:12
  6. --1002 2021-06-14 15:12:12 2021-06-14 16:12:12
  7. --1005 2021-06-14 15:18:12 2021-06-14 20:12:12
  8. --1001 2021-06-14 20:12:12 2021-06-14 23:12:12
  9. --1006 2021-06-14 21:12:12 2021-06-14 23:15:12
  10. --1007 2021-06-14 22:12:12 2021-06-14 23:10:12
  11. select max(cnt)
  12. from (select id,
  13. t,
  14. sum(flag) over (order by t) as cnt
  15. from (select id, stt as t, 1 as flag
  16. from test_c
  17. union all
  18. select id, edt as t, -1 as flag
  19. from test_c) t1) t2

2023.3.17

  1. --有一个5000万的用户文件(user_id,name,age),一个2亿记录的用户看电影的记录文件(user_id,url),根据年龄段观看电影的次数进行排序?
  2. select age_phase,
  3. count(1) as cnt
  4. from (select u.user_id,
  5. name,
  6. age,
  7. age_phase,
  8. url
  9. from (SELECT user_id,
  10. name,
  11. age,
  12. CASE
  13. WHEN age <= 10 AND age > 0 THEN '0-10'
  14. WHEN age <= 20 AND age > 10 THEN '10-20'
  15. WHEN age > 20 AND age <= 30 THEN '20-30'
  16. WHEN age > 30 AND age <= 40 THEN '30-40'
  17. WHEN age > 40 AND age <= 50 THEN '40-50'
  18. WHEN age > 50 AND age <= 60 THEN '50-60'
  19. WHEN age > 60 AND age <= 70 THEN '60-70'
  20. ELSE '70以上' END as age_phase
  21. from test4user) u
  22. join test4log l on u.user_id = l.user_id) t
  23. group by age_phase order by count(1) desc;

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/知新_RL/article/detail/711234
推荐阅读
相关标签
  

闽ICP备14008679号