当前位置:   article > 正文

Spark算子综合案例 - Scala篇_第1关:wordcount词频统计

第1关:wordcount词频统计

第1关:WordCount - 词频统计

  1. import org.apache.spark.rdd.RDD
  2. import org.apache.spark.{SparkConf, SparkContext}
  3. object WordCount {
  4. def main(args: Array[String]): Unit = {
  5. val conf = new SparkConf().setMaster("local").setAppName("WordCount")
  6. val sc = new SparkContext(conf)
  7. val path = "file:///root/files/wordcount.txt"
  8. /********* Begin *********/
  9. //读取文件创建RDD
  10. val file = sc.textFile(path)
  11. //切分并压平
  12. val words: RDD[String] = file.flatMap(_.split(" "))
  13. // 组装
  14. val wordsAndone: RDD[(String, Int)] = words.map((_,1))
  15. // 分组聚合
  16. val result: RDD[(String, Int)] = wordsAndone.reduceByKey(_+_)
  17. // 排序
  18. val result1: RDD[(String, Int)] = result.sortBy(_._2,false)
  19. //输出
  20. result1.foreach(println)
  21. /********* End *********/
  22. sc.stop()
  23. }
  24. }

第2关:friend recommendation - 好友推荐

  1. import org.apache.spark.rdd.RDD
  2. import org.apache.spark.{SparkConf, SparkContext}
  3. object Friend {
  4. def main(args: Array[String]): Unit = {
  5. val conf = new SparkConf().setMaster("local").setAppName("friend")
  6. val sc = new SparkContext(conf)
  7. val path = "file:///root/files/friend.txt"
  8. /********* Begin *********/
  9. //1.创建RDD
  10. val rdd: RDD[String] = sc.textFile(path)
  11. //2.切分压平
  12. val rdd1: RDD[(String, Int)] = rdd.flatMap(line => {
  13. var a = List[(String, Int)]()
  14. val split = line.split(" ")
  15. val me = split(0)
  16. for (i <- 1 until split.length) {
  17. val s = if (me.hashCode > split(i).hashCode) me + "_" + split(i) else split(i) + "_" + me
  18. a ::= (s, 0)
  19. for (j <- i+1 until split.length) {
  20. val ss = if (split(j).hashCode > split(i).hashCode) split(j) + "_" + split(i) else split(i) + "_" + split(j)
  21. a ::= (ss, 1)
  22. }
  23. }
  24. a
  25. })
  26. //3分组
  27. val rdd2: RDD[(String, Iterable[Int])] = rdd1.groupByKey()
  28. //4判断每一行里如果为直接好友将其设为0,如果不是+1
  29. val rdd3: RDD[(String, Int)] = rdd2.map(x => {
  30. var bool = false
  31. var count = 0
  32. val flags = x._2
  33. val name = x._1
  34. for (flag <- flags) {
  35. if (flag == 0) bool = true
  36. count += 1
  37. }
  38. if (bool == false) (name, count)
  39. else ("直接好友", 0)
  40. })
  41. //5过滤掉次数为0的剩下的就是间接好友及其次数
  42. val tu: RDD[(String, Int)] = rdd3.filter((x) => if (x._2 !=0) true else false)
  43. //6输出
  44. tu.foreach(println)
  45. /********* End *********/
  46. sc.stop()
  47. }
  48. }

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/花生_TL007/article/detail/388245
推荐阅读
相关标签
  

闽ICP备14008679号