当前位置:   article > 正文

Java (Spark案例分析)_spark java

spark java

一、需求:计算网页访问量前三名

  1. import org.apache.spark.rdd.RDD
  2. import org.apache.spark.{SparkConf, SparkContext}
  3. /**
  4. * 需求:计算网页访问量前三名
  5. * 用户:喜欢视频 直播
  6. * 帮助企业做经营和决策
  7. *
  8. * 看数据
  9. */
  10. object UrlCount {
  11. def main(args: Array[String]): Unit = {
  12. //1.加载数据
  13. val conf:SparkConf = new SparkConf().setAppName("UrlCount").setMaster("local[2]")
  14. //spark程序入口
  15. val sc: SparkContext = new SparkContext(conf)
  16. //载入数据
  17. val rdd1: RDD[String] = sc.textFile("e:/access.log")
  18. //2.对数据进行计算 w,1 h,1
  19. val rdd2: RDD[(String, Int)] = rdd1.map(line => {
  20. val s: Array[String] = line.split("\t")
  21. //标注为出现1
  22. (s(1), 1)
  23. })
  24. //3.将相同的网址进行累加求和 网页,201
  25. val rdd3:RDD[(String, Int)] = rdd2.reduceByKey(_+_)
  26. //4.排序 取出前三
  27. val rdd4: Array[(String, Int)] = rdd3.sortBy(_._2, false).take(3)
  28. //5.遍历打印
  29. rdd4.foreach(x => {
  30. println("网址为:" + x._1 + "访问量为:" + x._2)
  31. })
  32. //6.关闭资源
  33. sc.stop()
  34. }
  35. }

结果:

二、需求:求出每个学院 访问第一位的网址

  1. import java.net.URL
  2. import org.apache.spark.rdd.RDD
  3. import org.apache.spark.{SparkConf, SparkContext}
  4. /**
  5. * 需求:求出每个学院 访问第一位的网址
  6. * bigdata:video(直播)
  7. * java:video
  8. * python:teacher
  9. */
  10. object UrlGroupCount {
  11. def main(args: Array[String]): Unit = {
  12. //1.创建sparkContext
  13. val conf: SparkConf = new SparkConf().setAppName("UrlGroupCount").setMaster("local[2]")
  14. val sc: SparkContext = new SparkContext(conf)
  15. //2.加载数据
  16. val rdd1: RDD[String] = sc.textFile("e:/access.log")
  17. //3.切分
  18. val rdd2: RDD[(String, Int)] = rdd1.map(line => {
  19. val s: Array[String] = line.split("\t")
  20. //网址,1
  21. (s(1), 1)
  22. })
  23. //4.求出总的访问量 网址,总的访问量
  24. val rdd3: RDD[(String, Int)] = rdd2.reduceByKey(_+_)
  25. //5.取出学院
  26. val rdd4: RDD[(String, String, Int)] = rdd3.map(x => {
  27. //拿到url
  28. val url: String = x._1
  29. //java中拿到主机名
  30. val host: String = new URL(url).getHost.split("[.]")(0)
  31. //元组输出
  32. (host, url, x._2)
  33. })
  34. //6.按照学院进行分组
  35. val rdd5: RDD[(String, List[(String, String, Int)])] = rdd4.groupBy(_._1).mapValues(it => {
  36. //倒序
  37. it.toList.sortBy(_._3).reverse.take(1)
  38. })
  39. //7.遍历打印
  40. rdd5.foreach(x => {
  41. println("学院为:" + x._1 + "," + "访问量第一的为:" + x._2)
  42. })
  43. //8.关闭资源
  44. sc.stop()
  45. }
  46. }

 

结果:

三、需求:加入自定义分区,按照学院分区,相同的学院分为一个结果文件

  1. import java.net.URL
  2. import org.apache.spark.rdd.RDD
  3. import org.apache.spark.{Partitioner, SparkConf, SparkContext}
  4. import scala.collection.mutable
  5. /**
  6. * 需求:加入自定义分区
  7. * 按照学院分区,相同的学院分为一个结果文件
  8. */
  9. object UrlParCount {
  10. def main(args: Array[String]): Unit = {
  11. //1.创建sparkContext
  12. val conf: SparkConf = new SparkConf().setAppName("UrlParCount").setMaster("local[2]")
  13. val sc: SparkContext = new SparkContext(conf)
  14. //2.加载数据
  15. val rdd1 = sc.textFile("e:/access.log").map(line => {
  16. val s: Array[String] = line.split("\t")
  17. //元组输出
  18. (s(1), 1)
  19. })
  20. //3.聚合
  21. val rdd2: RDD[(String, Int)] = rdd1.reduceByKey(_+_)
  22. //4.自定义格式
  23. val rdd3: RDD[(String, (String, Int))] = rdd2.map(t => {
  24. val url = t._1
  25. val host = new URL(url).getHost
  26. val xHost: String = host.split("[.]")(0)
  27. //元组输出
  28. (xHost, (url, t._2))
  29. })
  30. //5.加入自定义分区
  31. val xueyuan: Array[String] = rdd3.map(_._1).distinct().collect
  32. val xueYuanPartitioner: XueYuanPartitioner = new XueYuanPartitioner(xueyuan)
  33. //6.加入分区规则
  34. val rdd4: RDD[(String, (String, Int))] = rdd3.partitionBy(xueYuanPartitioner).mapPartitions(it => {
  35. it.toList.sortBy(_._2._2).reverse.take(1).iterator
  36. })
  37. //7.遍历打印
  38. rdd4.saveAsTextFile("e://pout")
  39. //8.关闭资源
  40. sc.stop()
  41. }
  42. }
  43. class XueYuanPartitioner(xy: Array[String]) extends Partitioner {
  44. //自定义规则 学院 分区号
  45. val rules: mutable.HashMap[String, Int] = new mutable.HashMap[String, Int]()
  46. var number = 0
  47. //遍历学院
  48. for(i <- xy){
  49. //学院与分区号对应
  50. rules += (i -> number)
  51. //分区号递增
  52. number += 1
  53. }
  54. //总的分区个数
  55. override def numPartitions: Int = xy.length
  56. //拿到分区
  57. override def getPartition(key: Any): Int = {
  58. rules.getOrElse(key.toString, 0)
  59. }
  60. }

 

结果:

1、part-00000

(bigdata,(http://bigdata.xxxxxx.com/bigdata/video.shtml,503))

2、part-00001

(java,(http://java.xxxxxx.com/java/course/cloud.shtml,1028))

3、part-00002

(net,(http://net.xxxxxx.com/net/video.shtml,525))

四、pom.xml文件

  1. <?xml version="1.0" encoding="UTF-8"?>
  2. <project xmlns="http://maven.apache.org/POM/4.0.0"
  3. xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4. xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5. <modelVersion>4.0.0</modelVersion>
  6. <groupId>com.demo.spark</groupId>
  7. <artifactId>SparkWC</artifactId>
  8. <version>1.0-SNAPSHOT</version>
  9. <properties>
  10. <maven.compiler.source>1.8</maven.compiler.source>
  11. <maven.compiler.target>1.8</maven.compiler.target>
  12. <scala.version>2.11.8</scala.version>
  13. <spark.version>2.2.0</spark.version>
  14. <hadoop.version>2.8.4</hadoop.version>
  15. <encoding>UTF-8</encoding>
  16. </properties>
  17. <dependencies>
  18. <!-- scala的依赖导入 -->
  19. <dependency>
  20. <groupId>org.scala-lang</groupId>
  21. <artifactId>scala-library</artifactId>
  22. <version>${scala.version}</version>
  23. </dependency>
  24. <!-- spark的依赖导入 -->
  25. <dependency>
  26. <groupId>org.apache.spark</groupId>
  27. <artifactId>spark-core_2.11</artifactId>
  28. <version>${spark.version}</version>
  29. </dependency>
  30. <!-- hadoop-client API的导入 -->
  31. <dependency>
  32. <groupId>org.apache.hadoop</groupId>
  33. <artifactId>hadoop-client</artifactId>
  34. <version>${hadoop.version}</version>
  35. </dependency>
  36. <!-- https://mvnrepository.com/artifact/mysql/mysql-connector-java -->
  37. <dependency>
  38. <groupId>mysql</groupId>
  39. <artifactId>mysql-connector-java</artifactId>
  40. <version>5.1.39</version>
  41. </dependency>
  42. </dependencies>
  43. <build>
  44. <pluginManagement>
  45. <plugins>
  46. <!-- scala的编译插件 -->
  47. <plugin>
  48. <groupId>net.alchim31.maven</groupId>
  49. <artifactId>scala-maven-plugin</artifactId>
  50. <version>3.2.2</version>
  51. </plugin>
  52. <!-- java的编译插件 -->
  53. <plugin>
  54. <groupId>org.apache.maven.plugins</groupId>
  55. <artifactId>maven-compiler-plugin</artifactId>
  56. <version>3.5.1</version>
  57. </plugin>
  58. </plugins>
  59. </pluginManagement>
  60. <plugins>
  61. <plugin>
  62. <groupId>net.alchim31.maven</groupId>
  63. <artifactId>scala-maven-plugin</artifactId>
  64. <executions>
  65. <execution>
  66. <id>scala-compile-first</id>
  67. <phase>process-resources</phase>
  68. <goals>
  69. <goal>add-source</goal>
  70. <goal>compile</goal>
  71. </goals>
  72. </execution>
  73. <execution>
  74. <id>scala-test-compile</id>
  75. <phase>process-test-resources</phase>
  76. <goals>
  77. <goal>testCompile</goal>
  78. </goals>
  79. </execution>
  80. </executions>
  81. </plugin>
  82. <plugin>
  83. <groupId>org.apache.maven.plugins</groupId>
  84. <artifactId>maven-compiler-plugin</artifactId>
  85. <executions>
  86. <execution>
  87. <phase>compile</phase>
  88. <goals>
  89. <goal>compile</goal>
  90. </goals>
  91. </execution>
  92. </executions>
  93. </plugin>
  94. <!-- 打jar包插件 -->
  95. <plugin>
  96. <groupId>org.apache.maven.plugins</groupId>
  97. <artifactId>maven-shade-plugin</artifactId>
  98. <version>2.4.3</version>
  99. <executions>
  100. <execution>
  101. <phase>package</phase>
  102. <goals>
  103. <goal>shade</goal>
  104. </goals>
  105. <configuration>
  106. <filters>
  107. <filter>
  108. <artifact>*:*</artifact>
  109. <excludes>
  110. <exclude>META-INF/*.SF</exclude>
  111. <exclude>META-INF/*.DSA</exclude>
  112. <exclude>META-INF/*.RSA</exclude>
  113. </excludes>
  114. </filter>
  115. </filters>
  116. </configuration>
  117. </execution>
  118. </executions>
  119. </plugin>
  120. </plugins>
  121. </build>
  122. </project>

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/菜鸟追梦旅行/article/detail/466709
推荐阅读
相关标签
  

闽ICP备14008679号