赞
踩
- package clustering.garbage
-
- import java.io.PrintWriter
-
- import org.apache.spark.SparkContext
-
- /**
- * Created by fhqplzj on 17-1-12 at 下午8:40.
- */
- object Lines {
- def main(args: Array[String]): Unit = {
- val sc = new SparkContext("local[*]", s"${getClass.getSimpleName.stripSuffix("$")}")
- val path = "/tmp/fuck"
- removeRareWords(sc, path)
- }
-
- /**
- * 去除词频数为1的单词,单词之间以空格分割,行之间以换行符分割
- *
- * @param sc
- * @param path
- */
- def removeRareWords(sc: SparkContext, path: String): Unit = {
- val docs = sc.wholeTextFiles(path)
- /*词频数为1的单词组成的集合*/
- val rareWords = docs.
- values.
- flatMap(_.split("\\s+")).
- map((_, 1)).
- reduceByKey(_ + _).
- filter(_._2 == 1).
- keys.
- collect().
- toSet
- docs.foreach {
- case (name, doc) =>
- val printer = new PrintWriter(name.substring(name.indexOf("/")))
- val content = doc.split("\n").map(_.split(" ").filterNot(rareWords.contains).mkString(" ")).mkString("\n")
- printer.print(content)
- printer.close()
- }
- }
- }
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。