当前位置:   article > 正文

文本预处理,去除词频数为1的文档_freqlist类怎么去除词频为1的词

freqlist类怎么去除词频为1的词
  1. package clustering.garbage
  2. import java.io.PrintWriter
  3. import org.apache.spark.SparkContext
  4. /**
  5. * Created by fhqplzj on 17-1-12 at 下午8:40.
  6. */
  7. object Lines {
  8. def main(args: Array[String]): Unit = {
  9. val sc = new SparkContext("local[*]", s"${getClass.getSimpleName.stripSuffix("$")}")
  10. val path = "/tmp/fuck"
  11. removeRareWords(sc, path)
  12. }
  13. /**
  14. * 去除词频数为1的单词,单词之间以空格分割,行之间以换行符分割
  15. *
  16. * @param sc
  17. * @param path
  18. */
  19. def removeRareWords(sc: SparkContext, path: String): Unit = {
  20. val docs = sc.wholeTextFiles(path)
  21. /*词频数为1的单词组成的集合*/
  22. val rareWords = docs.
  23. values.
  24. flatMap(_.split("\\s+")).
  25. map((_, 1)).
  26. reduceByKey(_ + _).
  27. filter(_._2 == 1).
  28. keys.
  29. collect().
  30. toSet
  31. docs.foreach {
  32. case (name, doc) =>
  33. val printer = new PrintWriter(name.substring(name.indexOf("/")))
  34. val content = doc.split("\n").map(_.split(" ").filterNot(rareWords.contains).mkString(" ")).mkString("\n")
  35. printer.print(content)
  36. printer.close()
  37. }
  38. }
  39. }

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小蓝xlanll/article/detail/109766
推荐阅读