当前位置:   article > 正文

spark的jieba分词_spark jieba

spark jieba
import com.huaban.analysis.jieba.{JiebaSegmenter, SegToken}
import com.huaban.analysis.jieba.JiebaSegmenter.SegMode
import org.apache.spark.SparkConf
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.{DataFrame, SparkSession}

object JiebaKry {
  def main(args: Array[String]): Unit = {
    //    定义结巴分词类的序列化
    val conf = new SparkConf()
      .registerKryoClasses(Array(classOf[JiebaSegmenter]))
      .set("spark.rpc.message.maxSize","800")
    //    建立sparkSession,并传入定义好的conf
    val spark = SparkSession
      .builder()
      .appName("Jieba Test")
      .enableHiveSupport()
      .config(conf)
      .getOrCreate()


    //    定义结巴分词的方法,传入的是DataFrame,输出的DataFrame会多一列seg(即分好词的一列)
    def jieba_seg(df:DataFrame,colname:String): DataFrame ={
      val segmenter = new JiebaSegmenter()
      val seg = spark.sparkContext.broadcast(segmenter)
      val jieba_udf = udf{(sentence:String)=>
        val segV = seg.value
        segV.process(sentence.toString,SegMode.INDEX)
          .toArray()
          .map(_.asInstanceOf[SegToken].word)
          .filter(_.length>1)
      }
      //seg列出来的数据都是Array[String]
      df.withColumn("seg",jieba_udf(col(colname)))
    }
    //    从hive中取新闻数据
    val df = spark.sql("select content,label from badou.new_no_seg limit 300")
    val df_seg = jieba_seg(df,"content")
    df_seg.show()
    df_seg.write.mode("overwrite").saveAsTable("badou.news_jieba")

//    val rdd1 = df.rdd.map(x=>(x(0).toString,x(1).toString))
//    rdd1.filter(_._1>1)
//    df.filter(col("content")>1)
  }

}
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Cpp五条/article/detail/564731
推荐阅读
相关标签
  

闽ICP备14008679号