赞
踩
- # 引包
- from pyspark import SparkContext
- # 加括号才是实例化
- sc = SparkContext()
- # 读文件 textFile
- data = sc.textFile('/home/spark/Downloads/避暑山庄评论.csv')
- # 查看
- # data.collect()
- data.take(4)
- import jieba
- def cut(x):
- return jieba.cut(x)
- # 分词 flatMap
- cut_result = data.flatMap(lambda x:cut(x))
- # cut_result.collect()
- cut_result.take(4)
- # 计数 map
- count_result = cut_result.map(lambda x:(x,1))
- count_result.take(3)
- # 分组求和
- reduce_result = count_result.reduceByKey(lambda x,y:x+y)
- reduce_result.take(3)
- # collectAsMap()是一个动作操作,用于将RDD转换为Python字典类型
- words = reduce_result.collectAsMap()
- import matplotlib.pyplot as plt
- from wordcloud import WordCloud
- wc = WordCloud(
- background_color = "white",
- max_words=2000,
- max_font_size=150,
- random_state=50,
- font_path='/home/spark/Downloads/SourceHanSansCN-Normal.ttf'
- )
- wc.generate_from_frequencies(words)
- plt.imshow(wc)
- plt.axis("off")
- # 读停用词 将停用词过滤掉
- # 使用open()函数打开名为stopWords.txt的文件,它所在的路径为/home/spark/Downloads/。
- # encoding='UTF-8'指定了文件的编码方式为UTF-8,这是一种常见的Unicode编码方式。
- # readlines()方法读取文件中的所有行,并返回一个列表,其中每个元素都是文件中的一行。
- # strip()方法用于去除每个元素的首尾空白符,包括空格、制表符、换行符等。
- # for line in ...部分表示对于文件中的每一行进行操作,而line.strip()则表示对每一行进行去除首尾空白符的操作。
- stopwords = [line.strip() for line in \
- open('/home/spark/Downloads/stopWords.txt',encoding='UTF-8').readlines()]
- stopwords[:3]
- def cut(x):
- tmp = jieba.cut(x)
- result = []
- for word in tmp:
- if word not in stopwords:
- result.append(word)
- return result
- count_result = cut_result.map(lambda x:(x,1))
- reduce_result = count_result.reduceByKey(lambda x,y:x+y)
- # collectAsMap()是一个动作操作,用于将RDD转换为Python字典类型
- words = reduce_result.collectAsMap()
- from wordcloud import WordCloud
- wc = WordCloud(
- background_color = "white",# 背景颜色
- max_words=2000,# 词云显示的最大词数
- max_font_size=150,# 字体最大值
- random_state=50,
- font_path='/home/spark/Downloads/SourceHanSansCN-Normal.ttf'
- )
- wc.generate_from_frequencies(words)
- # 以下代码显示图片
- plt.imshow(wc)
- plt.axis("off")
- # 引包
- from pyspark import SparkContext
- sc = SparkContext()
- # 读文件 textFile
- rdd = sc.textFile("/home/spark/Downloads/避暑山庄评论.csv")
- # 查看数据
- # rdd.collect()
- # 分词
- import jieba
- import jieba
- # 读取中文文本数据文件
- with open('/home/spark/Downloads/避暑山庄评论.csv', 'r', encoding='utf-8') as f:
- text = f.read()
- # jieba分词
- words = jieba.cut(text)
- word_list = " ".join(words)
- # 打印分词结果
- print(word_list)
- # 加载停动词词库
- from pyspark.sql import SparkSession
- spark = SparkSession.builder.master("local").config("spark.hadoop.mapreduce.job.run-local", "true").getOrCreate()
- context = spark.sparkContext
- stop_word_rdd = context.textFile("/home/spark/Downloads/stopWords.txt")
- stop_words = set(stop_word_rdd.collect())
- # 中文分词 去除停用词 这里使用到了集合运算
- def get_word(line):
- return set(jieba.cut(line, cut_all=False)) - stop_words
- # 最终结果为(word,num)格式,需要根据num排序
- def sort_result(elem):
- return elem[1]
- new_rdd = rdd.flatMap(lambda line: get_word(line))
- result = new_rdd.map(lambda word: (word, 1)).reduceByKey(lambda x, y: x + y)
- words = result.collectAsMap()
- import matplotlib.pyplot as plt
- from wordcloud import WordCloud
- wc = WordCloud(
- background_color="white",# 背景颜色
- max_words=2000,# 词云显示的最大词数
- max_font_size=150,# 字体最大值
- random_state=50,
- font_path='/home/spark/Downloads/SourceHanSansCN-Normal.ttf'
- )
- wc.generate_from_frequencies(words)
- # 以下代码显示图片
- plt.imshow(wc)
- plt.axis("off")
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。