当前位置:   article > 正文

自然语言处理 | (8)中文文本分析、可视化与新闻关键词提取_python 'words_stat

python 'words_stat

目录

1.Python中文文本分析与可视化

2.新闻关键词抽取


1.Python中文文本分析与可视化

  • 读取数据
  1. #!pip install wordcloud #安装词云
  2. import warnings
  3. warnings.filterwarnings("ignore")
  4. import jieba #分词包
  5. import numpy #numpy计算包
  6. import codecs #codecs提供的open方法来指定打开的文件的语言编码,它会在读取的时候自动转换为内部unicode
  7. import pandas as pd
  8. import matplotlib.pyplot as plt
  9. %matplotlib inline
  10. import matplotlib
  11. matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
  12. from wordcloud import WordCloud#词云包
  13. df = pd.read_csv('./data/entertainment_news.csv',encoding='utf-8')
  14. print(df.head())
  15. print(len(df))
  16. df = df.dropna() #去除空值所在的整条(行)数据
  17. print(len(df))

 

  • 分词
  1. #转换list
  2. contents = df['content'].values.tolist()
  3. #分词
  4. segment = []
  5. for content in contents:
  6. try:
  7. segs = jieba.lcut(content)
  8. for seg in segs:
  9. if len(seg)>1 and seg != '\r\n':
  10. segment.append(seg)
  11. except:
  12. print(content)
  13. continue

 

  • 去停用词
  1. #把分词结果构造成DataFrame
  2. words_df = pd.DataFrame({'segment':segment})
  3. print(len(words_df))
  4. stopwords = pd.read_csv("./data/stopwords.txt",index_col=False,quoting=3,sep='\t',names=['stopword'])
  5. words_df = words_df[~words_df['segment'].isin(stopwords['stopword'])] #去除停止词
  6. print(len(words_df))

  • 统计词频
  1. #统计词频
  2. words_stat = words_df.groupby(by=['segment'])['segment'].agg({'词频':numpy.size})
  3. words_stat = words_stat.reset_index().sort_values(by=['词频'],ascending=False) #降序
  4. print(words_stat.head())

  • 构建词云
  1. matplotlib.rcParams['figure.figsize'] = (10.0,6.0)
  2. #设置中文字体 背景颜色等
  3. wordcloud = WordCloud(font_path='./data/simhei.ttf',background_color='black',max_font_size=80)
  4. #字典推导式
  5. word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values} #取词频最高的前1000个词 (词,词频)->{词:词频}
  6. wordcloud = wordcloud.fit_words(word_frequence)
  7. plt.imshow(wordcloud)#可视化词频最高的前1000个词 字体越大 词频越高

 

  • 自定义背景
  1. from scipy.misc import imread
  2. matplotlib.rcParams['figure.figsize'] = (15.0,15.0)
  3. from wordcloud import WordCloud,ImageColorGenerator
  4. bimg = imread('./image/entertainment.jpeg') #读入背景图片
  5. plt.imshow(bimg) #可视化背景图片

  1. #使用上述图片 自定义背景
  2. #设置背景颜色 字体 自定义背景
  3. wordcloud = WordCloud(background_color='white',mask=bimg,font_path='./data/simhei.ttf',max_font_size=200)
  4. #字典推导式
  5. word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values} #取词频最高的前1000个词 (词,词频)->{词:词频}
  6. wordcloud = wordcloud.fit_words(word_frequence)
  7. bimgColors = ImageColorGenerator(bimg)
  8. plt.imshow(wordcloud.recolor(color_func=bimgColors))

 

2.新闻关键词抽取

  • 基于TF-IDF的关键词抽取
  1. import jieba.analyse as analyse
  2. import pandas as pd
  3. #技术类新闻
  4. df = pd.read_csv('./data/technology_news.csv',encoding='utf-8').dropna()
  5. print(len(df)) #新闻数量/数据量
  6. #转换列表
  7. contents = df['content'].values.tolist()
  8. contents = ' '.join(contents) #把所有的新闻连接起来
  9. print(analyse.extract_tags(contents,topK=30,withWeight=False,allowPOS=()))

  1. import jieba.analyse as analyse
  2. import pandas as pd
  3. #军事类新闻
  4. df = pd.read_csv("./data/military_news.csv", encoding='utf-8').dropna()
  5. print(len(df)) #新闻数量/数据量
  6. #转换列表
  7. contents=df.content.values.tolist()
  8. contents = " ".join(contents)#把所有的新闻连接起来
  9. print(analyse.extract_tags(contents, topK=30, withWeight=False, allowPOS=()))

 

  • 基于TextRank的关键词抽取
  1. import jieba.analyse as analyse
  2. import pandas as pd
  3. #军事类新闻
  4. df = pd.read_csv('./data/military_news.csv',encoding='utf-8')
  5. df = df.dropna()
  6. print(len(df)) #新闻数量/数据量
  7. #转换列表
  8. contents=df.content.values.tolist()
  9. contents = "".join(contents)#把所有的新闻连接起来
  10. print(analyse.textrank(contents, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')))
  11. print("---------------------我是分割线----------------")
  12. print(analyse.textrank(contents, topK=20, withWeight=False, allowPOS=('ns', 'n')))

 

 

 

 

 

 

 

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/凡人多烦事01/article/detail/376286
推荐阅读
相关标签
  

闽ICP备14008679号