当前位置:   article > 正文

Python读取文件进行中文词频统计_读取该文件内容并统计其中每个字出现的频次。

读取该文件内容并统计其中每个字出现的频次。

一、读取CSV文件进行中文词频统计

第一种情况:

利用Counter函数进行词频统计,比较简洁,代码如下:

数据:movie_comments.csv文件为23万的影评数据

  1. # -*- coding:utf-8 -*-
  2. import jieba
  3. import re
  4. import pandas as pd
  5. from collections import Counter
  6. content = pd.read_csv(r"movie_comments.csv")
  7. # print(content.head())
  8. articles = content['comment'].tolist()
  9. print(len(articles))
  10. def token(string):
  11. return re.findall('\w+',string)
  12. articles_clean = [''.join(token(str(s))) for s in articles]
  13. print(articles_clean[100])
  14. def cut_word(string):
  15. return list(jieba.cut(string))
  16. articles_words = [cut_word(string) for string in articles_clean]
  17. list_set = []
  18. for i in articles_words:
  19. list_set.extend(i)
  20. words_count = Counter(list_set)
  21. statics = words_count.most_common()[:50]
  22. print(statics)

结果: 

[('的', 328262), ('了', 102420), ('是', 73106), ('我', 50338), ('都', 36255), ('很', 34712), ('看', 34022), ('电影', 33675), ('也', 32065), ('和', 31290), ('在', 31245), ('不', 28435), ('有', 27939), ('就', 25685), ('人', 23909), ('好', 22858), ('啊', 20803), ('这', 17484), ('还', 17449), ('一个', 17343), ('你', 17282), ('还是', 16425), ('但', 15578), ('故事', 15010), ('没有', 14343), ('就是', 14007), ('喜欢', 13566), ('让', 13304), ('太', 12676), ('又', 11566), ('剧情', 11359), ('没', 10858), ('说', 10764), ('吧', 10747), ('他', 10675), ('不错', 10416), ('得', 10349), ('到', 10341), ('给', 10300), ('这个', 10058), ('上', 10054), ('被', 9939), ('对', 9824), ('最后', 9694), ('一部', 9693), ('片子', 9590), ('什么', 9571), ('能', 9532), ('与', 9168), ('多', 8977)]

第二种情况:

但是如是大批量语料,电脑配置不好的,建议先预处理保存到本地,再去统计词频,代码如下:

预处理保存到本地:

  1. # -*- coding:utf-8 -*-
  2. import re
  3. import pandas as pd
  4. content = pd.read_csv(r"movie_comments.csv")
  5. # print(content.head())
  6. articles = content['comment'].tolist()
  7. print(len(articles))
  8. def token(string):
  9. return re.findall('\w+',string)
  10. articles_clean = [''.join(token(str(s))) for s in articles]
  11. #print(articles_clean[600])
  12. with open('writedic.txt','w',encoding='utf-8') as f:
  13. for line in articles_clean:
  14. f.write(line+'\n')

再进行分词、词频统计:

  1. import jieba
  2. from collections import Counter
  3. Token = []
  4. for i,lines in enumerate(open('writedic.txt',encoding='utf-8')):
  5. if i%10000 == 0:
  6. print(i)
  7. Token += jieba.cut(lines)
  8. print(Token[:10])
  9. words_count = Counter(Token)
  10. statics = words_count.most_common()[:50]
  11. print(statics)

结果同上。

二:读取excle文件进行中文词频统计具体分三步:

1、读取文件

2、分词,加载自定义词典,去数字,去停用词

3、统计词频并排序

代码如下:

  1. #!/usr/bin/python
  2. # -*- coding: UTF-8 -*-
  3. import pandas as pd
  4. import re
  5. import jieba
  6. """
  7. 第一步:读取文件
  8. """
  9. f1=open("write.txt", "w", encoding='utf-8')
  10. reviews=pd.read_excel('1.xlsx',usecols=[0],skiprow=[0],sheetname='Sheet1')#skiprow=[0]去除第一行.usecols的第一列是B列
  11. #打印前三行
  12. # for i in range(3):
  13. # print("Review #", i + 1)
  14. # print(reviews.answer[i])
  15. # print()
  16. # ## 数据预处理
  17. """
  18. 第二步:分词,加载有用词典,去数字,去停用词
  19. """
  20. # 清除不想要的单词(频率比较低的),停用词
  21. jieba.load_userdict("jiebauserdict.txt")
  22. stopword = [line.rstrip() for line in open("stopwords.txt", 'r', encoding='utf-8')]
  23. def clean_text(text):
  24. newtext = []
  25. text = re.sub(r'\d+', ' ', text) #去除数字
  26. text = re.sub(r'\n', '', text)
  27. text = jieba.lcut(text) # 分词
  28. for word in text:
  29. if word not in stopword: # 去停用词 + 词性筛选
  30. newtext.append(word)
  31. lineswords=' '.join(newtext)
  32. # print(lineswords)
  33. return lineswords
  34. # 预处理文章内容 Clean the answer
  35. clean_content = []
  36. for summary in reviews.answer:
  37. clean_content.append(clean_text(summary))
  38. print("Content are complete.")
  39. # 打印清洗后的前三行
  40. # for i in range(3):
  41. # print("Clean Review #", i + 1)
  42. # print(clean_content[i])
  43. # print()
  44. """
  45. 第三步:统计文本内容中每个单词出现的频率并排序
  46. """
  47. # 统计每个句子中每个单词出现的频率
  48. def count_words(count_dict, text):
  49. for sentence in text:
  50. for word in sentence.split():
  51. if word not in count_dict:
  52. count_dict[word] = 1 #如果不在count_dict,则存入词典
  53. else:
  54. count_dict[word] += 1
  55. # print(count_dict)
  56. result = sorted(count_dict.items(),key=lambda item:item[1],reverse=True) #排序
  57. print(result)
  58. for key in result:
  59. # print(key[:500])
  60. f1.write(key.__str__()+'\n')
  61. # # # print(key,count_dict[key])
  62. # # f1.write(key+' '+str(count_dict[key])+'\n')
  63. f1.flush()
  64. # 统计所有词的词频 比如:{'hello':7,'good':3} Find the number of times each word was used and the size of the vocabulary
  65. word_counts = {}
  66. count_words(word_counts, clean_content)
  67. print("词汇数量:", len(word_counts)) #词汇数量

结果:

                        

三、读取txt文件进行中文词频统计:

三部曲:读取文件、文件清洗与分词、统计词频

  1. #!/usr/bin/python
  2. # -*- coding: UTF-8 -*-
  3. import re
  4. import jieba
  5. from collections import Counter
  6. stopword = [line.rstrip() for line in open("stopwords.txt", 'r', encoding='utf-8')]
  7. # 文本清洗
  8. def clean_text(text):
  9. newtext = []
  10. text = re.sub(r'\d+', ' ', text) #去除数字
  11. text = jieba.lcut(text) # 分词
  12. for word in text:
  13. if word not in stopword: # 去停用词 + 词性筛选
  14. newtext.append(word)
  15. lineswords=' '.join(newtext)
  16. return lineswords
  17. # 统计词频
  18. def counter_word(data):
  19. lines = ''
  20. for line in data:
  21. lines += line
  22. data_list = lines.split(' ')
  23. words_count = Counter(data_list)
  24. # print(words_count)
  25. count_res = words_count.most_common()[:50]
  26. return count_res
  27. # 读取txt文本文件
  28. def read_content():
  29. data = []
  30. contents = [line.strip() for line in open("langchao.txt", 'r', encoding='utf-8-sig')]
  31. for content in contents:
  32. text = clean_text(content)
  33. data.append(text)
  34. result = counter_word(data)
  35. return result
  36. if __name__ == '__main__':
  37. result = read_content()
  38. print(result)

四、根据统计的词频进行判断,若词频大于设定阈值,则返回该单词并保存在txt文档中:

读取的文档:

  1. #encoding: utf-8
  2. from collections import Counter
  3. f = open("userdic.txt",'w',encoding='utf-8')
  4. result = []
  5. for line in open("writedic.txt", "r", encoding='utf-8'):
  6. result.append(list(line.strip().split(" ")))
  7. def get_keys(d, value):
  8. return [k for k, v in d.items() if v == value]
  9. # print(result)
  10. for words in result:
  11. words_count = Counter(words)
  12. for word in words_count:
  13. if words_count[word] > 1:
  14. print(word)
  15. f.write(word+' ')
  16. f.write('\n')

存储的文档:

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/知新_RL/article/detail/109753
推荐阅读
相关标签
  

闽ICP备14008679号