当前位置:   article > 正文

词频统计排序的几种方法(手写 pandas NLTK)_词频排序

词频排序
  1. #在list中词频统计小程序分析
  2. '''
  3. 利用dict将list中的词频统计一kv的形式展现出来
  4. '''
  5. ls = ["综合", "理工", "综合", "综合", "综合", "综合", "综合", "综合", \
  6. "综合", "综合", "师范", "理工", "综合", "理工", "综合", "综合", \
  7. "综合", "综合", "综合", "理工", "理工", "理工", "理工", "师范", \
  8. "综合", "农林", "理工", "综合", "理工", "理工", "理工", "综合", \
  9. "理工", "综合", "综合", "理工", "农林", "民族", "军事"]
  10. d = {}
  11. for word in ls:
  12. #如果d中无该k则d[newk]=newv可添加一个kv并计该新词频为一有该k则加一
  13. d[word] = d.get(word, 0) + 1
  14. 或者:
  15. word_dict=dict()
  16. for word in ls:
  17. if word not in ls:
  18. word_dict[word] = 1
  19. else:
  20. word_dict[word] +=1
  21. for k in d:
  22. print("{}:{}".format(k, d[k]))

中文txt文件的词频统计:

步骤:

1.用read()读出txt文件所有数据

2.对读出的数据进行结巴分词

3.将分词结果放到字典中

4.将字典中数据按照指定格式放入list中

5.如果要排序就用list(dict1.items())后将该list用lambda函数排序

ls.sort(key=lambda x: x[1], reverse=True)

6.新整个list将排序后dictlist按指定保存格式放入其中

或者还用ls,只不过换了个格式:

for i in range(100):

            ls[i] = "{}:{}".format(ls[i][0], ls[i][1])

7.用join把新list写入csv

  1. import jieba
  2. fi = open("天龙八部-网络版.txt", "r", encoding='utf-8')
  3. fo = open("天龙八部-词语统计.txt", "w", encoding='utf-8')
  4. txt = fi.read()
  5. words = jieba.lcut(txt)#如果没有该行则为对单个汉字的统计
  6. d = {}
  7. for w in words:
  8. d[w] = d.get(w, 0) + 1
  9. del d[' ']
  10. del d['\n']
  11. ls = []
  12. for key in d:
  13. ls.append("{}:{}".format(key, d[key]))
  14. fo.write(",".join(ls))
  15. fi.close()
  16. fo.close()

关于list的sort()排序可以看我的另一篇博客:

https://blog.csdn.net/qq_41228218/article/details/87303183

sort()的应用:

中文词频排序输出前几名保存成csv

  1. dict1={'w':1,'e':2}
  2. ls=list(dict1.items())
  3. ls1=[]
  4. for i in dict1:
  5. ls1.append("{}:{}".format(i, dict1[i]))
  6. print(ls)
  7. print(ls1)
  8. #结果:
  9. [('w', 1), ('e', 2)]
  10. ['w:1', 'e:2']
  11. #所以ls可以通过sort排序,而ls1不行
  12. ls.sort(key=lambda x: x[1], reverse=True)

关于list.sort()与lambda:

dictListsort.sort(key=lambda x: x[1], reverse=True)

https://blog.csdn.net/qq_41228218/article/details/87303183

整体源码:

  1. import jieba
  2. fi = open(r"FilepPath", "r", encoding='utf-8')
  3. fo = open("FilepPath.csv", "w", encoding='utf-8')
  4. txt = fi.read()
  5. words = jieba.lcut(txt)#如果没有该行则为对单个汉字的统计
  6. d = {}
  7. for w in words:
  8. #if W in ''' \n,>;:'?!@#$%^&*()''':
  9. #continue
  10. d[w] = d.get(w, 0) + 1
  11. del d[' ']
  12. del d['\n']
  13. del d[","]
  14. del d["。"]
  15. del d["“"]
  16. del d['”']
  17. del d[':']
  18. del d['?']
  19. del d['…']
  20. del d['!']
  21. del d['、']
  22. del d['‘']
  23. del d["’"]
  24. DictListSave = []
  25. DictListSort = []
  26. DictListSaveSort = []
  27. for key in d:
  28. DictListSave.append("{}:{}".format(key, d[key]))
  29. fo.write(",".join(DictListSave))#乱序保存
  30. dictListsort = list(d.items())
  31. dictListsort.sort(key=lambda x: x[1], reverse=True)
  32. #输出TOP(100)
  33. for i in range(100):
  34. #print(dictListsave[i])
  35. word, count = dictListsort[i]
  36. print('{0:<20}{1:>10}'.format(word, count))
  37. for l in dictListsort:
  38. DictListSaveSort.append("{}:{}".format(l[0], l[1]))
  39. fo.write(",".join(DictListSaveSort))#排序后保存
  40. fi.close()
  41. fo.close()

 

英文词频统计见我的另一篇博客:

https://blog.csdn.net/qq_41228218/article/details/87305610

pandas的series:

  1. import pandas as pd
  2. list_data = ["综合", "理工", "综合", "综合", "综合", "综合", "综合", "综合", \
  3. "综合", "综合", "师范", "理工", "综合", "理工", "综合", "综合", \
  4. "综合", "综合", "综合", "理工", "理工", "理工", "理工", "师范", \
  5. "综合", "农林", "理工", "综合", "理工", "理工", "理工", "综合", \
  6. "理工", "综合", "综合", "理工", "农林", "民族", "军事"]
  7. data = pd.Series(list_data)
  8. print(data.value_counts())
  9. 输出前三个:
  10. print(data.value_counts()[0:3])

pandas英文:

  1. import pandas as pd
  2. import re
  3. a = 'We need to use window.load, not document.ready, because in Chrome'.lower()
  4. list_data = a.split()
  5. list01=[re.sub(',',' ',i) for i in list_data]
  6. print(pd.Series(list01).value_counts())

 

结果:

可见series的方便快捷

过滤关键词:

  1. import pandas as pd
  2. list_data = ["综合", "理工", "综合", "综合", "综合", "综合", "综合", "综合", \
  3. "综合", "综合", "师范", "理工", "综合", "理工", "综合", "综合", \
  4. "综合", "综合", "综合", "理工", "理工", "理工", "理工", "师范", \
  5. "综合", "农林", "理工", "综合", "理工", "理工", "理工", "综合", \
  6. "理工", "综合", "综合", "理工", "农林", "民族", "军事"]
  7. FILTER_WORDS=['这个','那个','什么','怎么','如果']
  8. keywords_counts = pd.Series(list_data)
  9. #过滤分词后关键词小于1的词
  10. keywords_counts = keywords_counts[keywords_counts.str.len()>1]
  11. #过滤一些没有意义的词
  12. keywords counts = keywords counts[~keywords counts.str.contains('|'.join(FILTER_WORDS) )]
  13. #输出前30
  14. keywords_counts = keywords_counts.value_counts()[:30]
  15. print(keywords counts)

用NLTK来进行词频统计:

  1. import nltk
  2. from nltk import FreqDist
  3. list_data = ["综合", "理工", "综合", "综合", "综合", "综合", "综合", "综合", \
  4. "综合", "综合", "师范", "理工", "综合", "理工", "综合", "综合", \
  5. "综合", "综合", "综合", "理工", "理工", "理工", "理工", "师范", \
  6. "综合", "农林", "理工", "综合", "理工", "理工", "理工", "综合", \
  7. "理工", "综合", "综合", "理工", "农林", "民族", "军事"]
  8. fdist = FreqDist(list_data)
  9. print(fdist["综合"])
  10. #排序输出前5
  11. standard_freq_vector = fdist.most_common(5)
  12. print(standard_freq_vector)

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/知新_RL/article/detail/109672
推荐阅读
相关标签
  

闽ICP备14008679号