当前位置:   article > 正文

基于互信息与左右信息熵的新词发现_python实现基于互信息和左右信息熵的短语提取识别

python实现基于互信息和左右信息熵的短语提取识别
  1. import re
  2. from collections import Counter
  3. import numpy as np
  4. def ngram_words(file,ngram_cont):
  5. words = []
  6. for i in range(1,ngram_cont):
  7. words.extend([file[j:j+i] for j in range(len(file)-i+1)]) #添加指定的n元词数
  8. words_fre = dict(Counter(words))#统计词频
  9. return words_fre
  10. def PMI(words_fre,pmi_threshold):
  11. new_words = []
  12. for i in words_fre:
  13. if len(i) ==1 :
  14. pass
  15. else:
  16. p_x_p_y = min([words_fre.get(i[:j]) * words_fre.get(i[j:]) for j in range(1,len(i))]) #计算px*py
  17. if words_fre.get(i)/p_x_p_y > pmi_threshold: #大于阈值的添加为新词
  18. new_words.append(i)
  19. return new_words
  20. def calculate_entropy(list):
  21. entropy_dic = dict(Counter(list)) #统计词频
  22. entropy = (-1) * sum([entropy_dic.get(i)/len(list) * np.log2(entropy_dic.get(i)/len(list)) for i in entropy_dic])#计算熵
  23. return entropy
  24. def Entropy_left_right(words,text,ent_threshold):
  25. result_words = []
  26. for word in words:
  27. try:
  28. left_right_words = re.findall('(.)%s(.)' % word,text) #新词在文章中的前后位置的字
  29. left_words = [i[0] for i in left_right_words]
  30. left_entropy = calculate_entropy(left_words)
  31. right_words = [i[1] for i in left_right_words]
  32. right_entropy = calculate_entropy(right_words)
  33. if min(left_entropy,right_entropy) > ent_threshold:
  34. result_words.append(word)
  35. except:
  36. pass
  37. return result_words
  38. stop_word=['【','】',')','(','、',',','“','”','。','\n','《','》',' ','-','!','?','.','\'','[',']',':','/','.','"','\u3000','’','.',',','…','?']
  39. with open("result.txt",'r',encoding='utf8') as f:
  40. text = f.read()
  41. for i in stop_word:
  42. text=text.replace(i,"")
  43. ngram = 3
  44. PMI_threshold = 0.05
  45. ent_threshold = 1
  46. words_fre = ngram_words(text,ngram)
  47. new_words = PMI(words_fre,PMI_threshold)
  48. result = Entropy_left_right(new_words,text,ent_threshold)
  49. print(result)

 

声明:本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:【wpsshop博客】
推荐阅读
相关标签
  

闽ICP备14008679号