赞
踩
- import re
- from collections import Counter
- import numpy as np
- def ngram_words(file,ngram_cont):
- words = []
- for i in range(1,ngram_cont):
- words.extend([file[j:j+i] for j in range(len(file)-i+1)]) #添加指定的n元词数
- words_fre = dict(Counter(words))#统计词频
- return words_fre
- def PMI(words_fre,pmi_threshold):
- new_words = []
- for i in words_fre:
- if len(i) ==1 :
- pass
- else:
- p_x_p_y = min([words_fre.get(i[:j]) * words_fre.get(i[j:]) for j in range(1,len(i))]) #计算px*py
- if words_fre.get(i)/p_x_p_y > pmi_threshold: #大于阈值的添加为新词
- new_words.append(i)
- return new_words
- def calculate_entropy(list):
- entropy_dic = dict(Counter(list)) #统计词频
- entropy = (-1) * sum([entropy_dic.get(i)/len(list) * np.log2(entropy_dic.get(i)/len(list)) for i in entropy_dic])#计算熵
- return entropy
- def Entropy_left_right(words,text,ent_threshold):
- result_words = []
-
- for word in words:
- try:
- left_right_words = re.findall('(.)%s(.)' % word,text) #新词在文章中的前后位置的字
- left_words = [i[0] for i in left_right_words]
- left_entropy = calculate_entropy(left_words)
- right_words = [i[1] for i in left_right_words]
- right_entropy = calculate_entropy(right_words)
- if min(left_entropy,right_entropy) > ent_threshold:
- result_words.append(word)
- except:
- pass
- return result_words
- stop_word=['【','】',')','(','、',',','“','”','。','\n','《','》',' ','-','!','?','.','\'','[',']',':','/','.','"','\u3000','’','.',',','…','?']
- with open("result.txt",'r',encoding='utf8') as f:
- text = f.read()
- for i in stop_word:
- text=text.replace(i,"")
- ngram = 3
- PMI_threshold = 0.05
- ent_threshold = 1
- words_fre = ngram_words(text,ngram)
- new_words = PMI(words_fre,PMI_threshold)
- result = Entropy_left_right(new_words,text,ent_threshold)
- print(result)

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。