''' 引用自:涛笙依旧_,仅限学习交流用,如有侵权请告知。 ''' # 功能:停用词加载 stop_word_path = r'E:\论文\算法\停用词表\hit_stopwords.txt' def get_stop_word(stop_word_path): #停用词列表,默认使用哈工大停用词表 f = open(stop_word_path,encoding='utf-8') stop_words = list() for stop_word in f.readlines(): stop_words.append(stop_word[:-1]) return stop_words # 语料生成器,并且初步预处理语料 def text_generator(file_path): txts = glob.glob(f'{file_path}/*.txt') for txt in txts: d = codecs.open(txt, encoding='utf-8').read() title = d.split("\n")[0] d = d.replace(u'\u3000', '').strip() yield title,re.sub(u'[^\u4e00-\u9fa50-9a-zA-Z ]+', '', d) class NewWordFind(): def __init__(self, n_gram=5, min_p=2 , min_entropy=1, max_score=100, min_score=2): ''' input: n_gram: int n_gram 的 粒度 min_p: int 最小 信息熵 阈值 min_entropy: int 左右熵 阈值 max_score: int 综合得分最大阈值 min_score: int 综合得分最小阈值 ''' self.n_gram = n_gram self.min_p = min_p self.min_entropy = min_entropy self.max_score = max_score self.min_score = min_score # 功能:将 text 进行 n_gram def n_gram_words(self,text): """ 功能:将 text 进行 n_gram input: text : String 输入句子 return: words_freq:Dict 词频 字典 """ words = [] for i in range(1,self.n_gram+1): words += [text[j:j+i] for j in range(len(text)-i+1)] words_freq = dict(Counter(words)) new_words_freq = {} for word,freq in words_freq.items(): new_words_freq[word]=freq return new_words_freq # 功能:PMI 过滤掉 噪声词 def PMI_filter(self, word_freq_dic): """ 功能:PMI 过滤掉 噪声词 input: words_freq:Dict 词频 字典 return: new_words_dic:Dict PMI 过滤噪声后 剩余新词 """ new_words_dic = {} for word in word_freq_dic: if len(word) == 1: pass else: p_x_y = min([word_freq_dic.get(word[:i])* word_freq_dic.get(word[i:]) for i in range(1,len(word))]) mpi = p_x_y/word_freq_dic.get(word) if mpi > self.min_p: new_words_dic[word] = [mpi] return new_words_dic # 功能: 计算字符列表的熵 def calculate_entropy(self, char_list): """ 功能: 计算字符列表的熵 input: char_list: List 字符列表 return: entropy: float 熵 """ char_freq_dic = dict(Counter(char_list)) entropy = (-1)*sum([ char_freq_dic.get(i)/len(char_list)*np.log2(char_freq_dic.get(i)/len(char_list)) for i in char_freq_dic]) return entropy # 功能:通过熵阈值从限定词字典中过滤出最终的新词 def Entropy_left_right_filter(self,condinate_words_dic,text): """ 功能:通过熵阈值从限定词字典中过滤出最终的新词 input: condinate_words_dic:Dict 限定词字典 text:String 句子 output: final_words_list:List 最终的新词列表 """ final_words_list = [] for word in condinate_words_dic.keys(): left_right_char =re.findall('(.)%s(.)'%word,text) left_char = [i[0] for i in left_right_char] left_entropy = self.calculate_entropy(left_char) right_char = [i[1] for i in left_right_char] right_entropy = self.calculate_entropy(right_char) score = condinate_words_dic[word][0]-min(left_entropy,right_entropy) if min(right_entropy,left_entropy)> self.min_entropy and score<self.max_score and score>self.min_score: final_words_list.append({ "word":word, "pmi":condinate_words_dic[word][0], "left_entropy":left_entropy, "right_entropy":right_entropy, "score":score }) final_words_list = sorted(final_words_list, key=lambda x: x['score'], reverse=True) return final_words_list # 需要修改的地方: stop_word= get_stop_word(r"hit_stopwords.txt") #停用词表 # 预料库路劲,仅需要写到目录,不需要详细到具体文件名 file_path = r"file_path/" n_gram = 5 min_p = 2 min_entropy = 1 max_score = 100 min_score = 2 new_word_find = NewWordFind( n_gram=n_gram, min_p=min_p , min_entropy=min_entropy, max_score=max_score, min_score=min_score) for index,(title,text) in enumerate(text_generator(file_path)): print(f"\n index :{index} => title:{title}") for i in stop_word: text=text.replace(i,"") n_gram = new_word_find.n_gram_words(text) new_words_dic = new_word_find.PMI_filter(n_gram) new_words_list = new_word_find.Entropy_left_right_filter(new_words_dic,text) for new_words in new_words_list: print(f"{new_words}")
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。