当前位置:   article > 正文

新词发现的代码实现_新词发现算法代码

新词发现算法代码

'''
引用自:涛笙依旧_,仅限学习交流用,如有侵权请告知。
'''
# 功能:停用词加载
stop_word_path = r'E:\论文\算法\停用词表\hit_stopwords.txt'
def get_stop_word(stop_word_path):
    #停用词列表,默认使用哈工大停用词表
    f = open(stop_word_path,encoding='utf-8')
    stop_words = list()
    for stop_word in f.readlines():
        stop_words.append(stop_word[:-1])
    return stop_words

# 语料生成器,并且初步预处理语料
def text_generator(file_path):
    txts = glob.glob(f'{file_path}/*.txt')
    for txt in txts:
        d = codecs.open(txt, encoding='utf-8').read()
        title = d.split("\n")[0]
        d = d.replace(u'\u3000', '').strip()
        yield title,re.sub(u'[^\u4e00-\u9fa50-9a-zA-Z ]+', '', d)

class NewWordFind():
    def __init__(self, n_gram=5, min_p=2 , min_entropy=1, max_score=100, min_score=2):
        '''
            input:
                n_gram: int         n_gram 的 粒度
                min_p: int          最小 信息熵 阈值
                min_entropy: int          左右熵 阈值
                max_score: int          综合得分最大阈值
                min_score: int          综合得分最小阈值
        '''
        self.n_gram = n_gram
        self.min_p = min_p
        self.min_entropy = min_entropy
        self.max_score = max_score
        self.min_score = min_score

    # 功能:将 text 进行 n_gram
    def n_gram_words(self,text):
        """
            功能:将 text 进行 n_gram
            input:
                text : String       输入句子
            return:
                words_freq:Dict    词频 字典
        """
        words = []
        for i in range(1,self.n_gram+1):
            words += [text[j:j+i] for j in range(len(text)-i+1)]
        words_freq = dict(Counter(words))
        new_words_freq = {}
        for word,freq in words_freq.items():
            new_words_freq[word]=freq
        return new_words_freq

    # 功能:PMI 过滤掉 噪声词
    def PMI_filter(self, word_freq_dic):
        """
            功能:PMI 过滤掉 噪声词
            input:
                words_freq:Dict    词频 字典
            return:
                new_words_dic:Dict  PMI 过滤噪声后 剩余新词
        """
        new_words_dic = {}
        for word in word_freq_dic:
            if len(word) == 1:
                pass
            else:
                p_x_y = min([word_freq_dic.get(word[:i])* word_freq_dic.get(word[i:]) for i in range(1,len(word))])
                mpi = p_x_y/word_freq_dic.get(word)
                if mpi > self.min_p:
                    new_words_dic[word] = [mpi]
        return new_words_dic

    # 功能: 计算字符列表的熵
    def calculate_entropy(self, char_list):
        """
            功能: 计算字符列表的熵
            input:
                char_list: List     字符列表
            return:
                entropy: float       熵
        """
        char_freq_dic =  dict(Counter(char_list))
        entropy = (-1)*sum([ char_freq_dic.get(i)/len(char_list)*np.log2(char_freq_dic.get(i)/len(char_list)) for i in char_freq_dic])
        return entropy

    # 功能:通过熵阈值从限定词字典中过滤出最终的新词
    def Entropy_left_right_filter(self,condinate_words_dic,text):
        """
            功能:通过熵阈值从限定词字典中过滤出最终的新词
            input:
                condinate_words_dic:Dict       限定词字典
                text:String                    句子
            output:
                final_words_list:List           最终的新词列表
        """
        final_words_list = []
        for word in condinate_words_dic.keys():
            left_right_char =re.findall('(.)%s(.)'%word,text)

            left_char = [i[0] for i in left_right_char]
            left_entropy = self.calculate_entropy(left_char)

            right_char = [i[1] for i in left_right_char]
            right_entropy = self.calculate_entropy(right_char)
            score = condinate_words_dic[word][0]-min(left_entropy,right_entropy)
            if min(right_entropy,left_entropy)> self.min_entropy and score<self.max_score and score>self.min_score:
                final_words_list.append({
                    "word":word,
                    "pmi":condinate_words_dic[word][0],
                    "left_entropy":left_entropy,
                    "right_entropy":right_entropy,
                    "score":score
                })
        final_words_list = sorted(final_words_list, key=lambda x: x['score'], reverse=True)
        return final_words_list

# 需要修改的地方:
stop_word= get_stop_word(r"hit_stopwords.txt") #停用词表

# 预料库路劲,仅需要写到目录,不需要详细到具体文件名
file_path = r"file_path/"

n_gram = 5
min_p = 2
min_entropy = 1
max_score = 100
min_score = 2

new_word_find = NewWordFind( n_gram=n_gram, min_p=min_p , min_entropy=min_entropy, max_score=max_score, min_score=min_score)

for index,(title,text) in enumerate(text_generator(file_path)):
    print(f"\n index :{index} => title:{title}")
    for i in stop_word:
        text=text.replace(i,"")

    n_gram = new_word_find.n_gram_words(text)
    new_words_dic = new_word_find.PMI_filter(n_gram)
    new_words_list = new_word_find.Entropy_left_right_filter(new_words_dic,text)

    for new_words in new_words_list:
         print(f"{new_words}")
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146

  • 1
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/不正经/article/detail/347189
推荐阅读
相关标签
  

闽ICP备14008679号