基于互信息+信息熵的新词发现_头歌基于互信息和信息熵的新词提取答案

作者：空白诗007 | 2024-07-12 16:42:00

踩

头歌基于互信息和信息熵的新词提取答案

from nltk.probability import FreqDist

f = open(r"C:\Users\machuanbin\Desktop\santi.txt",encoding='utf-8')
text = f.read()

stop_word = ['【', '】', ')', '(', '、', '，', '“', '”', '。', '\n', '《', '》', ' ', '-', '！', '？', '.', '\'', '[', ']', '：',
             '/', '.', '"', '\u3000', '’', '．', ',', '…', '?']
for i in stop_word:
    text = text.replace(i, "")

print(text)
min_entropy = 0.8
min_p = 7
max_gram = 4
count_appear = 20


def gram(text, max_gram):
    t1 = [i for i in text]
    loop = len(t1) + 1 - max_gram
    t = []
    for i in range(loop):
        t.append(text[i:i + max_gram])
    if max_gram == 1:
        return t1
    else:
        return t


import math


def pro(word):
    len_word = len(word)
    total_count = len(word_all[len_word])
    pro = freq_all[len_word][word] / total_count
    return pro


def entropy(alist):
    f = FreqDist(alist)
    ent = (-1) * sum([i / len(alist) * math.log(i / len(alist)) for i in f.values()])
    return ent


freq_all = [0]
word_all = [0]
for i in range(1, max_gram + 1):
    t = gram(text, i)
    # print('t===',t)
    freq = FreqDist(t)

    word_all.append(t)
    freq_all.append(freq)

# 筛选一部分符合互信息的单词
final_word = []
for i in range(2, max_gram + 1):
    for j in word_all[i]: #在所有的2-gram中
        if freq_all[i][j] < count_appear:
            pass
        else:
            p = min([pro(j[:i]) * pro(j[i:]) for i in range(1, len(j))])
            if math.log(pro(j) / p) > min_p:
                final_word.append(j)
final_word = list(set(final_word))
print(final_word)
# 筛选左右熵
import re

final_word2 = []
for i in final_word:
    lr = re.findall('(.) %s (.)' % i, text)
    left_entropy = entropy([w[0] for w in lr])
    right_entropy = entropy([w[1] for w in lr])
    if min([right_entropy, left_entropy]) > min_entropy:
        final_word2.append(i)

print(final_word2)

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/空白诗007/article/detail/814884