当前位置:   article > 正文

Python实现求文本单词与构建的词典词汇的互信息并计算情感得分_python计算中文文本互信息

python计算中文文本互信息

Python实现求文本单词与构建的词典词汇的互信息

通过词频计算求出单词概率和词对概率

def _update(self):
        """遍历整个数据集,更新各个变量"""
        for sentence in self.dataSet: # 此处sentence是一个经过分词过后的列表
            n = len(sentence)
            self.count += n * (n - 1) / 2
            self.num += n
            for word in sentence:
                self.wordCount[word] += 1
            for i in range(n): # 这样的话无论是[x][y]还会[y][x]的结果就会是一样了
                for j in range(n):
                    if sentence[i] in self.pos_feature or sample[j] in self.pos_feature:
                        self.pos_pairsCount[sample[i]][sample[j]] += 1
                    elif sample[i] in self.neg_feature or sample[j] in self.neg_feature:
                        self.neg_pairsCount[sample[i]][sample[j]] += 1
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14

情感得分计算

 def calculate_sentiment_value(self,word):

        sum_pos = []
        # 找到所有与该单词相关的正向词汇,遍历每一个的互信息 最终进行
        for feature in self.pos_feature:
            single_sentiment_value = self.pos_query(word,feature) # 求这两个单词的互信息
            sum_pos.append(single_sentiment_value)
        print(sum_pos)
        neg_pos = []
        for feature in self.neg_feature:
            single_sentiment_value = self.neg_query(word,feature) # 求这两个单词的互信息
            neg_pos.append(single_sentiment_value)
        neg_pos = []
        score = sum(sum_pos) - sum(neg_pos)
        print(neg_pos)
        return score
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16

上面就是比较核心的部分了

整体的代码

from collections import defaultdict
from math import log2

class I():
    # 使用互信息计算两者之间的相似性
    def __init__(self, dataSet):
        self.wordCount = defaultdict(lambda: 0)  # 统计单词频率
        self.pos_pairsCount = defaultdict(lambda: defaultdict(lambda: 0))  # 统计两两单词共同出现的频率
        self.neg_pairsCount = defaultdict(lambda: defaultdict(lambda: 0))
        self.count = 0.0  # 统计单词无序对数目
        self.num = 0.0  # 统计总的单词数
        self.dataSet = dataSet
        self.sentiment_value = defaultdict(lambda: 0)
        self.pos_feature = ['r','z']
        self.neg_feature = ['y','t']
        self._update()
        

    def _update(self):
        """遍历整个数据集,更新各个变量"""
        for sample in self.dataSet:
            n = len(sample)
            self.count += n * (n - 1) / 2
            self.num += n
            for word in sample:
                self.wordCount[word] += 1
            for i in range(n):
                for j in range(n):
                    if sample[i] in self.pos_feature or sample[j] in self.pos_feature:
                        self.pos_pairsCount[sample[i]][sample[j]] += 1
                    elif sample[i] in self.neg_feature or sample[j] in self.neg_feature:
                        self.neg_pairsCount[sample[i]][sample[j]] += 1
                        
    def pos_query(self, x, y):
        """计算x和y的互信息, I(x,y)=p(x,y)log[p(x,y) / (p(x)p(y))]"""
        p_x = self.wordCount[x] / self.num
        p_y = self.wordCount[y] / self.num
        p_xy = self.pos_pairsCount[x][y] / self.count
        if p_x == 0 or p_y == 0:  # 表示数据集里面没有x或者y,无法得到二者关系
            return -1
        return p_xy * log2(p_xy / p_x / p_y)
    
    def neg_query(self, x, y):
        """计算x和y的互信息, I(x,y)=p(x,y)log[p(x,y) / (p(x)p(y))]"""
        p_x = self.wordCount[x] / self.num
        p_y = self.wordCount[y] / self.num
        p_xy = self.neg_pairsCount[x][y] / self.count
        if p_x == 0 or p_y == 0:  # 表示数据集里面没有x或者y,无法得到二者关系
            return -1
        return p_xy * log2(p_xy / p_x / p_y) 
    
    def calculate_sentiment_value(self,word):
#         sum_pos = []
#         words = list(test.wordCount.keys())
#         for i in words:
        sum_pos = []
        # 找到所有与该单词相关的正向词汇,遍历每一个的互信息 最终进行
        for feature in self.pos_feature:
            single_sentiment_value = self.pos_query(word,feature) # 求这两个单词的互信息
            sum_pos.append(single_sentiment_value)
        print(sum_pos)
        neg_pos = []
        for feature in self.neg_feature:
            single_sentiment_value = self.neg_query(word,feature) # 求这两个单词的互信息
            neg_pos.append(single_sentiment_value)
        neg_pos = []
        score = sum(sum_pos) - sum(neg_pos)
        print(neg_pos)
        return score
            
if __name__ == '__main__':
    
    
    dataSet = [['r', 'z'],
               ['x', 'y', 't', 's', 'z'],
               ['z'],
               ['s', 'x', 'r'],
               ['x', 't', 'r', 'z', 'y'],
               ['z', 'x', 't', 's', 'y']]
    test = I(dataSet)
    words = list(test.wordCount.keys())
    
    for i in words:
        print(i)
        temp = test.calculate_sentiment_value(i)
        
        print("sentiment orientation score of '{}' is {}".format(i,temp))

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88

在计算的过程中可能会出现一些问题,当然只是一些小问题,就留给读者自行去解决了

一些额外的辅助的文本处理代码

# 逐行读取文件句子并处理
# 也可以用with改写一下

word_fd = FreqDist()  # 可统计所有词的词频
cond_word_fd = ConditionalFreqDist()  # 可统计正常文本中的词频和广告文本中的词频

def new_read_line(filename, content):
    f = open(filename, 'r', encoding='utf-8')
    line = f.readline()
    while line:
        s = line
        s = re.sub('\n', '', s)
        s = re.split(' ', s)
        content.append(s)
        line = f.readline()

def new_read_dic(filename):
    f = open(filename, 'r', encoding='gbk')
    line = f.readline()
    str = []
    while line:
        s = line
        s = re.sub('\n', '', s)
        s = re.sub(' ', '', s)
        str.append(s)
        line = f.readline()
    return str
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27

需要导入的类

import pynlpir
import re
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC, LinearSVC, libsvm, liblinear
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from random import shuffle
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.metrics import BigramAssocMeasures

from collections import defaultdict
from math import log2
# 个人认为defaultdict相当好用
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/知新_RL/article/detail/72747
推荐阅读
相关标签
  

闽ICP备14008679号