赞
踩
句子级
、
段落级
、
篇章级
等
TF(Term Frequency):词频
T
F
=
该
词
频
数
文
档
词
语
总
数
TF = \frac{该词频数}{文档词语总数}
TF=文档词语总数该词频数
IDF(Inverse Document Frequency):逆文本频率指数
I
D
F
=
log
(
文
档
总
数
出
现
该
词
文
档
数
+
1
)
IDF = \log(\frac{文档总数}{出现该词文档数+1})
IDF=log(出现该词文档数+1文档总数)
TFIDF缺点
from collections import Counter
from math import log10
from re import split
from jieba.posseg import dt
FLAGS = set('a an b f i j l n nr nrfg nrt ns nt nz s t v vi vn z eng'.split())
def cut(text):
for sentence in split('[^a-zA-Z0-9\u4e00-\u9fa5]+', text.strip()):
for w in dt.cut(sentence):
if len(w.word) > 1 and w.flag in FLAGS:
yield w.word
class TFIDF:
def __init__(self):
self.idf = None
self.idf_max = None
def fit(self, texts):
texts = [set(cut(text)) for text in texts]
lent = len(texts)
words = set(w for t in texts for w in t)
self.idf = {w: log10(lent/(sum((w in t)for t in texts)+1)) for w in words}
self.idf_max = log10(lent)
return self
def get_idf(self, word):
return self.idf.get(word, self.idf_max)
def extract(self, text, top_n=10):
counter = Counter()
for w in cut(text):
counter[w] += self.get_idf(w)
return [i[0] for i in counter.most_common(top_n)]
tfidf = TFIDF().fit(['奶茶', '巧克力奶茶', '巧克力酸奶', '巧克力', '巧克力']*2)
print(tfidf.extract('酸奶巧克力奶茶'))
下面提供两种位置权重
粗粒度
词语位置 | 权重 |
---|---|
标题 | 10 |
标题尾词 | 20 |
首句 | 4 |
末句 | 3 |
首段 | 3 |
末段 | 2 |
其它 | 1 |
细粒度
"""https://blog.csdn.net/Yellow_python/article/details/104580509"""
from sklearn.gaussian_process import GaussianProcessRegressor
from jieba import lcut
X = [[0], [.1], [.2], [.3], [.4], [.5], [.6], [.7], [.8], [.9], [1]]
Y = [[1], [.2], [.04], [.02], [.01], [0], [0], [.01], [.03], [.1], [.5]]
class GPR:
"""高斯过程回归"""
def __init__(self):
self.model = GaussianProcessRegressor()
self.model.fit(X, Y)
def predict(self, position):
return self.model.predict([[position]])[0]
def extract(self, text, judge):
words = lcut(text)
le = len(words) - 1
entities = [(self.predict(i/le)[0], words[i]) for i in range(le+1) if judge(words[i])]
return entities
def visualization():
from matplotlib import pyplot as mp
w = [[i / 500] for i in range(501)]
z = GPR().model.predict(w)
mp.scatter(X, Y, s=66, color='g')
mp.scatter(w, z, s=6, color='r')
mp.show()
"""实体抽取并返回权重"""
print(GPR().extract('剑圣联合守望者斩杀大法师', lambda x: x in {'剑圣', '大法师', '守望者'}))
"""权重分布可视化"""
visualization()
示例句子:剑圣联合守望者斩杀大法师
词语 | 位置 | 位置百分比 | 权重 |
---|---|---|---|
剑圣 | 0 | 0% | 1.00 |
守望者 | 2 | 50% | 0.00 |
大法师 | 4 | 100% | 0.50 |
文章越长,单个词对文章的贡献值越低。下面提供几个公式:
下面提供两种词长权重
词长度 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 |
---|---|---|---|---|---|---|---|---|---|---|
权 重 = 1 − 1 词 长 权重 = 1 - \frac{1}{词长} 权重=1−词长1 | 0.00 | 0.50 | 0.67 | 0.75 | 0.80 | 0.83 | 0.86 | 0.88 | 0.89 | 0.90 |
权 重 = 词 长 权重 = \sqrt{词长} 权重=词长 | 1 | 1.414 | 1.732 | 2 | 2.236 | 2.449 | 2.646 | 2.828 | 3 | 3.162 |
词频=1时,权重=1;
词频>1时,下面提供两种词跨度权重
简单版
权
重
=
1
+
尾
词
位
置
−
首
词
位
置
总
长
权重 = 1 + \frac{尾词位置 - 首词位置}{总长}
权重=1+总长尾词位置−首词位置
复杂版
权
重
=
∏
i
=
1
n
−
1
(
1
+
词
位
置
i
+
1
−
词
位
置
i
总
长
)
权重 = \prod^{n-1}_{i=1} (1 + \frac{词位置_{i+1} - 词位置_i}{总长})
权重=∏i=1n−1(1+总长词位置i+1−词位置i)
def span1(text):
"""词跨度 + 词频"""
words = list(text) # 分词并返回列表,此处暂时用list
reversed_words = words[::-1]
le = len(words)
return {w: 2-((words.index(w)+reversed_words.index(w)+1)/le) for w in set(words)}
def span2(text):
c, position = dict(), dict()
words = list(text) # 分词并返回列表,此处暂时用list
length = len(words)
for i in range(length):
word = words[i]
if word not in c:
c[word] = 1
else:
c[word] *= 1 + ((i - position[word]) / length)
position[word] = i
return c
_text = '清水水水水水水水里清'
print(span1(_text))
print(span2(_text))
句子示例:清水水水水水水水里清
词语 | 简单版权重 | 复杂版权重 |
---|---|---|
清 | 1.9 | 1.9 |
水 | 1.6 | 1.771561 |
里 | 1 | 1 |
通常,实词>虚词,名词权重较高。
对于如何设定权重值,我们可以找些标注数据来建立一个词性权重模型。
给予主题相关的词更高的权重
例如,文中【车主打开车盖,用苹果照了一下水箱】句子属于【汽车主题】,同为名词的【水箱】权重要比【苹果】高。
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from re import split
from jieba import cut
from collections import Counter
from numpy import argmax
from pandas import DataFrame
def segment(text):
for phrase in split('[^a-zA-Z\u4e00-\u9fa5]+', text.strip()):
for word in cut(phrase):
yield word
def clf_word(texts, labels, clf=MultinomialNB()):
"""词分类,逻辑回归,存EXCEL"""
# 向量化
vectorizer = TfidfVectorizer(tokenizer=segment)
x = vectorizer.fit_transform(texts)
# 建模
clf.fit(x, labels)
classes = clf.classes_
print(clf.__class__.__name__, clf.score(x, labels), *classes)
# 词分类
c = Counter(w for t in texts for w in segment(t)).most_common()
ls = []
for word, freq in c:
predict_proba = clf.predict_proba(vectorizer.transform([word]))[0] # 概率
label = classes[argmax(predict_proba)] # 类别
ls.append([freq, word, label, *predict_proba])
df = DataFrame(ls, columns=['freq', 'word', 'label', *classes])
df.to_excel('a.xlsx', index=False) # 存excel
if __name__ == '__main__':
from data9 import X, Y # 导入新闻9分类语料
clf_word(X, Y)
在特定否定语境中,关键词权重要变零
例句:大力发展智能冰箱、智能空调、智能热水器等高新技术(不含智能手机和智能电视)
抽出实体及其权重:【智能冰箱:2】【智能空调:2】【智能热水器:2】【智能手机:-1】【智能电视:-1】
{'不包含': -1, '不包括': -1, '不含': -1, '除外': -1, '包含': 1, '包括': 1}
re.compile(r'(?<![无没])不[^a-zA-Z\W\d_限无没]|除外')
以下情况不列入支持
1、XXXXXXXX
2、XXXXXXXX
申请人条件如下:
1、XXXXXXXX
2、XXXXXXXX
from matplotlib import pyplot as mp
# 特定词
x2y = {'b': 20, 'c': 10, 'd': 30}
# 造数据
length = 100
x = ['a'] * length
x[15], x[30], x[75] = 'b', 'c', 'd'
# 计算权重
y = [0] * length
for i in range(length):
if x[i] in x2y:
weight = x2y[x[i]]
for j in range(i + 1, length):
y[j] += max(0, weight - abs(i - j))
mp.plot(y)
mp.show()
from collections import defaultdict
from jieba.posseg import dt
ALLOW_POS = frozenset(('ns', 'n', 'vn', 'v'))
class WeightedUndigraph:
d = 0.85
def __init__(self):
self.graph = defaultdict(list)
def add_edge(self, start, end, weight):
self.graph[start].append((start, end, weight))
self.graph[end].append((end, start, weight))
def rank(self):
ws = defaultdict(float)
outSum = defaultdict(float)
wsdef = 1.0 / (len(self.graph) or 1.0)
for n, out in self.graph.items():
ws[n] = wsdef
outSum[n] = sum((e[2] for e in out), 0.0)
for x in range(10): # 10次迭代
for n in sorted(self.graph.keys()): # 排序更稳定
s = 0
for e in self.graph[n]:
s += e[2] / outSum[e[1]] * ws[e[1]]
ws[n] = (1 - self.d) + self.d * s
min_rank, max_rank = min(ws.values()), max(ws.values())
for n, w in ws.items():
ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0) # 统一权重,无需乘以100
return ws
class TextRank:
def __init__(self):
self.tokenizer = dt
self.allow_pos = ALLOW_POS
self.span = 5
def flag_filter(self, wp):
return (wp.flag in self.allow_pos) and (len(wp.word.strip()) >= 2)
def text_rank(self, sentence, n=20, with_weight=False, allow_pos=ALLOW_POS, with_flag=False):
"""
Parameter:
- n: 返回关键词数量
- with_weight: 是否返回权重
- allow_pos: 允许的词性
- with_flag: 是否返回词性
"""
self.allow_pos = frozenset(allow_pos)
g = WeightedUndigraph()
cm = defaultdict(int)
words = tuple(self.tokenizer.cut(sentence))
for i, wp in enumerate(words):
if self.flag_filter(wp):
for j in range(i + 1, i + self.span):
if j >= len(words):
break
if not self.flag_filter(words[j]):
continue
if allow_pos and with_flag:
cm[(wp, words[j])] += 1
else:
cm[(wp.word, words[j].word)] += 1
for terms, w in cm.items():
g.add_edge(terms[0], terms[1], w)
nodes_rank = g.rank()
if with_weight:
tags = sorted(nodes_rank.items(), key=lambda x: x[1], reverse=True)
else:
tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
return tags[:n] if n else tags
_t = '越来越多的国产汽车出现在大众的眼中,国产汽车的整体性能也在不断优化,但和老牌的欧美汽车相比还存在着一定的差距'
print(TextRank().text_rank(_t, with_weight=True, with_flag=True))
聚类排除离群点
待开发
待开发
失败1:
尝试用词向量分布的离散程度来计算权重但失败,结果如下:
高频词向量[5 5 5 5 5 5 0 -5 -5 -5]
离散程度高
中频词向量[4 3 3 3 3 3 0 -3 -3 -3]
离散程度中
低频词向量[3 1 1 1 1 1 0 -1 -1 -1]
离散程度低
from re import split, fullmatch
from gensim.models import Word2Vec
from jieba import cut
from numpy import var
from pandas import DataFrame
def lcut(text):
return [w for s in split('[\n。…;;!!??]+', text)for w in cut(s)if fullmatch('[a-zA-Z\u4e00-\u9fa5]+', w)]
def word2vector(texts):
"""词向量建模"""
sentences = [lcut(t) for t in texts]
wv = Word2Vec(sentences, size=75, window=10, sg=1).wv
DataFrame([(w, var(wv[w]), *wv[w]) for w in wv.index2word], columns=[
'word', 'weight', *(str(i) for i in range(75))]).to_excel('b.xlsx', index=False)
from data9 import X # 新闻9分类,不导入标签
word2vector(X)
失败2:
词特征分布的离散程度:特征分布离散程度越高的词,权重应更高。
例如,两个IDF相同的词,第一个词集中分布在某类文档中,第二个词分散在不同类型的文档中,则第一个词的权重理应更高。
尝试用长文切短的主题模型来获取单词的主题分布离散程度,但结果失败,多数单词主题分布匀散,某些停词反而具有更为集中的主题分布。
from gensim import corpora, models
import re, jieba, numpy as np, pandas as pd
def word_lda(texts, num_topics=50):
# 分词
words_ls = [[w for w in jieba.cut(s) if re.fullmatch('[a-zA-Z\u4e00-\u9fa5]+', w)]
for t in texts for s in re.split('[\n。…;;!!??]+', t)]
# 构造词典
dictionary = corpora.Dictionary(words_ls)
# 基于词典,使【词】→【稀疏向量】,并将向量放入列表,形成【稀疏向量集】
corpus = [dictionary.doc2bow(words) for words in words_ls]
# lda模型,num_topics设置主题的个数
lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
# 主题概率矩阵
matrix = lda.state.get_lambda()
matrix = matrix / np.sum(matrix, axis=0)
# 按照主题概率聚类,并存excel
pd.DataFrame({
'word': [dictionary[i] for i in range(len(dictionary))],
'topic': np.argmax(matrix, axis=0),
'probability': np.max(matrix, axis=0),
}).sort_values(by=['topic', 'probability'], ascending=False).to_excel('word_lda.xlsx', index=False)
from data9 import X # 新闻9分类,不导入标签
word_lda(X)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。