赞
踩
Natural Language Processing
单词含义
如何使用同义词词典根据自然语言处理的具体应用的不同而不同。比如,在信息检索场景中,如果事先知道 automobile 和 car 是近义词,就可以将 automobile 的检索结果添加到 car 的检索结果中
NLTK 的安装
conda install nltk
from nltk.corpus import wordnet
, wordnet.synsets('car')
会报错提示找不到 wordnet 语料库 (运行 nltk.download('wordnet')
也是无法下载成功的):使用 WordNet 获得同义词
wordnet.synset()
方法。这里有一点需要注意,那就是一个单词存在多个含义。因此,在获得同义词时,需要(从多个含义中)指定是哪个含义>>> from nltk.corpus import wordnet
>>>> wordnet.synsets('car')
[Synset('car.n.01'), Synset('car.n.02'), Synset('car.n.03'), Synset('car.n.04'), Synset('cable_car.n.01')]
wordnet.synset()
方法,获取 “car.n.01” 的同义词簇。另外,对该同义词簇调用 definition()
方法>>> car = wordnet.synset('car.n.01') # 同义词簇
>>> car.definition()
'a motor vehicle with four wheels; usually propelled by an internal combustion engine'
lemma_names()
方法>>> car.lemma_names()
['car', 'auto', 'automobile', 'machine', 'motorcar']
WordNet 和单词网络
hypernym_paths()
方法。hypernym 主要是语言学中用到的单词,意思是“上位词”>>> car.hypernym_paths()[0]
[Synset('entity.n.01'), Synset('physical_entity.n.01'),
Synset('object.n.01'), Synset('whole.n.02'), Synset('artifact.n.01'), Synset('instrumentality.n.03'), Synset('container.n.01'),
Synset('wheeled_vehicle.n.01'), Synset('self-propelled_vehicle.n.01'), Synset('motor_vehicle.n.01'), Synset('car.n.01')]
car.hypernym_paths()
返回列表,该列表元素中包含了具体的路径信息。为什么要返回列表呢?因为单词之间的路径可能存在多个。就上面的例子来说,从起点单词 entity 到终点单词 car 有多条路径
基于 WordNet 的语义相似度
path_similarity()
方法,其返回值是 0 ~ 1 的实数(数值越大,越相似)。path_similarity()
方法会在内部基于图 B-2 所示的单词网络的公共路径计算单词之间的相似度 (car 和 motorcycle 之间公共路径最多,因此计算的相似度最高)>>> car = wordnet.synset('car.n.01')
>>> novel = wordnet.synset('novel.n.01')
>>> dog = wordnet.synset('dog.n.01')
>>> motorcycle = wordnet.synset('motorcycle.n.01')
>>> car.path_similarity(novel)
0.05555555555555555
>>> car.path_similarity(dog)
0.07692307692307693
>>> car.path_similarity(motorcycle)
0.3333333333333333
text = 'You say goodbye and I say hello.' # 语料库的样本文章
text = text.lower() # 将所有字母转化为小写,这样可以将句子开头的单词也作为常规单词处理
text = text.replace('.', ' .') # 方便后面用空格切分句子
print(text) # you say goodbye and i say hello .
words = text.split(' ') # 将空格作为分隔符, 切分句子
print(words) # ['you', 'say', 'goodbye', 'and', 'i', 'say', 'hello', '.']
通过导入正则表达式的
re
模块,使用re.split('(\W+)?', text)
也可以进行分词
>>> word_to_id = {} # 将单词转化为单词 ID
>>> id_to_word = {} # 将单词 ID 转化为单词
>>>
>>> for word in words:
... if word not in word_to_id:
... new_id = len(word_to_id)
... word_to_id[word] = new_id
... id_to_word[new_id] = word
>>> import numpy as np
>>> corpus = [word_to_id[w] for w in words]
>>> corpus = np.array(corpus) # 单词 ID 列表
>>> corpus
array([0, 1, 2, 3, 4, 1, 5, 6])
preprocess()
函数def preprocess(text): text = text.lower() text = text.replace('.', ' .') words = text.split(' ') word_to_id = {} id_to_word = {} for word in words: if word not in word_to_id: new_id = len(word_to_id) word_to_id[word] = new_id id_to_word[new_id] = word corpus = np.array([word_to_id[w] for w in words]) return corpus, word_to_id, id_to_word
根据具体情况,也可以仅将左边的单词或者右边的单词作为上下文。此外,也可以使用考虑了句子分隔符的上下文。简单起见,本书仅处理不考虑句子分隔符、左右单词数量相同的上下文
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
print(corpus)
# [0 1 2 3 4 1 5 6]
print(id_to_word)
# {0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6:
'.'}
# corpus 是单词 ID 列表 # vocab_size 是词汇个数 # window_size 是窗口大小 def create_co_matrix(corpus, vocab_size, window_size=1): corpus_size = len(corpus) co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32) for idx, word_id in enumerate(corpus): for i in range(1, window_size + 1): left_idx = idx - i right_idx = idx + i if left_idx >= 0: left_word_id = corpus[left_idx] co_matrix[word_id, left_word_id] += 1 if right_idx < corpus_size: right_word_id = corpus[right_idx] co_matrix[word_id, right_word_id] += 1 return co_matrix
def cos_similarity(x, y, eps=1e-8):
nx = x / (np.sqrt(np.sum(x ** 2)) + eps)
ny = y / (np.sqrt(np.sum(y ** 2)) + eps)
return np.dot(nx, ny)
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size)
c0 = C[word_to_id['you']] # you的单词向量
c1 = C[word_to_id['i']] # i的单词向量
print(cos_similarity(c0, c1))
# 0.7071067691154799
# 由于余弦相似度的取值范围是 −1 到 1,所以可以说这个值是相对比较高的(存在相似性)
""" @ query: 查询词 @ word_to_id: 单词到单词ID 的字典 @ id_to_word 单词ID 到单词的字典 @ word_matrix 汇总了单词向量的矩阵,假定保存了与各行对应的单词向量 @ top 显示到前几位 """ def most_similar(query, word_to_id, id_to_word, word_matrix, top=5): # 取出查询词 if query not in word_to_id: print('%s is not found' % query) return print('\n[query] ' + query) query_id = word_to_id[query] query_vec = word_matrix[query_id] # 计算余弦相似度 vocab_size = len(id_to_word) similarity = np.zeros(vocab_size) for i in range(vocab_size): similarity[i] = cos_similarity(word_matrix[i], query_vec) # 基于余弦相似度,按降序输出值 count = 0 for i in (-1 * similarity).argsort(): # argsort 返回排序后的索引 if id_to_word[i] == query: continue print(' %s: %s' % (id_to_word[i], similarity[i])) count += 1 if count >= top: return
Pointwise Mutual Information
def ppmi(C, verbose=False, eps=1e-8): M = np.zeros_like(C, dtype=np.float32) N = np.sum(C) S = np.sum(C, axis=0) total = C.shape[0] * C.shape[1] cnt = 0 for i in range(C.shape[0]): for j in range(C.shape[1]): pmi = np.log2(C[i, j] * N / (S[j]*S[i]) + eps) M[i, j] = max(0, pmi) if verbose: cnt += 1 if cnt % (total//100+1) == 0: print('%.1f%% done' % (100*cnt/total)) return M
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size)
W = ppmi(C)
np.set_printoptions(precision=3) # 有效位数为3位
print('covariance matrix')
print(C)
print('-'*50)
print('PPMI')
print(W)
output:
covariance matrix [[0 1 0 0 0 0 0] [1 0 1 0 1 1 0] [0 1 0 1 0 0 0] [0 0 1 0 1 0 0] [0 1 0 1 0 0 0] [0 1 0 0 0 0 1] [0 0 0 0 0 1 0]] -------------------------------------------------- PPMI [[ 0. 1.807 0. 0. 0. 0. 0. ] [ 1.807 0. 0.807 0. 0.807 0.807 0. ] [ 0. 0.807 0. 1.807 0. 0. 0. ] [ 0. 0. 1.807 0. 1.807 0. 0. ] [ 0. 0.807 0. 1.807 0. 0. 0. ] [ 0. 0.807 0. 0. 0. 0. 2.807] [ 0. 0. 0. 0. 0. 2.807 0. ]]
相当于谱分解之后取最大的 k k k 个主成分
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(id_to_word)
C = create_co_matrix(corpus, vocab_size, window_size=1)
W = ppmi(C)
# SVD
U, S, V = np.linalg.svd(W)
如果矩阵大小是 N N N,SVD的计算的复杂度将达到 O ( N 3 ) O(N^3) O(N3), 所以往往会使用 Truncated SVD 等更快的方法。Truncated SVD 通过截去(truncated)奇异值较小的部分,从而实现高速化
# 单词 ID 为 0 的单词向量:
print(C[0]) # 共现矩阵
# [0 1 0 0 0 0 0]
print(W[0]) # PPMI矩阵
# [ 0. 1.807 0. 0. 0. 0. 0. ]
print(U[0]) # SVD
# [ 3.409e-01 -1.110e-16 -1.205e-01 -4.441e-16 0.000e+00 -9.323e-01
# 2.226e-16]
W[0]
经过 SVD 被转化成了密集向量 U[0]
。如果要对这个密集向量降维,比如把它降维到二维向量,取出前两个元素即可print(U[0, :2])
# [ 3.409e-01 -1.110e-16]
for word, word_id in word_to_id.items():
plt.annotate(word, (U[word_id, 0], U[word_id, 1])) # 在 2D 图形中坐标为 (x, y) 的地方绘制单词的文本
plt.scatter(U[:,0], U[:,1], alpha=0.5)
plt.show()
Penn Treebank 语料库
<unk>
(unknown),将具体的数字替换成 “N
” 等<eos>
(end of sentence)from dataset import ptb
corpus, word_to_id, id_to_word = ptb.load_data('train')
print('corpus size:', len(corpus))
print('corpus[:30]:', corpus[:30])
print()
print('id_to_word[0]:', id_to_word[0])
print('id_to_word[1]:', id_to_word[1])
print('id_to_word[2]:', id_to_word[2])
print()
print("word_to_id['car']:", word_to_id['car'])
output:
corpus size: 929589
corpus[:30]: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29]
id_to_word[0]: aer
id_to_word[1]: banknote
id_to_word[2]: berlitz
word_to_id['car']: 3856
word_to_id['happy']: 4428
word_to_id['lexus']: 7426
window_size = 2 wordvec_size = 100 corpus, word_to_id, id_to_word = ptb.load_data('train') vocab_size = len(word_to_id) print('counting co-occurrence ...') C = create_co_matrix(corpus, vocab_size, window_size) print('calculating PPMI ...') W = ppmi(C, verbose=True) print('calculating SVD ...') try: # truncated SVD (fast!) from sklearn.utils.extmath import randomized_svd U, S, V = randomized_svd(W, n_components=wordvec_size, n_iter=5, random_state=None) except ImportError: # SVD (slow) U, S, V = np.linalg.svd(W) word_vecs = U[:, :wordvec_size] querys = ['you', 'year', 'car', 'toyota'] for query in querys: most_similar(query, word_to_id, id_to_word, word_vecs, top=5)
output:
[query] you i: 0.702039909619 we: 0.699448543998 've: 0.554828709147 do: 0.534370693098 else: 0.512044146526 [query] year month: 0.731561990308 quarter: 0.658233992457 last: 0.622425716735 earlier: 0.607752074689 next: 0.601592506413 [query] car luxury: 0.620933665528 auto: 0.615559874277 cars: 0.569818364381 vehicle: 0.498166879744 corsica: 0.472616831915 [query] toyota motor: 0.738666107068 nissan: 0.677577542584 motors: 0.647163210589 honda: 0.628862370943 lexus: 0.604740429865
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。