赞
踩
目录
本章我们先介绍深度学习出现前的方法,练习使用Python处理文本,实现分词和单词ID化等
NLP领域存在各种各样的语料库,本章先使用仅包含一个句子的简单文本作为语料库
- # 1.简单语料库
- >>> text = 'You say goodbye and I say hello.'
-
- # 2.进行分词
- # (1)lower()将所有字母转化为小写
- >>>text = text.lower()
- # (2)考虑到句子结尾的句号,在前面插入一个空格
- >>>text = text.replace('.', ' .')
- >>>text
- 'you say goodbye and i say hello .'
- # (3)将空格作为分隔符
- >>>words = text.split(' ')
- >>>words
- ['you', 'say', 'goodbye', 'and', 'i', 'say', 'hello', '.']
- # 3.给单词标上ID
- word_to_id = {}
- id_to_word = {}
- for word in words:
- if word not in word_to_id:
- new_id = len(word_to_id)
- word_to_id[word] = new_id
- id_to_word[new_id] = word
- >>>id_to_word
- {0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'}
- >>>word_to_id
- {'you': 0, 'say': 1, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5, '.': 6}
- # 4.使用Python的列表解析式将单词列表转化位单词ID列表,再将其转化为NumPy数组
- >>>import numpy as np
- #
- >>>corpus = [word_to_id[w] for w in words]
- >>>corpus = np.array(corpus)
- >>>corpus
- array([0, 1, 2, 3, 4, 1, 5, 6])
a = [x**2 for x in xs]
- def preprocess(text):
- text = text.lower()
- text = text.replace('.', ' .')
- words = text.split(' ')
-
- word_to_id = {}
- id_to_word = {}
- for word in words:
- if word not in word_to_id:
- new_id = len(word_to_id)
- word_to_id[word] = new_id
- id_to_word[new_id] = word
-
- corpus = np.array([word_to_id[w] for w in words])
-
- return corpus, word_to_id, id_to_word
基于分布式假设使用向量表示单词最直截了当的实现方法是对周围单词的数量进行计数
- def create_co_matrix(corpus, vocab_size, window_size=1):
- """生成共现矩阵
- :param corpus: 语料库(单词ID列表)
- :param vocab_size:词汇个数
- :param window_size:窗口大小(当窗口大小为1时,左右各1个单词为上下文)
- :return: 共现矩阵
- """
- corpus_size = len(corpus)
- co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32)
-
- for idx, word_id in enumerate(corpus):
- for i in range(1, window_size + 1):
- left_idx = idx - i
- right_idx = idx + i
-
- if left_idx >= 0:
- left_word_id = corpus[left_idx]
- co_matrix[word_id, left_word_id] += 1
-
- if right_idx < corpus_size:
- right_word_id = corpus[right_idx]
- co_matrix[word_id, right_word_id] += 1
-
- return co_matrix
具有代表性的方法有向量内积或欧氏距离等,测量单词的向量表示的相似度方面,余弦相似度是很常用的
- def cos_similarity(x, y, eps=1e-8):
- '''计算余弦相似度
- :param x: 向量
- :param y: 向量
- :param eps: 用于防止“除数为0”的微小值
- :return:
- '''
- nx = x / (np.sqrt(np.sum(x ** 2)) + eps)
- ny = y / (np.sqrt(np.sum(y ** 2)) + eps)
- return np.dot(nx, ny)
- text = 'You say goodbye and I say hello.'
- corpus, word_to_id, id_to_word = preprocess(text)
- vocab_size = len(word_to_id)
- C = create_co_matrix(corpus, vocab_size)
-
- c0 = C[word_to_id['you']] # you的单词向量
- c1 = C[word_to_id['i']] # i的单词向量
- print(cos_similarity(c0, c1))
- # 0.7071067691154799 ,存在相似度
当某个单词被作为查询词时,将与这个查询词相似的单词按降序显示出来
- def most_similar(query, word_to_id, id_to_word, word_matrix, top=5):
- '''相似单词的查找
- :param query: 查询词
- :param word_to_id: 从单词到单词ID的字典
- :param id_to_word: 从单词ID到单词的字典
- :param word_matrix: 汇总了单词向量的矩阵,假定保存了与各行对应的单词向量
- :param top: 显示到前几位
- '''
- # 1.取出查询词的id和单词向量
- if query not in word_to_id:
- print('%s is not found' % query)
- return
-
- print('\\n[query] ' + query)
- query_id = word_to_id[query]
- query_vec = word_matrix[query_id]
-
- # 2.计算查询词的单词向量和其它所有单词向量的余弦相似度
- vocab_size = len(id_to_word)
- similarity = np.zeros(vocab_size)
- for i in range(vocab_size):
- similarity[i] = cos_similarity(word_matrix[i], query_vec)
-
- # 3.基于余弦相似度的结果,按降序显示它们的值
- count = 0
- for i in (-1 * similarity).argsort():
- if id_to_word[i] == query:
- continue
- print(' %s: %s' % (id_to_word[i], similarity[i]))
-
- count += 1
- if count >= top:
- return
- text = 'You say goodbye and I say hello.'
- corpus, word_to_id, id_to_word = preprocess(text)
- vocab_size = len(word_to_id)
- C = create_co_matrix(corpus, vocab_size)
-
- most_similar('you', word_to_id, id_to_word, C, top=5)
-
- # 输出
- [query] you
- goodbye: 0.7071067691154799
- i: 0.7071067691154799
- hello: 0.7071067691154799
- say: 0.0
- and: 0.0
PPMI(x,y)=max(0, PMI(x, y))
- def ppmi(C, verbose=False, eps = 1e-8):
- '''生成PPMI(正的点互信息)
- :param C: 共现矩阵
- :param verbose: 是否输出进展情况
- :return:
- '''
- M = np.zeros_like(C, dtype=np.float32)
- N = np.sum(C) # 共现矩阵中不为0的值的和
- S = np.sum(C, axis=0) # 共现矩阵中每行的不为0的值的和,表示该单词出现的概率
- total = C.shape[0] * C.shape[1]
- cnt = 0
-
- for i in range(C.shape[0]):
- for j in range(C.shape[1]):
- pmi = np.log2(C[i, j] * N / (S[j]*S[i]) + eps)
- M[i, j] = max(0, pmi)
-
- if verbose:
- cnt += 1
- # 均匀输出100个进度
- if cnt % (total//100 + 1) == 0:
- print('%.1f%% done' % (100*cnt/total))
- return M
- text = 'You say goodbye and I say hello.'
- corpus, word_to_id, id_to_word = preprocess(text)
- vocab_size = len(word_to_id)
- C = create_co_matrix(corpus, vocab_size)
- W = ppmi(C)
-
- np.set_printoptions(precision=3) # 有效位数为3位
- print('covariance matrix')
- print(C)
- print('-'*50)
- print('PPMI')
- print(W)
covariance matrix [[0 1 0 0 0 0 0] [1 0 1 0 1 1 0] [0 1 0 1 0 0 0] [0 0 1 0 1 0 0] [0 1 0 1 0 0 0] [0 1 0 0 0 0 1] [0 0 0 0 0 1 0]] -------------------------------------------------- PPMI [[0. 1.807 0. 0. 0. 0. 0. ] [1.807 0. 0.807 0. 0.807 0.807 0. ] [0. 0.807 0. 1.807 0. 0. 0. ] [0. 0. 1.807 0. 1.807 0. 0. ] [0. 0.807 0. 1.807 0. 0. 0. ] [0. 0.807 0. 0. 0. 0. 2.807] [0. 0. 0. 0. 0. 2.807 0. ]]
在尽量保留“重要信息”的基础上减少向量维度。向量中的大多数元素为0的矩阵称为稀疏矩阵,这里的重点是,从稀疏矩阵中找出重要的轴(考虑数据的广度),用更少的维度对其进行重新表示,将其转化为大多数元素均不为0的密集矩阵。将为的方法有很多,这里我们使用奇异值分解(Singular Value Decomposition,SVD)
- text = 'You say goodbye and I say hello.'
- corpus, word_to_id, id_to_word = preprocess(text)
- vocab_size = len(word_to_id)
- C = create_co_matrix(corpus, vocab_size, window_size=1)
- W = ppmi(C)
-
- # SVD
- U, S, V = np.linalg.svd(W)
-
- print(C[0]) # 共现矩阵
- # [0 1 0 0 0 0 0]
- print(W[0]) # PPMI矩阵
- # [0. 1.807 0. 0. 0. 0. 0.]
- print(U[0]) # SVD
- # [-3.4094876e-01 -1.1102230e-16 -3.8857806e-16 -1.2051624e-01
- # 0.0000000e+00 9.3232495e-01 2.2259700e-16]
U[0, :2]
与原始的PTB文章相比,多了若干预处理,包括将稀有单词替换成特殊字符<unk>,将具体的数字替换成N等
- corpus, word_to_id, id_to_word = ptb.load_data('train')
-
- print('corpus size:', len(corpus))
- print('corpus[:30]:', corpus[:30])
- print()
- print('id_to_word[0]:', id_to_word[0])
- print('id_to_word[1]:', id_to_word[1])
- print('id_to_word[2]:', id_to_word[2])
- print()
- print("word_to_id['car']:", word_to_id['car'])
- print("word_to_id['happy']:", word_to_id['happy'])
- print("word_to_id['lexus']:", word_to_id['lexus'])
- corpus size: 929589
- corpus[:30]: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
- 24 25 26 27 28 29]
- id_to_word[0]: aer
- id_to_word[1]: banknote
- id_to_word[2]: berlitz
- word_to_id['car']: 3856
- word_to_id['happy']: 4428
- word_to_id['lexus']: 7426
下面我们将基于计数的方法应用于PTB数据集,同时使用更快速的SVD对大矩阵执行SVD:sklearn 的 randomized_svd( )
- window_size = 2
- wordvec_size = 100
-
- corpus, word_to_id, id_to_word = ptb.load_data('train')
- vocab_size = len(word_to_id)
- print('calculating co-occurrence ...')
- C = create_co_matrix(corpus, vocab_size, window_size)
- print('calculating PPMI ...')
- W = ppmi(C, verbose=True)
- print('calculating SVD ...')
- try:
- # truncated SVD (fast!)
- from sklearn.utils.extmath import randomized_svd
- U, S, V = randomized_svd(W, n_components=wordvec_size, n_iter=5, random_state=None)
- except ImportError:
- # SVD (slow)
- U, S, V = np.linalg.svd(W)
-
- word_vecs = U[:, :wordvec_size]
-
- querys = ['you', 'year', 'car', 'toyota']
- for query in querys:
- most_similar(query, word_to_id, id_to_word, word_vecs, top=5)
- [query] you
- i: 0.6907097697257996
- we: 0.6247817277908325
- do: 0.5822529792785645
- anybody: 0.5639793872833252
- 'd: 0.501946210861206
-
- [query] year
- month: 0.6549689769744873
- quarter: 0.6407794952392578
- next: 0.6044325828552246
- months: 0.5909066200256348
- earlier: 0.5837885141372681
-
- [query] car
- auto: 0.6536476612091064
- luxury: 0.6262210011482239
- cars: 0.6077972650527954
- corsica: 0.5058659315109253
- vehicle: 0.4927710294723511
-
- [query] toyota
- motor: 0.7512580156326294
- motors: 0.6617915630340576
- mazda: 0.6069322824478149
- lexus: 0.591224193572998
- honda: 0.576309084892273
for i in range (len(list1)): print i ,list1[i]
for index, item in enumerate(list1): print index, item
- >>> x = np.array([100, -20, 2])
- >>> x.argsort()
- array([1, 2, 0])
- >>> (-x).argsort()
- array([0, 2, 1])
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。