赞
踩
word2vec内容链接
word2vec代码内容如下:
import numpy as np from collections import defaultdict class word2vec(): def __init__(self): self.n = settings['n'] self.lr = settings['learning_rate'] self.epochs = settings['epochs'] self.window = settings['window_size'] def generate_training_data(self, settings, corpus): """ 得到训练数据 """ #defaultdict(int) 一个字典,当所访问的键不存在时,用int类型实例化一个默认值 word_counts = defaultdict(int) #遍历语料库corpus for row in corpus: for word in row: #统计每个单词出现的次数 word_counts[word] += 1 # 词汇表的长度 self.v_count = len(word_counts.keys()) # 在词汇表中的单词组成的列表 self.words_list = list(word_counts.keys()) # 以词汇表中单词为key,索引为value的字典数据 self.word_index = dict((word, i) for i, word in enumerate(self.words_list)) #以索引为key,以词汇表中单词为value的字典数据 self.index_word = dict((i, word) for i, word in enumerate(self.words_list)) training_data = [] for sentence in corpus: sent_len = len(sentence) for i, word in enumerate(sentence): w_target = self.word2onehot(sentence[i]) w_context = [] for j in range(i - self.window, i + self.window): if j != i and j <= sent_len - 1 and j >= 0: w_context.append(self.word2onehot(sentence[j])) training_data.append([w_target, w_context]) return np.array(training_data) def word2onehot(self, word): #将词用onehot编码 word_vec = [0 for i in range(0, self.v_count)] word_index = self.word_index[word] word_vec[word_index] = 1 return word_vec def train(self, training_data): #随机化参数w1,w2 self.w1 = np.random.uniform(-1, 1, (self.v_count, self.n)) self.w2 = np.random.uniform(-1, 1, (self.n, self.v_count)) for i in range(self.epochs): self.loss = 0 # w_t 是表示目标词的one-hot向量 #w_t -> w_target,w_c ->w_context for w_t, w_c in training_data: #前向传播 y_pred, h, u = self.forward(w_t) #计算误差 EI = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0) #反向传播,更新参数 self.backprop(EI, h, w_t) #计算总损失 self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u))) print('Epoch:', i, "Loss:", self.loss) def forward(self, x): """ 前向传播 """ h = np.dot(self.w1.T, x) u = np.dot(self.w2.T, h) y_c = self.softmax(u) return y_c, h, u def softmax(self, x): """ """ e_x = np.exp(x - np.max(x)) return e_x / np.sum(e_x) def backprop(self, e, h, x): d1_dw2 = np.outer(h, e) d1_dw1 = np.outer(x, np.dot(self.w2, e.T)) self.w1 = self.w1 - (self.lr * d1_dw1) self.w2 = self.w2 - (self.lr * d1_dw2) def word_vec(self, word): """ 获取词向量 通过获取词的索引直接在权重向量中找 """ w_index = self.word_index[word] v_w = self.w1[w_index] return v_w def vec_sim(self, word, top_n): """ 找相似的词 """ v_w1 = self.word_vec(word) word_sim = {} for i in range(self.v_count): v_w2 = self.w1[i] theta_sum = np.dot(v_w1, v_w2) #np.linalg.norm(v_w1) 求范数 默认为2范数,即平方和的二次开方 theta_den = np.linalg.norm(v_w1) * np.linalg.norm(v_w2) theta = theta_sum / theta_den word = self.index_word[i] word_sim[word] = theta words_sorted = sorted(word_sim.items(), key=lambda kv: kv[1], reverse=True) for word, sim in words_sorted[:top_n]: print(word, sim) def get_w(self): w1 = self.w1 return w1 #超参数 settings = { 'window_size': 2, #窗口尺寸 m #单词嵌入(word embedding)的维度,维度也是隐藏层的大小。 'n': 10, 'epochs': 50, #表示遍历整个样本的次数。在每个epoch中,我们循环通过一遍训练集的样本。 'learning_rate':0.01 #学习率 } #数据准备 text = "natural language processing and machine learning is fun and exciting" #按照单词间的空格对我们的语料库进行分词 corpus = [[word.lower() for word in text.split()]] print(corpus) #初始化一个word2vec对象 w2v = word2vec() training_data = w2v.generate_training_data(settings,corpus) #训练 w2v.train(training_data) # 获取词的向量 word = "machine" vec = w2v.word_vec(word) print(word, vec) # 找相似的词 w2v.vec_sim("machine", 3)
原句为’natural language processing and machine learning is fun and exciting’
这里统计完词频、词的索引等参数之后,获得原来数据对应的list的内容
[
['natural', ['language']], ['language', ['natural', 'processing']],
['processing', ['natural', 'language', 'and']],
['and', ['language', 'processing', 'machine']],
['machine', ['processing', 'and', 'learning']],
['learning', ['and', 'machine', 'is']],
['is', ['machine', 'learning', 'fun']],
['fun', ['learning', 'is', 'and']],
['and', ['is', 'fun', 'exciting']],
['exciting', ['fun', 'and']]
]
调用
training_data = w2v.generate_training_data(settings,corpus)
返回来的training_data的数据为
training_data =
[[list([1, 0, 0, 0, 0, 0, 0, 0, 0]) list([[0, 1, 0, 0, 0, 0, 0, 0, 0]])]
[list([0, 1, 0, 0, 0, 0, 0, 0, 0])
list([[1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0]])]
..........
[list([0, 0, 0, 0, 0, 0, 0, 0, 1])
list([[0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0]])]]
接下来进入train训练的代码过程
self.w1 = np.random.uniform(-1,1,(self.v_count,self.n))
self.w2 = np.random.uniform(-1,1,(self.n,self.v_count))
这里的self.v_count = 9,self.n = 10,因此构成了self.w1为一个(9,10)的(-1,1)的矩阵,self.w2为一个(10,9)的(-1,1)的矩阵
self.w1 =
[[-0.88093613 0.44287707 -0.20015634 -0.17542098 -0.18688373 0.25044748
0.86296623 0.85030189 0.78452837 -0.18417995]
......
[-0.52173874 0.2372753 -0.06543664 0.18024424 0.28042927 0.34655803
0.06426065 0.79247053 -0.60444507 0.45783363]]
接下来查看反向传播的操作
for w_t,w_c in training_data:
y_pred,h,u = self.forward(w_t)
......
查看反向传播的研读过程
def forward(self, x):
"""
前向传播
"""
h = np.dot(self.w1.T, x)
u = np.dot(self.w2.T, h)
y_c = self.softmax(u)
return y_c, h, u
整理出来公式
h = self.w1.T*x
u = self.w2.T*(self.w1.T*h)
y_c = softmax(self.w2.T*(self.w1.T*h))
然后计算所有单词的相应的误差
EI = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0)
这步类似于交叉熵损失函数的内容
接下来代码进行反向传播、更新参数的操作,这里重点看一下这个的过程
self.backprop(EI,h,w_t)
进入backprop函数之中
def backprop(self,e,h,x):
d1_dw2 = np.outer(h,e)
d1_dw1 = np.outer(x,np.dot(self.w1.T,e.T))
self.w1 = self.w1 - (self.lr*d1_dw1)
self.w2 = self.w2 - (self.lr*d1_dw2)
这里解析一下这里的d1_dw2和d1_dw1的计算得到的结果
首先我们将e、h、x的原来构成的值提取出来
e
=
E
I
=
s
o
f
t
m
a
x
(
w
2
∗
(
w
1
∗
w
o
r
d
)
)
e = EI = softmax(w_{2}*(w_{1}*word))
e=EI=softmax(w2∗(w1∗word))
h
=
w
2
∗
(
w
1
∗
w
o
r
d
)
h = w_{2}*(w_{1}*word)
h=w2∗(w1∗word)
x
=
w
o
r
d
x = word
x=word
所以对应的值为
d
1
d
w
1
=
w
o
r
d
∗
(
w
2
∗
s
o
f
t
m
a
x
l
o
s
s
)
=
s
o
f
t
m
a
x
l
o
s
s
∗
w
2
∗
w
o
r
d
d_{1}dw_{1} = word*(w_{2}*softmaxloss) = softmaxloss*w_{2}*word
d1dw1=word∗(w2∗softmaxloss)=softmaxloss∗w2∗word
d
1
d
w
2
=
s
o
f
t
m
a
x
l
o
s
s
∗
(
w
2
T
∗
(
w
1
T
∗
w
o
r
d
)
)
=
s
o
f
t
m
a
x
l
o
s
s
∗
w
2
T
∗
w
o
r
d
∗
w
1
T
d_{1}dw_{2} = softmaxloss*(w_{2}^{T}*(w_{1}^{T}*word)) = softmaxloss * w_{2}^{T}*word*w_{1}^{T}
d1dw2=softmaxloss∗(w2T∗(w1T∗word))=softmaxloss∗w2T∗word∗w1T
个人认为这里的求便导然后反向传播写的不对
感觉应该为下面的过程(一个对w1.T偏导只剩w2,一个对w2.T偏导只剩w1)
def backprop(self, e, h, x):
#d1_dw2 = np.outer(h, e)
d1_dw2 = np.outer(x, np.dot(self.w1.T,e.T)).T
d1_dw1 = np.outer(x, np.dot(self.w2, e.T))
#d1_dw1 = np.outer(x,np.dot(self.w1.T,e.T))
self.w1 = self.w1 - (self.lr * d1_dw1)
self.w2 = self.w2 - (self.lr * d1_dw2)
本质上就是给出一个单词的embedding和一个w1_t以及w2_t,周边的单词通过乘上w1_t和w2_t能够计算到中间的词语上去
最后解析一下计算单词的相似度内容
def vec_sim(self, word, top_n): """ 找相似的词 """ v_w1 = self.word_vec(word) word_sim = {} for i in range(self.v_count): v_w2 = self.w1[i] theta_sum = np.dot(v_w1, v_w2) #np.linalg.norm(v_w1) 求范数 默认为2范数,即平方和的二次开方 theta_den = np.linalg.norm(v_w1) * np.linalg.norm(v_w2) theta = theta_sum / theta_den word = self.index_word[i] word_sim[word] = theta words_sorted = sorted(word_sim.items(), key=lambda kv: kv[1], reverse=True) for word, sim in words_sorted[:top_n]: print(word, sim)
从词典中挨个找出单词来,调出embedding,然后不断地乘上v_w1或者v_w2,最后计算单词的相似度并进行排序得到最终的结果
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。