当前位置:   article > 正文

Pytorch - 词向量_pytorch训练词向量

pytorch训练词向量

学习自:蓝思诺特
https://www.bilibili.com/video/BV1Ky4y1g7Nk?p=6



One-hot 的缺点

  • 过于稀疏
  • 没有体现词的相似、关联

词向量化:把每一个词投射到n维空间中。
主流的两种简单的词向量训练方法:CBOW, Skip-gram


在这里插入图片描述

  • COW: 前后的词预测中间的词;前后多少个词的个数,也叫做 window_size

CBOW 代码实现

# 每一行可以理解为一篇文章
docs = [
    #数字
    ['5', '2', '4', '8', '6', '2', '3', '6', '4'],
    ['4', '8', '5', '6', '9', '5', '5', '6'],
    ['1', '1', '5', '2', '3', '3', '8'],
    ['3', '6', '9', '6', '8', '7', '4', '6', '3'],
    ['8', '9', '9', '6', '1', '4', '3', '4'],
    ['1', '0', '2', '0', '2', '1', '3', '3', '3', '3', '3'],
    ['9', '3', '3', '0', '1', '4', '7', '8'],
    ['9', '9', '8', '5', '6', '7', '1', '2', '3', '0', '1', '0'],

    #字母中夹杂了一些数字
    ['a', 't', 'g', 'q', 'e', 'h', '9', 'u', 'f'],
    ['e', 'q', 'y', 'u', 'o', 'i', 'p', 's'],
    ['q', 'o', '9', 'p', 'l', 'k', 'j', 'o', 'k', 'k', 'o', 'p'],
    ['h', 'g', 'y', 'i', 'u', 't', 't', 'a', 'e', 'q'],
    ['i', 'k', 'd', 'q', 'r', 'e', '9', 'e', 'a', 'd'],
    ['o', 'p', 'd', 'g', '9', 's', 'a', 'f', 'g', 'a'],
    ['i', 'u', 'y', 'g', 'h', 'k', 'l', 'a', 's', 'w'],
    ['o', 'l', 'u', 'y', 'a', 'o', 'g', 'f', 's'],
    ['o', 'p', 'i', 'u', 'y', 'g', 'd', 'a', 's', 'j', 'd', 'l'],
    ['u', 'k', 'i', 'l', 'o', '9', 'l', 'j', 's'],
    ['y', 'g', 'i', 's', 'h', 'k', 'j', 'l', 'f', 'r', 'f'],
    ['i', 'o', 'h', '9', 'n', '9', 'd', '9', 'f', 'a', '9'],
]

# %%
#生成字典
zidian = {}
for doc in docs:
    for word in doc:
        if word not in zidian:
            zidian[word] = len(zidian)

zidian['0'], len(zidian) # (9, 30)

# %%
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn


#定义数据
class DocDataset(Dataset):
    def __init__(self):
        xs = []
        ys = []

        for doc in docs:

            #句子根据字典编码成数字
            doc_encode = [zidian[word] for word in doc]

            #每个句子从头遍历到倒数第5个字母
            for i in range(0, len(doc) - 4):

                #x是5个字母的前后各两个
                xs.append([
                    doc_encode[i + 0], doc_encode[i + 1], doc_encode[i + 3],
                    doc_encode[i + 4]
                ])

                #y是5个字母中间的那个
                ys.append(doc_encode[i + 2])

        self.xs = torch.LongTensor(xs)
        self.ys = torch.LongTensor(ys)

    def __getitem__(self, i):
        return self.xs[i], self.ys[i]

    def __len__(self):
        return len(self.xs)


len(DocDataset()) # 113

# %%
#数据加载器
def get_dataloader():
    dataloader = DataLoader(dataset=DocDataset(),
                            batch_size=4,
                            shuffle=True,
                            drop_last=True)
    return dataloader


for i, data in enumerate(get_dataloader()):
    sample = data
    break

sample[0], sample[0].shape, sample[1], sample[1].shape

# %%
class CBOW(torch.nn.Module):
    def __init__(self):
        super().__init__()

        self.embed = torch.nn.Embedding(30, 2) # 30个词,映射成二维向量
        self.embed.weight.data.normal_(0, 0.1) # 初始化权重,均值为0,方差为0.1的正态分布

        self.fc = torch.nn.Linear(2, 30) # 全连接输出,方便计算loss 

    def forward(self, x):

        #[b,4] -> [b,4,2]  b句话,4个词,每个词用两位向量表示
        x = self.embed(x) # 转为 embed 层

        #[b,4,2] -> [b,2] 四个词求平均,变成一个词
        x = torch.mean(x, dim=1)

        #[b,2] -> [b,30]
        x = self.fc(x)

        return x


model = CBOW()
out = model(sample[0])
out, out.shape


criteon = torch.nn.CrossEntropyLoss()
optim = torch.optim.SGD(model.parameters(), lr=1e-2)

model.train()
for epoch in range(1000):
    for i, data in enumerate(get_dataloader()):
        x, y = data
        optim.zero_grad()

        #计算
        #[b,4] -> [b,30]
        out = model(x)

        loss = criteon(out, y)
        loss.backward()
        optim.step()

    if epoch % 200 == 0:
        print(epoch, loss)


colors = []
idxs = []
for word, idx in zidian.items():
    idxs.append(idx)
    if word in '1234567890':
        colors.append('red')
        continue
    colors.append('blue')

#[30] -> [30,2]
embed = model.embed(torch.LongTensor(idxs)).detach().numpy()

# 30个词投影成二维向量 
from matplotlib import pyplot as plt
plt.scatter(embed[:, 0], embed[:, 1], c=colors)
plt.show()

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
  • 158
  • 159
  • 160
  • 161

在这里插入图片描述


Skip_gram 代码实现

使用当前词,预测上一个词、下一个词、上两个词、下两个词。

# %%
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

#定义数据
class DocDataset(Dataset):
    def __init__(self):
        xs = []
        ys = []

        for doc in docs:

            #遍历每个句子的每个词,作为x
            for i in range(len(doc)):

                #遍历当前字的偏移,作为y
                for j in [-2, -1, 1, 2]:

                    #如果出界了就跳过
                    if i + j < 0 or i + j >= len(doc):
                        continue

                    xs.append(zidian[doc[i]])
                    ys.append(zidian[doc[i + j]])

        self.xs = torch.LongTensor(xs)
        self.ys = torch.LongTensor(ys)

    def __getitem__(self, i):
        return self.xs[i], self.ys[i]

    def __len__(self):
        return len(self.xs)

len(DocDataset())

# %%
#数据加载器
def get_dataloader():
    dataloader = DataLoader(dataset=DocDataset(),
                            batch_size=4,
                            shuffle=True,
                            drop_last=True)
    return dataloader


for i, data in enumerate(get_dataloader()):
    sample = data
    break

sample[0], sample[0].shape, sample[1], sample[1].shape


class SkipGram(torch.nn.Module):
    def __init__(self):
        super().__init__()

        self.embed = torch.nn.Embedding(30, 2)
        self.embed.weight.data.normal_(0,0.1)
        
        self.fc = torch.nn.Linear(2, 30)

    def forward(self, x):
        #[b] -> [b,2]
        x = self.embed(x)
        #[b,2] -> [b,30]
        x = self.fc(x)
        return x

model = SkipGram()
out = model(sample[0])
out[:2], out.shape


import random
criteon = torch.nn.CrossEntropyLoss()
optim = torch.optim.Adam(model.parameters(), lr=1e-2)

model.train()
for epoch in range(200):
    for i, data in enumerate(get_dataloader()):
        x, y = data 
        optim.zero_grad()

        #计算
        #[b] -> [b,30]
        out = model(x)

        loss = criteon(out, y)
        loss.backward()
        optim.step()

    if epoch % 20 == 0:
        print(epoch, loss)

# %%
colors = []
idxs = []
for word, idx in zidian.items():
    idxs.append(idx)
    if word in '1234567890':
        colors.append('red')
        continue
    colors.append('blue')

embed = model.embed(torch.LongTensor(idxs))
embed = embed.detach().numpy()

from matplotlib import pyplot as plt
plt.scatter(embed[:, 0], embed[:, 1], c=colors)

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112

在这里插入图片描述


elmo

# %%
zidian = {}
with open('./data/msr_paraphrase/zidian.txt') as fr:
    for line in fr.readlines():
        k, v = line.split(' ')
        zidian[k] = int(v)

zidian['<PAD>'], len(zidian)

# %%
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn


#定义数据
class MsrDataset(Dataset):
    def __init__(self):
        self.data = pd.read_csv('./data/msr_paraphrase/数字化数据.txt', nrows=2000)

    def __getitem__(self, i):
        return self.data.iloc[i]

    def __len__(self):
        return len(self.data)


len(MsrDataset())

# %%
def to_tensor(data):
    b = len(data)
    #N句话,每句话30个词
    xs = np.zeros((b * 2,30))

    for i in range(b):
        same, s1, s2 = data[i]

        #添加首尾符号,补0到统一长度
        s1 = [zidian['<SOS>']] + s1.split(',')[:28] + [
            zidian['<EOS>']
        ] + [zidian['<PAD>']] * 28
        xs[i] = s1[:30]

        s2 = [zidian['<SOS>']] + s2.split(',')[:28] + [
            zidian['<EOS>']
        ] + [zidian['<PAD>']] * 28
        xs[b + i] = s2[:30]

    return torch.LongTensor(xs)


#数据加载器
def get_dataloader():
    dataloader = DataLoader(dataset=MsrDataset(),
                            batch_size=8,
                            shuffle=True,
                            drop_last=True,
                            collate_fn=to_tensor)
    return dataloader


for i, data in enumerate(get_dataloader()):
    sample = data
    break

sample[:5], sample.shape

# %%
class ForwardBackward(nn.Module):
    def __init__(self, flip):
        super().__init__()

        self.rnn1 = nn.LSTM(input_size=256, hidden_size=256, batch_first=True)
        self.rnn2 = nn.LSTM(input_size=256, hidden_size=256, batch_first=True)

        self.fc = nn.Linear(in_features=256, out_features=4300)

        self.flip = flip

    def forward(self, x):
        b = x.shape[0]

        #初始化记忆
        h = torch.zeros(1, b,256)
        c = torch.zeros(1, b,256)

        #顺序运算,维度不变
        #[16,29,256] -> [16,29,256]

        #如果是反向传播,把x逆序,由下面一个矩阵,变成下面第二个矩阵.
        '''
        [[1,2,3],
         [4,5,6]]
         
        [[3,2,1],
         [6,5,4]]'''
        if self.flip:
            x = torch.flip(x, dims=(1, ))

        out1, (h, c) = self.rnn1(x, (h, c))
        out2, (h, c) = self.rnn2(out1, (h, c))

        #逆序后的x,计算出来的结果也是逆序的,把他们翻转回来
        if self.flip:
            x = torch.flip(x, dims=(1, ))
            out1 = torch.flip(out1, dims=(1, ))
            out2 = torch.flip(out2, dims=(1, ))

        #全连接输出
        #[16,29,256] -> [16,29,4300]
        out3 = self.fc(out2)

        return x, out1, out2, out3


x = torch.FloatTensor(16,29,256)
out = ForwardBackward(flip=True)(x)
len(out), out[-1].shape

# %%
class ELMo(nn.Module):
    def __init__(self):
        super().__init__()

        self.embed = nn.Embedding(num_embeddings=4300,
                                  embedding_dim=256,
                                  padding_idx=0)

        self.fw = ForwardBackward(flip=False)
        self.bw = ForwardBackward(flip=True)

    def forward(self, x):
        #编码
        #[16,30] -> [16,30,256]
        x = self.embed(x)

        #顺序预测,以当前字预测下一个字,不需要最后一个字
        outs_f = self.fw(x[:, :-1, :])

        #逆序预测,以当前字预测上一个字,不需要第一个字
        outs_b = self.bw(x[:, 1:, :])

        return outs_f, outs_b


out = ELMo()(sample)
len(out), len(out[0]), out[0][-1].shape

# %%
model = ELMo()
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_func = nn.CrossEntropyLoss()

for epoch in range(1):
    for i, x in enumerate(get_dataloader()):
        #x = [b,30]
        opt.zero_grad()

        #模型计算
        outs_f, outs_b = model(x)

        #在计算loss的时候,只需要全连接输出
        #[b,29,4300]
        outs_f = outs_f[-1]
        outs_b = outs_b[-1]

        #正向预测是以当前字预测下一个字,所以计算loss不需要第一个字
        #[b,30] -> [b,29]
        x_f = x[:, 1:]
        #逆向预测是以当前字预测上一个字,所以计算loss不需要最后一个字
        #[b,30] -> [b,29]
        x_b = x[:, :-1]

        #打平,不然计算不了loss
        #[b,29,4300] -> [b*29,4300]
        outs_f = outs_f.reshape(-1, 4300)
        outs_b = outs_b.reshape(-1, 4300)
        #[b,29] -> [b*29]
        x_f = x_f.reshape(-1)
        x_b = x_b.reshape(-1)

        #分别计算全向和后向的loss,再求和作为loss
        loss_f = loss_func(outs_f, x_f)
        loss_b = loss_func(outs_b, x_b)
        loss = (loss_f + loss_b) / 2

        loss.backward()
        opt.step()

        if i % 20 == 0:
            #统计正确率
            correct_f = (x_f == outs_f.argmax(axis=1)).sum().item()
            correct_b = (x_b == outs_b.argmax(axis=1)).sum().item()
            total = x.shape[0] * 29
            print(epoch, i, loss.item(), correct_f / total, correct_b / total)

# %%
def get_emb(x):
    #模型运算
    outs_f, outs_b = model(x)

    #在词向量编码时,可以任意选择一层的输出
    #[16,29,256]
    outs_f = outs_f[1]
    outs_b = outs_b[1]

    #正向和反向的输出不能对齐,把他们重叠的部分截取出来
    #[16,28,256]
    outs_f = outs_f[:, 1:]
    outs_b = outs_b[:, :-1]

    #拼合在一起,就是编码结果了
    #[16,28,256 + 256]
    embed = torch.cat((outs_f, outs_b), dim=2)

    #[16,28,512]
    return embed


get_emb(sample).shape



  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
  • 158
  • 159
  • 160
  • 161
  • 162
  • 163
  • 164
  • 165
  • 166
  • 167
  • 168
  • 169
  • 170
  • 171
  • 172
  • 173
  • 174
  • 175
  • 176
  • 177
  • 178
  • 179
  • 180
  • 181
  • 182
  • 183
  • 184
  • 185
  • 186
  • 187
  • 188
  • 189
  • 190
  • 191
  • 192
  • 193
  • 194
  • 195
  • 196
  • 197
  • 198
  • 199
  • 200
  • 201
  • 202
  • 203
  • 204
  • 205
  • 206
  • 207
  • 208
  • 209
  • 210
  • 211
  • 212
  • 213
  • 214
  • 215
  • 216
  • 217
  • 218
  • 219
  • 220
  • 221
  • 222
  • 223
  • 224
  • 225
  • 226
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Cpp五条/article/detail/575435
推荐阅读
相关标签
  

闽ICP备14008679号