赞
踩
目录
代码参考《动手学深度学习》(Pytorch版)10.8节改动词向量部分。
github地址代码中需要用到d2l包中的函数
对于《动手学深度学习》(Pytorch版)可以使用的环境是,cuda10+pytorch1.2+torchtext0.4
由于torchtext0.4和现在的新版有函数不同,下面是torchtext0.4的官方文档,用于查阅
cuda10+pytorch的配置参照这位大佬的文章
pip install torchtext==0.4.0
- fname = os.path.join(DATA_ROOT, "aclImdb_v1.tar.gz")
- if not os.path.exists(os.path.join(DATA_ROOT, "aclImdb")):
- print("从压缩包解压...")
- with tarfile.open(fname, 'r') as f:
- f.extractall(DATA_ROOT)
- !pip install transformers
- import torch
- from transformers import BertModel, BertTokenizer
- # 下载bert-base不区分大小写,768维
- tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', cache_dir='./transformers/bert-base-uncased/')
- model = BertModel.from_pretrained('bert-base-uncased', cache_dir='./transformers/bert-base-uncased/')
- import os
- import torch
- from torch import nn
- import torchtext.vocab as Vocab
- import torch.utils.data as Data
- import torch.nn.functional as F
-
- import sys
- sys.path.append("..")
- import d2lzh_pytorch as d2l
-
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
- DATA_ROOT = "../../data" # 此处根据自己的情况更改,就是有解压好的数据集的目录
- print(torch.__version__, device)
- from transformers import BertModel, BertTokenizer
- # 这里我们调用bert-base模型,同时模型的词典经过小写处理
- model_name = 'bert-base-uncased'
- # 读取模型对应的tokenizer
- tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir='./transformers/bert-base-uncased/')
- # 载入模型
- model = BertModel.from_pretrained(model_name, cache_dir='./transformers/bert-base-uncased/')
- model.to(device)
- model.eval()
- train_data = d2l.read_imdb('train', data_root=os.path.join(DATA_ROOT, "aclImdb"))
- test_data = d2l.read_imdb('test', data_root=os.path.join(DATA_ROOT, "aclImdb"))
此处写了一堆for来使数据的格式和《动手学深度学习》(Pytorch版)相同,很傻的方法,轻喷
- def pretreatment(original_data):
- i = 0
- for element in original_data:
- temporary = []
- original_data[i][0] = torch.tensor(tokenizer.encode(element[0], add_special_tokens=True))
- if (original_data[i][0].shape)[0] > 500:
- original_data[i][0] = original_data[i][0][:500]
- original_data[i][0][499] = 102
- elif (original_data[i][0].shape)[0] < 500:
- n = torch.zeros(500)
- n[: (original_data[i][0].shape)[0]-1] = original_data[i][0][:(original_data[i][0].shape)[0]-1]
- original_data[i][0] = n
- original_data[i][0][499] = 102
- temporary.append(element[1])
- original_data[i][1] = torch.tensor(temporary)
- i = i+1
- features = torch.cat([original_data[i][0].unsqueeze(0).long() for i in range(len(test_data))])
- labels = torch.cat( [original_data[i][1] for i in range(len(test_data))], 0)
- return features, labels
- train_set = Data.TensorDataset(*(pretreatment(train_data)))
- test_set = Data.TensorDataset(*(pretreatment(test_data)))
- batch_size = 2 # batch_size根据自己的显卡情况改动
- train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
- test_iter = Data.DataLoader(test_set, batch_size, shuffle=True)
我的显卡是1050 4G的显存,可以承受住batch_size=2,但是刚开始的时候只用batch_size=1训练了5个epoch。
- class TextCNN(nn.Module):
- def __init__(self, embed_size, kernel_sizes, num_channels):
- super(TextCNN, self).__init__()
- self.dropout = nn.Dropout(0.5)
- self.decoder = nn.Linear(sum(num_channels), 2)
- # 时序最大池化层没有权重,所以可以共用一个实例
- self.pool = GlobalMaxPool1d()
- self.convs = nn.ModuleList() # 创建多个一维卷积层
-
- for c, k in zip(num_channels, kernel_sizes):
- self.convs.append(nn.Conv1d(in_channels = 2*embed_size,
- out_channels = c,
- kernel_size = k))
-
- def forward(self, inputs):
- outputs = model(inputs)[0] #shape(batchsize, 500, 768)
- embeddings = torch.cat((
- outputs,
- outputs), dim=2) # (batch, seq_len, 2*embed_size)
- # 根据Conv1D要求的输入格式,将词向量维,即一维卷积层的通道维(即词向量那一维),变换到前一维
- embeddings = embeddings.permute(0, 2, 1) # 交换维度的函数
- # 对于每个一维卷积层,在时序最大池化后会得到一个形状为(批量大小, 通道大小, 1)的
- # Tensor。使用flatten函数去掉最后一维,然后在通道维上连结
- encoding = torch.cat([self.pool(F.relu(conv(embeddings))).squeeze(-1) for conv in self.convs], dim=1)
- # 应用丢弃法后使用全连接层得到输出
- outputs = self.decoder(self.dropout(encoding))
- return outputs
这个地方有一点改动,自己的思路也不是很清晰,可能写的不对,但是可以跑
- embed_size, kernel_sizes, nums_channels = 768, [3, 4, 5], [100, 100, 100]
- net = TextCNN(embed_size, kernel_sizes, nums_channels)
- lr, num_epochs = 0.001, 5
- optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
- loss = nn.CrossEntropyLoss()
d2l.train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)
- def predict_sentiment(net, sentence):
- """sentence是词语的列表"""
- device = list(net.parameters())[0].device
- sentence = torch.tensor(tokenizer.encode(s, add_special_tokens=True), device=device)
- label = torch.argmax(net(sentence.view((1, -1))), dim=1)
- return 'positive' if label.item() == 1 else 'negative'
- print("请输入一句评价电影的英文:")
- s = input()
- # 此处没有设置填充或截断到500,可以试一下
- print(predict_sentiment(net, s))
这个结果的超参数,除了batch_size = 1,其他的参数和上面给出的代码是一样的 ,在睡觉的时候训练的,对于这个时间很佛系。关于这个结果我的感觉是训练5个epoch这个模型并没有过拟合,而且train acc和test acc之间的gap是是挺小的,可以再训练几个epoch╭(╯^╰)╮。
同样的网络使用glove300明显过拟合了
transformer我的个人电脑实在是带不动啊,太慢了。
关于输入句子进行测试的结果
我都感觉自己写的代码有点离谱,这只是我的一个小练习,肯定有许多不对劲的地方,毕竟我BERT还没有学明白,继续努力。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。