赞
踩
本次的内容没有大改动,只是可以不用配置torchtext,也不import d2l,那就意味着可以使用任意版本的pyrtoch。
相关的环境配置也只需要 pip install transformers 就可以了。
目录
- import os
- from tqdm import tqdm
- import random
- import time
- import torch
- from torch import nn
- import torch.utils.data as Data
- import torch.nn.functional as F
- import sys
- sys.path.append("..")
-
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
- DATA_ROOT = "../../data"
- print(torch.__version__, device)
- from transformers import BertModel, BertTokenizer
- # 这里我们调用bert-base模型,同时模型的词典经过小写处理
- model_name = 'bert-base-uncased'
- # 读取模型对应的tokenizer
- tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir='./transformers/bert-base-uncased/')
- # 载入模型
- model = BertModel.from_pretrained(model_name, cache_dir='./transformers/bert-base-uncased/')
- model.to(device)
- model.eval()
- def read_imdb(folder='train', data_root="/Datasets/aclImdb"):
- data = []
- for label in ['pos', 'neg']:
- folder_name = os.path.join(data_root, folder, label)
- for file in tqdm(os.listdir(folder_name)):
- with open(os.path.join(folder_name, file), 'rb') as f:
- review = f.read().decode('utf-8').replace('\n', '').lower()
- data.append([review, 1 if label == 'pos' else 0])
- random.shuffle(data)
- return data
- train_data = read_imdb('train', data_root=os.path.join(DATA_ROOT, "aclImdb"))
- test_data = read_imdb('test', data_root=os.path.join(DATA_ROOT, "aclImdb"))
- def pretreatment(original_data):
- i = 0
- for element in original_data:
- temporary = []
- original_data[i][0] = torch.tensor(tokenizer.encode(element[0], add_special_tokens=True))
- if (original_data[i][0].shape)[0] > 500:
- original_data[i][0] = original_data[i][0][:500]
- original_data[i][0][499] = 102
- elif (original_data[i][0].shape)[0] < 500:
- n = torch.zeros(500)
- n[: (original_data[i][0].shape)[0]-1] = original_data[i][0][:(original_data[i][0].shape)[0]-1]
- original_data[i][0] = n
- original_data[i][0][499] = 102
- temporary.append(element[1])
- original_data[i][1] = torch.tensor(temporary)
- i = i+1
- features = torch.cat([original_data[i][0].unsqueeze(0).long() for i in range(len(test_data))])
- labels = torch.cat( [original_data[i][1] for i in range(len(test_data))], 0)
- return features, labels
- train_set = Data.TensorDataset(*(pretreatment(train_data)))
- test_set = Data.TensorDataset(*(pretreatment(test_data)))
- batch_size = 2
- train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
- test_iter = Data.DataLoader(test_set, batch_size, shuffle=True)
- del(train_data)
- del(test_data)
- del(train_set)
- del(test_set)
- class GlobalMaxPool1d(nn.Module):
- def __init__(self):
- super(GlobalMaxPool1d, self).__init__()
- def forward(self, x):
- # x shape: (batch_size, channel, seq_len)
- return F.max_pool1d(x, kernel_size=x.shape[2]) # shape: (batch_size, channel, 1)
- class TextCNN(nn.Module):
- def __init__(self, embed_size, kernel_sizes, num_channels):
- super(TextCNN, self).__init__()
- # self.embedding = nn.Embedding(len(vocab), embed_size)
- # 不参与训练的嵌入层
- # self.constant_embedding = nn.Embedding(len(vocab), embed_size)
- self.dropout = nn.Dropout(0.5)
- self.decoder = nn.Linear(sum(num_channels), 2)
- # 时序最大池化层没有权重,所以可以共用一个实例
- self.pool = GlobalMaxPool1d()
- self.convs = nn.ModuleList() # 创建多个一维卷积层
-
- for c, k in zip(num_channels, kernel_sizes):
- self.convs.append(nn.Conv1d(in_channels = 2*embed_size,
- out_channels = c,
- kernel_size = k))
-
- def forward(self, inputs):
- outputs = model(inputs)[0] #shape(500, 768)
- embeddings = torch.cat((
- outputs,
- outputs), dim=2) # (batch, seq_len, 2*embed_size)
- # 根据Conv1D要求的输入格式,将词向量维,即一维卷积层的通道维(即词向量那一维),变换到前一维
- embeddings = embeddings.permute(0, 2, 1) # 交换维度的函数
- # 对于每个一维卷积层,在时序最大池化后会得到一个形状为(批量大小, 通道大小, 1)的
- # Tensor。使用flatten函数去掉最后一维,然后在通道维上连结
- encoding = torch.cat([self.pool(F.relu(conv(embeddings))).squeeze(-1) for conv in self.convs], dim=1)
- # 应用丢弃法后使用全连接层得到输出
- outputs = self.decoder(self.dropout(encoding))
- return outputs
- embed_size, kernel_sizes, nums_channels = 768, [3, 4, 5], [100, 100, 100]
- net = TextCNN(embed_size, kernel_sizes, nums_channels)
- def evaluate_accuracy(data_iter, net, device=None):
- if device is None and isinstance(net, torch.nn.Module):
- # 如果没指定device就使用net的device
- device = list(net.parameters())[0].device
- acc_sum, n = 0.0, 0
- with torch.no_grad():
- for X, y in data_iter:
- if isinstance(net, torch.nn.Module):
- net.eval() # 评估模式, 这会关闭dropout
- acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
- net.train() # 改回训练模式
- else: # 自定义的模型, 3.13节之后不会用到, 不考虑GPU
- if('is_training' in net.__code__.co_varnames): # 如果有is_training这个参数
- # 将is_training设置成False
- acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item()
- else:
- acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
- n += y.shape[0]
- return acc_sum / n
- def train(train_iter, test_iter, net, loss, optimizer, device, num_epochs):
- net = net.to(device)
- print("training on ", device)
- batch_count = 0
- for epoch in range(num_epochs):
- train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
- for X, y in train_iter:
- X = X.to(device)
- y = y.to(device)
- y_hat = net(X)
- l = loss(y_hat, y)
- optimizer.zero_grad()
- l.backward()
- optimizer.step()
- train_l_sum += l.cpu().item()
- train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
- n += y.shape[0]
- batch_count += 1
- test_acc = evaluate_accuracy(test_iter, net)
- print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
- % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))
- lr, num_epochs = 0.001, 1
- optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
- loss = nn.CrossEntropyLoss()
- train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)
- PATH = "./SaveModel/BertBaseEpoch1.pth"
- torch.save(net, PATH)
- def predict_sentiment(net, sentence):
- """sentence是词语的列表"""
- device = list(net.parameters())[0].device
- sentence = torch.tensor(tokenizer.encode(s, add_special_tokens=True), device=device)
- label = torch.argmax(net(sentence.view((1, -1))), dim=1)
- return 'positive' if label.item() == 1 else 'negative'
- print("请输入一句评价电影的英文:")
- s = input()
- # 需不需要填充到500
- print(predict_sentiment(net, s))
- import torch
- from torch import nn
- import torch.nn.functional as F
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
- from transformers import BertModel, BertTokenizer
- # 这里我们调用bert-base模型,同时模型的词典经过小写处理
- model_name = 'bert-base-uncased'
- # 读取模型对应的tokenizer
- tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir='./transformers/bert-base-uncased/')
- # 载入模型
- model = BertModel.from_pretrained(model_name, cache_dir='./transformers/bert-base-uncased/')
- model.to(device)
- model.eval()
- class GlobalMaxPool1d(nn.Module):
- def __init__(self):
- super(GlobalMaxPool1d, self).__init__()
- def forward(self, x):
- # x shape: (batch_size, channel, seq_len)
- return F.max_pool1d(x, kernel_size=x.shape[2]) # shape: (batch_size, channel, 1)
- class TextCNN(nn.Module):
- def __init__(self, embed_size, kernel_sizes, num_channels):
- super(TextCNN, self).__init__()
- # self.embedding = nn.Embedding(len(vocab), embed_size)
- # 不参与训练的嵌入层
- # self.constant_embedding = nn.Embedding(len(vocab), embed_size)
- self.dropout = nn.Dropout(0.5)
- self.decoder = nn.Linear(sum(num_channels), 2)
- # 时序最大池化层没有权重,所以可以共用一个实例
- self.pool = GlobalMaxPool1d()
- self.convs = nn.ModuleList() # 创建多个一维卷积层
-
- for c, k in zip(num_channels, kernel_sizes):
- self.convs.append(nn.Conv1d(in_channels = 2*embed_size,
- out_channels = c,
- kernel_size = k))
-
- def forward(self, inputs):
- outputs = model(inputs)[0] #shape(500, 768)
- embeddings = torch.cat((
- outputs,
- outputs), dim=2) # (batch, seq_len, 2*embed_size)
- # 根据Conv1D要求的输入格式,将词向量维,即一维卷积层的通道维(即词向量那一维),变换到前一维
- embeddings = embeddings.permute(0, 2, 1) # 交换维度的函数
- # 对于每个一维卷积层,在时序最大池化后会得到一个形状为(批量大小, 通道大小, 1)的
- # Tensor。使用flatten函数去掉最后一维,然后在通道维上连结
- encoding = torch.cat([self.pool(F.relu(conv(embeddings))).squeeze(-1) for conv in self.convs], dim=1)
- # 应用丢弃法后使用全连接层得到输出
- outputs = self.decoder(self.dropout(encoding))
- return outputs
- PATH = "./SaveModel/BertBaseEpoch1.pth"
- net = torch.load(PATH)
net.to(device)
- def predict_sentiment(net, sentence):
- """sentence是词语的列表"""
- device = list(net.parameters())[0].device
- sentence = torch.tensor(tokenizer.encode(s, add_special_tokens=True), device=device)
- label = torch.argmax(net(sentence.view((1, -1))), dim=1)
- return 'positive' if label.item() == 1 else 'negative'
- print("请输入一句评价电影的英文:")
- s = input()
- # 需不需要填充到500
- print(predict_sentiment(net, s))
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。