赞
踩
先导入数据集,这里是先新建train.py文件导入数据和添加标签(也可以直接在一个文件里进行)
- import pandas as pd
- import numpy as np
-
- def load_data(path):
- data = pd.read_csv(path, encoding='utf-8',sep='|',nrows=14)
- data = data.content
- label_list = [2,0,1,1,1,0,2,0,2,0,2,0,1,1]#手动添加标签
- data = pd.DataFrame(data)
- data["label"] = label_list
- return data
-
-
-
-
同一目录下再建一个.py文件,导入数据集进行训练和测试
- import os
- import numpy as np
- import pandas as pd
- import torch
- import torch.nn as nn
- import torch.utils.data as Data
- import torch.optim as optim
- import transformers
- from transformers import AutoModel, AutoTokenizer
- import matplotlib.pyplot as plt
-
- train_curve = []
- device = torch.device('cuda')
-
- # 定义一些参数,模型选择了最基础的bert中文模型
- batch_size = 2
- epoches = 100
- model = "bert-base-chinese"
- hidden_size = 768
- n_class = 3
- maxlen = 8
-
- from train import load_data
-
-
- data = load_data("data/train_dataset/train_dataset.csv")
- sentences = list(data.content)
- labels = list(data.label)
-
- #print(sentences)
- #print(labels)
-
- # 将数据构造成bert的输入格式
- # inputs_ids: token的字典编码
- # attention_mask:长度与inputs_ids一致,真实长度的位置填充1,padding位置填充0
- # token_type_ids: 第一个句子填充0,第二个句子句子填充1
- class MyDataset(Data.Dataset):#调用dataset要重写__len__和__getitem__方法
- def __init__(self, sentences, labels=None, with_labels=True,):
- self.tokenizer = AutoTokenizer.from_pretrained(model)
- self.with_labels = with_labels
- self.sentences = sentences
- self.labels = labels
- def __len__(self):
- return len(sentences)
-
- def __getitem__(self, index):
- # Selecting sentence1 and sentence2 at the specified index in the data frame
- sent = self.sentences[index]
-
- # Tokenize the pair of sentences to get token ids, attention masks and token type ids
- encoded_pair = self.tokenizer(sent,
- padding='max_length', # Pad to max_length
- truncation=True, # Truncate to max_length
- max_length=maxlen,
- return_tensors='pt') # Return torch.Tensor objects
-
- token_ids = encoded_pair['input_ids'].squeeze(0) # 词向量tensor of token ids
- attn_masks = encoded_pair['attention_mask'].squeeze(0) # 掩码→要覆盖掉的词binary tensor with "0" for padded values and "1" for the other values
- token_type_ids = encoded_pair['token_type_ids'].squeeze(0) # 索引binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens
-
- if self.with_labels: # True if the dataset has labels
- label = self.labels[index]
- return token_ids, attn_masks, token_type_ids, label
- else:
- return token_ids, attn_masks, token_type_ids
-
- train = Data.DataLoader(dataset=MyDataset(sentences, labels), batch_size=batch_size, shuffle=True, num_workers=0)
-
- # model
- class BertClassify(nn.Module):
- def __init__(self):
- super(BertClassify, self).__init__()
- self.bert = AutoModel.from_pretrained(model, output_hidden_states=True, return_dict=True)
- self.linear = nn.Linear(hidden_size, n_class) # 直接用cls向量接全连接层分类
- self.dropout = nn.Dropout(0.5)
-
- def forward(self, X):
- input_ids, attention_mask, token_type_ids = X[0], X[1], X[2]
- outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) # 返回一个output字典
- # 用最后一层cls向量做分类
- # outputs.pooler_output: [bs, hidden_size]
- logits = self.linear(self.dropout(outputs.pooler_output))
-
- return logits
-
- bc = BertClassify().to(device)
-
- optimizer = optim.Adam(bc.parameters(), lr=1e-3, weight_decay=1e-2)
- loss_fn = nn.CrossEntropyLoss()
-
- # train
- sum_loss = 0
- total_step = len(train)
- for epoch in range(epoches):
- for i, batch in enumerate(train):
- optimizer.zero_grad()
- batch = tuple(p.to(device) for p in batch)
- pred = bc([batch[0], batch[1], batch[2]])
- loss = loss_fn(pred, batch[3])
- sum_loss += loss.item()#累加损失
-
- loss.backward()
- optimizer.step()
- if epoch % 10 == 0:
- print('[{}|{}] step:{}/{} loss:{:.4f}'.format(epoch+1, epoches, i+1, total_step, loss.item()))
- train_curve.append(sum_loss)
- sum_loss = 0#每计算一个epoch的损失,要先置为0
-
- # test
- bc.eval()#测试的时候不会算梯度
- with torch.no_grad():
- test_text = ['服务,喂,你好,是这样的,我那个我的电话因为上次突然把给我停机了,我不知道啥意思,然后我就好长时间没有使用,我这次拿着10,怎么我欠费160多了?噢,我想问一下,我这机器现在是在停机状态吧,']
- test = MyDataset(test_text, labels=None, with_labels=False)
- x = test.__getitem__(0)#得到三个输出(token mask belong)和label
- x = tuple(p.unsqueeze(0).to(device) for p in x)#把所有输出封装成元组(加维度[在测试数据集为1时])
- pred = bc([x[0], x[1], x[2]])
- pred = pred.data.max(dim=1, keepdim=True)[1]
- if pred[0][0] == 0:
- print('中性')
- elif pred[0][0] == 1:
- print('不满')
- else:
- print('强烈不满')
-
- pd.DataFrame(train_curve).plot() # loss曲线
- plt.show()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。