赞
踩
实验用的数据可以点击这里下载
完整代码:github或gitee
from torch.utils.data import DataLoader,TensorDataset
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
import torch
device0 = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")#训练集gpu
device1 = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")#测试集gpu
data=pd.read_table('./data/train.txt',header=None)#text label
data.columns = ['text', 'label']
text=[i for i in data['text']]
label=[i for i in data['label']]
#可以通过df.colname 来指定某个列,value_count()在这里进行计数
df2 = data.label.value_counts()
print(df2)
class SentimentDataset(Dataset): def __init__(self,df): self.dataset = df def __len__(self): return len(self.dataset) def __getitem__(self, idx): text = self.dataset.loc[idx, "text"] label = self.dataset.loc[idx, "label"] input_ids = self.dataset.loc[idx, "input_ids"] attention_mask = self.dataset.loc[idx, "attention_mask"] sample = {"text": text, "label": label,"input_ids":input_ids,"attention_mask":attention_mask} # print(sample) return sample print('text2token') from transformers import AutoTokenizer, AutoModel # added_token=['##char##'] # tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese",additional_special_tokens=added_token) tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese") def text2token(text,tokenizer,max_length=100): text2id = tokenizer( text, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt" ) input_ids=text2id["input_ids"].tolist() attention_mask=text2id["attention_mask"].tolist() return input_ids,attention_mask input_ids,attention_mask=text2token(text,tokenizer,max_length=100) data['input_ids']=input_ids data['attention_mask']=attention_mask train_data = data.sample(frac=0.8) test_data=data[~data.index.isin(train_data.index)] print(len(train_data),len(test_data)) train_data=train_data.reset_index(drop=True) test_data=test_data.reset_index(drop=True) print('DataLoader') #按batch_size分 batch_size=16 train_loader = DataLoader( SentimentDataset(train_data), batch_size=batch_size, shuffle=True, num_workers=0 ) test_loader = DataLoader( SentimentDataset(test_data), batch_size=batch_size, shuffle=False, num_workers=0 ) import pickle with open('train_loader.pkl', 'wb') as f: pickle.dump(train_loader, f) with open('test_loader.pkl', 'wb') as f: pickle.dump(test_loader, f)
如果之前保存了可以用这个直接读数据
import pickle
with open("train_loader.pkl",'rb') as f:
train_loader = pickle.loads(f.read())
with open("test_loader.pkl",'rb') as f:
test_loader = pickle.loads(f.read())
from transformers import AutoTokenizer, AutoModel import torch.nn as nn import torch.nn.functional as F tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese") class fn_cls(nn.Module): def __init__(self,device): super(fn_cls, self).__init__() self.model = AutoModel.from_pretrained("bert") self.model.resize_token_embeddings(len(tokenizer))############## self.model.to(device) # self.dropout = nn.Dropout(0.3) self.l1 = nn.Linear(768, 1) def forward(self, x, attention_mask=None): outputs = self.model(x, attention_mask=attention_mask) # print(outputs[0])torch.Size([8, 100, 768]) # print(outputs[1])torch.Size([8, 768]) # print(outputs[0][:,0,:])torch.Size([8, 768]) x = outputs[1] # x = self.dropout(x) x = self.l1(x) return x # cls = fn_cls(device0) # from torch import optim # optimizer = optim.Adam(cls.parameters(), lr=1e-4) sigmoid = nn.Sigmoid() criterion = nn.BCELoss()#weight=weight
from sklearn import metrics import numpy as np from tqdm import tqdm def test(device_test): cls.to(device_test) cls.eval() epoch_loss=0 total=0 correct=0 output_all=[] label_all=[] for batch_idx,batch in enumerate(test_loader): with torch.no_grad(): label=batch['label'].to(device_test).float().view(-1,1)#batch size * 1 label_all.append(label) input_ids=torch.stack(batch['input_ids']).t().to(device_test)#batch size * 100 attention_mask=torch.stack(batch['attention_mask']).t().to(device_test)#batch size * 100 #计算输出 output = cls(input_ids, attention_mask=attention_mask)#batch size * 1 output=sigmoid(output)#batch size * 1 total+=len(output) #计算loss loss = criterion(output, label) epoch_loss+=loss ave_loss=epoch_loss/total #四舍五入 output=output.round() output_all.append(output) #计算准确率 add_correct=(output== label).sum().item() correct+=add_correct acc=correct/total if batch_idx%5==0: print('[{}/{} ({:.0f}%)]\t正确分类的样本数:{},样本总数:{},准确率:{:.2f}%,ave_loss:{}'.format( batch_idx, len(test_loader),100.*batch_idx/len(test_loader), correct, total,acc, ave_loss ),end= "\r") #结束: print('正确分类的样本数:{},样本总数:{},准确率:{:.2f}%,ave_loss:{}'.format( correct, total,acc, ave_loss)) # can't convert cuda:5 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first. output_all=torch.cat(output_all,0) label_all=torch.cat(label_all,0) output_all=np.array(output_all.cpu()) label_all=np.array(label_all.cpu()) acc_score=metrics.accuracy_score(label_all,output_all) print(metrics.classification_report(label_all,output_all)) print("准确率:",acc_score ) return acc,epoch_loss.item() # test(device1)
train_acc_l=[] train_epoch_loss_l=[] test_acc_l=[] test_epoch_loss_l=[] def train_one_epoch(device_train,epoch_num): print("______________________________________________") print("______________________________________________") print("_______________",epoch_num,"start_______________") print("______________________________________________") print("______________________________________________") cls.to(device_train) cls.train() epoch_loss=0 total=0 correct=0 output_all=[] label_all=[] for batch_idx,batch in enumerate(train_loader): label=batch['label'].to(device_train).float().view(-1,1)#batch size * 1 input_ids=torch.stack(batch['input_ids']).t().to(device_train)#batch size * 100 attention_mask=torch.stack(batch['attention_mask']).t().to(device_train)#batch size * 100 #计算输出 output = cls(input_ids, attention_mask=attention_mask)#batch size * 1 output=sigmoid(output)#batch size * 1 #计算loss loss = criterion(output, label) loss.backward() optimizer.step() optimizer.zero_grad() with torch.no_grad(): #四舍五入 output=output.round() output_all.append(output) label_all.append(label) total+=len(output) #epoch_loss epoch_loss+=loss ave_loss=epoch_loss/total #计算准确率 add_correct=(output== label).sum().item() correct+=add_correct acc=correct/total if batch_idx%5==0: print('[{}/{} ({:.0f}%)]\t正确分类的样本数:{},样本总数:{},准确率:{:.2f}%,ave_loss:{}'.format( batch_idx, len(train_loader),100.*batch_idx/len(train_loader), correct, total,acc, ave_loss ),end= "\r") #结束: print('正确分类的样本数:{},样本总数:{},准确率:{:.2f}%,ave_loss:{}'.format( correct, total,acc, ave_loss)) # can't convert cuda:5 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first. with torch.no_grad(): output_all=torch.cat(output_all,0) label_all=torch.cat(label_all,0) output_all=np.array(output_all.cpu()) label_all=np.array(label_all.cpu()) acc_score=metrics.accuracy_score(label_all,output_all) # print(metrics.classification_report(label_all,output_all)) # print("准确率:",acc_score ) test_acc,test_epoch_loss=test(device1) print('train_acc:',acc,'train_epoch_loss:',epoch_loss.item(),'test_acc:',test_acc,'test_epoch_loss:',test_epoch_loss) train_acc_l.append(acc) train_epoch_loss_l.append(epoch_loss.item()) test_acc_l.append(test_acc) test_epoch_loss_l.append(test_epoch_loss) print("______________________________________________") print("______________________________________________") print("_______________",epoch_num,"end_______________") print("______________________________________________") print("______________________________________________") return test_epoch_loss # train_one_epoch(device0,0)
import time cls = fn_cls(device0) from torch import optim # cls=torch.load("./data/yxl_best.model",map_location=device0) optimizer = optim.Adam(cls.parameters(), lr=1e-4) test(device1) now_loss = 999 pre_epoch_loss = 9999 epoch = 0 while now_loss < pre_epoch_loss : torch.save(cls,"./data/yxl_best.model") pre_epoch_loss = now_loss now_loss = train_one_epoch(device0,epoch) epoch += 1
def predict(device,s_l,cls): with torch.no_grad(): cls.to(device) cls.eval() text2id = tokenizer( s_l, max_length=100, padding='max_length', truncation=True, return_tensors="pt" ) input_ids=text2id["input_ids"].to(device) mask=text2id["attention_mask"].to(device) output = cls(input_ids, attention_mask=mask) output1=sigmoid(output) output2=output1.round() return output1,output2 from tqdm import tqdm def run(device, s_l, cls, bs): # bs指的是batch size with torch.no_grad(): cls.to(device) cls.eval() len_ = len(s_l) all_end_lgs = [] all_end = [] for start in tqdm(range(0, len_, bs)): li_i = s_l[start:min(start+bs, len_)] text2id = tokenizer( li_i, max_length=100, padding='max_length', truncation=True, return_tensors="pt" ) input_ids=text2id["input_ids"].to(device) mask=text2id["attention_mask"].to(device) output = cls(input_ids, attention_mask=mask) output1=sigmoid(output) output2=output1.round() all_end_lgs = all_end_lgs + output1.tolist() all_end = all_end + output2.tolist() return all_end,all_end_lgs
预测实例:
s = ['好好好好好好好',
'坏坏坏坏坏坏坏坏',]
print(predict(device1,s,cls)[1])
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。