当前位置:   article > 正文

【实战】使用Bert微调完成文本二分类_bert+adapter二分类

bert+adapter二分类

实验用的数据可以点击这里下载
完整代码:githubgitee

1.训练前准备

指定训练和预测的gpu

from torch.utils.data import DataLoader,TensorDataset
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
import torch

device0 = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")#训练集gpu
device1 = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")#测试集gpu

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9

读取数据、分析数据


data=pd.read_table('./data/train.txt',header=None)#text label
data.columns = ['text', 'label']
text=[i for i in data['text']]
label=[i for i in data['label']]

#可以通过df.colname 来指定某个列,value_count()在这里进行计数
df2 = data.label.value_counts()  
print(df2)

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10

构造训练数据

class SentimentDataset(Dataset):
    def __init__(self,df):
        self.dataset = df
    def __len__(self):
        return len(self.dataset)
    def __getitem__(self, idx):
        text = self.dataset.loc[idx, "text"]
        label = self.dataset.loc[idx, "label"]
        input_ids = self.dataset.loc[idx, "input_ids"]
        attention_mask = self.dataset.loc[idx, "attention_mask"]
        sample = {"text": text, "label": label,"input_ids":input_ids,"attention_mask":attention_mask}
        # print(sample)
        return sample
        
print('text2token')
from transformers import AutoTokenizer, AutoModel
# added_token=['##char##']
# tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese",additional_special_tokens=added_token)
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
def text2token(text,tokenizer,max_length=100):
    text2id = tokenizer(
        text, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt"
    )
    input_ids=text2id["input_ids"].tolist()
    attention_mask=text2id["attention_mask"].tolist()
    return input_ids,attention_mask

input_ids,attention_mask=text2token(text,tokenizer,max_length=100)

data['input_ids']=input_ids
data['attention_mask']=attention_mask

train_data = data.sample(frac=0.8)
test_data=data[~data.index.isin(train_data.index)]
print(len(train_data),len(test_data))
train_data=train_data.reset_index(drop=True)
test_data=test_data.reset_index(drop=True)

print('DataLoader')
#按batch_size分


batch_size=16
train_loader = DataLoader(
    SentimentDataset(train_data), 
    batch_size=batch_size, 
    shuffle=True, 
    num_workers=0
)
test_loader = DataLoader(
    SentimentDataset(test_data), 
    batch_size=batch_size, 
    shuffle=False, 
    num_workers=0
)
import pickle
with open('train_loader.pkl', 'wb') as f:
    pickle.dump(train_loader, f)
with open('test_loader.pkl', 'wb') as f:
    pickle.dump(test_loader, f)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60

如果之前保存了可以用这个直接读数据

import pickle
with open("train_loader.pkl",'rb') as f:
    train_loader  = pickle.loads(f.read())
with open("test_loader.pkl",'rb') as f:
    test_loader  = pickle.loads(f.read())
  • 1
  • 2
  • 3
  • 4
  • 5

2.模型定义、训练和测试代码

定义模型

from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
import torch.nn.functional as F
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
class fn_cls(nn.Module):
    def __init__(self,device):
        super(fn_cls, self).__init__()
        self.model = AutoModel.from_pretrained("bert")
        self.model.resize_token_embeddings(len(tokenizer))##############
        self.model.to(device)
        # self.dropout = nn.Dropout(0.3)
        self.l1 = nn.Linear(768, 1)

    def forward(self, x, attention_mask=None):
        outputs = self.model(x, attention_mask=attention_mask)
#         print(outputs[0])torch.Size([8, 100, 768])
#         print(outputs[1])torch.Size([8, 768])
#         print(outputs[0][:,0,:])torch.Size([8, 768])
        x = outputs[1]
        # x = self.dropout(x)
        x = self.l1(x)
        return x
# cls = fn_cls(device0)

# from torch import optim
# optimizer = optim.Adam(cls.parameters(), lr=1e-4)
sigmoid = nn.Sigmoid()
criterion = nn.BCELoss()#weight=weight
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28

测试代码

from sklearn import metrics
import numpy as np
from tqdm import tqdm

def test(device_test):
    cls.to(device_test)
    cls.eval()

    epoch_loss=0
    total=0
    correct=0
    output_all=[]
    label_all=[]
    for batch_idx,batch in enumerate(test_loader):
        with torch.no_grad():
            label=batch['label'].to(device_test).float().view(-1,1)#batch size * 1
            label_all.append(label)
            input_ids=torch.stack(batch['input_ids']).t().to(device_test)#batch size * 100
            attention_mask=torch.stack(batch['attention_mask']).t().to(device_test)#batch size * 100
            
            #计算输出
            output = cls(input_ids, attention_mask=attention_mask)#batch size * 1
            output=sigmoid(output)#batch size * 1
            total+=len(output)
            
            #计算loss
            loss = criterion(output, label)
            epoch_loss+=loss
            ave_loss=epoch_loss/total
            
            #四舍五入
            output=output.round()
            output_all.append(output)
            
            #计算准确率
            add_correct=(output== label).sum().item()
            correct+=add_correct
            acc=correct/total
            
            if batch_idx%5==0:
                print('[{}/{} ({:.0f}%)]\t正确分类的样本数:{},样本总数:{},准确率:{:.2f}%,ave_loss:{}'.format(
                    batch_idx, len(test_loader),100.*batch_idx/len(test_loader), 
                    correct, total,acc,
                    ave_loss
                    ),end= "\r")
            
            
            
    #结束:
    print('正确分类的样本数:{},样本总数:{},准确率:{:.2f}%,ave_loss:{}'.format(
                    correct, total,acc,
                    ave_loss))
    
#     can't convert cuda:5 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.
    output_all=torch.cat(output_all,0)
    label_all=torch.cat(label_all,0)
    
    output_all=np.array(output_all.cpu())
    label_all=np.array(label_all.cpu())
    acc_score=metrics.accuracy_score(label_all,output_all)
    print(metrics.classification_report(label_all,output_all))
    print("准确率:",acc_score )
    
    return acc,epoch_loss.item()

# test(device1)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66

训练代码

train_acc_l=[]
train_epoch_loss_l=[]
test_acc_l=[]
test_epoch_loss_l=[]

def train_one_epoch(device_train,epoch_num):
    print("______________________________________________")
    print("______________________________________________")
    print("_______________",epoch_num,"start_______________")
    print("______________________________________________")
    print("______________________________________________")
    cls.to(device_train)
    cls.train()

    epoch_loss=0
    total=0
    correct=0
    output_all=[]
    label_all=[]
    for batch_idx,batch in enumerate(train_loader):
        label=batch['label'].to(device_train).float().view(-1,1)#batch size * 1
        input_ids=torch.stack(batch['input_ids']).t().to(device_train)#batch size * 100
        attention_mask=torch.stack(batch['attention_mask']).t().to(device_train)#batch size * 100

        #计算输出
        output = cls(input_ids, attention_mask=attention_mask)#batch size * 1
        output=sigmoid(output)#batch size * 1

        #计算loss
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
            
        with torch.no_grad():
            #四舍五入
            output=output.round()
            output_all.append(output)
            label_all.append(label)
            total+=len(output)
            
            #epoch_loss
            epoch_loss+=loss
            ave_loss=epoch_loss/total
            
            #计算准确率
            add_correct=(output== label).sum().item()
            correct+=add_correct
            acc=correct/total
            
            if batch_idx%5==0:
                print('[{}/{} ({:.0f}%)]\t正确分类的样本数:{},样本总数:{},准确率:{:.2f}%,ave_loss:{}'.format(
                    batch_idx, len(train_loader),100.*batch_idx/len(train_loader), 
                    correct, total,acc,
                    ave_loss
                    ),end= "\r")
            
            
            
    #结束:
    print('正确分类的样本数:{},样本总数:{},准确率:{:.2f}%,ave_loss:{}'.format(
                    correct, total,acc,
                    ave_loss))
    
#     can't convert cuda:5 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.
    with torch.no_grad():
        output_all=torch.cat(output_all,0)
        label_all=torch.cat(label_all,0)

        output_all=np.array(output_all.cpu())
        label_all=np.array(label_all.cpu())
        acc_score=metrics.accuracy_score(label_all,output_all)
        
    # print(metrics.classification_report(label_all,output_all))
    # print("准确率:",acc_score )
    
    test_acc,test_epoch_loss=test(device1)
    print('train_acc:',acc,'train_epoch_loss:',epoch_loss.item(),'test_acc:',test_acc,'test_epoch_loss:',test_epoch_loss)
    train_acc_l.append(acc)
    train_epoch_loss_l.append(epoch_loss.item())
    test_acc_l.append(test_acc)
    test_epoch_loss_l.append(test_epoch_loss)
    print("______________________________________________")
    print("______________________________________________")
    print("_______________",epoch_num,"end_______________")
    print("______________________________________________")
    print("______________________________________________")
    return test_epoch_loss
    
# train_one_epoch(device0,0)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90

3.微调

import time

cls = fn_cls(device0)

from torch import optim
# cls=torch.load("./data/yxl_best.model",map_location=device0)
optimizer = optim.Adam(cls.parameters(), lr=1e-4)
test(device1)
now_loss = 999
pre_epoch_loss = 9999
epoch = 0
while now_loss < pre_epoch_loss :
    torch.save(cls,"./data/yxl_best.model")
    pre_epoch_loss = now_loss 
    now_loss = train_one_epoch(device0,epoch)
    epoch += 1

    
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18

4.预测、批量预测

def predict(device,s_l,cls):
    with torch.no_grad():
        cls.to(device)
        cls.eval()
        text2id = tokenizer(
            s_l, max_length=100, padding='max_length', truncation=True, return_tensors="pt"
        )
        input_ids=text2id["input_ids"].to(device)
        mask=text2id["attention_mask"].to(device)
        output = cls(input_ids, attention_mask=mask)
        output1=sigmoid(output)
        output2=output1.round()
        return output1,output2
from tqdm import tqdm
def run(device, s_l, cls, bs):
	# bs指的是batch size
    with torch.no_grad():
        cls.to(device)
        cls.eval()
        len_ = len(s_l)
        all_end_lgs = []
        all_end = []
        for start in tqdm(range(0, len_, bs)):
            li_i = s_l[start:min(start+bs, len_)]
            text2id = tokenizer(
                li_i, max_length=100, padding='max_length', truncation=True, return_tensors="pt"
                )
            input_ids=text2id["input_ids"].to(device)
            mask=text2id["attention_mask"].to(device)
            output = cls(input_ids, attention_mask=mask)
            output1=sigmoid(output)
            output2=output1.round()
            all_end_lgs = all_end_lgs + output1.tolist()
            all_end = all_end + output2.tolist()
    return all_end,all_end_lgs
        
    

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38

预测实例:

s = ['好好好好好好好',
'坏坏坏坏坏坏坏坏',]
print(predict(device1,s,cls)[1])
  • 1
  • 2
  • 3
声明:本文内容由网友自发贡献,转载请注明出处:【wpsshop博客】
推荐阅读
  

闽ICP备14008679号