当前位置:   article > 正文

Pytorch bert模型 实现文本分类_tokenize the pair of sentences to get token ids, a

tokenize the pair of sentences to get token ids, attention masks and token t

先导入数据集,这里是先新建train.py文件导入数据和添加标签(也可以直接在一个文件里进行)

  1. import pandas as pd
  2. import numpy as np
  3. def load_data(path):
  4. data = pd.read_csv(path, encoding='utf-8',sep='|',nrows=14)
  5. data = data.content
  6. label_list = [2,0,1,1,1,0,2,0,2,0,2,0,1,1]#手动添加标签
  7. data = pd.DataFrame(data)
  8. data["label"] = label_list
  9. return data

同一目录下再建一个.py文件,导入数据集进行训练和测试

  1. import os
  2. import numpy as np
  3. import pandas as pd
  4. import torch
  5. import torch.nn as nn
  6. import torch.utils.data as Data
  7. import torch.optim as optim
  8. import transformers
  9. from transformers import AutoModel, AutoTokenizer
  10. import matplotlib.pyplot as plt
  11. train_curve = []
  12. device = torch.device('cuda')
  13. # 定义一些参数,模型选择了最基础的bert中文模型
  14. batch_size = 2
  15. epoches = 100
  16. model = "bert-base-chinese"
  17. hidden_size = 768
  18. n_class = 3
  19. maxlen = 8
  20. from train import load_data
  21. data = load_data("data/train_dataset/train_dataset.csv")
  22. sentences = list(data.content)
  23. labels = list(data.label)
  24. #print(sentences)
  25. #print(labels)
  26. # 将数据构造成bert的输入格式
  27. # inputs_ids: token的字典编码
  28. # attention_mask:长度与inputs_ids一致,真实长度的位置填充1,padding位置填充0
  29. # token_type_ids: 第一个句子填充0,第二个句子句子填充1
  30. class MyDataset(Data.Dataset):#调用dataset要重写__len__和__getitem__方法
  31. def __init__(self, sentences, labels=None, with_labels=True,):
  32. self.tokenizer = AutoTokenizer.from_pretrained(model)
  33. self.with_labels = with_labels
  34. self.sentences = sentences
  35. self.labels = labels
  36. def __len__(self):
  37. return len(sentences)
  38. def __getitem__(self, index):
  39. # Selecting sentence1 and sentence2 at the specified index in the data frame
  40. sent = self.sentences[index]
  41. # Tokenize the pair of sentences to get token ids, attention masks and token type ids
  42. encoded_pair = self.tokenizer(sent,
  43. padding='max_length', # Pad to max_length
  44. truncation=True, # Truncate to max_length
  45. max_length=maxlen,
  46. return_tensors='pt') # Return torch.Tensor objects
  47. token_ids = encoded_pair['input_ids'].squeeze(0) # 词向量tensor of token ids
  48. attn_masks = encoded_pair['attention_mask'].squeeze(0) # 掩码→要覆盖掉的词binary tensor with "0" for padded values and "1" for the other values
  49. token_type_ids = encoded_pair['token_type_ids'].squeeze(0) # 索引binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens
  50. if self.with_labels: # True if the dataset has labels
  51. label = self.labels[index]
  52. return token_ids, attn_masks, token_type_ids, label
  53. else:
  54. return token_ids, attn_masks, token_type_ids
  55. train = Data.DataLoader(dataset=MyDataset(sentences, labels), batch_size=batch_size, shuffle=True, num_workers=0)
  56. # model
  57. class BertClassify(nn.Module):
  58. def __init__(self):
  59. super(BertClassify, self).__init__()
  60. self.bert = AutoModel.from_pretrained(model, output_hidden_states=True, return_dict=True)
  61. self.linear = nn.Linear(hidden_size, n_class) # 直接用cls向量接全连接层分类
  62. self.dropout = nn.Dropout(0.5)
  63. def forward(self, X):
  64. input_ids, attention_mask, token_type_ids = X[0], X[1], X[2]
  65. outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) # 返回一个output字典
  66. # 用最后一层cls向量做分类
  67. # outputs.pooler_output: [bs, hidden_size]
  68. logits = self.linear(self.dropout(outputs.pooler_output))
  69. return logits
  70. bc = BertClassify().to(device)
  71. optimizer = optim.Adam(bc.parameters(), lr=1e-3, weight_decay=1e-2)
  72. loss_fn = nn.CrossEntropyLoss()
  73. # train
  74. sum_loss = 0
  75. total_step = len(train)
  76. for epoch in range(epoches):
  77. for i, batch in enumerate(train):
  78. optimizer.zero_grad()
  79. batch = tuple(p.to(device) for p in batch)
  80. pred = bc([batch[0], batch[1], batch[2]])
  81. loss = loss_fn(pred, batch[3])
  82. sum_loss += loss.item()#累加损失
  83. loss.backward()
  84. optimizer.step()
  85. if epoch % 10 == 0:
  86. print('[{}|{}] step:{}/{} loss:{:.4f}'.format(epoch+1, epoches, i+1, total_step, loss.item()))
  87. train_curve.append(sum_loss)
  88. sum_loss = 0#每计算一个epoch的损失,要先置为0
  89. # test
  90. bc.eval()#测试的时候不会算梯度
  91. with torch.no_grad():
  92. test_text = ['服务,喂,你好,是这样的,我那个我的电话因为上次突然把给我停机了,我不知道啥意思,然后我就好长时间没有使用,我这次拿着10,怎么我欠费160多了?噢,我想问一下,我这机器现在是在停机状态吧,']
  93. test = MyDataset(test_text, labels=None, with_labels=False)
  94. x = test.__getitem__(0)#得到三个输出(token mask belong)和label
  95. x = tuple(p.unsqueeze(0).to(device) for p in x)#把所有输出封装成元组(加维度[在测试数据集为1时])
  96. pred = bc([x[0], x[1], x[2]])
  97. pred = pred.data.max(dim=1, keepdim=True)[1]
  98. if pred[0][0] == 0:
  99. print('中性')
  100. elif pred[0][0] == 1:
  101. print('不满')
  102. else:
  103. print('强烈不满')
  104. pd.DataFrame(train_curve).plot() # loss曲线
  105. plt.show()

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/很楠不爱3/article/detail/378488
推荐阅读
相关标签
  

闽ICP备14008679号