当前位置:   article > 正文

BERT文本分类——基于美团外卖评论数据集_外卖评价数据

外卖评价数据

一.BERT模型介绍

BERT的全称为Bidirectional Encoder Representation from Transformers,是一个预训练的语言表征模型。它强调了不再像以往一样采用传统的单向语言模型或者把两个单向语言模型进行浅层拼接的方法进行预训练,而是采用新的masked language model(MLM),以能生成深度的双向语言表征。BERT论文发表时提及在11个NLP(Natural Language Processing,自然语言处理)任务中获得了新的state-of-the-art的结果。下面我们看看整个的BERT模型是什么样的,结构如下图所示。

二.代码实现

1.下载预训练模型

BERT-Chinese:

https://huggingface.co/bert-base-chinese

美团外卖的用户评论数据集:

https://huggingface.co/datasets/XiangPan/waimai_10k/blob/main/waimai_10k.csv

2. 训练代码

  1. import pandas as pd
  2. import torch
  3. from torch.utils.data import DataLoader
  4. import datasets
  5. from transformers import AutoTokenizer
  6. from transformers import AutoModelForSequenceClassification
  7. from torchkeras import KerasModel
  8. import evaluate
  9. df = pd.read_csv("waimai_10k.csv")
  10. ds = datasets.Dataset.from_pandas(df)
  11. ds = ds.shuffle(42) #打乱顺序
  12. ds = ds.rename_columns({"review":"text","label":"labels"})
  13. tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese') #需要和模型一致
  14. if __name__ == '__main__':
  15. ds_encoded = ds.map(lambda example:tokenizer(example["text"],
  16. max_length=50,truncation=True,padding='max_length'),
  17. batched=True,
  18. batch_size=20,
  19. num_proc=2) #支持批处理和多进程map
  20. # 转换成pytorch中的tensor
  21. ds_encoded.set_format(type="torch", columns=["input_ids", 'attention_mask', 'token_type_ids', 'labels'])
  22. # 分割成训练集和测试集
  23. ds_train_val, ds_test = ds_encoded.train_test_split(test_size=0.2).values()
  24. ds_train, ds_val = ds_train_val.train_test_split(test_size=0.2).values()
  25. # 在collate_fn中可以做动态批处理(dynamic batching)
  26. def collate_fn(examples):
  27. return tokenizer.pad(examples)
  28. dl_train = torch.utils.data.DataLoader(ds_train, batch_size=16, collate_fn=collate_fn)
  29. dl_val = torch.utils.data.DataLoader(ds_val, batch_size=16, collate_fn=collate_fn)
  30. dl_test = torch.utils.data.DataLoader(ds_test, batch_size=16, collate_fn=collate_fn)
  31. for batch in dl_train:
  32. break
  33. # 加载模型 (会添加针对特定任务类型的Head)
  34. model = AutoModelForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=2)
  35. dict(model.named_children()).keys()
  36. output = model(**batch)
  37. class StepRunner:
  38. def __init__(self, net, loss_fn, accelerator, stage="train", metrics_dict=None,
  39. optimizer=None, lr_scheduler=None
  40. ):
  41. self.net, self.loss_fn, self.metrics_dict, self.stage = net, loss_fn, metrics_dict, stage
  42. self.optimizer, self.lr_scheduler = optimizer, lr_scheduler
  43. self.accelerator = accelerator
  44. if self.stage == 'train':
  45. self.net.train()
  46. else:
  47. self.net.eval()
  48. def __call__(self, batch):
  49. out = self.net(**batch)
  50. # loss
  51. loss = out.loss
  52. # preds
  53. preds = (out.logits).argmax(axis=1)
  54. # backward()
  55. if self.optimizer is not None and self.stage == "train":
  56. self.accelerator.backward(loss)
  57. self.optimizer.step()
  58. if self.lr_scheduler is not None:
  59. self.lr_scheduler.step()
  60. self.optimizer.zero_grad()
  61. all_loss = self.accelerator.gather(loss).sum()
  62. labels = batch['labels']
  63. acc = (preds == labels).sum() / ((labels > -1).sum())
  64. all_acc = self.accelerator.gather(acc).mean()
  65. # losses
  66. step_losses = {self.stage + "_loss": all_loss.item(), self.stage + '_acc': all_acc.item()}
  67. # metrics
  68. step_metrics = {}
  69. if self.stage == "train":
  70. if self.optimizer is not None:
  71. step_metrics['lr'] = self.optimizer.state_dict()['param_groups'][0]['lr']
  72. else:
  73. step_metrics['lr'] = 0.0
  74. return step_losses, step_metrics
  75. KerasModel.StepRunner = StepRunner
  76. optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
  77. keras_model = KerasModel(model,
  78. loss_fn=None,
  79. optimizer=optimizer
  80. )
  81. keras_model.fit(
  82. train_data=dl_train,
  83. val_data=dl_val,
  84. ckpt_path='bert_waimai.pt',
  85. epochs=100,
  86. patience=10,
  87. monitor="val_acc",
  88. mode="max",
  89. plot=True,
  90. wandb=False,
  91. quiet=True
  92. )
  93. model.eval()
  94. model.config.id2label = {0: "差评", 1: "好评"}
  95. model.save_pretrained("waimai_10k_bert")
  96. tokenizer.save_pretrained("waimai_10k_bert")

3. 训练展示

图片

4. 测试代码

  1. from transformers import pipeline
  2. classifier = pipeline("text-classification", model="waimai_10k_bert")
  3. while True:
  4. text = input("请输入一句话(或输入q退出):")
  5. if text == "q":
  6. break
  7. result = classifier(text)
  8. print(result)

最后:

 训练曲线和测试结果,并且得到了训练权重,喜欢的小伙伴可关注公众号回复“BERT美团”获取源代码和训练好的权重文件。会不定期发布相关设计内容包括但不限于如下内容:信号处理、通信仿真、算法设计、matlab appdesigner,gui设计、simulink仿真......希望能帮到你!

5a8015ddde1e41418a38e958eb12ecbd.png

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/我家小花儿/article/detail/847265
推荐阅读
相关标签
  

闽ICP备14008679号