赞
踩
使用bert-base-chinese预训练模型对二分类问题进行微调
import pandas as pd from transformers import BertTokenizerFast, AutoModelForSequenceClassification, Trainer, TrainingArguments import torch model_name = "./bert-base-chinese" path = "./abuse_22.csv" df = pd.read_csv(path, encoding="utf-8") texts = df["content"][:1000].tolist() labels = df["punish_result"][:1000].tolist() texts = list(map(lambda x: str(x), texts)) class Dataset(torch.utils.data.Dataset): def __init__(self, encodings, labels): self.encodings = encodings self.labels = labels def __getitem__(self, idx): item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} item['labels'] = torch.tensor(self.labels[idx]) return item def __len__(self): return len(self.labels) model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) tokenizer = BertTokenizerFast.from_pretrained(model_name) # 参考这里 https://blog.csdn.net/weixin_42924890/article/details/139269528 train_encodings = tokenizer(texts, truncation=True, padding=True, max_length=512) encodings = Dataset(train_encodings, labels) args = TrainingArguments(output_dir='./output_dir', evaluation_strategy='epoch', no_cuda=True, num_train_epochs=2, learning_rate=1e-4, weight_decay=1e-2, per_device_eval_batch_size=32, per_device_train_batch_size=32) trainer = Trainer( model=model, args=args, train_dataset=encodings, ) # 开始训练 trainer.train()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。