赞
踩
分别使用两种框架,加载预训练模型,对句对进行分类
数据下载:千言数据集:文本相似度
可以使用 paddlenlp 直接加载预训练模型,比较方便
# %% # 比赛地址 # https://aistudio.baidu.com/aistudio/competition/detail/45 import time import os import numpy as np import paddle import paddlenlp import paddle.nn.functional as F import paddle.distributed as dist # 并行 from functools import partial from paddlenlp.data import Stack, Pad, Tuple import paddle.nn as nn from paddlenlp.datasets import load_dataset from paddlenlp.transformers import LinearDecayWithWarmup def read(data, datasetname, predict=False): # 将数据转成迭代器 if not predict: for d in data: label = d["label"] if datasetname != "lcqmc": text1, text2 = d["sentence1"], d["sentence2"] else: text1, text2 = d["query"], d["title"] yield {"label": label, "text1": text1, "text2": text2} else: for d in data: if datasetname != "lcqmc": text1, text2 = d["sentence1"], d["sentence2"] else: text1, text2 = d["query"], d["title"] yield {"text1": text1, "text2": text2} def convert_data(data, tokenizer, datasetname, max_seq_len=512, is_test=False): # 数据转码为模型的输入 text1, text2 = data["text1"], data["text2"] encoded_inputs = tokenizer(text=text1, text_pair=text2, max_seq_len=max_seq_len) input_ids = encoded_inputs["input_ids"] token_type_ids = encoded_inputs["token_type_ids"] if not is_test: label = np.array([data["label"]], dtype="int64") return input_ids, token_type_ids, label return input_ids, token_type_ids class PretrainedModel(nn.Layer): # 预训练模型 + FC def __init__(self, pretrained_model, dropout=None): super().__init__() self.ptm = pretrained_model self.dropout = nn.Dropout(dropout if dropout is not None else 0.1) self.clf = nn.Linear(self.ptm.config["hidden_size"], 2) def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None): _, cls_embedding = self.ptm(input_ids, token_type_ids, position_ids, attention_mask) cls_embedding = self.dropout(cls_embedding) logits = self.clf(cls_embedding) probs = F.softmax(logits) return probs class Recongnizer(): # 识别器类 def __init__(self, datasetname, state_dict_path=None): # 传入模型参数路径 self.seed = 100 paddle.seed = self.seed self.batch_size = 128 self.epochs = 20 self.max_seq_len = 512 self.datasetname = datasetname # paddlenlp自带的一键加载数据 self.train_ds, self.dev_ds, self.test_ds = load_dataset(datasetname, splits=["train", "dev", "test"]) # 使用预训练模型的tokenizer self.tokenizer = paddlenlp.transformers.ErnieGramTokenizer.from_pretrained("ernie-gram-zh") # https://gitee.com/paddlepaddle/PaddleNLP/blob/develop/docs/model_zoo/transformers.rst # 加载预训练模型 self.pretrained_model = paddlenlp.transformers.ErnieGramModel.from_pretrained("ernie-gram-zh") self.model = PretrainedModel(self.pretrained_model) if state_dict_path: # 如果传入了模型参数,直接加载参数 try: state_dict = paddle.load(state_dict_path) self.model.set_dict(state_dict) except: print("加载模型参数失败!") self.pathname = "checkpoint" self.global_step = 0 isExists = os.path.exists(self.pathname) if not isExists: os.mkdir(self.pathname) self.save_dir = "" self.save_param_path = "" def fit(self): # 加载数据集 train_ds = load_dataset(read, data=self.train_ds, datasetname=self.datasetname, lazy=False) dev_ds = load_dataset(read, data=self.dev_ds, datasetname=self.datasetname, lazy=False) test_ds = load_dataset(read, data=self.test_ds, datasetname=self.datasetname, predict=True, lazy=False) # 展示数据 for i, example in enumerate(train_ds): if i < 5: print(example) input_ids, token_type_ids, label = convert_data(train_ds[0], self.tokenizer, self.datasetname) print(input_ids) # [1, 692, 811, 445, 2001, 497, 5, 654, 21, 692, 811, 614, 356, 314, 5, 291, 21, 2, 329, 445, 2001, 497, 5, 654, 21, 692, 811, 614, 356, 314, 5, 291, 21, 2] print(token_type_ids) # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] print(label) # [1] # 数据转换函数 trans_func = partial(convert_data, tokenizer=self.tokenizer, datasetname=self.datasetname, max_seq_len=self.max_seq_len) # 对数据进行批量打包+pad batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=self.tokenizer.pad_token_id), Pad(axis=0, pad_val=self.tokenizer.pad_token_type_id), Stack(dtype="int64") ): [d for d in fn(samples)] # 将长度不同的多个句子padding到统一长度,取N个输入数据中的最大长度 # 长度是指的: 一个batch中的最大长度,主要考虑性能开销 # 取样器 batch_sampler = paddle.io.DistributedBatchSampler(train_ds, batch_size=self.batch_size, shuffle=True) # 数据加载器 train_data_loader = paddle.io.DataLoader( dataset=train_ds.map(trans_func), batch_sampler=batch_sampler, collate_fn=batchify_fn, return_list=True ) batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=self.batch_size, shuffle=False) dev_data_loader = paddle.io.DataLoader( dataset=dev_ds.map(trans_func), batch_sampler=batch_sampler, collate_fn=batchify_fn, return_list=True ) num_training_steps = len(train_data_loader) * self.epochs # 学习率 lr_scheduler = LinearDecayWithWarmup(5e-5, num_training_steps, 0.0) # 衰减的参数 decay_params = [ p.name for n, p in self.model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] # 梯度剪切 clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) # 优化器 optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=self.model.parameters(), weight_decay=0.0, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=clip ) # 损失函数 criterion = paddle.nn.loss.CrossEntropyLoss() # 评估准确率 metric = paddle.metric.Precision() t_start = time.time() F1 = 0 # 最大F1值 for epoch in range(1, self.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): input_ids, token_type_ids, labels = batch probs = self.model(input_ids=input_ids, token_type_ids=token_type_ids) loss = criterion(probs, labels) metric.update(np.argmax(probs, axis=1), labels) acc = metric.accumulate() self.global_step += 1 if self.global_step % 10 == 0: print("训练步数 %d, epoch: %d, batch: %d, loss: %.5f, acc: %.5f, speed: %.2f step/s" % (self.global_step, epoch, step, loss, acc, 10 / (time.time() - t_start))) t_start = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if self.global_step % 100 == 0: _, F1, improve = self.evaluate(criterion, metric, dev_data_loader, F1, "dev") if improve: print("评估结果 F1值 : {:.3f} , 模型保存于:".format(F1) + self.save_param_path) else: print("最好结果 F1值 : {:.3f} , 当前评估没有提升!".format(F1)) print("-----训练完成------") # 用最好的模型参数,提交预测 state_dict = paddle.load(self.save_param_path) self.model.set_dict(state_dict) self.writeToFile(test_ds) @paddle.no_grad() def evaluate(self, criterion, metric, data_loader, F1, phase="dev"): self.model.eval() metric.reset() recall = paddle.metric.Recall() recall.reset() losses = [] prob_list = [] for batch in data_loader: input_ids, token_type_ids, labels = batch probs = self.model(input_ids=input_ids, token_type_ids=token_type_ids) prob_list.extend(probs) loss = criterion(probs, labels) losses.append(loss.numpy()) metric.update(np.argmax(probs, axis=1), labels) recall.update(np.argmax(probs, axis=1), labels) acc = metric.accumulate() rec = recall.accumulate() f1 = 0 if (acc + rec) == 0.0 else 2 * acc * rec / (acc + rec) improve = False if f1 > F1: # 保存 F1 值最大的时候的模型参数 F1 = f1 improve = True self.save_dir = os.path.join(self.pathname, "best_model_state") self.save_param_path = os.path.join(self.save_dir, "model_state_pdparams_F1_" + str(round(F1, 4))) paddle.save(self.model.state_dict(), self.save_param_path) self.tokenizer.save_pretrained(self.save_dir) print("评估 {} loss: {:.5}, acc: {:.5}, recall: {:.5}".format(phase, np.mean(losses), acc, rec)) self.model.train() metric.reset() return prob_list, F1, improve def predict(self, text1, text2): # 单条 句对预测 encoded_inputs = self.tokenizer(text=text1, text_pair=text2, max_seq_len=self.max_seq_len) input_ids = encoded_inputs["input_ids"] token_type_ids = encoded_inputs["token_type_ids"] predict_data_loader = [(input_ids, token_type_ids)] batch_probs = [] self.model.eval() with paddle.no_grad(): for batch_data in predict_data_loader: input_ids, token_type_ids = batch_data input_ids = paddle.to_tensor([input_ids]) token_type_ids = paddle.to_tensor([token_type_ids]) batch_prob = self.model(input_ids=input_ids, token_type_ids=token_type_ids) batch_prob = F.softmax(batch_prob, axis=1).numpy() batch_probs.append(batch_prob) batch_probs = np.concatenate(batch_probs, axis=0) return batch_probs def writeToFile(self, test_ds): # 对测试集进行预测,写入文件 with open(self.datasetname + ".tsv", "w", encoding="utf-8") as f: f.write("index\tprediction\n") for i, d in enumerate(test_ds): prob = self.predict(d["text1"], d["text2"]) label = 1 if prob[0][1] >= 0.5 else 0 f.write(str(i) + "\t" + str(label) + "\n") if __name__ == "__main__": dist.init_parallel_env() # 初始化并行环境 # 启动命令 python -m paddle.distributed.launch --gpus '0,1' xxx.py & # 并行训练设置 # https://aistudio.baidu.com/aistudio/projectdetail/1222066 # https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/02_paddle2.0_develop/06_device_cn.html datasetnames = ["lcqmc", "bq_corpus", "paws-x"] # for name in datasetnames: model = Recongnizer(name) model.fit()
预训练模型下载:https://huggingface.co/nghuyong/ernie-1.0
# %% # 比赛地址 # https://aistudio.baidu.com/aistudio/competition/detail/45 import time import os import numpy as np import torch from datetime import timedelta import torch.nn as nn import torch.nn.functional as F from transformers import AutoTokenizer, AutoModel from sklearn import metrics np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed_all(4) torch.backends.cudnn.deterministic = True # 保证每次运行结果一样 def get_time_dif(start_time): # 辅助函数,获取训练时长 """ 获取已经使用的时间 """ end_time = time.time() time_dif = end_time - start_time return timedelta(seconds=int(round(time_dif))) def load(file, test=False): # 加载数据集 data = [] with open(file, 'r', encoding='utf-8') as f: for line in f: res = line[:-1].split('\t') if not test: data.append({"text1": res[0], "text2": res[1], "label": int(res[2])}) else: data.append({"text1": res[0], "text2": res[1]}) return data def load_dataset(datasetname): # 加载数据集 train_ds = load(os.path.join(datasetname, "train.tsv")) dev_ds = load(os.path.join(datasetname, "dev.tsv")) test_ds = load(os.path.join(datasetname, "test.tsv"), True) return train_ds, dev_ds, test_ds def convert_data(data, tokenizer, max_seq_len=512, is_test=False): # 转换数据集为模型可用的编码 if isinstance(data, dict): text1, text2 = data["text1"], data["text2"] if not is_test: label = [data["label"]] else: text1, text2, label = [], [], [] for d in data: if not is_test: label.append(d["label"]) text1.append(d["text1"]) text2.append(d["text2"]) # tokenizer 参数说明见下面链接 # https://huggingface.co/transformers/main_classes/tokenizer.html?highlight=tokenizer x = tokenizer(text=text1, text_pair=text2, add_special_tokens=True, padding='longest', max_length=max_seq_len, return_tensors='np', return_token_type_ids=True, return_attention_mask=True, truncation=True) input_ids, token_type_ids, mask = x["input_ids"], x["token_type_ids"], x["attention_mask"] if not is_test: return input_ids, token_type_ids, mask, np.array(label, dtype="int64") return input_ids, token_type_ids, mask class DatasetIterator(object): # 数据迭代器 def __init__(self, tokenizer, data, batch_size, device, max_seq_len=512, pred=False): self.batch_size = batch_size self.pred = pred self.tokenizer = tokenizer self.max_seq_len = max_seq_len self.dataset = [] self.batch_data = [] self.ct = 0 for d in data: # 对数据进行 batch 化,每 batch_size 个一组 self.batch_data.append(d) self.ct += 1 if self.ct == self.batch_size: self.dataset.append(self.batch_data) self.ct = 0 self.batch_data = [] if self.ct != 0: self.dataset.append(self.batch_data) self.n_batches = len(self.dataset) self.index = 0 self.device = device def _to_tensor(self, datas): # 转化为 tensor input_ids = torch.LongTensor(np.array(datas[0])).to(self.device) token_type_ids = torch.LongTensor(np.array(datas[1])).to(self.device) mask = torch.LongTensor(np.array(datas[2])).to(self.device) if not self.pred: label = datas[3] label = torch.LongTensor(np.array(label)).to(self.device) return (input_ids, token_type_ids, mask), label else: return (input_ids, token_type_ids, mask) def __next__(self): if self.index < self.n_batches: batches = self.dataset[self.index] self.index += 1 batches = convert_data(batches, self.tokenizer, self.max_seq_len, self.pred) batches = self._to_tensor(batches) return batches else: self.index = 0 raise StopIteration def __iter__(self): return self def __len__(self): return self.n_batches def bulid_iterator(tokenizer, dataset, batch_size, device, max_seq_len=512, pred=False): iter = DatasetIterator(tokenizer, dataset, batch_size, device, max_seq_len, pred) return iter class PretrainedModel(nn.Module): # 预训练模型 + FC def __init__(self, pretrained_model, dropout=None): super(PretrainedModel, self).__init__() self.ptm = pretrained_model for param in self.ptm.parameters(): param.requires_grad = True # 打开 finetune 开关 self.dropout = nn.Dropout(dropout if dropout is not None else 0.1) self.clf = nn.Linear(768, 2) def forward(self, x): out = self.ptm(input_ids=x[0], attention_mask=x[2], token_type_ids=x[1]) cls_embedding = self.dropout(out['pooler_output']) logits = self.clf(cls_embedding) probs = F.softmax(logits, dim=1) return probs class Recongnizer(): # 识别器类 def __init__(self, datasetname, state_dict_path=None): # 传入模型参数路径 self.seed = torch.initial_seed() self.datasetname = datasetname self.batch_size = 64 self.epochs = 3 self.max_seq_len = 512 self.lr = 1e-5 self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.train_ds, self.dev_ds, self.test_ds = load_dataset(self.datasetname) # 使用预训练模型的tokenizer, 见 https://huggingface.co/nghuyong/ernie-1.0 self.tokenizer = AutoTokenizer.from_pretrained("../ERNIE_pretrain") self.pretrained_model = AutoModel.from_pretrained("../ERNIE_pretrain") self.model = PretrainedModel(self.pretrained_model).to(self.device) if state_dict_path: try: state_dict = torch.load(state_dict_path) self.model.load_state_dict(state_dict) except: print("加载模型参数失败!") self.pathname = "checkpoint_pt" self.global_step = 0 isExists = os.path.exists(self.pathname) if not isExists: os.mkdir(self.pathname) self.save_param_path = os.path.join(self.pathname, "best_params_pt") def fit(self): # 加载数据集 train_ds = bulid_iterator(self.tokenizer, self.train_ds, self.batch_size, self.device, self.max_seq_len) dev_ds = bulid_iterator(self.tokenizer, self.dev_ds, self.batch_size, self.device, self.max_seq_len) test_ds = bulid_iterator(self.tokenizer, self.test_ds, self.batch_size, self.device, self.max_seq_len, pred=True) # 展示数据 for i, example in enumerate(train_ds): if i < 2: (input_ids, token_type_ids, mask), label = example print(input_ids) # [1, 692, 811, 445, 2001, 497, 5, 654, 21, 692, 811, 614, 356, 314, 5, 291, 21, 2, 329, 445, 2001, 497, 5, 654, 21, 692, 811, 614, 356, 314, 5, 291, 21, 2] print(token_type_ids) # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] print(label) # [1] param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] # 优化的参数 optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_deacy': 0.0} ] # 优化器 optimizer = torch.optim.AdamW(params=optimizer_grouped_parameters, lr=self.lr) t_start = time.time() dev_best_loss = float('inf') self.model.train() for epoch in range(1, self.epochs + 1): for step, batch in enumerate(train_ds, start=1): x, labels = batch probs = self.model(x) self.model.zero_grad() loss = F.cross_entropy(probs, labels) loss.backward(retain_graph=False) optimizer.step() self.global_step += 1 if self.global_step % 10 == 0: true = labels.data.cpu() predit = torch.max(probs.data, 1)[1].cpu() train_acc = metrics.accuracy_score(true, predit) dev_acc, dev_loss = self.evaluate(dev_ds) if dev_loss < dev_best_loss: # 保存 dev 上 loss 最小的模型 dev_best_loss = dev_loss torch.save(self.model.state_dict(), self.save_param_path) improve = '*' else: improve = '' time_dif = get_time_dif(t_start) msg = 'Iter:{0:>6}, Train Loss:{1:>5.2}, Train Acc:{2:>6.2}, Val Loss:{3:>5.2}, Val Acc:{4:>6.2%}, Time:{5} {6}' print(msg.format(self.global_step, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve)) self.model.train() print("-----训练完成------") # 用最好的模型参数,测评一次 state_dict = torch.load(self.save_param_path) self.model.load_state_dict(state_dict) self.model.eval() self.writeToFile(self.test_ds) def evaluate(self, data_loader, test=False): # 评估 self.model.eval() loss_total = 0 predict_all = np.array([], dtype=int) labels_all = np.array([], dtype=int) with torch.no_grad(): for x, labels in data_loader: outputs = self.model(x) loss = F.cross_entropy(outputs, labels) loss_total = loss_total + loss labels = labels.data.cpu().numpy() predict = torch.max(outputs.data, 1)[1].cpu().numpy() labels_all = np.append(labels_all, labels) predict_all = np.append(predict_all, predict) acc = metrics.accuracy_score(labels_all, predict_all) if test: return acc, loss_total / len(data_loader), predict_all return acc, loss_total / len(data_loader) def predict(self, text1, text2): # 单条句对预测 d = {"text1": text1, "text2": text2} input_ids, token_type_ids, mask = convert_data(d, self.tokenizer, max_seq_len=self.max_seq_len, is_test=True) input_ids = torch.LongTensor(input_ids).to(self.device) token_type_ids = torch.LongTensor(token_type_ids).to(self.device) mask = torch.LongTensor(mask).to(self.device) predict_data_loader = [(input_ids, token_type_ids, mask)] batch_probs = [] self.model.eval() with torch.no_grad(): for x in predict_data_loader: batch_prob = self.model(x) batch_probs.append(batch_prob.cpu().numpy()) batch_probs = np.concatenate(batch_probs, axis=0) return batch_probs def writeToFile(self, test_ds): # 将预测结果写入文件 with open(self.datasetname + ".tsv", "w", encoding="utf-8") as f: f.write("index\tprediction\n") for i, d in enumerate(test_ds): prob = self.predict(d["text1"], d["text2"]) label = 1 if prob[0][1] >= 0.5 else 0 f.write(str(i) + "\t" + str(label) + "\n") if __name__ == "__main__": datasetnames = ["lcqmc", "bq_corpus", "paws-x"] # for name in datasetnames: model = Recongnizer(name) model.fit()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。