赞
踩
对于配置较低的GPU上的T5微调训练,只能单条数据的训练,训练代码如下:
-
- # 使用
- import datetime
- import json
- import os
-
- import transformers
- from torch.utils.tensorboard import SummaryWriter
- from transformers import T5Tokenizer, T5ForConditionalGeneration
- import torch
-
- def preprocess(text):
- text = text.replace("\n", "\\n").replace("\t", "\\t")
- return text
-
- def postprocess(text):
- return text.replace("\\n", "\n").replace("\\t", "\t")
-
- def train():
-
- lr = 1.5e-4
- num_warmup_steps = 2000
- epochs = 3
- tb_writer = SummaryWriter(log_dir="t5/summary")
- output_dir = "t5/my_model/"
- batch_size = 1
- gradient_accumulation=1
- max_grad_norm = 1
- log_step = 1
- import pandas as pd
- colum_data = pd.read_excel("data/rewrite_train.xlsx")
-
- data_json_list = json.loads(colum_data.to_json(force_ascii=False, orient="records"))
-
- total_steps = int(len(data_json_list) / epochs/ batch_size / gradient_accumulation)
-
- if not os.path.exists(output_dir):
- os.mkdir(output_dir)
-
- tokenizer = T5Tokenizer.from_pretrained("ClueAI/ChatYuan-large-v1")
- model = T5ForConditionalGeneration.from_pretrained("ClueAI/ChatYuan-large-v1")
- # 修改colab笔记本设置为gpu,推理更快
- model.train()
- device = torch.device('cuda')
- model.to(device)
-
- print('calculating total steps')
-
- optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True)
-
- scheduler = transformers.get_linear_schedule_with_warmup(optimizer,
- num_warmup_steps=num_warmup_steps,
- num_training_steps=total_steps)
- print('starting training')
- overall_step = 0
- running_loss = 0
- for epoch in range(epochs):
- print('epoch {}'.format(epoch + 1))
- now = datetime.datetime.now()
- print('time: {}'.format(now))
- import random
- random.shuffle(data_json_list)
-
- for step, each in enumerate(data_json_list):
- input_ids = tokenizer(preprocess(each.get("input")), return_tensors="pt").input_ids.long().to(device)
- labels = tokenizer(preprocess(each.get("label")), return_tensors="pt").input_ids.long().to(device)
- outputs = model(input_ids=input_ids, labels=labels)
- loss = outputs.loss
-
- if gradient_accumulation > 1:
- loss = loss / gradient_accumulation
-
- loss.backward()
- torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
-
- # optimizer step
- if (step + 1) % gradient_accumulation == 0:
- running_loss += loss.item()
- optimizer.step()
- optimizer.zero_grad()
- scheduler.step()
- overall_step += 1
- if (overall_step + 1) % log_step == 0:
- tb_writer.add_scalar('loss', loss.item(), overall_step)
- if (overall_step + 1) % log_step == 0:
- print('now time: {}:{}. Step {} of epoch {}, loss {}'.format(
- datetime.datetime.now().hour,
- datetime.datetime.now().minute,
- step + 1,
- epoch + 1,
- running_loss / log_step))
- running_loss = 0
- if step%10==0 and step>=10:
- if not os.path.exists(output_dir + 'model_epoch{}_step{}'.format(epoch + 1,step)):
- os.mkdir(output_dir + 'model_epoch{}_step{}'.format(epoch + 1,step))
- print('saving model for epoch {}, step {}'.format(epoch + 1,step))
- model_to_save = model.module if hasattr(model, 'module') else model
- model_to_save.save_pretrained(output_dir + 'model_epoch{}_step{}'.format(epoch + 1,step))
- print('saving model for epoch {}'.format(epoch + 1))
- if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)):
- os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1))
- model_to_save = model.module if hasattr(model, 'module') else model
- model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1))
- print('epoch {} finished'.format(epoch + 1))
-
- then = datetime.datetime.now()
- print('time: {}'.format(then))
- print('time for one epoch: {}'.format(then - now))
-
- print('training finished')
- if not os.path.exists(output_dir + 'final_model'):
- os.mkdir(output_dir + 'final_model')
- model_to_save = model.module if hasattr(model, 'module') else model
- model_to_save.save_pretrained(output_dir + 'final_model')
-
-
-
- print("begin train now")
- train()
- print("train end")
如果你是土豪,可以使用批量的训练方法:
-
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- # @Time : 2023/2/27 16:39
- # preference:https://github.com/Shivanandroy/T5-Finetuning-PyTorch
- #数据下载:链接:https://pan.baidu.com/s/1cwKLNZD7-rsdETogacP2jw?pwd=mefc 提取码:mefc
- # @Author : sparkle_code_guy
- import os
- from torch.utils.tensorboard import SummaryWriter
- from transformers import T5Tokenizer, T5ForConditionalGeneration
- import torch
- from torch.utils.data import Dataset, DataLoader
- from torch import cuda
- import numpy as np
- import pandas as pd
-
- device = 'cuda' if cuda.is_available() else 'cpu'
- class YourDataSetClass(Dataset):
- """
- Creating a custom dataset for reading the dataset and
- loading it into the dataloader to pass it to the
- neural network for finetuning the model
- """
-
- def __init__(
- self, dataframe, tokenizer, source_len, target_len, source_text, target_text
- ):
- """
- Initializes a Dataset class
- Args:
- dataframe (pandas.DataFrame): Input dataframe
- tokenizer (transformers.tokenizer): Transformers tokenizer
- source_len (int): Max length of source text
- target_len (int): Max length of target text
- source_text (str): column name of source text
- target_text (str): column name of target text
- """
- self.tokenizer = tokenizer
- self.data = dataframe
- self.source_len = source_len
- self.rewrite_len = target_len
- self.target_text = self.data[target_text]
- self.source_text = self.data[source_text]
-
- def __len__(self):
- """returns the length of dataframe"""
-
- return len(self.target_text)
-
- def __getitem__(self, index):
- """return the input ids, attention masks and target ids"""
-
- source_text = str(self.source_text[index])
- target_text = str(self.target_text[index])
-
- # cleaning data so as to ensure data is in string type
- source_text = " ".join(source_text.split())
- target_text = " ".join(target_text.split())
-
- source = self.tokenizer.batch_encode_plus(
- [source_text],
- max_length=self.source_len,
- pad_to_max_length=True,
- truncation=True,
- padding="max_length",
- return_tensors="pt",
- )
- target = self.tokenizer.batch_encode_plus(
- [target_text],
- max_length=self.rewrite_len,
- pad_to_max_length=True,
- truncation=True,
- padding="max_length",
- return_tensors="pt",
- )
-
- source_ids = source["input_ids"].squeeze()
- source_mask = source["attention_mask"].squeeze()
- target_ids = target["input_ids"].squeeze()
- target_mask = target["attention_mask"].squeeze()
-
- return {
- "source_ids": source_ids.to(dtype=torch.long),
- "source_mask": source_mask.to(dtype=torch.long),
- "target_ids": target_ids.to(dtype=torch.long),
- "target_ids_y": target_ids.to(dtype=torch.long),
- }
-
- def train(epoch, tokenizer, model, device, loader, optimizer,summary_writer,output_dir):
-
- """
- Function to be called for training with the parameters passed from main function
- """
-
- model.train()
- for _, data in enumerate(loader, 0):
- y = data["target_ids"].to(device, dtype=torch.long)
- y_ids = y[:, :-1].contiguous()
- lm_labels = y[:, 1:].clone().detach()
- lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
- ids = data["source_ids"].to(device, dtype=torch.long)
- mask = data["source_mask"].to(device, dtype=torch.long)
-
- outputs = model(
- input_ids=ids,
- attention_mask=mask,
- decoder_input_ids=y_ids,
- labels=lm_labels,
- )
- loss = outputs[0]
- optimizer.zero_grad()
- loss.backward()
- optimizer.step()
- summary_writer.add_scalar('epoch/loss_{}'.format(epoch), loss.item(), _)
-
- if _%100000==0 and _ > 0:
- print(f"[Saving Model]...\n")
- # Saving the model after training
- path = os.path.join(output_dir, 'model_epoch{}_step{}'.format(epoch + 1,_))
- if not os.path.exists(path):
- os.mkdir(path)
- model.save_pretrained(path)
- tokenizer.save_pretrained(path)
-
- def T5Trainer(
- dataframe, source_text, target_text, model_params, output_dir="./outputs/"
- ):
- """
- T5 trainer
- """
- if not os.path.exists(output_dir):
- os.mkdir(output_dir)
- # Set random seeds and deterministic pytorch for reproducibility
- torch.manual_seed(model_params["SEED"]) # pytorch random seed
- np.random.seed(model_params["SEED"]) # numpy random seed
-
- # logging
- print(f"""[Model]: Loading {model_params["MODEL"]}...\n""")
-
- # tokenzier for encoding the text
- tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
-
- # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary.
- # Further this model is sent to device (GPU/TPU) for using the hardware.
- model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
- model = model.to(device)
-
- # logging
- print(f"[Data]: Reading data...\n")
-
- # Importing the raw dataset
- dataframe = dataframe[[source_text, target_text]]
-
- # Creation of Dataset and Dataloader
- # Defining the train size. So 80% of the data will be used for training and the rest for validation.
- train_size = 1
- train_dataset = dataframe.sample(frac=train_size, random_state=model_params["SEED"])
- train_dataset = train_dataset.reset_index(drop=True)
-
- print(f"FULL Dataset: {dataframe.shape}")
- print(f"TRAIN Dataset: {train_dataset.shape}")
-
- # Creating the Training and Validation dataset for further creation of Dataloader
- training_set = YourDataSetClass(
- train_dataset,
- tokenizer,
- model_params["MAX_SOURCE_TEXT_LENGTH"],
- model_params["MAX_TARGET_TEXT_LENGTH"],
- source_text,
- target_text,
- )
-
- # Defining the parameters for creation of dataloaders
- train_params = {
- "batch_size": model_params["TRAIN_BATCH_SIZE"],
- "shuffle": True,
- "num_workers": 0,
- }
-
-
- # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
- training_loader = DataLoader(training_set, **train_params)
-
- # Defining the optimizer that will be used to tune the weights of the network in the training session.
- optimizer = torch.optim.Adam(
- params=model.parameters(), lr=model_params["LEARNING_RATE"]
- )
-
- # Training loop
- print(f"[Initiating Fine Tuning]...\n")
-
- for epoch in range(model_params["TRAIN_EPOCHS"]):
- summary_writer = SummaryWriter(log_dir="t5/summary_task")
- train(epoch, tokenizer, model, device, training_loader, optimizer,summary_writer,output_dir)
- print(f"[Saving Model]...\n")
- # Saving the model after training
- path = os.path.join(output_dir, 'model_epoch{}'.format(epoch + 1))
- if not os.path.exists(path):
- os.mkdir(path)
- model.save_pretrained(path)
- tokenizer.save_pretrained(path)
- print(
- f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n"""
- )
-
-
- if __name__ == '__main__':
- model_params = {
- "MODEL": "ClueAI/ChatYuan-large-v1", # model_type: t5-base/t5-large
- "TRAIN_BATCH_SIZE": 8, # training batch size
- "TRAIN_EPOCHS": 3, # number of training epochs
- "LEARNING_RATE": 1e-4, # learning rate
- "MAX_SOURCE_TEXT_LENGTH": 768, # max length of source text
- "MAX_TARGET_TEXT_LENGTH": 512, # max length of target text
- "SEED": 42, # set seed for reproducibility
- }
- train_dataframe = pd.read_csv("data/new_data.txt", sep='\t')
- T5Trainer(train_dataframe, "input", "label", model_params)
链接:百度网盘 请输入提取码
提取码:nrb9
关于模型的应用部分:
-
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- # @Time : 2023/2/27 16:39
- # @Author : sparkle_code_guy
- from transformers import T5Tokenizer, T5ForConditionalGeneration
-
- tokenizer = T5Tokenizer.from_pretrained("ClueAI/ChatYuan-large-v1")
- model = T5ForConditionalGeneration.from_pretrained("ClueAI/ChatYuan-large-v1")
- import torch
-
- # 修改colab笔记本设置为gpu,推理更快
- device = torch.device('cuda')
- model.to(device)
-
- def preprocess(text):
- text = text.replace("\n", "\\n").replace("\t", "\\t")
- return text
-
- def postprocess(text):
- return text.replace("\\n", "\n").replace("\\t", "\t")
-
- def answer(text, sample=True, top_p=1, temperature=0.7):
- '''sample:是否抽样。生成任务,可以设置为True;
- top_p:0-1之间,生成的内容越多样'''
- text = preprocess(text)
- print(len(text))
- encoding = tokenizer(text=[text], truncation=True, padding=True, max_length=768, return_tensors="pt").to(device)
- if not sample:
- out = model.generate(**encoding, return_dict_in_generate=True, output_scores=False, max_new_tokens=512,
- num_beams=1, length_penalty=0.6)
- else:
- out = model.generate(**encoding, return_dict_in_generate=True, output_scores=False, max_new_tokens=512,
- do_sample=True, top_p=top_p, temperature=temperature, no_repeat_ngram_size=3)
- out_text = tokenizer.batch_decode(out["sequences"], skip_special_tokens=True)
- return postprocess(out_text[0])
-
- def rewrite_message(input):
- print("query message:",input)
- answer_message_list=[]
- for each in range(4):
- answer_message_list.append("方案{0}:".format(each) + answer_message(input) )
-
- return "\n\n".join(answer_message_list)
-
- def answer_message(input):
- input_format = input.replace("\n", "。")
- input_text = "用户:" + input_format + "\n小智:"
- output_text = answer(input_text)
- return f"{output_text}"
-
- import gradio as gr
-
- examples_list = [
- "example1"]
-
- synthesis_interface = gr.Interface(rewrite_message,
- inputs=gr.components.Textbox(lines=10,interactive=True,placeholder="enter your question ..."),
- outputs=gr.components.Textbox(lines=10,interactive=False),
- cache_examples=False,
- title="问答",
- examples_per_page=5,
- examples=examples_list,
- live=False)
- synthesis_interface.launch(share=False,server_name='0.0.0.0',server_port=7860)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。