当前位置:   article > 正文

基于T5的模型微调以及对应的数据介绍_t5微调

t5微调

对于配置较低的GPU上的T5微调训练,只能单条数据的训练,训练代码如下:

  1. # 使用
  2. import datetime
  3. import json
  4. import os
  5. import transformers
  6. from torch.utils.tensorboard import SummaryWriter
  7. from transformers import T5Tokenizer, T5ForConditionalGeneration
  8. import torch
  9. def preprocess(text):
  10. text = text.replace("\n", "\\n").replace("\t", "\\t")
  11. return text
  12. def postprocess(text):
  13. return text.replace("\\n", "\n").replace("\\t", "\t")
  14. def train():
  15. lr = 1.5e-4
  16. num_warmup_steps = 2000
  17. epochs = 3
  18. tb_writer = SummaryWriter(log_dir="t5/summary")
  19. output_dir = "t5/my_model/"
  20. batch_size = 1
  21. gradient_accumulation=1
  22. max_grad_norm = 1
  23. log_step = 1
  24. import pandas as pd
  25. colum_data = pd.read_excel("data/rewrite_train.xlsx")
  26. data_json_list = json.loads(colum_data.to_json(force_ascii=False, orient="records"))
  27. total_steps = int(len(data_json_list) / epochs/ batch_size / gradient_accumulation)
  28. if not os.path.exists(output_dir):
  29. os.mkdir(output_dir)
  30. tokenizer = T5Tokenizer.from_pretrained("ClueAI/ChatYuan-large-v1")
  31. model = T5ForConditionalGeneration.from_pretrained("ClueAI/ChatYuan-large-v1")
  32. # 修改colab笔记本设置为gpu,推理更快
  33. model.train()
  34. device = torch.device('cuda')
  35. model.to(device)
  36. print('calculating total steps')
  37. optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True)
  38. scheduler = transformers.get_linear_schedule_with_warmup(optimizer,
  39. num_warmup_steps=num_warmup_steps,
  40. num_training_steps=total_steps)
  41. print('starting training')
  42. overall_step = 0
  43. running_loss = 0
  44. for epoch in range(epochs):
  45. print('epoch {}'.format(epoch + 1))
  46. now = datetime.datetime.now()
  47. print('time: {}'.format(now))
  48. import random
  49. random.shuffle(data_json_list)
  50. for step, each in enumerate(data_json_list):
  51. input_ids = tokenizer(preprocess(each.get("input")), return_tensors="pt").input_ids.long().to(device)
  52. labels = tokenizer(preprocess(each.get("label")), return_tensors="pt").input_ids.long().to(device)
  53. outputs = model(input_ids=input_ids, labels=labels)
  54. loss = outputs.loss
  55. if gradient_accumulation > 1:
  56. loss = loss / gradient_accumulation
  57. loss.backward()
  58. torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
  59. # optimizer step
  60. if (step + 1) % gradient_accumulation == 0:
  61. running_loss += loss.item()
  62. optimizer.step()
  63. optimizer.zero_grad()
  64. scheduler.step()
  65. overall_step += 1
  66. if (overall_step + 1) % log_step == 0:
  67. tb_writer.add_scalar('loss', loss.item(), overall_step)
  68. if (overall_step + 1) % log_step == 0:
  69. print('now time: {}:{}. Step {} of epoch {}, loss {}'.format(
  70. datetime.datetime.now().hour,
  71. datetime.datetime.now().minute,
  72. step + 1,
  73. epoch + 1,
  74. running_loss / log_step))
  75. running_loss = 0
  76. if step%10==0 and step>=10:
  77. if not os.path.exists(output_dir + 'model_epoch{}_step{}'.format(epoch + 1,step)):
  78. os.mkdir(output_dir + 'model_epoch{}_step{}'.format(epoch + 1,step))
  79. print('saving model for epoch {}, step {}'.format(epoch + 1,step))
  80. model_to_save = model.module if hasattr(model, 'module') else model
  81. model_to_save.save_pretrained(output_dir + 'model_epoch{}_step{}'.format(epoch + 1,step))
  82. print('saving model for epoch {}'.format(epoch + 1))
  83. if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)):
  84. os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1))
  85. model_to_save = model.module if hasattr(model, 'module') else model
  86. model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1))
  87. print('epoch {} finished'.format(epoch + 1))
  88. then = datetime.datetime.now()
  89. print('time: {}'.format(then))
  90. print('time for one epoch: {}'.format(then - now))
  91. print('training finished')
  92. if not os.path.exists(output_dir + 'final_model'):
  93. os.mkdir(output_dir + 'final_model')
  94. model_to_save = model.module if hasattr(model, 'module') else model
  95. model_to_save.save_pretrained(output_dir + 'final_model')
  96. print("begin train now")
  97. train()
  98. print("train end")

如果你是土豪,可以使用批量的训练方法:

  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # @Time : 2023/2/27 16:39
  4. # preference:https://github.com/Shivanandroy/T5-Finetuning-PyTorch
  5. #数据下载:链接:https://pan.baidu.com/s/1cwKLNZD7-rsdETogacP2jw?pwd=mefc 提取码:mefc
  6. # @Author : sparkle_code_guy
  7. import os
  8. from torch.utils.tensorboard import SummaryWriter
  9. from transformers import T5Tokenizer, T5ForConditionalGeneration
  10. import torch
  11. from torch.utils.data import Dataset, DataLoader
  12. from torch import cuda
  13. import numpy as np
  14. import pandas as pd
  15. device = 'cuda' if cuda.is_available() else 'cpu'
  16. class YourDataSetClass(Dataset):
  17. """
  18. Creating a custom dataset for reading the dataset and
  19. loading it into the dataloader to pass it to the
  20. neural network for finetuning the model
  21. """
  22. def __init__(
  23. self, dataframe, tokenizer, source_len, target_len, source_text, target_text
  24. ):
  25. """
  26. Initializes a Dataset class
  27. Args:
  28. dataframe (pandas.DataFrame): Input dataframe
  29. tokenizer (transformers.tokenizer): Transformers tokenizer
  30. source_len (int): Max length of source text
  31. target_len (int): Max length of target text
  32. source_text (str): column name of source text
  33. target_text (str): column name of target text
  34. """
  35. self.tokenizer = tokenizer
  36. self.data = dataframe
  37. self.source_len = source_len
  38. self.rewrite_len = target_len
  39. self.target_text = self.data[target_text]
  40. self.source_text = self.data[source_text]
  41. def __len__(self):
  42. """returns the length of dataframe"""
  43. return len(self.target_text)
  44. def __getitem__(self, index):
  45. """return the input ids, attention masks and target ids"""
  46. source_text = str(self.source_text[index])
  47. target_text = str(self.target_text[index])
  48. # cleaning data so as to ensure data is in string type
  49. source_text = " ".join(source_text.split())
  50. target_text = " ".join(target_text.split())
  51. source = self.tokenizer.batch_encode_plus(
  52. [source_text],
  53. max_length=self.source_len,
  54. pad_to_max_length=True,
  55. truncation=True,
  56. padding="max_length",
  57. return_tensors="pt",
  58. )
  59. target = self.tokenizer.batch_encode_plus(
  60. [target_text],
  61. max_length=self.rewrite_len,
  62. pad_to_max_length=True,
  63. truncation=True,
  64. padding="max_length",
  65. return_tensors="pt",
  66. )
  67. source_ids = source["input_ids"].squeeze()
  68. source_mask = source["attention_mask"].squeeze()
  69. target_ids = target["input_ids"].squeeze()
  70. target_mask = target["attention_mask"].squeeze()
  71. return {
  72. "source_ids": source_ids.to(dtype=torch.long),
  73. "source_mask": source_mask.to(dtype=torch.long),
  74. "target_ids": target_ids.to(dtype=torch.long),
  75. "target_ids_y": target_ids.to(dtype=torch.long),
  76. }
  77. def train(epoch, tokenizer, model, device, loader, optimizer,summary_writer,output_dir):
  78. """
  79. Function to be called for training with the parameters passed from main function
  80. """
  81. model.train()
  82. for _, data in enumerate(loader, 0):
  83. y = data["target_ids"].to(device, dtype=torch.long)
  84. y_ids = y[:, :-1].contiguous()
  85. lm_labels = y[:, 1:].clone().detach()
  86. lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
  87. ids = data["source_ids"].to(device, dtype=torch.long)
  88. mask = data["source_mask"].to(device, dtype=torch.long)
  89. outputs = model(
  90. input_ids=ids,
  91. attention_mask=mask,
  92. decoder_input_ids=y_ids,
  93. labels=lm_labels,
  94. )
  95. loss = outputs[0]
  96. optimizer.zero_grad()
  97. loss.backward()
  98. optimizer.step()
  99. summary_writer.add_scalar('epoch/loss_{}'.format(epoch), loss.item(), _)
  100. if _%100000==0 and _ > 0:
  101. print(f"[Saving Model]...\n")
  102. # Saving the model after training
  103. path = os.path.join(output_dir, 'model_epoch{}_step{}'.format(epoch + 1,_))
  104. if not os.path.exists(path):
  105. os.mkdir(path)
  106. model.save_pretrained(path)
  107. tokenizer.save_pretrained(path)
  108. def T5Trainer(
  109. dataframe, source_text, target_text, model_params, output_dir="./outputs/"
  110. ):
  111. """
  112. T5 trainer
  113. """
  114. if not os.path.exists(output_dir):
  115. os.mkdir(output_dir)
  116. # Set random seeds and deterministic pytorch for reproducibility
  117. torch.manual_seed(model_params["SEED"]) # pytorch random seed
  118. np.random.seed(model_params["SEED"]) # numpy random seed
  119. # logging
  120. print(f"""[Model]: Loading {model_params["MODEL"]}...\n""")
  121. # tokenzier for encoding the text
  122. tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
  123. # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary.
  124. # Further this model is sent to device (GPU/TPU) for using the hardware.
  125. model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
  126. model = model.to(device)
  127. # logging
  128. print(f"[Data]: Reading data...\n")
  129. # Importing the raw dataset
  130. dataframe = dataframe[[source_text, target_text]]
  131. # Creation of Dataset and Dataloader
  132. # Defining the train size. So 80% of the data will be used for training and the rest for validation.
  133. train_size = 1
  134. train_dataset = dataframe.sample(frac=train_size, random_state=model_params["SEED"])
  135. train_dataset = train_dataset.reset_index(drop=True)
  136. print(f"FULL Dataset: {dataframe.shape}")
  137. print(f"TRAIN Dataset: {train_dataset.shape}")
  138. # Creating the Training and Validation dataset for further creation of Dataloader
  139. training_set = YourDataSetClass(
  140. train_dataset,
  141. tokenizer,
  142. model_params["MAX_SOURCE_TEXT_LENGTH"],
  143. model_params["MAX_TARGET_TEXT_LENGTH"],
  144. source_text,
  145. target_text,
  146. )
  147. # Defining the parameters for creation of dataloaders
  148. train_params = {
  149. "batch_size": model_params["TRAIN_BATCH_SIZE"],
  150. "shuffle": True,
  151. "num_workers": 0,
  152. }
  153. # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
  154. training_loader = DataLoader(training_set, **train_params)
  155. # Defining the optimizer that will be used to tune the weights of the network in the training session.
  156. optimizer = torch.optim.Adam(
  157. params=model.parameters(), lr=model_params["LEARNING_RATE"]
  158. )
  159. # Training loop
  160. print(f"[Initiating Fine Tuning]...\n")
  161. for epoch in range(model_params["TRAIN_EPOCHS"]):
  162. summary_writer = SummaryWriter(log_dir="t5/summary_task")
  163. train(epoch, tokenizer, model, device, training_loader, optimizer,summary_writer,output_dir)
  164. print(f"[Saving Model]...\n")
  165. # Saving the model after training
  166. path = os.path.join(output_dir, 'model_epoch{}'.format(epoch + 1))
  167. if not os.path.exists(path):
  168. os.mkdir(path)
  169. model.save_pretrained(path)
  170. tokenizer.save_pretrained(path)
  171. print(
  172. f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n"""
  173. )
  174. if __name__ == '__main__':
  175. model_params = {
  176. "MODEL": "ClueAI/ChatYuan-large-v1", # model_type: t5-base/t5-large
  177. "TRAIN_BATCH_SIZE": 8, # training batch size
  178. "TRAIN_EPOCHS": 3, # number of training epochs
  179. "LEARNING_RATE": 1e-4, # learning rate
  180. "MAX_SOURCE_TEXT_LENGTH": 768, # max length of source text
  181. "MAX_TARGET_TEXT_LENGTH": 512, # max length of target text
  182. "SEED": 42, # set seed for reproducibility
  183. }
  184. train_dataframe = pd.read_csv("data/new_data.txt", sep='\t')
  185. T5Trainer(train_dataframe, "input", "label", model_params)

训练数据集

链接:百度网盘 请输入提取码

提取码:nrb9

关于模型的应用部分:

  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # @Time : 2023/2/27 16:39
  4. # @Author : sparkle_code_guy
  5. from transformers import T5Tokenizer, T5ForConditionalGeneration
  6. tokenizer = T5Tokenizer.from_pretrained("ClueAI/ChatYuan-large-v1")
  7. model = T5ForConditionalGeneration.from_pretrained("ClueAI/ChatYuan-large-v1")
  8. import torch
  9. # 修改colab笔记本设置为gpu,推理更快
  10. device = torch.device('cuda')
  11. model.to(device)
  12. def preprocess(text):
  13. text = text.replace("\n", "\\n").replace("\t", "\\t")
  14. return text
  15. def postprocess(text):
  16. return text.replace("\\n", "\n").replace("\\t", "\t")
  17. def answer(text, sample=True, top_p=1, temperature=0.7):
  18. '''sample:是否抽样。生成任务,可以设置为True;
  19. top_p:0-1之间,生成的内容越多样'''
  20. text = preprocess(text)
  21. print(len(text))
  22. encoding = tokenizer(text=[text], truncation=True, padding=True, max_length=768, return_tensors="pt").to(device)
  23. if not sample:
  24. out = model.generate(**encoding, return_dict_in_generate=True, output_scores=False, max_new_tokens=512,
  25. num_beams=1, length_penalty=0.6)
  26. else:
  27. out = model.generate(**encoding, return_dict_in_generate=True, output_scores=False, max_new_tokens=512,
  28. do_sample=True, top_p=top_p, temperature=temperature, no_repeat_ngram_size=3)
  29. out_text = tokenizer.batch_decode(out["sequences"], skip_special_tokens=True)
  30. return postprocess(out_text[0])
  31. def rewrite_message(input):
  32. print("query message:",input)
  33. answer_message_list=[]
  34. for each in range(4):
  35. answer_message_list.append("方案{0}:".format(each) + answer_message(input) )
  36. return "\n\n".join(answer_message_list)
  37. def answer_message(input):
  38. input_format = input.replace("\n", "。")
  39. input_text = "用户:" + input_format + "\n小智:"
  40. output_text = answer(input_text)
  41. return f"{output_text}"
  42. import gradio as gr
  43. examples_list = [
  44. "example1"]
  45. synthesis_interface = gr.Interface(rewrite_message,
  46. inputs=gr.components.Textbox(lines=10,interactive=True,placeholder="enter your question ..."),
  47. outputs=gr.components.Textbox(lines=10,interactive=False),
  48. cache_examples=False,
  49. title="问答",
  50. examples_per_page=5,
  51. examples=examples_list,
  52. live=False)
  53. synthesis_interface.launch(share=False,server_name='0.0.0.0',server_port=7860)
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/IT小白/article/detail/480089
推荐阅读
相关标签
  

闽ICP备14008679号