当前位置:   article > 正文

Qwen源码解析:finetune.py

qwen
  1. # This code is based on the revised code from fastchat based on tatsu-lab/stanford_alpaca.
  2. from dataclasses import dataclass, field
  3. import json
  4. import math
  5. import logging
  6. import os
  7. from typing import Dict, Optional, List
  8. import torch
  9. from torch.utils.data import Dataset
  10. from deepspeed import zero
  11. from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
  12. import transformers
  13. from transformers import Trainer, GPTQConfig, deepspeed
  14. from transformers.trainer_pt_utils import LabelSmoother
  15. from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
  16. IGNORE_TOKEN_ID = LabelSmoother.ignore_index
  17. @dataclass
  18. class ModelArguments:
  19. model_name_or_path: Optional[str] = field(default="Qwen/Qwen-7B")
  20. @dataclass
  21. class DataArguments:
  22. data_path: str = field(
  23. default=None, metadata={"help": "Path to the training data."}
  24. )
  25. eval_data_path: str = field(
  26. default=None, metadata={"help": "Path to the evaluation data."}
  27. )
  28. lazy_preprocess: bool = False
  29. @dataclass
  30. class TrainingArguments(transformers.TrainingArguments):
  31. cache_dir: Optional[str] = field(default=None)
  32. optim: str = field(default="adamw_torch")
  33. model_max_length: int = field(
  34. default=8192,
  35. metadata={
  36. "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
  37. },
  38. )
  39. use_lora: bool = False
  40. @dataclass
  41. class LoraArguments:
  42. lora_r: int = 64
  43. lora_alpha: int = 16
  44. lora_dropout: float = 0.05
  45. lora_target_modules: List[str] = field(
  46. default_factory=lambda: ["c_attn", "c_proj", "w1", "w2"]
  47. )
  48. lora_weight_path: str = ""
  49. lora_bias: str = "none"
  50. q_lora: bool = False
  51. def maybe_zero_3(param):
  52. if hasattr(param, "ds_id"):
  53. assert param.ds_status == ZeroParamStatus.NOT_AVAILABLE
  54. with zero.GatheredParameters([param]):
  55. param = param.data.detach().cpu().clone()
  56. else:
  57. param = param.detach().cpu().clone()
  58. return param
  59. # Borrowed from peft.utils.get_peft_model_state_dict
  60. def get_peft_state_maybe_zero_3(named_params, bias):
  61. if bias == "none":
  62. to_return = {k: t for k, t in named_params if "lora_" in k}
  63. elif bias == "all":
  64. to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
  65. elif bias == "lora_only":
  66. to_return = {}
  67. maybe_lora_bias = {}
  68. lora_bias_names = set()
  69. for k, t in named_params:
  70. if "lora_" in k:
  71. to_return[k] = t
  72. bias_name = k.split("lora_")[0] + "bias"
  73. lora_bias_names.add(bias_name)
  74. elif "bias" in k:
  75. maybe_lora_bias[k] = t
  76. for k, t in maybe_lora_bias:
  77. if bias_name in lora_bias_names:
  78. to_return[bias_name] = t
  79. else:
  80. raise NotImplementedError
  81. to_return = {k: maybe_zero_3(v) for k, v in to_return.items()}
  82. return to_return
  83. local_rank = None
  84. def rank0_print(*args):
  85. if local_rank == 0:
  86. print(*args)
  87. def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str, bias="none"):
  88. """Collects the state dict and dump to disk."""
  89. # check if zero3 mode enabled
  90. if deepspeed.is_deepspeed_zero3_enabled():
  91. state_dict = trainer.model_wrapped._zero3_consolidated_16bit_state_dict()
  92. else:
  93. if trainer.args.use_lora:
  94. state_dict = get_peft_state_maybe_zero_3(
  95. trainer.model.named_parameters(), bias
  96. )
  97. else:
  98. state_dict = trainer.model.state_dict()
  99. if trainer.args.should_save and trainer.args.local_rank == 0:
  100. trainer._save(output_dir, state_dict=state_dict)
  101. def preprocess(
  102. sources,
  103. tokenizer: transformers.PreTrainedTokenizer,
  104. max_len: int,
  105. system_message: str = "You are a helpful assistant."
  106. ) -> Dict:
  107. roles = {"user": "<|im_start|>user", "assistant": "<|im_start|>assistant"}
  108. im_start = tokenizer.im_start_id
  109. im_end = tokenizer.im_end_id
  110. nl_tokens = tokenizer('\n').input_ids
  111. _system = tokenizer('system').input_ids + nl_tokens
  112. _user = tokenizer('user').input_ids + nl_tokens
  113. _assistant = tokenizer('assistant').input_ids + nl_tokens
  114. # Apply prompt templates
  115. input_ids, targets = [], []
  116. for i, source in enumerate(sources):
  117. if roles[source[0]["from"]] != roles["user"]:
  118. source = source[1:]
  119. input_id, target = [], []
  120. system = [im_start] + _system + tokenizer(system_message).input_ids + [im_end] + nl_tokens
  121. input_id += system
  122. target += [im_start] + [IGNORE_TOKEN_ID] * (len(system)-3) + [im_end] + nl_tokens
  123. assert len(input_id) == len(target)
  124. for j, sentence in enumerate(source):
  125. role = roles[sentence["from"]]
  126. _input_id = tokenizer(role).input_ids + nl_tokens + \
  127. tokenizer(sentence["value"]).input_ids + [im_end] + nl_tokens
  128. input_id += _input_id
  129. if role == '<|im_start|>user':
  130. _target = [im_start] + [IGNORE_TOKEN_ID] * (len(_input_id)-3) + [im_end] + nl_tokens
  131. elif role == '<|im_start|>assistant':
  132. _target = [im_start] + [IGNORE_TOKEN_ID] * len(tokenizer(role).input_ids) + \
  133. _input_id[len(tokenizer(role).input_ids)+1:-2] + [im_end] + nl_tokens
  134. else:
  135. raise NotImplementedError
  136. target += _target
  137. assert len(input_id) == len(target)
  138. input_id += [tokenizer.pad_token_id] * (max_len - len(input_id))
  139. target += [IGNORE_TOKEN_ID] * (max_len - len(target))
  140. input_ids.append(input_id[:max_len])
  141. targets.append(target[:max_len])
  142. input_ids = torch.tensor(input_ids, dtype=torch.int)
  143. targets = torch.tensor(targets, dtype=torch.int)
  144. return dict(
  145. input_ids=input_ids,
  146. labels=targets,
  147. attention_mask=input_ids.ne(tokenizer.pad_token_id),
  148. )
  149. class SupervisedDataset(Dataset):
  150. """Dataset for supervised fine-tuning."""
  151. def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int):
  152. super(SupervisedDataset, self).__init__()
  153. rank0_print("Formatting inputs...")
  154. sources = [example["conversations"] for example in raw_data]
  155. data_dict = preprocess(sources, tokenizer, max_len)
  156. self.input_ids = data_dict["input_ids"]
  157. self.labels = data_dict["labels"]
  158. self.attention_mask = data_dict["attention_mask"]
  159. def __len__(self):
  160. return len(self.input_ids)
  161. def __getitem__(self, i) -> Dict[str, torch.Tensor]:
  162. return dict(
  163. input_ids=self.input_ids[i],
  164. labels=self.labels[i],
  165. attention_mask=self.attention_mask[i],
  166. )

Python代码,涉及deepspeed、transformers、peft等库,用于定义数据与模型相关的参数和配置。下面我会逐行解释这段代码的含义:

 

pythonCopy code

from deepspeed import zero

  • 从deepspeed库中导入zero模块。DeepSpeed是一个深度学习优化库,zero是它的一个特性,可以减少模型的内存占用。
 

pythonCopy code

from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus

  • 从deepspeed的zero模块中导入ZeroParamStatus。它与模型参数的分区有关,通常与zero优化有关。
 

pythonCopy code

import transformers

  • 导入transformers库。transformers是一个提供大量预训练模型的库,例如BERT、GPT-2等。
 

pythonCopy code

from transformers import Trainer, GPTQConfig, deepspeed

  • 从transformers库中导入Trainer、GPTQConfig和deepspeed。Trainer是用于训练模型的类,GPTQConfig可能是针对某种GPT模型的配置,而deepspeed是与DeepSpeed集成相关的模块。
 

pythonCopy code

from transformers.trainer_pt_utils import LabelSmoother

  • 从transformers的trainer_pt_utils模块中导入LabelSmoother。LabelSmoother是一个用于标签平滑的工具,可以提高模型的泛化能力。
 

pythonCopy code

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

  • 从peft库中导入LoraConfig、get_peft_model和prepare_model_for_kbit_training。不过,到目前为止(2022年1月)我不熟悉peft这个库,所以具体的功能需要查阅相关文档。
 

pythonCopy code

IGNORE_TOKEN_ID = LabelSmoother.ignore_index

  • 定义一个常量IGNORE_TOKEN_ID,其值等于LabelSmoother的ignore_index属性。这通常用于指定在计算损失时应该忽略的token ID。
 

pythonCopy code

@dataclass class ModelArguments: model_name_or_path: Optional[str] = field(default="Qwen/Qwen-7B")

  • 使用Python的dataclass定义一个名为ModelArguments的类,它有一个属性model_name_or_path,默认值为"Qwen/Qwen-7B"。
 

pythonCopy code

@dataclass class DataArguments: data_path: str = field( default=None, metadata={"help": "Path to the training data."} ) eval_data_path: str = field( default=None, metadata={"help": "Path to the evaluation data."} ) lazy_preprocess: bool = False

  • 使用dataclass定义一个名为DataArguments的类,用于指定数据相关的参数,如训练数据路径、评估数据路径和是否使用懒加载预处理。
 

pythonCopy code

@dataclass class TrainingArguments(transformers.TrainingArguments): cache_dir: Optional[str] = field(default=None) optim: str = field(default="adamw_torch") model_max_length: int = field( default=8192, metadata={ "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)." }, ) use_lora: bool = False

  • 使用dataclass定义一个名为TrainingArguments的类,它继承了transformers的TrainingArguments类。这个类定义了训练相关的参数,如缓存目录、优化器、模型的最大序列长度和是否使用Lora。

这段代码主要定义了与模型、数据和训练相关的参数和配置类。希望这个解释对您有所帮助!

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/花生_TL007/article/detail/175436
推荐阅读
相关标签
  

闽ICP备14008679号