赞
踩
源码来自于中文LLaMA3
def accuracy(predictions, references, normalize=True, sample_weight=None):
return {
"accuracy": float(
accuracy_score(references, predictions, normalize=normalize, sample_weight=sample_weight)
)
}
def compute_metrics(eval_preds):
preds, labels = eval_preds
# preds have the same shape as the labels, after the argmax(-1) has been calculated
# by preprocess_logits_for_metrics but we need to shift the labels
labels = labels[:, 1:].reshape(-1)
preds = preds[:, :-1].reshape(-1)
return accuracy(predictions=preds, references=labels)
def preprocess_logits_for_metrics(logits, labels):
if isinstance(logits, tuple):
# Depending on the model and config, logits may contain extra tensors,
# like past_key_values, but logits always come first
logits = logits[0]
return logits.argmax(dim=-1)
以上部分为模型forward后的取值与训练样本的实际值计算损失部分,不做具体阐述,详情阅读chatgpt
def fault_tolerance_data_collator(features: List) -> Dict[str, Any]:
if not isinstance(features[0], Mapping):
features = [vars(f) for f in features]
first = features[0]
batch = {}
# Special handling for labels.
# Ensure that tensor is created with the correct type
# (it should be automatically the case, but let's make sure of it.)
if "label" in first and first["label"] is not None:
label = first["label"].item() if isinstance(first["label"], torch.Tensor) else first["label"]
dtype = torch.long if isinstance(label, int) else torch.float
batch["labels"] = torch.tensor([f["label"] for f in features], dtype=dtype)
elif "label_ids" in first and first["label_ids"] is not None:
if isinstance(first["label_ids"], torch.Tensor):
batch["labels"] = torch.stack([f["label_ids"] for f in features])
else:
dtype = torch.long if isinstance(first["label_ids"][0], int) else torch.float
batch["labels"] = torch.tensor([f["label_ids"] for f in features], dtype=dtype)
# Handling of all other possible keys.
# Again, we will use the first element to figure out which key/values are not None for this model.
try:
for k, v in first.items():
if k not in ("label", "label_ids") and v is not None and not isinstance(v, str):
if isinstance(v, torch.Tensor):
batch[k] = torch.stack([f[k] for f in features])
elif isinstance(v, np.ndarray):
batch[k] = torch.tensor(np.stack([f[k] for f in features]))
else:
batch[k] = torch.tensor([f[k] for f in features])
except ValueError: # quick fix by simply take the first example
for k, v in first.items():
if k not in ("label", "label_ids") and v is not None and not isinstance(v, str):
if isinstance(v, torch.Tensor):
batch[k] = torch.stack([features[0][k]] * len(features))
elif isinstance(v, np.ndarray):
batch[k] = torch.tensor(np.stack([features[0][k]] * len(features)))
else:
batch[k] = torch.tensor([features[0][k]] * len(features))
return batch
batch数据预处理:
MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
@dataclass
class ModelArguments:
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
"""
model_name_or_path: Optional[str] = field(
default=None,
metadata={
"help": (
"The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
)
},
)
tokenizer_name_or_path: Optional[str] = field(
default=None,
metadata={
"help": (
"The tokenizer for weights initialization.Don't set if you want to train a model from scratch."
)
},
)
model_type: Optional[str] = field(
default=None,
metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
)
config_overrides: Optional[str] = field(
default=None,
metadata={
"help": (
"Override some existing default config settings when a model is trained from scratch. Example: "
"n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
)
},
)
config_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
)
tokenizer_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
)
cache_dir: Optional[str] = field(
default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
)
use_fast_tokenizer: bool = field(
default=False,
metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
)
model_revision: str = field(
default="main",
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
)
use_auth_token: bool = field(
default=False,
metadata={
"help": (
"Will use the token generated when running `huggingface-cli login` (necessary to use this script "
"with private models)."
)
},
)
torch_dtype: Optional[str] = field(
default=None,
metadata={
"help": (
"Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
"dtype will be automatically derived from the model's weights."
),
"choices": ["auto", "bfloat16", "float16", "float32"],
},
)
low_cpu_mem_usage: bool = field(
default=False,
metadata={
"help": (
"It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. "
"set True will benefit LLM loading time and RAM consumption."
)
},
)
def __post_init__(self):
if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
raise ValueError(
"--config_overrides can't be used in combination with --config_name or --model_name_or_path"
)
MODEL_TYPES---->(‘bart’, ‘bert’, ‘bert-generation’, ‘big_bird’, ‘bigbird_pegasus’, ‘biogpt’, ‘blenderbot’, ‘blenderbot-small’, ‘bloom’, ‘camembert’, ‘llama’, ‘codegen’, ‘cpmant’, ‘ctrl’, ‘data2vec-text’, ‘electra’, ‘ernie’, ‘falcon’, ‘fuyu’, ‘gemma’, ‘git’, ‘gpt2’, ‘gpt2’, ‘gpt_bigcode’, ‘gpt_neo’, ‘gpt_neox’, ‘gpt_neox_japanese’, ‘gptj’, ‘llama’, ‘marian’, ‘mbart’, ‘mega’, ‘megatron-bert’, ‘mistral’, ‘mixtral’, ‘mpt’, ‘musicgen’, ‘mvp’, ‘open-llama’, ‘openai-gpt’, ‘opt’, ‘pegasus’, ‘persimmon’, ‘phi’, ‘plbart’, ‘prophetnet’, ‘qdqbert’, ‘qwen2’, ‘reformer’, ‘rembert’, ‘roberta’, ‘roberta-prelayernorm’, ‘roc_bert’, ‘roformer’, ‘rwkv’, ‘speech_to_text_2’, ‘stablelm’, ‘transfo-xl’, ‘trocr’, ‘whisper’, ‘xglm’, ‘xlm’, ‘xlm-prophetnet’, ‘xlm-roberta’, ‘xlm-roberta-xl’, ‘xlnet’, ‘xmod’)
以下是 ModelArguments 类中定义的一些关键参数:
model_name_or_path: 模型检查点的名称或路径,用于初始化权重。如果不设置,则表示从头开始训练模型。
tokenizer_name_or_path: 分词器的名称或路径,用于初始化权重。如果不设置,则表示从头开始训练模型。
model_type: 如果从头开始训练模型,则需要从 MODEL_TYPES 列表中传递一个模型类型。
config_overrides: 当从头开始训练模型时,用于覆盖现有默认配置设置的字符串。例如,可以用来改变模型的一些超参数。
config_name: 预训练配置的名称或路径,如果不与 model_name 相同,则使用此参数。
tokenizer_name: 预训练分词器的名称或路径,如果不与 model_name 相同,则使用此参数。
cache_dir: 存储从 huggingface.co 下载的预训练模型的目录。
use_fast_tokenizer: 是否使用快速的分词器(由 tokenizers 库支持)。
model_revision: 要使用的具体模型版本(可以是分支名称、标签名称或提交 ID)。
use_auth_token: 是否使用运行 huggingface-cli login 时生成的令牌(对于使用私有模型是必要的)。
torch_dtype: 覆盖默认的 torch.dtype 并以这种数据类型加载模型。如果传递 auto,则数据类型将自动从模型的权重中派生。
low_cpu_mem_usage: 是否创建一个空的模型外壳,然后在加载预训练权重时才实例化其参数。设置为 True 可以减少大型语言模型(LLM)的加载时间和 RAM 消耗。
post_init 方法是一个特殊的方法,它在 dataclass 实例化后立即被调用。在这个方法中,它检查了 config_overrides 是否与 config_name 或 model_name_or_path 一起使用,如果是,则抛出一个 ValueError 异常,因为这可能导致配置冲突。
这个类通常与命令行解析库(如 argparse)一起使用,以便从命令行中读取和解析这些参数,然后用于配置和初始化模型。
@dataclass
class DataTrainingArguments:
"""
Arguments pertaining to what data we are going to input our model for training and eval.
"""
dataset_dir: Optional[str] = field(
default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
)
dataset_config_name: Optional[str] = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
)
train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
validation_file: Optional[str] = field(
default=None,
metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
)
max_train_samples: Optional[int] = field(
default=None,
metadata={
"help": (
"For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
)
},
)
max_eval_samples: Optional[int] = field(
default=None,
metadata={
"help": (
"For debugging purposes or quicker training, truncate the number of evaluation examples to this "
"value if set."
)
},
)
streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"})
block_size: Optional[int] = field(
default=None,
metadata={
"help": (
"Optional input sequence length after tokenization. "
"The training dataset will be truncated in block of this size for training. "
"Default to the model max input length for single sentence inputs (take into account special tokens)."
)
},
)
overwrite_cache: bool = field(
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
)
validation_split_percentage: Optional[float] = field(
default=0.05,
metadata={
"help": "The percentage of the train set used as validation set in case there's no validation split"
},
)
preprocessing_num_workers: Optional[int] = field(
default=None,
metadata={"help": "The number of processes to use for the preprocessing."},
)
keep_linebreaks: bool = field(
default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
)
data_cache_dir: Optional[str] = field(default="./", metadata={"help": "The datasets processed stored"})
def __post_init__(self):
if self.streaming:
require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`")
以下是 DataTrainingArguments 类中定义的一些关键参数:
dataset_dir: 要使用的数据集的名称(通过 datasets 库)。
dataset_config_name: 要使用的数据集的配置名称(通过 datasets 库)。
train_file: 输入的培训数据文件(一个文本文件)。
validation_file: 可选的输入评估数据文件,用于评估困惑度(一个文本文件)。
max_train_samples: 用于调试或更快训练的目的,如果设置了该值,则截断训练示例的数量。
max_eval_samples: 用于调试或更快训练的目的,如果设置了该值,则截断评估示例的数量。
streaming: 是否启用流模式。
block_size: 可选的标记化后的输入序列长度。训练数据集将以此大小为块被截断以进行训练。默认为模型的单个句子输入的最大输入长度(考虑特殊标记)。
overwrite_cache: 是否覆盖缓存的训练和评估集。
validation_split_percentage: 如果没有验证分割,则用作验证集的训练集的百分比。
preprocessing_num_workers: 用于预处理的进程数。
keep_linebreaks: 在使用 TXT 文件时是否保留换行符。
data_cache_dir: 处理后的数据集存储的目录。
post_init 方法是一个特殊的方法,它在 dataclass 实例化后立即被调用。在这个方法中,它检查了是否启用了 streaming 模式,如果启用了,则检查 datasets 库的版本是否大于或等于 2.0.0,因为流功能需要 datasets>=2.0.0。
用于配置和加载数据集。这些参数帮助用户指定训练和评估数据的位置、格式和大小,以及如何处理这些数据。
@dataclass
class MyTrainingArguments(TrainingArguments):
trainable : Optional[str] = field(default="q_proj,v_proj")
lora_rank : Optional[int] = field(default=8)
lora_dropout : Optional[float] = field(default=0.1)
lora_alpha : Optional[float] = field(default=32.)
modules_to_save : Optional[str] = field(default=None)
debug_mode : Optional[bool] = field(default=False)
peft_path : Optional[str] = field(default=None)
use_flash_attention_2 : Optional[bool] = field(default=False)
double_quant: Optional[bool] = field(default=True)
quant_type: Optional[str] = field(default="nf4")
load_in_kbits: Optional[int] = field(default=16)
full_finetuning : Optional[bool] = field(default=False)
继承 TrainingArguments,MyTrainingArguments 可以扩展这些参数,并添加自己的特定参数。
以下是 MyTrainingArguments 类中定义的一些关键参数:
trainable: 可训练的层,默认为 “q_proj,v_proj”。
lora_rank: Low-Rank Adaption (LoRA) 的秩,默认为 8。
lora_dropout: LoRA 的 dropout 比率,默认为 0.1。
lora_alpha: LoRA 的 alpha 参数,默认为 32.0。
modules_to_save: 要保存的模块列表,默认为 None。
debug_mode: 是否启用调试模式,默认为 False。
peft_path: Potential Energy Function Template (PEFT) 的路径,默认为 None。
use_flash_attention_2: 是否使用 FlashAttention 2,默认为 False。
double_quant: 是否使用双量化,默认为 True。
quant_type: 量化类型,默认为 “nf4”。
load_in_kbits: 加载的位数,默认为 16。
full_finetuning: 是否进行全量微调,默认为 False。
这些参数提供了对训练过程的细粒度控制,允许用户根据特定的训练需求进行调整。debug_mode 可以用于开启或关闭调试特性,而 full_finetuning 参数用于控制是否对整个模型进行微调。
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, MyTrainingArguments))
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
# If we pass only one argument to the script and it's the path to a json file,
# let's parse it to get our arguments.
model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
send_example_telemetry("run_clm", model_args, data_args)
创建一个 HfArgumentParser 实例,它继承自 argparse.ArgumentParser。这个实例将用于解析命令行参数,并且可以处理多个 dataclasses 定义的参数。
如果提供了json文件,则解析JSON 文件中的参数,如果没有,则使用 parser.parse_args_into_dataclasses 方法来解析命令行参数。这个方法会将解析后的参数存储在 model_args, data_args, 和 training_args 变量中。在.sh文件中,使用的是第二种直接解析参数
send_example_telemetry 函数是用于向 Hugging Face 发送有关 transformers 库中示例的使用情况的匿名遥测数据。这有助于Hugging Face团队了解用户如何使用其库和示例,从而做出更好的决策和优化。
# Setup logging
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO, # if training_args.local_rank in [-1, 0] else logging.WARN,
handlers=[logging.StreamHandler(sys.stdout)],)
if training_args.should_log:
# The default of training_args.log_level is passive, so we set log level at info here to have that default.
transformers.utils.logging.set_verbosity_info()
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary:
logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)
logger.info(f"Training/evaluation parameters {training_args}")
logging.basicConfig(format=“%(asctime)s - %(levelname)s - %(name)s - %(message)s”,datefmt=“%m/%d/%Y %H:%M:%S”,: 这行代码使用 logging 模块的基本配置来设置日志格式。format 参数定义了日志信息的格式,包括时间戳、日志级别、日志名和日志消息。datefmt 参数定义了日期和时间的格式。
level=logging.INFO, # if training_args.local_rank in [-1, 0] else logging.WARN,: 这行代码设置了日志级别。如果 training_args.local_rank 是 -1 或 0,则日志级别设置为 INFO;否则,日志级别设置为 WARN。这可能意味着只有在主进程(local_rank 为 0)时,才会记录 INFO 级别的日志。
handlers=[logging.StreamHandler(sys.stdout)],): 这行代码设置了日志处理器,将日志输出到标准输出(通常是终端)。
if training_args.should_log:: 这行代码检查 training_args.should_log 属性是否为 True。如果为 True,则执行以下代码。
transformers.utils.logging.set_verbosity_info(): 这行代码将 transformers 库的日志级别设置为 INFO。
logger.setLevel(log_level): 这行代码将日志记录器(logger)的级别设置为 log_level。
datasets.utils.logging.set_verbosity(log_level): 这行代码将 datasets 库的日志级别设置为 log_level。
transformers.utils.logging.set_verbosity(log_level): 这行代码将 transformers 库的日志级别设置为 log_level。
transformers.utils.logging.enable_default_handler(): 这行代码启用 transformers 库的默认日志处理器。
transformers.utils.logging.enable_explicit_format(): 这行代码启用 transformers 库的显式日志格式。
logger.warning(f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}“): 这行代码打印了一个警告信息,其中包含了关于训练进程的排名、设备、GPU 数量、是否进行分布式训练以及是否使用 16 位浮点数训练的信息。
logger.info(f"Training/evaluation parameters {training_args}”): 这行代码打印了一个信息,其中包含了关于训练/评估参数的信息。
# Detecting last checkpoint.
#
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
#设置了一个随机种子,以保证模型训练的随机性是可复现的
# Set seed before initializing model.
set_seed(training_args.seed)
config_kwargs = {
"cache_dir": model_args.cache_dir,
"revision": model_args.model_revision,
"use_auth_token": True if model_args.use_auth_token else None,
}
if model_args.config_name:
config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
elif model_args.model_name_or_path:
config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
else:
config = CONFIG_MAPPING[model_args.model_type]()
logger.warning("You are instantiating a new config instance from scratch.")
if model_args.config_overrides is not None:
logger.info(f"Overriding config: {model_args.config_overrides}")
config.update_from_string(model_args.config_overrides)
logger.info(f"New config: {config}")
tokenizer_kwargs = {
"cache_dir": model_args.cache_dir,
"use_fast": model_args.use_fast_tokenizer,
"revision": model_args.model_revision,
"use_auth_token": True if model_args.use_auth_token else None,
}
if model_args.tokenizer_name:
tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
elif model_args.tokenizer_name_or_path:
tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name_or_path, **tokenizer_kwargs)
else:
raise ValueError(
"You are instantiating a new tokenizer from scratch. This is not supported by this script."
"You can do it from another script, save it, and load it from here, using --tokenizer_name."
)
# Preprocessing the datasets.
# First we tokenize all the texts.
# since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
def tokenize_function(examples):
with CaptureLogger(tok_logger) as cl:
output = tokenizer(examples["text"])
# clm input could be much much longer than block_size
if "Token indices sequence length is longer than the" in cl.out:
tok_logger.warning(
"^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
" before being passed to the model."
)
return output
if data_args.block_size is None:
block_size = tokenizer.model_max_length
if block_size > 1024:
logger.warning(
"The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value"
" of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can"
" override this default with `--block_size xxx`."
)
block_size = 1024
else:
if data_args.block_size > tokenizer.model_max_length:
logger.warning(
f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model"
f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
)
block_size = min(data_args.block_size, tokenizer.model_max_length)
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.
if total_length >= block_size:
total_length = (total_length // block_size) * block_size
# Split by chunks of max_len.
result = {
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
for k, t in concatenated_examples.items()
}
result["labels"] = result["input_ids"].copy()
return result
with training_args.main_process_first(desc="dataset map tokenization and grouping"):
lm_datasets = []
path = Path(data_args.dataset_dir)
files = [file.name for file in path.glob("*.txt")]
if training_args.debug_mode is True:
files = [files[0]]
for idx, file in enumerate(files):
data_file = os.path.join(path, file)
filename = ''.join(file.split(".")[:-1])
cache_path = os.path.join(data_args.data_cache_dir, filename+f"_{block_size}")
os.makedirs(cache_path, exist_ok=True)
try:
processed_dataset = datasets.load_from_disk(cache_path, keep_in_memory=False)
logger.info(f'training datasets-{filename} has been loaded from disk')
except Exception:
cache_dir = os.path.join(data_args.data_cache_dir, filename+f"_text_{block_size}")
os.makedirs(cache_dir, exist_ok=True)
raw_dataset = load_dataset("text", data_files=data_file, cache_dir=cache_dir, keep_in_memory=False)
logger.info(f"{file} has been loaded")
tokenized_dataset = raw_dataset.map(
tokenize_function,
batched=True,
num_proc=data_args.preprocessing_num_workers,
remove_columns="text",
load_from_cache_file=True,
keep_in_memory=False,
cache_file_names = {k: os.path.join(cache_dir, 'tokenized.arrow') for k in raw_dataset},
desc="Running tokenizer on dataset",
)
grouped_datasets = tokenized_dataset.map(
group_texts,
batched=True,
num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=True,
keep_in_memory=False,
cache_file_names = {k: os.path.join(cache_dir, 'grouped.arrow') for k in tokenized_dataset},
desc=f"Grouping texts in chunks of {block_size}",
)
processed_dataset = grouped_datasets
processed_dataset.save_to_disk(cache_path)
if idx == 0:
lm_datasets = processed_dataset['train']
else:
assert lm_datasets.features.type == processed_dataset["train"].features.type
lm_datasets = concatenate_datasets([lm_datasets, processed_dataset["train"]])
lm_datasets = lm_datasets.train_test_split(test_size = data_args.validation_split_percentage)
if training_args.do_train:
train_dataset = lm_datasets['train']
if data_args.max_train_samples is not None:
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
logger.info(f"Num train_samples {len(train_dataset)}")
logger.info("Training example:")
logger.info(tokenizer.decode(train_dataset[0]['input_ids']))
if training_args.do_eval:
eval_dataset = lm_datasets["test"]
if data_args.max_eval_samples is not None:
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
eval_dataset = eval_dataset.select(range(max_eval_samples))
logger.info(f"Num eval_samples {len(eval_dataset)}")
logger.info("Evaluation example:")
logger.info(tokenizer.decode(eval_dataset[0]['input_ids']))
compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
if training_args.load_in_kbits in [4, 8]:
if training_args.modules_to_save is not None:
load_in_8bit_skip_modules = training_args.modules_to_save.split(',')
else:
load_in_8bit_skip_modules = None
quantization_config = BitsAndBytesConfig(
load_in_4bit=training_args.load_in_kbits == 4,
load_in_8bit=training_args.load_in_kbits == 8,
llm_int8_threshold=6.0,
load_in_8bit_skip_modules=load_in_8bit_skip_modules,
bnb_4bit_compute_dtype=compute_dtype,
bnb_4bit_use_double_quant=training_args.double_quant,
bnb_4bit_quant_type=training_args.quant_type # {'fp4', 'nf4'}
)
else:
quantization_config = None
if quantization_config is not None:
logger.info(f"quantization_config:{quantization_config.to_dict()}")
根据配置参数设置模型量化配置,并打印出量化配置的详细信息
training_args.modules_to_save代表有一些特定的模块需要被保存。
load_in_8bit_skip_modules 是一个变量,它用于指定在加载 8 位量化模型时应该跳过哪些模块。这是为了防止某些模块在量化过程中出现问题,或者为了保留某些模块的原始精度。
if model_args.model_name_or_path:
torch_dtype = (
model_args.torch_dtype
if model_args.torch_dtype in ["auto", None]
else getattr(torch, model_args.torch_dtype)
)
device_map = {"":int(os.environ.get("LOCAL_RANK") or 0)}
model = AutoModelForCausalLM.from_pretrained(
model_args.model_name_or_path,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
torch_dtype=torch_dtype,
low_cpu_mem_usage=model_args.low_cpu_mem_usage,
device_map=device_map,
quantization_config=quantization_config,
attn_implementation="flash_attention_2" if training_args.use_flash_attention_2 else "sdpa"
)
else:
model = AutoModelForCausalLM.from_config(config)
n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
if training_args.load_in_kbits in [4, 8]:
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing)
model.config.use_cache = False
model_vocab_size = model.get_output_embeddings().weight.size(0)
tokenizer_vocab_size = len(tokenizer)
logger.info(f"Model vocab size: {model_vocab_size}")
logger.info(f"Tokenizer vocab size: {tokenizer_vocab_size}")
if model_vocab_size != tokenizer_vocab_size:
logger.info(f"Resize model vocab size to {tokenizer_vocab_size}")
model.resize_token_embeddings(len(tokenizer))
if not training_args.full_finetuning:
if training_args.peft_path is not None:
logger.info("Peft from pre-trained model")
model = PeftModel.from_pretrained(model, training_args.peft_path, device_map=device_map, is_trainable=True)
else:
logger.info("Init new peft model")
target_modules = training_args.trainable.split(',')
modules_to_save = training_args.modules_to_save
if modules_to_save is not None:
modules_to_save = modules_to_save.split(',')
lora_rank = training_args.lora_rank
lora_dropout = training_args.lora_dropout
lora_alpha = training_args.lora_alpha
logger.info(f"target_modules: {target_modules}")
logger.info(f"lora_rank: {lora_rank}")
logger.info(f"modules_to_save: {modules_to_save}")
peft_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
target_modules=target_modules,
inference_mode=False,
r=lora_rank, lora_alpha=lora_alpha,
lora_dropout=lora_dropout,
modules_to_save=modules_to_save)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
# Initialize our Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset if training_args.do_train else None,
eval_dataset=eval_dataset if training_args.do_eval else None,
tokenizer=tokenizer,
data_collator=fault_tolerance_data_collator,
compute_metrics=compute_metrics if training_args.do_eval and not is_torch_xla_available() else None,
preprocess_logits_for_metrics=preprocess_logits_for_metrics
if training_args.do_eval and not is_torch_xla_available()
else None,
)
data_collator 参数,它是一个数据收集器实例,用于将数据样本和标签打包成适合模型输入的格式。
preprocess_logits_for_metrics预处理模型输出以计算评估指标
构建训练器,输入模型,训练参数,数据集,tokenizer, data_collator,精度计算矩阵, 预处理模型输出
# Training
if training_args.do_train:
checkpoint = None
if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the tokenizer too for easy upload
metrics = train_result.metrics
max_train_samples = (
data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
)
metrics["train_samples"] = min(max_train_samples, len(train_dataset))
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
# Evaluation
if training_args.do_eval:
logger.info("*** Evaluate ***")
metrics = trainer.evaluate()
max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
try:
perplexity = math.exp(metrics["eval_loss"])
except OverflowError:
perplexity = float("inf")
metrics["perplexity"] = perplexity
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)
如果训练参数需要验证,那么计算模型的在验证集上的困惑度,记录验证效果
问题:预训练步骤是怎么设置的,和其他的微调方式区别在哪里
暂时理解是:通过提示词模板,实现的对话功能,在对模型进行输出后,进行损失计算时的方法,是目前的微调方式,而本程序使用的是直接将文章输入,不需要得到大模型在固定输入下的输出
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。