赞
踩
pip install -q -U bitsandbytes
pip install -q -U git+https://github.com/huggingface/transformers.git
pip install -q -U git+https://github.com/huggingface/peft.git
pip install -q -U git+https://github.com/huggingface/accelerate.git
pip install trl
import os import torch from datasets import load_dataset from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging, ) from peft import LoraConfig, PeftModel from trl import SFTTrainer from accelerate import FullyShardedDataParallelPlugin, Accelerator from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig fsdp_plugin = FullyShardedDataParallelPlugin( state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False), optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False), ) accelerator = Accelerator(fsdp_plugin=fsdp_plugin)
因为使用的是base模型,所以没有一个严格的提示模板需要遵循。使用的数据集遵循LLama3的模板格式,因此对于使用Llama3聊天格式的下游任务来说应该没问题。如果你使用自己的数据,你可以自定义格式,在下游任务中也使用相同的格式即可。
base_model_id = "meta-llama/Meta-Llama-3-8B" dataset_name = "scooterman/guanaco-llama3-1k" new_model = "llama3-8b-SFT" from datasets import load_dataset dataset = load_dataset(dataset_name, split="train") import torch from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig model = AutoModelForCausalLM.from_pretrained(base_model_id, device_map="auto") tokenizer = AutoTokenizer.from_pretrained( base_model_id, add_eos_token=True, add_bos_token=True, ) tokenizer.pad_token = tokenizer.eos_token
许多教程只是简单地粘贴一个参数列表,让读者自己去弄清楚每个参数的作用。下面我添加了注释来解释每个参数的作用!
# Output directory where the results and checkpoint are stored output_dir = "./results" # Number of training epochs - how many times does the model see the whole dataset num_train_epochs = 1 #Increase this for a larger finetune # Enable fp16/bf16 training. This is the type of each weight. Since we are on an A100 # we can set bf16 to true because it can handle that type of computation bf16 = True # Batch size is the number of training examples used to train a single forward and backward pass. per_device_train_batch_size = 4 # Gradients are accumulated over multiple mini-batches before updating the model weights. # This allows for effectively training with a larger batch size on hardware with limited memory gradient_accumulation_steps = 2 # memory optimization technique that reduces RAM usage during training by intermittently storing # intermediate activations instead of retaining them throughout the entire forward pass, trading # computational time for lower memory consumption. gradient_checkpointing = True # Maximum gradient normal (gradient clipping) max_grad_norm = 0.3 # Initial learning rate (AdamW optimizer) learning_rate = 2e-4 # Weight decay to apply to all layers except bias/LayerNorm weights weight_decay = 0.001 # Optimizer to use optim = "paged_adamw_32bit" # Number of training steps (overrides num_train_epochs) max_steps = 5 # Ratio of steps for a linear warmup (from 0 to learning rate) warmup_ratio = 0.03 # Group sequences into batches with same length # Saves memory and speeds up training considerably group_by_length = True # Save checkpoint every X updates steps save_steps = 100 # Log every X updates steps logging_steps = 5
建立一个wandb帐户来监控这次微调任务。
pip install wandb
import wandb training_arguments = TrainingArguments( output_dir=output_dir, num_train_epochs=num_train_epochs, per_device_train_batch_size=per_device_train_batch_size, gradient_accumulation_steps=gradient_accumulation_steps, optim=optim, save_steps=save_steps, logging_steps=logging_steps, learning_rate=learning_rate, weight_decay=weight_decay, bf16=bf16, max_grad_norm=max_grad_norm, max_steps=max_steps, warmup_ratio=warmup_ratio, group_by_length=group_by_length, report_to="wandb" ) trainer = SFTTrainer( model=model, train_dataset=dataset, dataset_text_field="text", tokenizer=tokenizer, args=training_arguments, ) trainer.train() # Save trained model trainer.model.save_pretrained(new_model)
为了部署这个模型以进行极快的推理,使用VLLM并托管一个OpenAI兼容端点。可能需要重新启动内核,然后运行下面的单元。
pip install vllm
python -O -u -m vllm.entrypoints.openai.api_server \
--host=127.0.0.1 \
--port=8000 \
--model=brev-llama3-8b-SFT \
--tokenizer=meta-llama/Meta-Llama-3-8B \
--tensor-parallel-size=2
Instruct 版本对话prompt结构:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
{{ system_prompt }}<|eot_id|><|start_header_id|>user<|end_header_id|>
{{ user_msg_1 }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{{ model_answer_1 }}<|eot_id|>
16 GB 的 RAM,包括 3090 或 4090 等消费级 GPU
import transformers import torch model_id = "meta-llama/Meta-Llama-3-8B-Instruct" pipeline = transformers.pipeline( "text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, device="cuda", ) messages = [ {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"}, {"role": "user", "content": "Who are you?"}, ] prompt = pipeline.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) terminators = [ pipeline.tokenizer.eos_token_id, pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>") ] outputs = pipeline( prompt, max_new_tokens=256, eos_token_id=terminators, do_sample=True, temperature=0.6, top_p=0.9, ) print(outputs[0]["generated_text"][len(prompt):])
量化版,4 bits加载需要大约 7 GB 的内存运行
pipeline = transformers.pipeline(
"text-generation",
model=model_id,
model_kwargs={
"torch_dtype": torch.float16,
"quantization_config": {"load_in_4bit": True},
"low_cpu_mem_usage": True,
},
)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。