赞
踩
最近在微调不同量级上的大模型,包括Llama-2-7b,Llama-2-13b,Llama-2-70b,Yi-34b,Qwen-14b,Qwen-72b等大模型。在有限的资源上微调大模型,节约显存,可以考虑使用
LoRA这个算法,来自论文《LoRA: Low-Rank Adaptation of Large Language Models》,目前可以用的包有两个,分别是loralib 和peft这两个包,其中peft 和huggingface 中的transformers结合一起使用非常方便大家的使用。
LLM模型中有一部分层是线性层,LoRA是对线性层的参数增加两个低维度的矩阵进行线性运算,从而降低了模型微调的参数量和显存消耗。通常低维度的参数rank 可以自行设置,相比于LLM的hidden_size的值要小很多。下面介绍LoRA的数学表达式:
y
=
(
w
T
+
α
l
o
r
a
_
A
T
l
o
r
a
_
B
T
)
x
=
w
T
x
+
α
l
o
r
a
_
A
T
l
o
r
a
_
B
T
x
y = (w^{T} + \alpha lora\_{A}^{T}lora\_{B}^{T}) x\\ =w^{T} x+ \alpha lora\_{A}^{T}lora\_{B}^{T} x
y=(wT+αlora_ATlora_BT)x=wTx+αlora_ATlora_BTx
其中
w
∈
R
m
×
h
w \in R^{m\times h}
w∈Rm×h,
l
o
r
a
_
A
∈
R
r
×
h
lora\_{A} \in R^{r\times h}
lora_A∈Rr×h,
l
o
r
a
_
B
∈
R
m
×
r
lora\_{B} \in R^{m\times r}
lora_B∈Rm×r,
r
r
r通常可以自行设置的值比较小,而且远远小于m,
l
o
r
a
_
A
∈
R
r
×
h
,
l
o
r
a
_
B
∈
R
m
×
r
lora\_{A} \in R^{r\times h},lora\_{B} \in R^{m\times r}
lora_A∈Rr×h,lora_B∈Rm×r的参数量远小于
w
∈
R
m
×
h
w \in R^{m\times h}
w∈Rm×h。在采用LoRA的方式微调LLM,梯度更新的参数为每一层的线性层相应的lora参数,例如表达式中的
l
o
r
a
_
A
lora\_{A}
lora_A和
l
o
r
a
_
B
lora\_{B}
lora_B,原模型的参数不进行梯度更新,这样做的目的是训练参数量减少,节约显存,加快训练速度。在显卡资源不足的情况下,可以选择LoRA的方式进行微调LLM。而且
l
o
r
a
_
A
T
l
o
r
a
_
B
T
∈
R
h
×
m
lora\_{A}^{T}lora\_{B}^{T}\in R^{h\times m}
lora_ATlora_BT∈Rh×m与
w
T
w^{T}
wT的大小是一致的,方便将
l
o
r
a
_
A
T
l
o
r
a
_
B
T
lora\_{A}^{T}lora\_{B}^{T}
lora_ATlora_BT参数合并到原始模型参数
w
T
w^{T}
wT上,并未对原始模型增加新的参数,从而采用原始模型的推理方式进行推理。有时候微调的时候需要训练embedding层和lm head 这一层,在采用LoRA训练的时候也可以训练这两层,在peft中可以实现这两层的训练。
下面将给出Qwen采用LoRA的方式进行微调,是基于peft和transformers实现的。下面将给出示例代码。注意这里使用的是Qwen1不是Qwen2。
from peft import PeftModel from peft import LoraConfig from peft import get_peft_model from modeling_qwen import QWenLMHeadModel def load_qwen_model_lora(pretrain_model_path, use_gradient_checkpointing, lora_r, lora_alpha, lora_dropout, bf16=False, fp16=False, checkpoint_dir=None ): model = QWenLMHeadModel.from_pretrained(pretrain_model_path, torch_dtype=torch.bfloat16 if bf16 else torch.float16 if fp16 else torch.float32) print("loadding model") model.config.torch_dtype = (torch.float16 if fp16 else (torch.bfloat16 if bf16 else torch.float32)) if use_gradient_checkpointing: model.gradient_checkpointing_enable() print("using gradient_checkpointing_enable") model.config.use_cache = False target_modules = find_all_linear_names(model) print("模型中linear层的名称集合:", target_modules) config = LoraConfig( r=lora_r, lora_alpha=lora_alpha, target_modules=target_modules, lora_dropout=lora_dropout, bias="none", task_type="CAUSAL_LM", ) if checkpoint_dir is not None: print("加载模型继续训练.") model = PeftModel.from_pretrained(model, checkpoint_dir) for name, param in model.named_parameters(): if 'lora_' in name: param.requires_grad = True else: print('添加 LoRA 网络...') model = get_peft_model(model, config) for name, module in model.named_modules(): if isinstance(module, LoraLayer): if bf16: module = module.to(torch.bfloat16) if 'ln' in name: module = module.to(torch.float32) if 'lm_head' in name or 'wte' in name: if hasattr(module, 'weight'): if bf16 and module.weight.dtype == torch.float32: module = module.to(torch.bfloat16) return model def find_all_linear_names(model): lora_module_names = set() for name, module in model.named_modules(): if isinstance(module, torch.nn.Linear): names = name.split('.') lora_module_names.add(names[0] if len(names) == 1 else names[-1]) print("lora name: " , lora_module_names) if 'lm_head' in lora_module_names: # needed for 16-bit lora_module_names.remove('lm_head') return list(lora_module_names)
采用transformers 的Trainer 进行模型训练,这里也可以采用deepspeed ddp 进行模型训练。
下面将介绍在采用LoRA进行微调的时候训练embedding 和lm header 这两层。如果模型字典进行延扩,embedding 和lm header 这两个层需要训练。只需要在LoraConfig指定就可以。
def load_qwen_model_lora(pretrain_model_path, use_gradient_checkpointing, lora_r, lora_alpha, lora_dropout, bf16=False, fp16=False, checkpoint_dir=None, finetuning_embedding_and_lm_head=True ): model = QWenLMHeadModel.from_pretrained(pretrain_model_path, torch_dtype=torch.bfloat16 if bf16 else torch.float16 if fp16 else torch.float32) print("loadding model") model.config.torch_dtype = (torch.float16 if fp16 else (torch.bfloat16 if bf16 else torch.float32)) if use_gradient_checkpointing: model.gradient_checkpointing_enable() print("using gradient_checkpointing_enable") model.config.use_cache = False target_modules = find_all_linear_names(model) config = LoraConfig( r=lora_r, lora_alpha=lora_alpha, target_modules=target_modules, lora_dropout=lora_dropout, bias="none", task_type="CAUSAL_LM", modules_to_save=['base_model.model.lm_head', 'base_model.model.transformer.wte'] ) if checkpoint_dir is not None: print("从 checkpoint 加载 adapters.") model = PeftModel.from_pretrained(model, checkpoint_dir) for name, param in model.named_parameters(): if 'lora_' in name: param.requires_grad = True else: print('添加 LoRA 网络...') model = get_peft_model(model, config) for name, module in model.named_modules(): if isinstance(module, LoraLayer): if bf16: module = module.to(torch.bfloat16) if 'ln' in name: module = module.to(torch.float32) if 'lm_head' in name or 'wte' in name: if hasattr(module, 'weight'): if bf16 and module.weight.dtype == torch.float32: module = module.to(torch.bfloat16) # 设置lm head 和 embedding是否训练 if finetuning_embedding_and_lm_head: for name, param in model.named_parameters(): if 'lm_head' in name or 'wte' in name: param.requires_grad = True return model
QLoRA是模型量化(Quantilization) 和LoRA结合起来使得降低显存并加快训练效率,可以看作是对LoRA的优化。下面将给出千问的示例。在实际中使用的是4bit进行微调的。
import os import torch from peft import PeftModel from peft import LoraConfig from peft import get_peft_model from peft import prepare_model_for_kbit_training from modeling_qwen import QWenLMHeadModel def load_qwen_model_qlora(pretrain_model_path, use_gradient_checkpointing, bits, double_quant, quant_type, lora_r, lora_alpha, lora_dropout, checkpoint_dir=None, bf16=False, fp16=False, finetuning_embedding_and_lm_head=False): ## lora 微调embedding和lm head 的话,有两种方式,一种是下面两行代码取消注释,一种是LoraConfig中modules_to_save添加要训练的层名 if finetuning_embedding_and_lm_head: replace_peft_save_pretrined() model_dict = load_model_state_dict(pretrain_model_path) if checkpoint_dir is not None: fintuning_checkpoint_file = os.path.join(checkpoint_dir, 'embedding_and_lm_head.pt') if os.path.exists(fintuning_checkpoint_file): # 加载保存点的变化的权重 finetuning_state = torch.load(fintuning_checkpoint_file, map_location=lambda storage, loc: storage) # for n, p in finetuning_state.items(): n = n.replace('base_model.model.', '') if n in model_dict: model_dict[n] = p else: raise NameError weight_dtype = torch.bfloat16 if bf16 else torch.float16 if fp16 else torch.float32 model = QWenLMHeadModel.from_pretrained(pretrain_model_path, torch_dtype=weight_dtype, state_dict=model_dict, load_in_4bit=bits == 4, load_in_8bit=bits == 8, quantization_config=BitsAndBytesConfig( load_in_4bit=bits == 4, load_in_8bit=bits == 8, bnb_4bit_compute_dtype=weight_dtype, bnb_4bit_use_double_quant=double_quant, bnb_4bit_quant_type=quant_type ),) print("loadding model") model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=use_gradient_checkpointing) target_modules = find_all_linear_names(bits, model) print("模型中linear层的名称集合:", target_modules) config = LoraConfig( r=lora_r, lora_alpha=lora_alpha, target_modules=target_modules, lora_dropout=lora_dropout, bias="none", task_type="CAUSAL_LM", # modules_to_save=['base_model.model.lm_head', 'base_model.model.transformer.wte'] # , ) if checkpoint_dir is not None: print("从 checkpoint 加载 adapters.") model = PeftModel.from_pretrained(model, checkpoint_dir) else: print('添加 LoRA 网络...') model = get_peft_model(model, config) for name, param in model.named_parameters(): if finetuning_embedding_and_lm_head: if 'lm_head' in name or 'wte' in name: param.requires_grad = True continue return model
以上是对LoRA和QLoRA的使用介绍,如有理解错误,欢迎指证。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。