当前位置:   article > 正文

使用单卡qlora混合精度训练大模型chatGLM2-6b,解决qlora loss变成nan的问题!

chatglm qlora

最近新换了工作,以后的工作内容会和大模型相关,所以先抽空跑了一下chatGLM2-6b的demo,使用Qlora或lora微调模型

今天简单写个文档记录一下,顺便也是一个简单的教程,并且踩了qlora loss变成nan训练不稳定的问题

本教程并没有写lora的原理,需要的话自行查阅

1.chatGLM2-6b 模型我已经从huggingface 下载到服务器,因为我的服务器不能直接连接huggingface 下载

我是放到了文件夹下 /data/tmp/chatGLM2_6b_pretrain,包含模型文件和一些配置文件,直接在huggingface下载就好

2.打印模型结构

  1. 1 from transformers import AutoModel
  2. 2
  3. 3 model_name = "/data/tmp/chatGLM2_6b_pretrain"
  4. 4 model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
  5. 5 print(model)
  1. ChatGLMForConditionalGeneration(
  2. (transformer): ChatGLMModel(
  3. (embedding): Embedding(
  4. (word_embeddings): Embedding(65024, 4096)
  5. )
  6. (rotary_pos_emb): RotaryEmbedding()
  7. (encoder): GLMTransformer(
  8. (layers): ModuleList(
  9. (0-27): 28 x GLMBlock(
  10. (input_layernorm): RMSNorm()
  11. (self_attention): SelfAttention(
  12. (query_key_value): Linear(in_features=4096, out_features=4608, bias=True)
  13. (core_attention): CoreAttention(
  14. (attention_dropout): Dropout(p=0.0, inplace=False)
  15. )
  16. (dense): Linear(in_features=4096, out_features=4096, bias=False)
  17. )
  18. (post_attention_layernorm): RMSNorm()
  19. (mlp): MLP(
  20. (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
  21. (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
  22. )
  23. )
  24. )
  25. (final_layernorm): RMSNorm()
  26. )
  27. (output_layer): Linear(in_features=4096, out_features=65024, bias=False)
  28. )
  29. )
query_key_value 这个矩阵不是三个方阵拼接到一起,应该是Wq 4096*4096  Wk 4096*256 Wv 4096*256 使用的 group-attention

 3.打印添加lora后的模型结构

  1. 1 from transformers import AutoTokenizer, AutoModel, AutoConfig
  2. 2 from peft import LoraConfig, get_peft_model, TaskType
  3. 3
  4. 4 model_name = "/data/tmp/chatGLM2_6b_pretrain"
  5. 5 model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
  6. 6
  7. 7 config = LoraConfig(
  8. 8 peft_type="LORA",
  9. 9 task_type=TaskType.CAUSAL_LM,
  10. 10 inference_mode=False,
  11. 11 r=8,
  12. 12 lora_alpha=16,
  13. 13 lora_dropout=0.1,
  14. 14 fan_in_fan_out=False,
  15. 15 bias='lora_only',
  16. 16 target_modules=["query_key_value"]
  17. 17 )
  18. 18
  19. 19 model = get_peft_model(model, config)
  20. 20 print(model)
  1. PeftModelForCausalLM(
  2. (base_model): LoraModel(
  3. (model): ChatGLMForConditionalGeneration(
  4. (transformer): ChatGLMModel(
  5. (embedding): Embedding(
  6. (word_embeddings): Embedding(65024, 4096)
  7. )
  8. (rotary_pos_emb): RotaryEmbedding()
  9. (encoder): GLMTransformer(
  10. (layers): ModuleList(
  11. (0-27): 28 x GLMBlock(
  12. (input_layernorm): RMSNorm()
  13. (self_attention): SelfAttention(
  14. (query_key_value): Linear(
  15. in_features=4096, out_features=4608, bias=True
  16. (lora_dropout): ModuleDict(
  17. (default): Dropout(p=0.1, inplace=False)
  18. )
  19. (lora_A): ModuleDict(
  20. (default): Linear(in_features=4096, out_features=8, bias=False)
  21. )
  22. (lora_B): ModuleDict(
  23. (default): Linear(in_features=8, out_features=4608, bias=False)
  24. )
  25. (lora_embedding_A): ParameterDict()
  26. (lora_embedding_B): ParameterDict()
  27. )
  28. (core_attention): CoreAttention(
  29. (attention_dropout): Dropout(p=0.0, inplace=False)
  30. )
  31. (dense): Linear(in_features=4096, out_features=4096, bias=False)
  32. )
  33. (post_attention_layernorm): RMSNorm()
  34. (mlp): MLP(
  35. (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
  36. (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
  37. )
  38. )
  39. )
  40. (final_layernorm): RMSNorm()
  41. )
  42. (output_layer): Linear(in_features=4096, out_features=65024, bias=False)
  43. )
  44. )
  45. )
  46. )

  会发现 在query_key_value 矩阵下 多了两个全连接层,lora_A 和 lora_B ,这两个全连接层就是要训练的

4.准备数据集,我们使用的firefly数据集,可以自行去huggingface下载jsonl格式,需要提前划分好训练集和测试集 qa_dataset.py

  1. 1 # -*- coding: utf-8 -*-
  2. 2 from torch.utils.data import Dataset
  3. 3 import torch
  4. 4 import json
  5. 5 import numpy as np
  6. 6
  7. 7
  8. 8 class QADataset(Dataset):
  9. 9 def __init__(self, data_path, tokenizer, max_source_length, max_target_length) -> None:
  10. 10 super().__init__()
  11. 11 self.tokenizer = tokenizer
  12. 12 self.max_source_length = max_source_length
  13. 13 self.max_target_length = max_target_length
  14. 14 self.max_seq_length = self.max_source_length + self.max_target_length
  15. 15
  16. 16 self.data = []
  17. 17 with open(data_path, "r", encoding='utf-8') as f:
  18. 18 for line in f:
  19. 19 if not line or line == "":
  20. 20 continue
  21. 21 json_line = json.loads(line)
  22. 22 # {'kind': 'NLI', 'input': '自然语言推理:\n前提:家里人心甘情愿地养他,还有几家想让他做女婿的\n假设:他是被家里人收养的孤儿', 'target': '中立'}
  23. 23 kind = json_line["kind"]
  24. 24 input = json_line["input"]
  25. 25 target = json_line["target"]
  26. 26 self.data.append({
  27. 27 "question": input,
  28. 28 "answer": "--**"+kind+"**--\n"+target
  29. 29 })
  30. 30 print("data load , size:", len(self.data))
  31. 31 def preprocess(self, question, answer):
  32. 32 prompt = self.tokenizer.build_prompt(question, None)
  33. 33
  34. 34 a_ids = self.tokenizer.encode(text=prompt, add_special_tokens=True, truncation=True,
  35. 35 max_length=self.max_source_length)
  36. 36
  37. 37 b_ids = self.tokenizer.encode(text=answer, add_special_tokens=False, truncation=True,
  38. 38 max_length=self.max_target_length-1) #因为会补充eos_token
  39. 39
  40. 40 context_length = len(a_ids)
  41. 41 input_ids = a_ids + b_ids + [self.tokenizer.eos_token_id]
  42. 42 labels = [self.tokenizer.pad_token_id] * context_length + b_ids + [self.tokenizer.eos_token_id]
  43. 43
  44. 44 pad_len = self.max_seq_length - len(input_ids)
  45. 45 input_ids = input_ids + [self.tokenizer.pad_token_id] * pad_len
  46. 46 labels = labels + [self.tokenizer.pad_token_id] * pad_len
  47. 47 labels = [(l if l != self.tokenizer.pad_token_id else -100) for l in labels]
  48. 48 return input_ids, labels
  49. 49
  50. 50 def __getitem__(self, index):
  51. 51 item_data = self.data[index]
  52. 52
  53. 53 input_ids, labels = self.preprocess(**item_data)
  54. 54
  55. 55 return {
  56. 56 "input_ids": torch.LongTensor(np.array(input_ids)),
  57. 57 "labels": torch.LongTensor(np.array(labels))
  58. 58 }
  59. 59
  60. 60 def __len__(self):
  61. 61 return len(self.data)
  62. 62
  63. 63 if __name__ == "__main__":
  64. 64 with open("/data/tmp/firefly_data/firefly-train-1.1M.jsonl", "r", encoding='utf-8') as f_read,open("/data/tmp/firefly_data/firefly_train80000.jsonl","w",encoding='utf-8') as f_trainx, open("/data/tmp/firefly_data/firefly_train.jsonl","w",encoding='utf-8') as f_train, open("/data/tmp/firefly_data/firefly_test.jsonl","w",encoding='utf-8') as f_test:
  65. 65 lines = f_read.readlines()
  66. 66
  67. 67 f_test.writelines(lines[:1000])
  68. 68 f_train.writelines(lines[1000:])
  69. 69 f_trainx.writelines(lines[1000:81000])

5.训练lora,使用半精度,占用显存很大,batch_size只能为1,显存就要占用到30g了,而且训练很久,为了解决这个显存占用大的问题,后面又尝试了qlora

train_lora.py

  1. 1 # -*- coding: utf-8 -*-
  2. 2 import pandas as pd
  3. 3 from torch.utils.data import DataLoader
  4. 4 from transformers import AutoTokenizer, AutoModel
  5. 5 from qa_dataset import QADataset
  6. 6 from peft import LoraConfig, get_peft_model, TaskType
  7. 7 from tqdm import tqdm
  8. 8 import torch
  9. 9 import os, time, sys
  10. 10 import numpy as np
  11. 11
  12. 12
  13. 13 def train(epoch, model, device, loader, optimizer, gradient_accumulation_steps,model_output_dir):
  14. 14 model.train()
  15. 15 time1 = time.time()
  16. 16 losses = []
  17. 17 train_bar = tqdm(loader,total=len(loader))
  18. 18 for index, data in enumerate(train_bar):
  19. 19 input_ids = data['input_ids'].to(device, dtype=torch.long)
  20. 20 labels = data['labels'].to(device, dtype=torch.long)
  21. 21
  22. 22 outputs = model(
  23. 23 input_ids=input_ids,
  24. 24 labels=labels,
  25. 25 )
  26. 26 loss = outputs.loss
  27. 27 # 反向传播,计算当前梯度
  28. 28 loss.backward()
  29. 29 losses.append(loss.item())
  30. 30 # 梯度累积步数
  31. 31 if (index % gradient_accumulation_steps == 0 and index != 0) or index == len(loader) - 1:
  32. 32 # 更新网络参数
  33. 33 optimizer.step()
  34. 34 # 清空过往梯度
  35. 35 optimizer.zero_grad()
  36. 36
  37. 37 if index % 300 == 0:
  38. 38 model_save_path = os.path.join(model_output_dir,"index_{}".format(index))
  39. 39 if os.path.exists(model_save_path):
  40. 40 pass
  41. 41 else:
  42. 42 os.makedirs(model_save_path)
  43. 43 model.save_pretrained(model_save_path)
  44. 44 train_bar.set_description("epoch:{} idx:{} loss:{:.6f}".format(epoch,index,np.mean(losses)))
  45. 45
  46. 46
  47. 47
  48. 48 def validate(tokenizer, model, device, loader, max_length):
  49. 49 model.eval()
  50. 50 predictions = []
  51. 51 actuals = []
  52. 52 with torch.no_grad():
  53. 53 for _, data in enumerate(tqdm(loader, file=sys.stdout, desc="Validation Data")):
  54. 54 input_ids = data['input_ids'].to(device, dtype=torch.long)
  55. 55 labels = data['labels'].to(device, dtype=torch.long)
  56. 56 generated_ids = model.generate(
  57. 57 input_ids=input_ids,
  58. 58 max_length=max_length,
  59. 59 do_sample=False,
  60. 60 temperature=0
  61. 61 )
  62. 62 preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in
  63. 63 generated_ids]
  64. 64 target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) for t in labels]
  65. 65 predictions.extend(preds)
  66. 66 actuals.extend(target)
  67. 67 return predictions, actuals
  68. 68
  69. 69
  70. 70 def main():
  71. 71 model_name = "/data/tmp/chatGLM2_6b_pretrain"
  72. 72 train_json_path = "/data/tmp/firefly_data/firefly_train20000.jsonl"
  73. 73 val_json_path = "/data/tmp/firefly_data/firefly_test.jsonl"
  74. 74 max_source_length = 60
  75. 75 max_target_length = 360
  76. 76 epochs = 1
  77. 77 batch_size = 1
  78. 78 lr = 1e-4
  79. 79 lora_rank = 8
  80. 80 lora_alpha = 32
  81. 81 gradient_accumulation_steps = 16
  82. 82 model_output_dir = "output"
  83. 83 # 设备
  84. 84 device = torch.device("cuda:0")
  85. 85
  86. 86 # 加载分词器和模型
  87. 87 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
  88. 88 model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
  89. 89
  90. 90 # setup peft
  91. 91 peft_config = LoraConfig(
  92. 92 task_type=TaskType.CAUSAL_LM,
  93. 93 inference_mode=False,
  94. 94 r=lora_rank,
  95. 95 lora_alpha=lora_alpha,
  96. 96 lora_dropout=0.1
  97. 97 )
  98. 98 model = get_peft_model(model, peft_config)
  99. 99 model.is_parallelizable = True
  100. 100 model.model_parallel = True
  101. 101 model.print_trainable_parameters()
  102. 102 # 转为半精度
  103. 103 model = model.half()
  104. 104 model.float()
  105. 105
  106. 106 print("Start Load Train Data...")
  107. 107 train_params = {
  108. 108 "batch_size": batch_size,
  109. 109 "shuffle": True,
  110. 110 "num_workers": 0,
  111. 111 }
  112. 112 training_set = QADataset(train_json_path, tokenizer, max_source_length, max_target_length)
  113. 113 training_loader = DataLoader(training_set, **train_params)
  114. 114 print("Start Load Validation Data...")
  115. 115 val_params = {
  116. 116 "batch_size": batch_size,
  117. 117 "shuffle": False,
  118. 118 "num_workers": 0,
  119. 119 }
  120. 120 val_set = QADataset(val_json_path, tokenizer, max_source_length, max_target_length)
  121. 121 val_loader = DataLoader(val_set, **val_params)
  122. 122
  123. 123 optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr)
  124. 124 model = model.to(device)
  125. 125 print("Start Training...")
  126. 126 for epoch in range(epochs):
  127. 127 train(epoch, model, device, training_loader, optimizer, gradient_accumulation_steps,model_output_dir)
  128. 128 # print("Save Model To ", model_output_dir)
  129. 129 # model.save_pretrained(model_output_dir)
  130. 130 # 验证
  131. 131 print("Start Validation...")
  132. 132 with torch.no_grad():
  133. 133 predictions, actuals = validate(tokenizer, model, device, val_loader, max_target_length)
  134. 134 # 验证结果存储
  135. 135 final_df = pd.DataFrame({"Generated Text": predictions, "Actual Text": actuals})
  136. 136 val_data_path = os.path.join(model_output_dir, "predictions.csv")
  137. 137 final_df.to_csv(val_data_path)
  138. 138 print("Validation Data To ", val_data_path)
  139. 139
  140. 140
  141. 141 if __name__ == '__main__':
  142. 142 main()

6.有很多同学手里没有v100 32g显卡,即便batch_size=1 也没办法训练,所以又研究了qlora训练,qlora相比于lora来说就是多了一个模型量化过程,他会把原模型量化得到4-bit NormalFloat,这就会让原模型的显存占用很低,就可以让更多的显存来存放lora部分的参数

但是4-bit NormalFloat 结合手动half半精度的混合精度训练会导致loss很不稳定,可能一跑起来就变成nan了,就连float32结合半精度float16也是不稳定的,所以这块踩了坑,手动使用model.half() loss总会变成nan值,最后使用torch官方的自动混合精度就好了,它会自动的进行混合精度和梯度缩放,不至于产生超过半精度上下限的nan值。

这里说明为什么使用fp16混合精度,而不用float32,实践发现使用混合精度训练可以提速5-6倍的训练时间,在大模型上动不动就要跑很久,时间成本很高。

在2w的训练数据下,
qlora半精度需要2个小时就可以训练完成但是不稳定,float32需要11+小时
lora半精度需要不到5个小时,而且会比较稳定
如果这样的话,qlora不能使用半精度训练,否则loss不稳定,使用float32的话就是时间换空间,qlora除了能降低显存就完全失去了优势,还好torch官方的混合精度可以拯救半精度训练的问题

trian_qlora.py

  1. 1 # -*- coding: utf-8 -*-
  2. 2 import pandas as pd
  3. 3 from torch.utils.data import DataLoader
  4. 4 from transformers import AutoTokenizer, AutoModel,BitsAndBytesConfig
  5. 5 from qa_dataset import QADataset
  6. 6 from peft import LoraConfig, get_peft_model, TaskType,prepare_model_for_kbit_training
  7. 7 from tqdm import tqdm
  8. 8 import torch
  9. 9 import os, time, sys
  10. 10 from transformers import (
  11. 11 set_seed,
  12. 12 HfArgumentParser,
  13. 13 TrainingArguments,
  14. 14 AutoModelForCausalLM
  15. 15 )
  16. 16 import bitsandbytes as bnb
  17. 17 from collections import defaultdict
  18. 18 import numpy as np
  19. 19 import os
  20. 20
  21. 21 def verify_model_dtype(model):
  22. 22 """
  23. 23 查看模型种各种类型的参数的情况
  24. 24 """
  25. 25 dtype2param_num = defaultdict(int) # 每种数据类型的参数量
  26. 26 dtype2param_name = defaultdict(list) # 每种数据类型的参数名称
  27. 27 dtype2trainable_param_num = defaultdict(int) # 每种数据类型参与训练的参数量
  28. 28 dtype2trainable_param_name = defaultdict(list) # 每种数据类型参与训练的参数名称
  29. 29 for name, p in model.named_parameters():
  30. 30 dtype = p.dtype
  31. 31 dtype2param_num[dtype] += p.numel()
  32. 32 dtype2param_name[dtype].append(name)
  33. 33 if p.requires_grad:
  34. 34 dtype2trainable_param_num[dtype] += p.numel()
  35. 35 dtype2trainable_param_name[dtype].append(name)
  36. 36 # 统计全部参数中,各种类型参数分布
  37. 37 total = 0
  38. 38 print('verify all params of the model')
  39. 39 for k, v in dtype2param_num.items():
  40. 40 total += v
  41. 41 for k, v in dtype2param_num.items():
  42. 42 print(k, v, v / total)
  43. 43 for k, v in dtype2trainable_param_name.items():
  44. 44 print(k, v)
  45. 45
  46. 46 print()
  47. 47 # 统计可训练参数中,各种类型参数分布
  48. 48 print('verify trainable params the model')
  49. 49 total_trainable = 0
  50. 50 for k, v in dtype2trainable_param_num.items():
  51. 51 total_trainable += v
  52. 52 for k, v in dtype2trainable_param_num.items():
  53. 53 print(k, v, v / total_trainable)
  54. 54 for k, v in dtype2trainable_param_num.items():
  55. 55 print(k, v)
  56. 56
  57. 57 def find_all_linear_names(model):
  58. 58 """
  59. 59 找出所有全连接层,为所有全连接添加adapter
  60. 60 """
  61. 61 cls = bnb.nn.Linear4bit
  62. 62 lora_module_names = set()
  63. 63 for name, module in model.named_modules():
  64. 64 if isinstance(module, cls):
  65. 65 names = name.split('.')
  66. 66 lora_module_names.add(names[0] if len(names) == 1 else names[-1])
  67. 67
  68. 68 if 'lm_head' in lora_module_names: # needed for 16-bit
  69. 69 lora_module_names.remove('lm_head')
  70. 70 return list(lora_module_names)
  71. 71
  72. 72 def train(epoch, model, device, loader, optimizer,scaler, gradient_accumulation_steps,model_output_dir):
  73. 73 model.train()
  74. 74 time1 = time.time()
  75. 75 losses = []
  76. 76 train_bar = tqdm(loader,total=len(loader))
  77. 77 for index, data in enumerate(train_bar):
  78. 78 optimizer.zero_grad()
  79. 79 with torch.autocast(device_type="cuda",dtype=torch.float16):
  80. 80 input_ids = data['input_ids'].to(device, dtype=torch.long)
  81. 81 labels = data['labels'].to(device, dtype=torch.long)
  82. 82
  83. 83 outputs = model(
  84. 84 input_ids=input_ids,
  85. 85 labels=labels,
  86. 86 )
  87. 87 loss = outputs.loss
  88. 88 losses.append(loss.item())
  89. 89
  90. 90 scaler.scale(loss).backward()
  91. 91 # Unscales the gradients of optimizer's assigned params in-place
  92. 92 # scaler.unscale_(optimizer)
  93. 93
  94. 94 # Since the gradients of optimizer's assigned params are unscaled, clips as usual:
  95. 95 # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
  96. 96
  97. 97 # optimizer's gradients are already unscaled, so scaler.step does not unscale them,
  98. 98 # although it still skips optimizer.step() if the gradients contain infs or NaNs.
  99. 99 scaler.step(optimizer)
  100. 100
  101. 101 # Updates the scale for next iteration.
  102. 102 scaler.update()
  103. 103
  104. 104 # # 反向传播,计算当前梯度
  105. 105 # loss.backward()
  106. 106 # optimizer.step()
  107. 107 # 梯度累积步数
  108. 108 # if (index % gradient_accumulation_steps == 0 and index != 0) or index == len(loader) - 1:
  109. 109 # # 更新网络参数
  110. 110 # # optimizer.step()
  111. 111 # scaler.step(optimizer)
  112. 112 # scaler.update()
  113. 113 # # 清空过往梯度
  114. 114 # optimizer.zero_grad()
  115. 115
  116. 116 if index % 300 == 0:
  117. 117 model_save_path = os.path.join(model_output_dir,"index_{}".format(index))
  118. 118 if os.path.exists(model_save_path):
  119. 119 pass
  120. 120 else:
  121. 121 os.makedirs(model_save_path)
  122. 122 model.save_pretrained(model_save_path)
  123. 123 train_bar.set_description("epoch:{} idx:{} loss:{:.6f}".format(epoch,index,np.mean(losses)))
  124. 124 # 100轮打印一次 loss
  125. 125 # if index % 100 == 0 or index == len(loader) - 1:
  126. 126 # time2 = time.time()
  127. 127 # tqdm.write(
  128. 128 # f"{index}, epoch: {epoch} -loss: {str(loss)} ; each step's time spent: {(str(float(time2 - time1) / float(index + 0.0001)))}")
  129. 129
  130. 130
  131. 131 def validate(tokenizer, model, device, loader, max_length):
  132. 132 model.eval()
  133. 133 predictions = []
  134. 134 actuals = []
  135. 135 with torch.no_grad():
  136. 136 for _, data in enumerate(tqdm(loader, file=sys.stdout, desc="Validation Data")):
  137. 137 input_ids = data['input_ids'].to(device, dtype=torch.long)
  138. 138 labels = data['labels'].to(device, dtype=torch.long)
  139. 139 generated_ids = model.generate(
  140. 140 input_ids=input_ids,
  141. 141 max_length=max_length,
  142. 142 do_sample=False,
  143. 143 temperature=0
  144. 144 )
  145. 145 preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in
  146. 146 generated_ids]
  147. 147 target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) for t in labels]
  148. 148 predictions.extend(preds)
  149. 149 actuals.extend(target)
  150. 150 return predictions, actuals
  151. 151
  152. 152
  153. 153 def main():
  154. 154 model_name = "/data/tmp/chatGLM2_6b_pretrain"
  155. 155 train_json_path = "/data/tmp/firefly_data/firefly_train80000.jsonl"
  156. 156 val_json_path = "/data/tmp/firefly_data/firefly_test.jsonl"
  157. 157 max_source_length = 128
  158. 158 max_target_length = 512
  159. 159 epochs = 1
  160. 160 batch_size = 16
  161. 161 lr = 1e-4
  162. 162 lora_rank = 32
  163. 163 lora_alpha = 32
  164. 164 gradient_accumulation_steps = 16
  165. 165 model_output_dir = "output"
  166. 166 # 设备
  167. 167 device = torch.device("cuda:0")
  168. 168 lora_dropout = 0.05
  169. 169
  170. 170 # 加载分词器和模型
  171. 171 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
  172. 172 # model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
  173. 173 # 加载模型
  174. 174 model = AutoModelForCausalLM.from_pretrained(
  175. 175 model_name,
  176. 176 device_map=0,
  177. 177 load_in_4bit=True,
  178. 178 torch_dtype=torch.float16,
  179. 179 trust_remote_code=True,
  180. 180 quantization_config=BitsAndBytesConfig(
  181. 181 load_in_4bit=True,
  182. 182 bnb_4bit_compute_dtype=torch.float16,
  183. 183 bnb_4bit_use_double_quant=True,
  184. 184 bnb_4bit_quant_type="nf4",
  185. 185 llm_int8_threshold=6.0,
  186. 186 llm_int8_has_fp16_weight=False,
  187. 187 ),
  188. 188 )
  189. 189
  190. 190 # casts all the non int8 modules to full precision (fp32) for stability
  191. 191 model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
  192. 192 print(f'memory footprint of model: {model.get_memory_footprint()/(1024*1024*1024)} GB')
  193. 193
  194. 194 # 找到所有需要插入adapter的全连接层
  195. 195 target_modules = find_all_linear_names(model)
  196. 196 print("全连接层:",target_modules)
  197. 197 # 初始化lora配置
  198. 198 peft_config = LoraConfig(
  199. 199 r=lora_rank,
  200. 200 lora_alpha=lora_alpha,
  201. 201 # target_modules=target_modules,
  202. 202 target_modules=["query_key_value","dense","dense_h_to_4h","dense_4h_to_h"],
  203. 203 lora_dropout=lora_dropout,
  204. 204 bias="none",
  205. 205 task_type="CAUSAL_LM",
  206. 206 )
  207. 207
  208. 208 model = get_peft_model(model, peft_config)
  209. 209
  210. 210
  211. 211 # model.is_parallelizable = True
  212. 212 # model.model_parallel = True
  213. 213 model.print_trainable_parameters()
  214. 214 # 转为半精度
  215. 215 # model = model.half() #You shouldn’t call half manually on the model or data. 不使用torch官方的自动混合精度和梯度缩放,手动使用half()半精度会导致模型loss变成nan,使用官方的混合精度需要关闭half()手动半精度,不然会报错
  216. 216 # model.float()
  217. 217 model.config.torch_dtype = torch.float32
  218. 218 # 查看模型种各种类型的参数的情况
  219. 219 verify_model_dtype(model)
  220. 220
  221. 221 print(model)
  222. 222
  223. 223 print("Start Load Train Data...")
  224. 224 train_params = {
  225. 225 "batch_size": batch_size,
  226. 226 "shuffle": True,
  227. 227 "num_workers": 0,
  228. 228 }
  229. 229 training_set = QADataset(train_json_path, tokenizer, max_source_length, max_target_length)
  230. 230 training_loader = DataLoader(training_set, **train_params)
  231. 231 print("Start Load Validation Data...")
  232. 232 val_params = {
  233. 233 "batch_size": batch_size,
  234. 234 "shuffle": False,
  235. 235 "num_workers": 0,
  236. 236 }
  237. 237 val_set = QADataset(val_json_path, tokenizer, max_source_length, max_target_length)
  238. 238 val_loader = DataLoader(val_set, **val_params)
  239. 239
  240. 240
  241. 241 scaler = torch.cuda.amp.GradScaler()
  242. 242 optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr)
  243. 243 model = model.to(device)
  244. 244 print("Start Training...")
  245. 245 for epoch in range(epochs):
  246. 246 train(epoch, model, device, training_loader, optimizer,scaler, gradient_accumulation_steps,model_output_dir)
  247. 247 print("Save Model To ", model_output_dir)
  248. 248 model.save_pretrained(model_output_dir)
  249. 249 # 验证
  250. 250 print("Start Validation...")
  251. 251 with torch.no_grad():
  252. 252 predictions, actuals = validate(tokenizer, model, device, val_loader, max_target_length)
  253. 253 # 验证结果存储
  254. 254 final_df = pd.DataFrame({"Generated Text": predictions, "Actual Text": actuals})
  255. 255 val_data_path = os.path.join(model_output_dir, "predictions.csv")
  256. 256 final_df.to_csv(val_data_path)
  257. 257 print("Validation Data To ", val_data_path)
  258. 258
  259. 259
  260. 260 if __name__ == '__main__':
  261. 261 main()

torch官方的自动混合精度代码

  1. # Creates model and optimizer in default precision
  2. model = Net().cuda()
  3. optimizer = optim.SGD(model.parameters(), ...)
  4. # Creates a GradScaler once at the beginning of training.
  5. scaler = GradScaler()
  6. for epoch in epochs:
  7. for input, target in data:
  8. optimizer.zero_grad()
  9. # Runs the forward pass with autocasting.
  10. with autocast(device_type='cuda', dtype=torch.float16):
  11. output = model(input)
  12. loss = loss_fn(output, target)
  13. # Scales loss. Calls backward() on scaled loss to create scaled gradients.
  14. # Backward passes under autocast are not recommended.
  15. # Backward ops run in the same dtype autocast chose for corresponding forward ops.
  16. scaler.scale(loss).backward()
  17. # scaler.step() first unscales the gradients of the optimizer's assigned params.
  18. # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
  19. # otherwise, optimizer.step() is skipped.
  20. scaler.step(optimizer)
  21. # Updates the scale for next iteration.
  22. scaler.update()

qlora在batch=16  lora_rank = 32 target_modules=["query_key_value","dense","dense_h_to_4h","dense_4h_to_h"]  对所有模型矩阵参数都进行lora的情况下才占用不足29g显存

而lora在batch=1 lora_rank = 8  只对query_key_value矩阵进行lora训练的情况下就占用了30g显存,而且速度会比较慢

报错 ValueError: Attempting to unscale FP16 gradients.
官方说不需要手动再调用.half()分别在模型和数据上
# model = model.half()
使用官方的混合精度训练,模型的loss确实可以稳定下降

7.使用qlora + torch fp16混合精度训练可以稳定,模型输出结果:

Qlora 后chatGLM6b 预测结果:
  1. [Round 1]
  2. 问:在上海的苹果代工厂,较低的基本工资让工人们形成了“软强制”的加班默契。加班能多拿两三千,“自愿”加班成为常态。律师提示,加班后虽能获得一时不错的报酬,但过重的工作负荷会透支身体,可能对今后劳动权利造成不利影响。
  3. 输出摘要:
  4. 答: --**Summary**--
  5. 苹果代工厂员工调查:为何争着“自愿”加班
  6. [Round 1]
  7. 问:上联:把酒邀春,春日三人醉
  8. 下联:
  9. 答: --**Couplet**--
  10. 梳妆佩玉,玉王点一娇

  actual label

  1. --**Summary**--
  2. 苹果代工厂员工调查:为何争着“自愿”加班
  3. --**Couplet**--
  4. 梳妆佩玉,玉王点一娇

8.使用qlora加载模型推理 model_test.py  qlora推理占用的显存同样很低

  1. 1 from transformers import AutoTokenizer, AutoModel, AutoConfig,BitsAndBytesConfig
  2. 2 from peft import PeftConfig, PeftModel, LoraConfig, get_peft_model, TaskType
  3. 3 import torch
  4. 4 from transformers import (
  5. 5 set_seed,
  6. 6 HfArgumentParser,
  7. 7 TrainingArguments,
  8. 8 AutoModelForCausalLM
  9. 9 )
  10. 10
  11. 11 device = torch.device("cuda:0")
  12. 12
  13. 13 model_name = "/data/tmp/chatGLM2_6b_pretrain"
  14. 14 lora_dir = "output"
  15. 15
  16. 16 # 加载模型
  17. 17 model = AutoModelForCausalLM.from_pretrained(
  18. 18 model_name,
  19. 19 device_map=0,
  20. 20 load_in_4bit=True,
  21. 21 torch_dtype=torch.float16,
  22. 22 trust_remote_code=True,
  23. 23 quantization_config=BitsAndBytesConfig(
  24. 24 load_in_4bit=True,
  25. 25 bnb_4bit_compute_dtype=torch.float16,
  26. 26 bnb_4bit_use_double_quant=True,
  27. 27 bnb_4bit_quant_type="nf4",
  28. 28 llm_int8_threshold=6.0,
  29. 29 llm_int8_has_fp16_weight=False,
  30. 30 ),
  31. 31 )
  32. 32
  33. 33 # peft_config = LoraConfig(
  34. 34 # r=lora_rank,
  35. 35 # lora_alpha=lora_alpha,
  36. 36 # # target_modules=target_modules,
  37. 37 # target_modules=["query_key_value","dense_h_to_4h"],
  38. 38 # lora_dropout=lora_dropout,
  39. 39 # bias="none",
  40. 40 # task_type="CAUSAL_LM",
  41. 41 # )
  42. 42
  43. 43 # model = get_peft_model(model, peft_config)
  44. 44
  45. 45 # model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
  46. 46 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
  47. 47
  48. 48 config = PeftConfig.from_pretrained(lora_dir)
  49. 49 model = PeftModel.from_pretrained(model, lora_dir)
  50. 50
  51. 51 model = model.to(device)
  52. 52 model.eval()
  53. 53
  54. 54 while True:
  55. 55 text = input("问题:")
  56. 56 response, history = model.chat(tokenizer, text, history=[])
  57. 57 print("回答:", response)
  58. 58
  1. 邓紫棋在北京鸟巢开演唱会,唱了音乐《 画》 。 请找出这段话中的实体
  2. 回答: --NER--
  3. 北京鸟巢,邓紫棋
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/酷酷是懒虫/article/detail/837406
推荐阅读
相关标签
  

闽ICP备14008679号