赞
踩
Fine-tuning是一种机器学习技术,特别是在深度学习和自然语言处理(NLP)领域中非常常见。这种方法涉及将一个在大型数据集上预训练(pre-trained)的模型,调整(tune)到一个更小、更具体的任务上。Fine-tuning通常可以提高模型在特定任务上的性能,因为模型已经具备了从大量数据中学习到的通用知识。
Fine-tuning是一个非常强大的工具,它允许利用预训练模型的强大能力,同时针对的具体任务进行定制化的优化。通过fine-tuning,可以用较少的数据和计算资源来训练一个高性能的模型。
import re
import json
from sklearn.model_selection import train_test_split
import os
data_pth = 'F:\python_code\AIGC\Finetune\code_alpaca_en\code_alpaca.jsonl'
# 打开jsonl文件并逐行读取
data = []
with open(data_pth, 'r',encoding='utf-8') as file:
for line in file:
# 将每行内容转换为JSON对象
json_data = json.loads(line)
# 处理JSON对象,例如输出到控制台
#print(json_data)
data.append(json_data
print(data[0])
返回
{‘instruction’: ‘Create an array of length 5 which contains all even
numbers between 1 and 10.’, ‘input’: ‘’, ‘output’: ‘arr = [2, 4, 6,
8, 10]’}
#拆分数据集 train, test = train_test_split(data,test_size=0.1) def build_text_files(data_json, dest_path): f = open(dest_path, 'w',encoding='utf-8') data = '' for texts in data_json: summary_instruction = str(texts['instruction']).strip()# 提问 summary_output = str(texts['output']).strip()# 返回 summary = re.sub(r"\s", " ", summary_instruction+summary_output)#字符串中替换匹配正则表达式的部分 data += summary + " " f.write(data) build_text_files(train,'train_dataset_V2.txt') build_text_files(test,'test_dataset_V2.txt') print("Train dataset length: "+str(len(train))) print("Test dataset length: "+ str(len(test)))
返回:
Train dataset length: 18019
Test dataset length: 2003
from transformers import AutoTokenizer #本地模型地址 model_name = r'G:\hugging_fase_model2\gpt2' #加载tokener tokenizer = AutoTokenizer.from_pretrained(model_name) train_path = 'train_dataset_V2.txt' test_path = 'test_dataset_V2.txt' #加载依赖 from transformers import TextDataset,DataCollatorForLanguageModeling #数据转换tokener def load_dataset(train_path,test_path,tokenizer): train_dataset = TextDataset( tokenizer=tokenizer, file_path=train_path, block_size=128) test_dataset = TextDataset( tokenizer=tokenizer, file_path=test_path, block_size=128) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False, ) return train_dataset,test_dataset,data_collator #数据加载 train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer) print(train_dataset[0])
返回:
tensor([16594, 257, 33476, 12405, 284, 1064, 262, 1438, 286,
257,
6491, 3025, 938, 1438, 318, 705, 17919, 6, 290, 468,
262, 4511, 3884, 4179, 13, 46506, 1438, 16034, 36707, 33411,
938, 3672, 796, 705, 17919, 6, 38678, 11050, 3884, 32374,
22196, 34, 27564, 2043, 352, 26, 220, 6889, 257, 15612,
14392, 2438, 284, 1064, 262, 2160, 286, 262, 37014, 422,
352, 284, 838, 7004, 751, 62, 77, 17024, 3419, 220,
220, 220, 14048, 2160, 1081, 34142, 220, 220, 220, 2160,
796, 657, 220, 220, 220, 220, 220, 220, 220, 1114,
1312, 796, 352, 1675, 838, 220, 220, 220, 220, 220,
220, 2160, 796, 2160, 1343, 1312, 220, 220, 220, 7406,
220, 220, 220, 6997, 70, 14253, 5855, 13065, 286, 477,
3146, 422, 352, 284, 838, 318, 1058, 366])
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead model = AutoModelWithLMHead.from_pretrained(model_name) training_args = TrainingArguments( output_dir="../gpt2-gerchef_v2", #输出模型保存的地址 overwrite_output_dir=True, num_train_epochs=5, # 总轮数 per_device_train_batch_size=32, # batch size for training per_device_eval_batch_size=64, # batch size for evaluation eval_steps = 400, # 每500步评估一次模型 save_steps=400, # 每500步保存一次模型 warmup_steps=400,# number of warmup steps for learning rate scheduler prediction_loss_only=True, report_to="tensorboard" #max_steps=7000 # 设置最大的训练步数,这将覆盖num_train_epochs如果设置了 ) #设置训练器 trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=test_dataset, )
import os
os.environ["WANDB_DISABLED"] = "true"
trainer.train()
返回
大概训练时间20分钟
trainer.save_model()
from transformers import pipeline
#加载选择的训练好的模型
chef_v2 = pipeline('text-generation',model=r'F:\python_code\AIGC\gpt2-gerchef_v2\checkpoint-2000', tokenizer=r'G:\hugging_fase_model2\gpt2')
print(chef_v2('Create a dictionary where the keys are the integers up to 50, and the values are the cubes of those same integers.'))
返回:
[{‘generated_text’: “Create a dictionary where the keys are the integers up to 50, and the values are the cubes of those same integers.cubeDict = {‘x’: 50, ‘y’: 50} Write code to calculate the Fibonacci series up”}]
坐在这里看到返回的数据有点代码写作的意思。
from transformers import pipeline
#加载选择的训练好的模型
model_name = r'G:\hugging_fase_model2\gpt2'
chef = pipeline('text-generation',model=model_name, tokenizer=model_name)
print(chef_v2('Create a dictionary where the keys are the integers up to 50, and the values are the cubes of those same integers.'))
返回:
[{‘generated_text’: “Create a dictionary where the keys are the
integers up to 50, and the values are the cubes of those same
integers. Add the value you desire.\n\nFor example, let’s add 1000 in
the dictionary.\n\n(dictionary: 10”}]
以上是本次案例的全部内容,感谢浏览。附件已上传。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。