赞
踩
XTuner运行原理
拉取环境
studio-conda xtuner0.1.17
激活环境
conda activate xtuner0.1.17
创建目录
- # 进入家目录 (~的意思是 “当前用户的home路径”)
- cd ~
- # 创建版本文件夹并进入,以跟随本教程
- mkdir -p /root/xtuner0117 && cd /root/xtuner0117
拉取代码
git clone -b v0.1.17 https://github.com/InternLM/xtuner
源码安装
- # 进入源码目录
- cd /root/xtuner0117/xtuner
-
- # 从源码安装 XTuner
- pip install -e '.[all]'
数据集准备
- # 前半部分是创建一个文件夹,后半部分是进入该文件夹。
- mkdir -p /root/ft && cd /root/ft
-
- # 在ft这个文件夹里再创建一个存放数据的data文件夹
- mkdir -p /root/ft/data && cd /root/ft/data
新建一个generate_data.py
文件
touch /root/ft/data/generate_data.py
脚本内容:
- import json
-
- # 设置用户的名字
- name = 'JeffDing菜鸟' #将对应的name进行修改
- # 设置需要重复添加的数据次数
- n = 10000
-
- # 初始化OpenAI格式的数据结构
- data = [
- {
- "messages": [
- {
- "role": "user",
- "content": "请做一下自我介绍"
- },
- {
- "role": "assistant",
- "content": "我是{}的小助手,内在是上海AI实验室书生·浦语的1.8B大模型哦".format(name)
- }
- ]
- }
- ]
-
- # 通过循环,将初始化的对话数据重复添加到data列表中
- for i in range(n):
- data.append(data[0])
-
- # 将data列表中的数据写入到一个名为'personal_assistant.json'的文件中
- with open('personal_assistant.json', 'w', encoding='utf-8') as f:
- # 使用json.dump方法将数据以JSON格式写入文件
- # ensure_ascii=False 确保中文字符正常显示
- # indent=4 使得文件内容格式化,便于阅读
- json.dump(data, f, ensure_ascii=False, indent=4)
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
运行generate_data.py
文件
- # 确保先进入该文件夹
- cd /root/ft/data
-
- # 运行代码
- python /root/ft/data/generate_data.py
模型准备
- # 创建目标文件夹,确保它存在。
- # -p选项意味着如果上级目录不存在也会一并创建,且如果目标文件夹已存在则不会报错。
- mkdir -p /root/ft/model
-
- # 创建符号链接
- ln -s /root/share/new_models/Shanghai_AI_Laboratory/internlm2-chat-1_8b /root/ft/model
配置文件选择
- # 列出所有内置配置文件
- # xtuner list-cfg
-
- # 假如我们想找到 internlm2-1.8b 模型里支持的配置文件
- xtuner list-cfg -p internlm2_1_8b
运行结果:
- ==========================CONFIGS===========================
- PATTERN: internlm2_1_8b
- -------------------------------
- internlm2_1_8b_full_alpaca_e3
- internlm2_1_8b_qlora_alpaca_e3
- =============================================================
配置文件名的解释
虽然我们用的数据集并不是 alpaca
而是我们自己通过脚本制作的小助手数据集 ,但是由于我们是通过 QLoRA
的方式对 internlm-chat-1.8b
进行微调。而最相近的配置文件应该就是 internlm2_1_8b_qlora_alpaca_e3
,因此我们可以选择拷贝这个配置文件到当前目录:
- # 创建一个存放 config 文件的文件夹
- mkdir -p /root/ft/config
-
- # 使用 XTuner 中的 copy-cfg 功能将 config 文件复制到指定的位置
- xtuner copy-cfg internlm2_1_8b_qlora_alpaca_e3 /root/ft/config
- # Copyright (c) OpenMMLab. All rights reserved.
- import torch
- from datasets import load_dataset
- from mmengine.dataset import DefaultSampler
- from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
- LoggerHook, ParamSchedulerHook)
- from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
- from peft import LoraConfig
- from torch.optim import AdamW
- from transformers import (AutoModelForCausalLM, AutoTokenizer,
- BitsAndBytesConfig)
-
- from xtuner.dataset import process_hf_dataset
- from xtuner.dataset.collate_fns import default_collate_fn
- from xtuner.dataset.map_fns import openai_map_fn, template_map_fn_factory
- from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
- VarlenAttnArgsToMessageHubHook)
- from xtuner.engine.runner import TrainLoop
- from xtuner.model import SupervisedFinetune
- from xtuner.parallel.sequence import SequenceParallelSampler
- from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
-
- #######################################################################
- # PART 1 Settings #
- #######################################################################
- # Model
- pretrained_model_name_or_path = '/root/ft/model/internlm2-chat-1_8b'
- use_varlen_attn = False
-
- # Data
- alpaca_en_path = '/root/ft/data/personal_assistant.json'
- prompt_template = PROMPT_TEMPLATE.default
- max_length = 1024
- pack_to_max_length = True
-
- # parallel
- sequence_parallel_size = 1
-
- # Scheduler & Optimizer
- batch_size = 1 # per_device
- accumulative_counts = 16
- accumulative_counts *= sequence_parallel_size
- dataloader_num_workers = 0
- max_epochs = 2
- optim_type = AdamW
- lr = 2e-4
- betas = (0.9, 0.999)
- weight_decay = 0
- max_norm = 1 # grad clip
- warmup_ratio = 0.03
-
- # Save
- save_steps = 300
- save_total_limit = 3 # Maximum checkpoints to keep (-1 means unlimited)
-
- # Evaluate the generation performance during the training
- evaluation_freq = 300
- SYSTEM = ''
- evaluation_inputs = ['请你介绍一下你自己', '你是谁', '你是我的小助手吗']
-
- #######################################################################
- # PART 2 Model & Tokenizer #
- #######################################################################
- tokenizer = dict(
- type=AutoTokenizer.from_pretrained,
- pretrained_model_name_or_path=pretrained_model_name_or_path,
- trust_remote_code=True,
- padding_side='right')
-
- model = dict(
- type=SupervisedFinetune,
- use_varlen_attn=use_varlen_attn,
- llm=dict(
- type=AutoModelForCausalLM.from_pretrained,
- pretrained_model_name_or_path=pretrained_model_name_or_path,
- trust_remote_code=True,
- torch_dtype=torch.float16,
- quantization_config=dict(
- type=BitsAndBytesConfig,
- load_in_4bit=True,
- load_in_8bit=False,
- llm_int8_threshold=6.0,
- llm_int8_has_fp16_weight=False,
- bnb_4bit_compute_dtype=torch.float16,
- bnb_4bit_use_double_quant=True,
- bnb_4bit_quant_type='nf4')),
- lora=dict(
- type=LoraConfig,
- r=64,
- lora_alpha=16,
- lora_dropout=0.1,
- bias='none',
- task_type='CAUSAL_LM'))
-
- #######################################################################
- # PART 3 Dataset & Dataloader #
- #######################################################################
- alpaca_en = dict(
- type=process_hf_dataset,
- dataset=dict(type=load_dataset, path='json', data_files=dict(train=alpaca_en_path)),
- tokenizer=tokenizer,
- max_length=max_length,
- dataset_map_fn=openai_map_fn,
- template_map_fn=dict(
- type=template_map_fn_factory, template=prompt_template),
- remove_unused_columns=True,
- shuffle_before_pack=True,
- pack_to_max_length=pack_to_max_length,
- use_varlen_attn=use_varlen_attn)
-
- sampler = SequenceParallelSampler \
- if sequence_parallel_size > 1 else DefaultSampler
- train_dataloader = dict(
- batch_size=batch_size,
- num_workers=dataloader_num_workers,
- dataset=alpaca_en,
- sampler=dict(type=sampler, shuffle=True),
- collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
-
- #######################################################################
- # PART 4 Scheduler & Optimizer #
- #######################################################################
- # optimizer
- optim_wrapper = dict(
- type=AmpOptimWrapper,
- optimizer=dict(
- type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
- clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
- accumulative_counts=accumulative_counts,
- loss_scale='dynamic',
- dtype='float16')
-
- # learning policy
- # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
- param_scheduler = [
- dict(
- type=LinearLR,
- start_factor=1e-5,
- by_epoch=True,
- begin=0,
- end=warmup_ratio * max_epochs,
- convert_to_iter_based=True),
- dict(
- type=CosineAnnealingLR,
- eta_min=0.0,
- by_epoch=True,
- begin=warmup_ratio * max_epochs,
- end=max_epochs,
- convert_to_iter_based=True)
- ]
-
- # train, val, test setting
- train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
-
- #######################################################################
- # PART 5 Runtime #
- #######################################################################
- # Log the dialogue periodically during the training process, optional
- custom_hooks = [
- dict(type=DatasetInfoHook, tokenizer=tokenizer),
- dict(
- type=EvaluateChatHook,
- tokenizer=tokenizer,
- every_n_iters=evaluation_freq,
- evaluation_inputs=evaluation_inputs,
- system=SYSTEM,
- prompt_template=prompt_template)
- ]
-
- if use_varlen_attn:
- custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
-
- # configure default hooks
- default_hooks = dict(
- # record the time of every iteration.
- timer=dict(type=IterTimerHook),
- # print log every 10 iterations.
- logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
- # enable the parameter scheduler.
- param_scheduler=dict(type=ParamSchedulerHook),
- # save checkpoint per `save_steps`.
- checkpoint=dict(
- type=CheckpointHook,
- by_epoch=False,
- interval=save_steps,
- max_keep_ckpts=save_total_limit),
- # set sampler seed in distributed evrionment.
- sampler_seed=dict(type=DistSamplerSeedHook),
- )
-
- # configure environment
- env_cfg = dict(
- # whether to enable cudnn benchmark
- cudnn_benchmark=False,
- # set multi process parameters
- mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
- # set distributed parameters
- dist_cfg=dict(backend='nccl'),
- )
-
- # set visualizer
- visualizer = None
-
- # set log level
- log_level = 'INFO'
-
- # load from which checkpoint
- load_from = None
-
- # whether to resume training from the loaded checkpoint
- resume = False
-
- # Defaults to use random seed and disable `deterministic`
- randomness = dict(seed=None, deterministic=False)
-
- # set log processor
- log_processor = dict(by_epoch=False)
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
主要修改内容
from xtuner.dataset.map_fns import openai_map_fn, template_map_fn_factory
引入openai_map_fn包
alpaca_en_path
:数据集路径
dataset_map_fn
:数据集map
pretrained_model_name_or_path
:模型路径
普通训练
xtuner train /root/ft/config/internlm2_1_8b_qlora_alpaca_e3_copy.py --work-dir /root/ft/train
deepspeed加速训练
- # 使用 deepspeed 来加速训练
- xtuner train /root/ft/config/internlm2_1_8b_qlora_alpaca_e3_copy.py --work-dir /root/ft/train_deepspeed --deepspeed deepspeed_zero2
模型转换
- # 创建一个保存转换后 Huggingface 格式的文件夹
- mkdir -p /root/ft/huggingface
-
- # 模型转换
- # xtuner convert pth_to_hf ${配置文件地址} ${权重文件地址} ${转换后模型保存地址}
- xtuner convert pth_to_hf /root/ft/config/internlm2_1_8b_qlora_alpaca_e3_copy.py /root/ft/train_deepspeed/iter_768.pth /root/ft/huggingface
xtuner convert参数:
参数名 | 解释 |
---|---|
--fp32 | 代表以fp32的精度开启,假如不输入则默认为fp16 |
--max-shard-size {GB} | 代表每个权重文件最大的大小(默认为2GB) |
假如有特定的需要,我们可以在上面的转换指令后进行添加。由于本次测试的模型文件较小,并且已经验证过拟合,故没有添加。假如加上的话应该是这样的:
xtuner convert pth_to_hf /root/ft/train/internlm2_1_8b_qlora_alpaca_e3_copy.py /root/ft/train/iter_768.pth /root/ft/huggingface --fp32 --max-shard-size 2GB
模型整合
- # 创建一个名为 final_model 的文件夹存储整合后的模型文件
- mkdir -p /root/ft/final_model
-
- # 解决一下线程冲突的 Bug
- export MKL_SERVICE_FORCE_INTEL=1
-
- # 进行模型整合
- # xtuner convert merge ${NAME_OR_PATH_TO_LLM} ${NAME_OR_PATH_TO_ADAPTER} ${SAVE_PATH}
- xtuner convert merge /root/ft/model /root/ft/huggingface /root/ft/final_model
对话测试
- # 与模型进行对话
- xtuner chat /root/ft/final_model --prompt-template internlm2_chat
xtuner chat
参数列表:
启动参数 | 解释 |
---|---|
--system | 指定SYSTEM文本,用于在对话中插入特定的系统级信息 |
--system-template | 指定SYSTEM模板,用于自定义系统信息的模板 |
--bits | 指定LLM运行时使用的位数,决定了处理数据时的精度 |
--bot-name | 设置bot的名称,用于在对话或其他交互中识别bot |
--with-plugins | 指定在运行时要使用的插件列表,用于扩展或增强功能 |
--no-streamer | 关闭流式传输模式,对于需要一次性处理全部数据的场景 |
--lagent | 启用lagent,用于特定的运行时环境或优化 |
--command-stop-word | 设置命令的停止词,当遇到这些词时停止解析命令 |
--answer-stop-word | 设置回答的停止词,当生成回答时遇到这些词则停止 |
--offload-folder | 指定存放模型权重的文件夹,用于加载或卸载模型权重 |
--max-new-tokens | 设置生成文本时允许的最大token数量,控制输出长度 |
--temperature | 设置生成文本的温度值,较高的值会使生成的文本更多样,较低的值会使文本更确定 |
--top-k | 设置保留用于顶k筛选的最高概率词汇标记数,影响生成文本的多样性 |
--top-p | 设置累计概率阈值,仅保留概率累加高于top-p的最小标记集,影响生成文本的连贯性 |
--seed | 设置随机种子,用于生成可重现的文本内容 |
除了这些参数以外其实还有一个非常重要的参数就是 --adapter
,这个参数主要的作用就是可以在转化后的 adapter 层与原模型整合之前来对该层进行测试。使用这个额外的参数对话的模型和整合后的模型几乎没有什么太多的区别,因此我们可以通过测试不同的权重文件生成的 adapter 来找到最优的 adapter 进行最终的模型整合工作。
Web demo 部署
安装依赖
pip install streamlit==1.24.0
下载项目代码
- # 创建存放 InternLM 文件的代码
- mkdir -p /root/ft/web_demo && cd /root/ft/web_demo
-
- # 拉取 InternLM 源文件
- git clone https://github.com/InternLM/InternLM.git
-
- # 进入该库中
- cd /root/ft/web_demo/InternLM
将/root/ft/web_demo/InternLM/chat/web_demo.py
中的内容替换为以下的代码
- """This script refers to the dialogue example of streamlit, the interactive
- generation code of chatglm2 and transformers.
- We mainly modified part of the code logic to adapt to the
- generation of our model.
- Please refer to these links below for more information:
- 1. streamlit chat example:
- https://docs.streamlit.io/knowledge-base/tutorials/build-conversational-apps
- 2. chatglm2:
- https://github.com/THUDM/ChatGLM2-6B
- 3. transformers:
- https://github.com/huggingface/transformers
- Please run with the command `streamlit run path/to/web_demo.py
- --server.address=0.0.0.0 --server.port 7860`.
- Using `python path/to/web_demo.py` may cause unknown problems.
- """
- # isort: skip_file
- import copy
- import warnings
- from dataclasses import asdict, dataclass
- from typing import Callable, List, Optional
-
- import streamlit as st
- import torch
- from torch import nn
- from transformers.generation.utils import (LogitsProcessorList,
- StoppingCriteriaList)
- from transformers.utils import logging
-
- from transformers import AutoTokenizer, AutoModelForCausalLM # isort: skip
-
- logger = logging.get_logger(__name__)
-
-
- @dataclass
- class GenerationConfig:
- # this config is used for chat to provide more diversity
- max_length: int = 32768
- top_p: float = 0.8
- temperature: float = 0.8
- do_sample: bool = True
- repetition_penalty: float = 1.005
-
-
- @torch.inference_mode()
- def generate_interactive(
- model,
- tokenizer,
- prompt,
- generation_config: Optional[GenerationConfig] = None,
- logits_processor: Optional[LogitsProcessorList] = None,
- stopping_criteria: Optional[StoppingCriteriaList] = None,
- prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor],
- List[int]]] = None,
- additional_eos_token_id: Optional[int] = None,
- **kwargs,
- ):
- inputs = tokenizer([prompt], padding=True, return_tensors='pt')
- input_length = len(inputs['input_ids'][0])
- for k, v in inputs.items():
- inputs[k] = v.cuda()
- input_ids = inputs['input_ids']
- _, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
- if generation_config is None:
- generation_config = model.generation_config
- generation_config = copy.deepcopy(generation_config)
- model_kwargs = generation_config.update(**kwargs)
- bos_token_id, eos_token_id = ( # noqa: F841 # pylint: disable=W0612
- generation_config.bos_token_id,
- generation_config.eos_token_id,
- )
- if isinstance(eos_token_id, int):
- eos_token_id = [eos_token_id]
- if additional_eos_token_id is not None:
- eos_token_id.append(additional_eos_token_id)
- has_default_max_length = kwargs.get(
- 'max_length') is None and generation_config.max_length is not None
- if has_default_max_length and generation_config.max_new_tokens is None:
- warnings.warn(
- f"Using 'max_length''s default ({repr(generation_config.max_length)}) \
- to control the generation length. "
- 'This behaviour is deprecated and will be removed from the \
- config in v5 of Transformers -- we'
- ' recommend using `max_new_tokens` to control the maximum \
- length of the generation.',
- UserWarning,
- )
- elif generation_config.max_new_tokens is not None:
- generation_config.max_length = generation_config.max_new_tokens + \
- input_ids_seq_length
- if not has_default_max_length:
- logger.warn( # pylint: disable=W4902
- f"Both 'max_new_tokens' (={generation_config.max_new_tokens}) "
- f"and 'max_length'(={generation_config.max_length}) seem to "
- "have been set. 'max_new_tokens' will take precedence. "
- 'Please refer to the documentation for more information. '
- '(https://huggingface.co/docs/transformers/main/'
- 'en/main_classes/text_generation)',
- UserWarning,
- )
-
- if input_ids_seq_length >= generation_config.max_length:
- input_ids_string = 'input_ids'
- logger.warning(
- f"Input length of {input_ids_string} is {input_ids_seq_length}, "
- f"but 'max_length' is set to {generation_config.max_length}. "
- 'This can lead to unexpected behavior. You should consider'
- " increasing 'max_new_tokens'.")
-
- # 2. Set generation parameters if not already defined
- logits_processor = logits_processor if logits_processor is not None \
- else LogitsProcessorList()
- stopping_criteria = stopping_criteria if stopping_criteria is not None \
- else StoppingCriteriaList()
-
- logits_processor = model._get_logits_processor(
- generation_config=generation_config,
- input_ids_seq_length=input_ids_seq_length,
- encoder_input_ids=input_ids,
- prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
- logits_processor=logits_processor,
- )
-
- stopping_criteria = model._get_stopping_criteria(
- generation_config=generation_config,
- stopping_criteria=stopping_criteria)
- logits_warper = model._get_logits_warper(generation_config)
-
- unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
- scores = None
- while True:
- model_inputs = model.prepare_inputs_for_generation(
- input_ids, **model_kwargs)
- # forward pass to get next token
- outputs = model(
- **model_inputs,
- return_dict=True,
- output_attentions=False,
- output_hidden_states=False,
- )
-
- next_token_logits = outputs.logits[:, -1, :]
-
- # pre-process distribution
- next_token_scores = logits_processor(input_ids, next_token_logits)
- next_token_scores = logits_warper(input_ids, next_token_scores)
-
- # sample
- probs = nn.functional.softmax(next_token_scores, dim=-1)
- if generation_config.do_sample:
- next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
- else:
- next_tokens = torch.argmax(probs, dim=-1)
-
- # update generated ids, model inputs, and length for next step
- input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
- model_kwargs = model._update_model_kwargs_for_generation(
- outputs, model_kwargs, is_encoder_decoder=False)
- unfinished_sequences = unfinished_sequences.mul(
- (min(next_tokens != i for i in eos_token_id)).long())
-
- output_token_ids = input_ids[0].cpu().tolist()
- output_token_ids = output_token_ids[input_length:]
- for each_eos_token_id in eos_token_id:
- if output_token_ids[-1] == each_eos_token_id:
- output_token_ids = output_token_ids[:-1]
- response = tokenizer.decode(output_token_ids)
-
- yield response
- # stop when each sentence is finished
- # or if we exceed the maximum length
- if unfinished_sequences.max() == 0 or stopping_criteria(
- input_ids, scores):
- break
-
-
- def on_btn_click():
- del st.session_state.messages
-
-
- @st.cache_resource
- def load_model():
- model = (AutoModelForCausalLM.from_pretrained('/root/ft/final_model',
- trust_remote_code=True).to(
- torch.bfloat16).cuda())
- tokenizer = AutoTokenizer.from_pretrained('/root/ft/final_model',
- trust_remote_code=True)
- return model, tokenizer
-
-
- def prepare_generation_config():
- with st.sidebar:
- max_length = st.slider('Max Length',
- min_value=8,
- max_value=32768,
- value=2048)
- top_p = st.slider('Top P', 0.0, 1.0, 0.75, step=0.01)
- temperature = st.slider('Temperature', 0.0, 1.0, 0.1, step=0.01)
- st.button('Clear Chat History', on_click=on_btn_click)
-
- generation_config = GenerationConfig(max_length=max_length,
- top_p=top_p,
- temperature=temperature)
-
- return generation_config
-
-
- user_prompt = '<|im_start|>user\n{user}<|im_end|>\n'
- robot_prompt = '<|im_start|>assistant\n{robot}<|im_end|>\n'
- cur_query_prompt = '<|im_start|>user\n{user}<|im_end|>\n\
- <|im_start|>assistant\n'
-
-
- def combine_history(prompt):
- messages = st.session_state.messages
- meta_instruction = ('')
- total_prompt = f"<s><|im_start|>system\n{meta_instruction}<|im_end|>\n"
- for message in messages:
- cur_content = message['content']
- if message['role'] == 'user':
- cur_prompt = user_prompt.format(user=cur_content)
- elif message['role'] == 'robot':
- cur_prompt = robot_prompt.format(robot=cur_content)
- else:
- raise RuntimeError
- total_prompt += cur_prompt
- total_prompt = total_prompt + cur_query_prompt.format(user=prompt)
- return total_prompt
-
-
- def main():
- # torch.cuda.empty_cache()
- print('load model begin.')
- model, tokenizer = load_model()
- print('load model end.')
-
-
- st.title('InternLM2-Chat-1.8B')
-
- generation_config = prepare_generation_config()
-
- # Initialize chat history
- if 'messages' not in st.session_state:
- st.session_state.messages = []
-
- # Display chat messages from history on app rerun
- for message in st.session_state.messages:
- with st.chat_message(message['role'], avatar=message.get('avatar')):
- st.markdown(message['content'])
-
- # Accept user input
- if prompt := st.chat_input('What is up?'):
- # Display user message in chat message container
- with st.chat_message('user'):
- st.markdown(prompt)
- real_prompt = combine_history(prompt)
- # Add user message to chat history
- st.session_state.messages.append({
- 'role': 'user',
- 'content': prompt,
- })
-
- with st.chat_message('robot'):
- message_placeholder = st.empty()
- for cur_response in generate_interactive(
- model=model,
- tokenizer=tokenizer,
- prompt=real_prompt,
- additional_eos_token_id=92542,
- **asdict(generation_config),
- ):
- # Display robot response in chat message container
- message_placeholder.markdown(cur_response + '▌')
- message_placeholder.markdown(cur_response)
- # Add robot response to chat history
- st.session_state.messages.append({
- 'role': 'robot',
- 'content': cur_response, # pylint: disable=undefined-loop-variable
- })
- torch.cuda.empty_cache()
-
-
- if __name__ == '__main__':
- main()
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
运行
streamlit run /root/ft/web_demo/InternLM/chat/web_demo.py --server.address 127.0.0.1 --server.port 6006
模型部分
安装依赖
- apt install git-lfs
- git lfs install
下载模型代码仓
git clone https://code.openxlab.org.cn/JeffDing/xtuner_demo_1_8b.git
复制模型到代码仓
cp -r /root/ft/final_model/* /root/openxlab/xtuner_demo_1_8b/
上传模型
- cd xtuner_demo_1_8b
- git add .
- git commit -m "init"
- git push
具体的操作可以参考文档:上传模型文件 | OpenXLab浦源 - 文档中心
应用部分
需要创建一个GIT代码仓,然后将前面web_demo.py的代码修改下上传
整体代码如下:
- # isort: skip_file
- import copy
- import warnings
- import os
- from dataclasses import asdict, dataclass
- from typing import Callable, List, Optional
-
- import streamlit as st
- import torch
- from torch import nn
- from transformers.generation.utils import (LogitsProcessorList,
- StoppingCriteriaList)
- from transformers.utils import logging
-
- from transformers import AutoTokenizer, AutoModelForCausalLM # isort: skip
-
- logger = logging.get_logger(__name__)
-
-
- @dataclass
- class GenerationConfig:
- # this config is used for chat to provide more diversity
- max_length: int = 32768
- top_p: float = 0.8
- temperature: float = 0.8
- do_sample: bool = True
- repetition_penalty: float = 1.005
-
-
- @torch.inference_mode()
- def generate_interactive(
- model,
- tokenizer,
- prompt,
- generation_config: Optional[GenerationConfig] = None,
- logits_processor: Optional[LogitsProcessorList] = None,
- stopping_criteria: Optional[StoppingCriteriaList] = None,
- prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor],
- List[int]]] = None,
- additional_eos_token_id: Optional[int] = None,
- **kwargs,
- ):
- inputs = tokenizer([prompt], padding=True, return_tensors='pt')
- input_length = len(inputs['input_ids'][0])
- for k, v in inputs.items():
- inputs[k] = v.cuda()
- input_ids = inputs['input_ids']
- _, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
- if generation_config is None:
- generation_config = model.generation_config
- generation_config = copy.deepcopy(generation_config)
- model_kwargs = generation_config.update(**kwargs)
- bos_token_id, eos_token_id = ( # noqa: F841 # pylint: disable=W0612
- generation_config.bos_token_id,
- generation_config.eos_token_id,
- )
- if isinstance(eos_token_id, int):
- eos_token_id = [eos_token_id]
- if additional_eos_token_id is not None:
- eos_token_id.append(additional_eos_token_id)
- has_default_max_length = kwargs.get(
- 'max_length') is None and generation_config.max_length is not None
- if has_default_max_length and generation_config.max_new_tokens is None:
- warnings.warn(
- f"Using 'max_length''s default ({repr(generation_config.max_length)}) \
- to control the generation length. "
- 'This behaviour is deprecated and will be removed from the \
- config in v5 of Transformers -- we'
- ' recommend using `max_new_tokens` to control the maximum \
- length of the generation.',
- UserWarning,
- )
- elif generation_config.max_new_tokens is not None:
- generation_config.max_length = generation_config.max_new_tokens + \
- input_ids_seq_length
- if not has_default_max_length:
- logger.warn( # pylint: disable=W4902
- f"Both 'max_new_tokens' (={generation_config.max_new_tokens}) "
- f"and 'max_length'(={generation_config.max_length}) seem to "
- "have been set. 'max_new_tokens' will take precedence. "
- 'Please refer to the documentation for more information. '
- '(https://huggingface.co/docs/transformers/main/'
- 'en/main_classes/text_generation)',
- UserWarning,
- )
-
- if input_ids_seq_length >= generation_config.max_length:
- input_ids_string = 'input_ids'
- logger.warning(
- f"Input length of {input_ids_string} is {input_ids_seq_length}, "
- f"but 'max_length' is set to {generation_config.max_length}. "
- 'This can lead to unexpected behavior. You should consider'
- " increasing 'max_new_tokens'.")
-
- # 2. Set generation parameters if not already defined
- logits_processor = logits_processor if logits_processor is not None \
- else LogitsProcessorList()
- stopping_criteria = stopping_criteria if stopping_criteria is not None \
- else StoppingCriteriaList()
-
- logits_processor = model._get_logits_processor(
- generation_config=generation_config,
- input_ids_seq_length=input_ids_seq_length,
- encoder_input_ids=input_ids,
- prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
- logits_processor=logits_processor,
- )
-
- stopping_criteria = model._get_stopping_criteria(
- generation_config=generation_config,
- stopping_criteria=stopping_criteria)
- logits_warper = model._get_logits_warper(generation_config)
-
- unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
- scores = None
- while True:
- model_inputs = model.prepare_inputs_for_generation(
- input_ids, **model_kwargs)
- # forward pass to get next token
- outputs = model(
- **model_inputs,
- return_dict=True,
- output_attentions=False,
- output_hidden_states=False,
- )
-
- next_token_logits = outputs.logits[:, -1, :]
-
- # pre-process distribution
- next_token_scores = logits_processor(input_ids, next_token_logits)
- next_token_scores = logits_warper(input_ids, next_token_scores)
-
- # sample
- probs = nn.functional.softmax(next_token_scores, dim=-1)
- if generation_config.do_sample:
- next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
- else:
- next_tokens = torch.argmax(probs, dim=-1)
-
- # update generated ids, model inputs, and length for next step
- input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
- model_kwargs = model._update_model_kwargs_for_generation(
- outputs, model_kwargs, is_encoder_decoder=False)
- unfinished_sequences = unfinished_sequences.mul(
- (min(next_tokens != i for i in eos_token_id)).long())
-
- output_token_ids = input_ids[0].cpu().tolist()
- output_token_ids = output_token_ids[input_length:]
- for each_eos_token_id in eos_token_id:
- if output_token_ids[-1] == each_eos_token_id:
- output_token_ids = output_token_ids[:-1]
- response = tokenizer.decode(output_token_ids)
-
- yield response
- # stop when each sentence is finished
- # or if we exceed the maximum length
- if unfinished_sequences.max() == 0 or stopping_criteria(
- input_ids, scores):
- break
-
-
- def on_btn_click():
- del st.session_state.messages
-
-
- @st.cache_resource
- def load_model():
- base_path = './xtuner_demo_1_8b'
- # download repo to the base_path directory using git
- os.system('apt install git')
- os.system('apt install git-lfs')
- os.system(f'git clone https://code.openxlab.org.cn/JeffDing/xtuner_demo_1_8b.git {base_path}')
- os.system(f'cd {base_path} && git lfs pull')
-
- model = (AutoModelForCausalLM.from_pretrained(base_path,
- trust_remote_code=True).to(
- torch.bfloat16).cuda())
- tokenizer = AutoTokenizer.from_pretrained(base_path,
- trust_remote_code=True)
- return model, tokenizer
-
-
- def prepare_generation_config():
- with st.sidebar:
- max_length = st.slider('Max Length',
- min_value=8,
- max_value=32768,
- value=2048)
- top_p = st.slider('Top P', 0.0, 1.0, 0.75, step=0.01)
- temperature = st.slider('Temperature', 0.0, 1.0, 0.1, step=0.01)
- st.button('Clear Chat History', on_click=on_btn_click)
-
- generation_config = GenerationConfig(max_length=max_length,
- top_p=top_p,
- temperature=temperature)
-
- return generation_config
-
-
- user_prompt = '<|im_start|>user\n{user}<|im_end|>\n'
- robot_prompt = '<|im_start|>assistant\n{robot}<|im_end|>\n'
- cur_query_prompt = '<|im_start|>user\n{user}<|im_end|>\n\
- <|im_start|>assistant\n'
-
-
- def combine_history(prompt):
- messages = st.session_state.messages
- meta_instruction = ('')
- total_prompt = f"<s><|im_start|>system\n{meta_instruction}<|im_end|>\n"
- for message in messages:
- cur_content = message['content']
- if message['role'] == 'user':
- cur_prompt = user_prompt.format(user=cur_content)
- elif message['role'] == 'robot':
- cur_prompt = robot_prompt.format(robot=cur_content)
- else:
- raise RuntimeError
- total_prompt += cur_prompt
- total_prompt = total_prompt + cur_query_prompt.format(user=prompt)
- return total_prompt
-
-
- def main():
- # torch.cuda.empty_cache()
- print('load model begin.')
- model, tokenizer = load_model()
- print('load model end.')
-
-
- st.title('InternLM2-Chat-1.8B')
-
- generation_config = prepare_generation_config()
-
- # Initialize chat history
- if 'messages' not in st.session_state:
- st.session_state.messages = []
-
- # Display chat messages from history on app rerun
- for message in st.session_state.messages:
- with st.chat_message(message['role'], avatar=message.get('avatar')):
- st.markdown(message['content'])
-
- # Accept user input
- if prompt := st.chat_input('What is up?'):
- # Display user message in chat message container
- with st.chat_message('user'):
- st.markdown(prompt)
- real_prompt = combine_history(prompt)
- # Add user message to chat history
- st.session_state.messages.append({
- 'role': 'user',
- 'content': prompt,
- })
-
- with st.chat_message('robot'):
- message_placeholder = st.empty()
- for cur_response in generate_interactive(
- model=model,
- tokenizer=tokenizer,
- prompt=real_prompt,
- additional_eos_token_id=92542,
- **asdict(generation_config),
- ):
- # Display robot response in chat message container
- message_placeholder.markdown(cur_response + '▌')
- message_placeholder.markdown(cur_response)
- # Add robot response to chat history
- st.session_state.messages.append({
- 'role': 'robot',
- 'content': cur_response, # pylint: disable=undefined-loop-variable
- })
- torch.cuda.empty_cache()
-
-
- if __name__ == '__main__':
- main()
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
然后在OpenXLab中创建应用即可
Haotian Liu等使用GPT-4V对图像数据生成描述,以此构建出大量<question text><image> -- <answer text>
的数据对。利用这些数据对,配合文本单模态LLM,训练出一个Image Projector。
所使用的文本单模型LLM
和训练出来的Image Projector
,统称为LLaVA模型
。
Pretrain阶段
在Pretrain阶段,我们会使用大量的图片+简单文本(caption, 即图片标题)
数据对,使LLM理解图像中的普遍特征。即,对大量的图片进行粗看。
Pretrain阶段训练完成后,此时的模型已经有视觉能力了!但是由于训练数据中都是图片+图片标题,所以此时的模型虽然有视觉能力,但无论用户问它什么,它都只会回答输入图片的标题。即,此时的模型只会给输入图像“写标题”。
Finetune阶段
在Finetune阶段,我们会使用图片+复杂文本
数据对,来对Pretrain得到的Image Projector即iter_2181.pth进行进一步的训练。
训练数据格式
- [
- {
- "id": "随便什么字符串",
- "image": "图片文件的相对位置。相对谁?相对你后面config文件里指定的image_folder参数的路径。",
- "conversation": [
- {
- "from": "human",
- "value": "<image>\n第1个问题。"
- },
- {
- "from": "gpt",
- "value": "第1个回答"
- },
- {
- "from": "human",
- "value": "第2个问题。"
- },
- {
- "from": "gpt",
- "value": "第2个回答"
- },
- # ......
- {
- "from": "human",
- "value": "第n个问题。"
- },
- {
- "from": "gpt",
- "value": "第n个回答"
- },
- ]
- },
-
- # 下面是第2组训练数据了。
-
- {
- "id": "随便什么字符串",
- "image": "图片文件的相对位置。相对谁?相对你后面config文件里指定的image_folder参数的路径。",
- "conversation": [
- {
- "from": "human",
- "value": "<image>\n第1个问题。"
- },
- # ......
- {
- "from": "gpt",
- "value": "第n个回答"
- }
- ]
- }
- ]
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
针对这张示例图片的问答对数据(repeat_data.json),生成脚本如下(重复200次)
- cd ~ && git clone https://github.com/InternLM/tutorial -b camp2 && conda activate xtuner0.1.17 && cd tutorial
-
- python /root/tutorial/xtuner/llava/llava_data/repeat.py \
- -i /root/tutorial/xtuner/llava/llava_data/unique_data.json \
- -o /root/tutorial/xtuner/llava/llava_data/repeated_data.json \
- -n 200
准备配置文件
cp /root/tutorial/xtuner/llava/llava_data/internlm2_chat_1_8b_llava_tutorial_fool_config.py /root/tutorial/xtuner/llava/llava_internlm2_chat_1_8b_qlora_clip_vit_large_p14_336_lora_e1_gpu8_finetune_copy.py
创建配置文件
- # 查询xtuner内置配置文件
- xtuner list-cfg -p llava_internlm2_chat_1_8b
-
- # 拷贝配置文件到当前目录
- xtuner copy-cfg \
- llava_internlm2_chat_1_8b_qlora_clip_vit_large_p14_336_lora_e1_gpu8_finetune \
- /root/tutorial/xtuner/llava
修改配置文件
修改llava_internlm2_chat_1_8b_qlora_clip_vit_large_p14_336_lora_e1_gpu8_finetune_copy.py
文件中
开始Finetune
- cd /root/tutorial/xtuner/llava/
- xtuner train /root/tutorial/xtuner/llava/llava_internlm2_chat_1_8b_qlora_clip_vit_large_p14_336_lora_e1_gpu8_finetune_copy.py --deepspeed deepspeed_zero
对比Finetune前后的性能差异
图片:
Finetune前
- # 解决小bug
- export MKL_SERVICE_FORCE_INTEL=1
- export MKL_THREADING_LAYER=GNU
-
- # pth转huggingface
- xtuner convert pth_to_hf \
- llava_internlm2_chat_1_8b_clip_vit_large_p14_336_e1_gpu8_pretrain \
- /root/share/new_models/xtuner/iter_2181.pth \
- /root/tutorial/xtuner/llava/llava_data/iter_2181_hf
-
- # 启动!
- xtuner chat /root/share/new_models/Shanghai_AI_Laboratory/internlm2-chat-1_8b \
- --visual-encoder /root/share/new_models/openai/clip-vit-large-patch14-336 \
- --llava /root/tutorial/xtuner/llava/llava_data/iter_2181_hf \
- --prompt-template internlm2_chat \
- --image /root/tutorial/xtuner/llava/llava_data/test_img/oph.jpg
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
运行结果:
Finetune后
- # 解决小bug
- export MKL_SERVICE_FORCE_INTEL=1
- export MKL_THREADING_LAYER=GNU
-
- # pth转huggingface
- xtuner convert pth_to_hf \
- /root/tutorial/xtuner/llava/llava_internlm2_chat_1_8b_qlora_clip_vit_large_p14_336_lora_e1_gpu8_finetune_copy.py \
- /root/tutorial/xtuner/llava/work_dirs/llava_internlm2_chat_1_8b_qlora_clip_vit_large_p14_336_lora_e1_gpu8_finetune_copy/iter_1200.pth \
- /root/tutorial/xtuner/llava/llava_data/iter_1200_hf
-
- # 启动!
- xtuner chat /root/share/new_models/Shanghai_AI_Laboratory/internlm2-chat-1_8b \
- --visual-encoder /root/share/new_models/openai/clip-vit-large-patch14-336 \
- --llava /root/tutorial/xtuner/llava/llava_data/iter_1200_hf \
- --prompt-template internlm2_chat \
- --image /root/tutorial/xtuner/llava/llava_data/test_img/oph.jpg
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
运行结果:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。