赞
踩
@TOC
本文演示了如何使用accelerate推理DeepSeek-V2-Chat(裁剪以后的模型,仅演示如何将权值拆到多卡)
import torch from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig from accelerate import init_empty_weights import sys from accelerate import dispatch_model, infer_auto_device_map from accelerate.utils import get_balanced_memory from torch.cuda.amp import autocast from torch.utils._python_dispatch import TorchDispatchMode from dataclasses import dataclass from typing import Any import torch.cuda import multiprocessing as mp @dataclass class _ProfilerState: cls: Any object: Any = None class TorchDumpDispatchMode(TorchDispatchMode): def __init__(self,parent): super().__init__() self.parent=parent self.op_index=0 self.cvt_count=0 def get_max_gpu_id(self,tensors): max_gpu_id = -1 max_index = -1 tensor_index=[] for i, tensor in enumerate(tensors): if not isinstance(tensor, torch.Tensor): continue tensor_index.append(i) if tensor.is_cuda: gpu_id = tensor.get_device() if gpu_id > max_gpu_id: max_gpu_id = gpu_id max_index = i if max_gpu_id == -1: return None, None,tensor_index return max_index, max_gpu_id,tensor_index def convert(self,op_type,tensor_list): index, gpu_id,tensor_index = self.get_max_gpu_id(tensor_list) if index is None: return keep_index=set(tensor_index)-set([index]) device=torch.device(f"cuda:{gpu_id}") for i in keep_index: if tensor_list[i].device!=device: #print(f"{op_type} {i} {tensor_list[i].device} -> {device}") tensor_list[i].data=tensor_list[i].data.to(device,non_blocking=True) #卡间通信是串行的,所有多stream并不能充分提升性能 def __torch_dispatch__(self, func, types, args=(),kwargs=None): func_packet = func._overloadpacket if kwargs is None: kwargs = {} op_type=f"{func}" self.op_index+=1 if isinstance(args, list) or isinstance(args, tuple): self.convert(op_type,args) elif isinstance(args[0], list) or isinstance(args[0], tuple): self.convert(op_type,args[0]) else: print(op_type) output= func(*args,**kwargs) return output class TorchDumper: def __init__(self,**kwargs): self.p= _ProfilerState(TorchDumpDispatchMode) self.kwargs=kwargs def __enter__(self): if self.p.object is None: o = self.p.cls(self,**self.kwargs) o.__enter__() self.p.object = o else: self.p.object.step() return self def __exit__(self, exc_type, exc_val, exc_tb): TorchDumper._CURRENT_Dumper = None if self.p.object is not None: self.p.object.__exit__(exc_type, exc_val, exc_tb) del self.p.object model_name = "./models/deepseek-ai/DeepSeek-V2-Chat/" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) max_memory = {i: "23GB" for i in range(8)} sys.path.insert(0,model_name) model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True,attn_implementation="eager",torch_dtype=torch.bfloat16) model=model.eval() no_split_module_classes = ['DeepseekV2MLP','DeepseekV2Attention'] #no_split_module_classes = ['DeepseekV2DecoderLayer'] device_map = infer_auto_device_map( model, max_memory=max_memory, no_split_module_classes=no_split_module_classes, dtype='float16') model = dispatch_model(model, device_map=device_map) model.generation_config = GenerationConfig.from_pretrained(model_name) model.generation_config.pad_token_id = model.generation_config.eos_token_id messages = [{"role": "user", "content": "Write a piece of quicksort code in C++"} ] input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt") with TorchDumper(): outputs = model.generate(input_tensor.to(model.device), max_new_tokens=100) result = tokenizer.decode(outputs[0][input_tensor.shape[1]:], skip_special_tokens=True) print(result)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。