赞
踩
前言
日常没空,留着以后写
官网:https://docs.llamaindex.ai/en/stable/
简介也没空,以后再写
注:先说明,随着官方的变动,代码也可能变动,大家运行不起来,可以进官网查查资料
如果没有找到 llama_index.embeddings.huggingface
那么:pip install llama_index-embeddings-huggingface
还不行进入官网,输入huggingface进行搜索
- from llama_index.embeddings.huggingface import HuggingFaceEmbedding
- from llama_index.core import Settings
-
- Settings.embed_model = HuggingFaceEmbedding(
- model_name=f"{embed_model_path}",device='cuda'
-
- )
还是那句话,如果以下代码不行,进官网搜索Custom LLM Model
- from llama_index.core.llms import (
- CustomLLM,
- CompletionResponse,
- CompletionResponseGen,
- LLMMetadata,
- )
- from llama_index.core.llms.callbacks import llm_completion_callback
- from transformers import AutoTokenizer, AutoModelForCausalLM
-
- class GLMCustomLLM(CustomLLM):
- context_window: int = 8192 # 上下文窗口大小
- num_output: int = 8000 # 输出的token数量
- model_name: str = "glm-4-9b-chat" # 模型名称
- tokenizer: object = None # 分词器
- model: object = None # 模型
- dummy_response: str = "My response"
-
- def __init__(self, pretrained_model_name_or_path):
- super().__init__()
-
- # GPU方式加载模型
- self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, device_map="cuda", trust_remote_code=True)
- self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="cuda", trust_remote_code=True).eval()
-
- # CPU方式加载模型
- # self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, device_map="cpu", trust_remote_code=True)
- # self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="cpu", trust_remote_code=True)
- self.model = self.model.float()
-
- @property
- def metadata(self) -> LLMMetadata:
- """Get LLM metadata."""
- # 得到LLM的元数据
- return LLMMetadata(
- context_window=self.context_window,
- num_output=self.num_output,
- model_name=self.model_name,
- )
-
- # @llm_completion_callback()
- # def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
- # return CompletionResponse(text=self.dummy_response)
- #
- # @llm_completion_callback()
- # def stream_complete(
- # self, prompt: str, **kwargs: Any
- # ) -> CompletionResponseGen:
- # response = ""
- # for token in self.dummy_response:
- # response += token
- # yield CompletionResponse(text=response, delta=token)
-
- @llm_completion_callback() # 回调函数
- def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
- # 完成函数
- print("完成函数")
-
- inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda() # GPU方式
- # inputs = self.tokenizer.encode(prompt, return_tensors='pt') # CPU方式
- outputs = self.model.generate(inputs, max_length=self.num_output)
- response = self.tokenizer.decode(outputs[0])
- return CompletionResponse(text=response)
-
- @llm_completion_callback()
- def stream_complete(
- self, prompt: str, **kwargs: Any
- ) -> CompletionResponseGen:
- # 流式完成函数
- print("流式完成函数")
-
- inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda() # GPU方式
- # inputs = self.tokenizer.encode(prompt, return_tensors='pt') # CPU方式
- outputs = self.model.generate(inputs, max_length=self.num_output)
- response = self.tokenizer.decode(outputs[0])
- for token in response:
- yield CompletionResponse(text=token, delta=token)
- from typing import Any
-
- from llama_index.core.llms import (
- CustomLLM,
- CompletionResponse,
- CompletionResponseGen,
- LLMMetadata,
- )
- from llama_index.core.llms.callbacks import llm_completion_callback
- from transformers import AutoTokenizer, AutoModelForCausalLM
- from llama_index.core import Settings,VectorStoreIndex,SimpleDirectoryReader
- from llama_index.embeddings.huggingface import HuggingFaceEmbedding
-
-
- class GLMCustomLLM(CustomLLM):
- context_window: int = 8192 # 上下文窗口大小
- num_output: int = 8000 # 输出的token数量
- model_name: str = "glm-4-9b-chat" # 模型名称
- tokenizer: object = None # 分词器
- model: object = None # 模型
- dummy_response: str = "My response"
-
- def __init__(self, pretrained_model_name_or_path):
- super().__init__()
-
- # GPU方式加载模型
- self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, device_map="cuda", trust_remote_code=True)
- self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="cuda", trust_remote_code=True).eval()
-
- # CPU方式加载模型
- # self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, device_map="cpu", trust_remote_code=True)
- # self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="cpu", trust_remote_code=True)
- self.model = self.model.float()
-
- @property
- def metadata(self) -> LLMMetadata:
- """Get LLM metadata."""
- # 得到LLM的元数据
- return LLMMetadata(
- context_window=self.context_window,
- num_output=self.num_output,
- model_name=self.model_name,
- )
-
- # @llm_completion_callback()
- # def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
- # return CompletionResponse(text=self.dummy_response)
- #
- # @llm_completion_callback()
- # def stream_complete(
- # self, prompt: str, **kwargs: Any
- # ) -> CompletionResponseGen:
- # response = ""
- # for token in self.dummy_response:
- # response += token
- # yield CompletionResponse(text=response, delta=token)
-
- @llm_completion_callback() # 回调函数
- def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
- # 完成函数
- print("完成函数")
-
- inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda() # GPU方式
- # inputs = self.tokenizer.encode(prompt, return_tensors='pt') # CPU方式
- outputs = self.model.generate(inputs, max_length=self.num_output)
- response = self.tokenizer.decode(outputs[0])
- return CompletionResponse(text=response)
-
- @llm_completion_callback()
- def stream_complete(
- self, prompt: str, **kwargs: Any
- ) -> CompletionResponseGen:
- # 流式完成函数
- print("流式完成函数")
-
- inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda() # GPU方式
- # inputs = self.tokenizer.encode(prompt, return_tensors='pt') # CPU方式
- outputs = self.model.generate(inputs, max_length=self.num_output)
- response = self.tokenizer.decode(outputs[0])
- for token in response:
- yield CompletionResponse(text=token, delta=token)
-
-
- if __name__ == "__main__":
-
-
- # 定义你的LLM
- pretrained_model_name_or_path = r'/home/nlp/model/LLM/THUDM/glm-4-9b-chat'
- embed_model_path = '/home/nlp/model/Embedding/BAAI/bge-m3'
-
- Settings.embed_model = HuggingFaceEmbedding(
- model_name=f"{embed_model_path}",device='cuda'
-
- )
-
- Settings.llm = GLMCustomLLM(pretrained_model_name_or_path)
-
- documents = SimpleDirectoryReader(input_dir="home/xxxx/input").load_data()
- index = VectorStoreIndex.from_documents(
- documents,
- )
-
-
- # 查询和打印结果
- query_engine = index.as_query_engine()
- response = query_engine.query("萧炎的表妹是谁?")
-
- print(response)
- from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
- from llama_index.embeddings.huggingface import HuggingFaceEmbedding
- from llama_index.llms.ollama import Ollama
-
- documents = SimpleDirectoryReader("data").load_data()
-
- # bge-base embedding model
- Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")
-
- # ollama
- Settings.llm = Ollama(model="llama3", request_timeout=360.0)
-
- index = VectorStoreIndex.from_documents(
- documents,
- )
欢迎大家点赞或收藏
大家的点赞或收藏可以鼓励作者加快更新哟~
参加链接:
LlamaIndex中的CustomLLM(本地加载模型)
llamaIndex 基于GPU加载本地embedding模型
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。