当前位置:   article > 正文

LLM之基于llama-index部署本地embedding与GLM-4模型并初步搭建RAG(其他大模型也可,附上ollma方式运行)_llamaindex 无法导入huggingfacellm

llamaindex 无法导入huggingfacellm

前言

日常没空,留着以后写

llama-index简介

官网:https://docs.llamaindex.ai/en/stable/

简介也没空,以后再写

注:先说明,随着官方的变动,代码也可能变动,大家运行不起来,可以进官网查查资料

加载本地embedding模型

如果没有找到 llama_index.embeddings.huggingface

那么:pip install llama_index-embeddings-huggingface

还不行进入官网,输入huggingface进行搜索

  1. from llama_index.embeddings.huggingface import HuggingFaceEmbedding
  2. from llama_index.core import Settings
  3. Settings.embed_model = HuggingFaceEmbedding(
  4. model_name=f"{embed_model_path}",device='cuda'
  5. )

 加载本地LLM模型

还是那句话,如果以下代码不行,进官网搜索Custom LLM Model

  1. from llama_index.core.llms import (
  2. CustomLLM,
  3. CompletionResponse,
  4. CompletionResponseGen,
  5. LLMMetadata,
  6. )
  7. from llama_index.core.llms.callbacks import llm_completion_callback
  8. from transformers import AutoTokenizer, AutoModelForCausalLM
  9. class GLMCustomLLM(CustomLLM):
  10. context_window: int = 8192 # 上下文窗口大小
  11. num_output: int = 8000 # 输出的token数量
  12. model_name: str = "glm-4-9b-chat" # 模型名称
  13. tokenizer: object = None # 分词器
  14. model: object = None # 模型
  15. dummy_response: str = "My response"
  16. def __init__(self, pretrained_model_name_or_path):
  17. super().__init__()
  18. # GPU方式加载模型
  19. self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, device_map="cuda", trust_remote_code=True)
  20. self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="cuda", trust_remote_code=True).eval()
  21. # CPU方式加载模型
  22. # self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, device_map="cpu", trust_remote_code=True)
  23. # self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="cpu", trust_remote_code=True)
  24. self.model = self.model.float()
  25. @property
  26. def metadata(self) -> LLMMetadata:
  27. """Get LLM metadata."""
  28. # 得到LLM的元数据
  29. return LLMMetadata(
  30. context_window=self.context_window,
  31. num_output=self.num_output,
  32. model_name=self.model_name,
  33. )
  34. # @llm_completion_callback()
  35. # def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
  36. # return CompletionResponse(text=self.dummy_response)
  37. #
  38. # @llm_completion_callback()
  39. # def stream_complete(
  40. # self, prompt: str, **kwargs: Any
  41. # ) -> CompletionResponseGen:
  42. # response = ""
  43. # for token in self.dummy_response:
  44. # response += token
  45. # yield CompletionResponse(text=response, delta=token)
  46. @llm_completion_callback() # 回调函数
  47. def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
  48. # 完成函数
  49. print("完成函数")
  50. inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda() # GPU方式
  51. # inputs = self.tokenizer.encode(prompt, return_tensors='pt') # CPU方式
  52. outputs = self.model.generate(inputs, max_length=self.num_output)
  53. response = self.tokenizer.decode(outputs[0])
  54. return CompletionResponse(text=response)
  55. @llm_completion_callback()
  56. def stream_complete(
  57. self, prompt: str, **kwargs: Any
  58. ) -> CompletionResponseGen:
  59. # 流式完成函数
  60. print("流式完成函数")
  61. inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda() # GPU方式
  62. # inputs = self.tokenizer.encode(prompt, return_tensors='pt') # CPU方式
  63. outputs = self.model.generate(inputs, max_length=self.num_output)
  64. response = self.tokenizer.decode(outputs[0])
  65. for token in response:
  66. yield CompletionResponse(text=token, delta=token)

基于本地模型搭建简易RAG

  1. from typing import Any
  2. from llama_index.core.llms import (
  3. CustomLLM,
  4. CompletionResponse,
  5. CompletionResponseGen,
  6. LLMMetadata,
  7. )
  8. from llama_index.core.llms.callbacks import llm_completion_callback
  9. from transformers import AutoTokenizer, AutoModelForCausalLM
  10. from llama_index.core import Settings,VectorStoreIndex,SimpleDirectoryReader
  11. from llama_index.embeddings.huggingface import HuggingFaceEmbedding
  12. class GLMCustomLLM(CustomLLM):
  13. context_window: int = 8192 # 上下文窗口大小
  14. num_output: int = 8000 # 输出的token数量
  15. model_name: str = "glm-4-9b-chat" # 模型名称
  16. tokenizer: object = None # 分词器
  17. model: object = None # 模型
  18. dummy_response: str = "My response"
  19. def __init__(self, pretrained_model_name_or_path):
  20. super().__init__()
  21. # GPU方式加载模型
  22. self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, device_map="cuda", trust_remote_code=True)
  23. self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="cuda", trust_remote_code=True).eval()
  24. # CPU方式加载模型
  25. # self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, device_map="cpu", trust_remote_code=True)
  26. # self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="cpu", trust_remote_code=True)
  27. self.model = self.model.float()
  28. @property
  29. def metadata(self) -> LLMMetadata:
  30. """Get LLM metadata."""
  31. # 得到LLM的元数据
  32. return LLMMetadata(
  33. context_window=self.context_window,
  34. num_output=self.num_output,
  35. model_name=self.model_name,
  36. )
  37. # @llm_completion_callback()
  38. # def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
  39. # return CompletionResponse(text=self.dummy_response)
  40. #
  41. # @llm_completion_callback()
  42. # def stream_complete(
  43. # self, prompt: str, **kwargs: Any
  44. # ) -> CompletionResponseGen:
  45. # response = ""
  46. # for token in self.dummy_response:
  47. # response += token
  48. # yield CompletionResponse(text=response, delta=token)
  49. @llm_completion_callback() # 回调函数
  50. def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
  51. # 完成函数
  52. print("完成函数")
  53. inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda() # GPU方式
  54. # inputs = self.tokenizer.encode(prompt, return_tensors='pt') # CPU方式
  55. outputs = self.model.generate(inputs, max_length=self.num_output)
  56. response = self.tokenizer.decode(outputs[0])
  57. return CompletionResponse(text=response)
  58. @llm_completion_callback()
  59. def stream_complete(
  60. self, prompt: str, **kwargs: Any
  61. ) -> CompletionResponseGen:
  62. # 流式完成函数
  63. print("流式完成函数")
  64. inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda() # GPU方式
  65. # inputs = self.tokenizer.encode(prompt, return_tensors='pt') # CPU方式
  66. outputs = self.model.generate(inputs, max_length=self.num_output)
  67. response = self.tokenizer.decode(outputs[0])
  68. for token in response:
  69. yield CompletionResponse(text=token, delta=token)
  70. if __name__ == "__main__":
  71. # 定义你的LLM
  72. pretrained_model_name_or_path = r'/home/nlp/model/LLM/THUDM/glm-4-9b-chat'
  73. embed_model_path = '/home/nlp/model/Embedding/BAAI/bge-m3'
  74. Settings.embed_model = HuggingFaceEmbedding(
  75. model_name=f"{embed_model_path}",device='cuda'
  76. )
  77. Settings.llm = GLMCustomLLM(pretrained_model_name_or_path)
  78. documents = SimpleDirectoryReader(input_dir="home/xxxx/input").load_data()
  79. index = VectorStoreIndex.from_documents(
  80. documents,
  81. )
  82. # 查询和打印结果
  83. query_engine = index.as_query_engine()
  84. response = query_engine.query("萧炎的表妹是谁?")
  85. print(response)

ollama 

  1. from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
  2. from llama_index.embeddings.huggingface import HuggingFaceEmbedding
  3. from llama_index.llms.ollama import Ollama
  4. documents = SimpleDirectoryReader("data").load_data()
  5. # bge-base embedding model
  6. Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")
  7. # ollama
  8. Settings.llm = Ollama(model="llama3", request_timeout=360.0)
  9. index = VectorStoreIndex.from_documents(
  10. documents,
  11. )

欢迎大家点赞或收藏

大家的点赞或收藏可以鼓励作者加快更新哟~

参加链接:

LlamaIndex中的CustomLLM(本地加载模型)
llamaIndex 基于GPU加载本地embedding模型
 

官网文档

官网_starter_example_loca

官网_usage_custom

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/酷酷是懒虫/article/detail/1017725
推荐阅读
相关标签
  

闽ICP备14008679号