赞
踩
转载改编自:
# install required packages
!pip install langchain
!pip install unstructured
!pip install transformers_stream_generator
import os
from typing import List, Optional
from langchain.llms.base import LLM
from modelscope import AutoModelForCausalLM, AutoTokenizer
from modelscope import GenerationConfig
# initialize qwen 7B model
tokenizer = AutoTokenizer.from_pretrained("qwen/Qwen-7B-Chat", revision = 'v1.0.5',trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("qwen/Qwen-7B-Chat", revision = 'v1.0.5',device_map="auto", trust_remote_code=True, fp16=True).eval()
model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-7B-Chat",revision = 'v1.0.5', trust_remote_code=True)
def torch_gc():
os.environ["TOKENIZERS_PARALLELISM"] = "false"
DEVICE = "cuda"
DEVICE_ID = "0"
CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE
a = torch.Tensor([1, 2])
a = a.cuda()
print(a)
if torch.cuda.is_available():
with torch.cuda.device(CUDA_DEVICE):
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
# wrap the qwen model with langchain LLM base class
class QianWenChatLLM(LLM):
max_length = 10000
temperature: float = 0.01
top_p = 0.9
def __init__(self):
super().__init__()
@property
def _llm_type(self):
return "ChatLLM"
def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
print(prompt)
response, history = model.chat(tokenizer, prompt, history=None)
torch_gc()
return response
qwllm = QianWenChatLLM()
print('@@@ qianwen LLM created')
import os
import re
import torch
from typing import Any, List
from pydantic import BaseModel, Extra
from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredFileLoader,TextLoader
from langchain.embeddings.base import Embeddings
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
define chinese text split logic for divided docs into reasonable size
class ChineseTextSplitter(CharacterTextSplitter):
def __init__(self, pdf: bool = False, sentence_size: int = 100, **kwargs):
super().__init__(**kwargs)
self.pdf = pdf
self.sentence_size = sentence_size
def split_text(self, text: str) -> List[str]:
if self.pdf:
text = re.sub(r"\n{3,}", r"\n", text)
text = re.sub('\s', " ", text)
text = re.sub("\n\n", "", text)
text = re.sub(r'([;;.!?。!?\?])([^”’])', r"\1\n\2", text) # 单字符断句符
text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text) # 英文省略号
text = re.sub(r'(\…{2})([^"’”」』])', r"\1\n\2", text) # 中文省略号
text = re.sub(r'([;;!?。!?\?]["’”」』]{0,2})([^;;!?,。!?\?])', r'\1\n\2', text)
# 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后,注意前面的几句都小心保留了双引号
text = text.rstrip() # 段尾如果有多余的\n就去掉它
# 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。
ls = [i for i in text.split("\n") if i]
for ele in ls:
if len(ele) > self.sentence_size:
ele1 = re.sub(r'([,,.]["’”」』]{0,2})([^,,.])', r'\1\n\2', ele)
ele1_ls = ele1.split("\n")
for ele_ele1 in ele1_ls:
if len(ele_ele1) > self.sentence_size:
ele_ele2 = re.sub(r'([\n]{1,}| {2,}["’”」』]{0,2})([^\s])', r'\1\n\2', ele_ele1)
ele2_ls = ele_ele2.split("\n")
for ele_ele2 in ele2_ls:
if len(ele_ele2) > self.sentence_size:
ele_ele3 = re.sub('( ["’”」』]{0,2})([^ ])', r'\1\n\2', ele_ele2)
ele2_id = ele2_ls.index(ele_ele2)
ele2_ls = ele2_ls[:ele2_id] + [i for i in ele_ele3.split("\n") if i] + ele2_ls[
ele2_id + 1:]
ele_id = ele1_ls.index(ele_ele1)
ele1_ls = ele1_ls[:ele_id] + [i for i in ele2_ls if i] + ele1_ls[ele_id + 1:]
id = ls.index(ele)
ls = ls[:id] + [i for i in ele1_ls if i] + ls[id + 1:]
return ls
class ModelScopeEmbeddings(BaseModel, Embeddings):
embed: Any
model_id: str ="damo/nlp_corom_sentence-embedding_english-base"
"""Model name to use."""
def __init__(self, **kwargs: Any):
"""Initialize the modelscope"""
super().__init__(**kwargs)
try:
from modelscope.models import Model
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
self.embed = pipeline(Tasks.sentence_embedding,model=self.model_id)
except ImportError as e:
raise ValueError(
"Could not import some python packages." "Please install it with `pip install modelscope`."
) from e
class Config:
extra = Extra.forbid
def embed_documents(self, texts: List[str]) -> List[List[float]]:
texts = list(map(lambda x: x.replace("\n", " "), texts))
inputs = {"source_sentence": texts}
embeddings = self.embed(input=inputs)['text_embedding']
return embeddings
def embed_query(self, text: str) -> List[float]:
text = text.replace("\n", " ")
inputs = {"source_sentence": [text]}
embedding = self.embed(input=inputs)['text_embedding'][0]
return embedding
# define prompt template
prompt_template = """请基于```内的内容回答问题。"
\```
{context}
\```
我的问题是:{question}。
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
# load the vector db and upsert docs with vector to db
print('@@@ reading docs ...')
sentence_size = 1600
embeddings = ModelScopeEmbeddings(model_id="damo/nlp_corom_sentence-embedding_chinese-tiny")
filepath = "../../../README_zh.md"
if filepath.lower().endswith(".md"):
loader = UnstructuredFileLoader(filepath, mode="elements")
docs = loader.load()
elif filepath.lower().endswith(".txt"):
loader = TextLoader(filepath, autodetect_encoding=True)
textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
docs = loader.load_and_split(textsplitter)
db = FAISS.from_documents(docs, embeddings)
print('@@@ reading doc done, vec db created.')
# create knowledge chain
kc = RetrievalQA.from_llm(llm=qwllm, retriever=db.as_retriever(search_kwargs={"k": 6}), prompt=prompt)
# test the knowledge chain
query = 'modelscope是什么?'
result = kc({"query": query})
print(result)
请基于```内的内容回答问题。"
\```
Context:
ModelScope Library为模型贡献者提供了必要的分层API,以便将来自 CV、NLP、语音、多模态以及科学计算的模型集成到ModelScope生态系统中。所有这些不同模型的实现都以一种简单统一访问的方式进行封装,用户只需几行代码即可完成模型推理、微调和评估。同时,灵活的模块化设计使得在必要时也可以自定义模型训练推理过程中的不同组件。
Context:
ModelScope 是一个“模型即服务”(MaaS)平台,旨在汇集来自AI社区的最先进的机器学习模型,并简化在实际应用中使用AI模型的流程。ModelScope库使开发人员能够通过丰富的API设计执行推理、训练和评估,从而促进跨不同AI领域的最先进模型的统一体验。
Context:
除了包含各种模型的实现之外,ModelScope Library还支持与ModelScope后端服务进行必要的交互,特别是与Model-Hub和Dataset-Hub的交互。这种交互促进了模型和数据集的管理在后台无缝执行,包括模型数据集查询、版本控制、缓存管理等。
\```
我的问题是:modelscope是什么?。
tensor([1., 2.], device='cuda:0')
2024-03-24
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。