当前位置:   article > 正文

RAG - langchain + Qwen + modelscope +文档切块_qwen-plus与text-embedding-async-v2联合使用

qwen-plus与text-embedding-async-v2联合使用


转载改编自:

https://github.com/modelscope/modelscope/blob/master/examples/pytorch/application/qwen_doc_search_QA_based_on_langchain.ipynb


安装准备

# install required packages
!pip install langchain
!pip install unstructured
!pip install transformers_stream_generator
  • 1
  • 2
  • 3
  • 4

引入类、tokenizer、model

import os
from typing import List, Optional
from langchain.llms.base import LLM
from modelscope import AutoModelForCausalLM, AutoTokenizer
from modelscope import GenerationConfig

# initialize qwen 7B model
tokenizer = AutoTokenizer.from_pretrained("qwen/Qwen-7B-Chat", revision = 'v1.0.5',trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("qwen/Qwen-7B-Chat", revision = 'v1.0.5',device_map="auto", trust_remote_code=True, fp16=True).eval()
model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-7B-Chat",revision = 'v1.0.5', trust_remote_code=True) 

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11

torch garbage collection

def torch_gc():
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    DEVICE = "cuda"
    DEVICE_ID = "0"
    CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE
    a = torch.Tensor([1, 2])
    a = a.cuda()
    print(a)

    if torch.cuda.is_available():
        with torch.cuda.device(CUDA_DEVICE):
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14

使用 langchain LLM 基类包装模型

# wrap the qwen model with langchain LLM base class
class QianWenChatLLM(LLM):
    max_length = 10000
    temperature: float = 0.01
    top_p = 0.9

    def __init__(self):
        super().__init__()

    @property
    def _llm_type(self):
        return "ChatLLM"

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        print(prompt)
        response, history = model.chat(tokenizer, prompt, history=None)
        torch_gc()
        return response

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19

创建 qwen llm

qwllm = QianWenChatLLM()
print('@@@ qianwen LLM created')
  • 1
  • 2

引入

import os
import re
import torch

from typing import Any, List
from pydantic import BaseModel, Extra
from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredFileLoader,TextLoader
from langchain.embeddings.base import Embeddings
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13

定义中文切分逻辑,将文档分块

define chinese text split logic for divided docs into reasonable size

class ChineseTextSplitter(CharacterTextSplitter):
    def __init__(self, pdf: bool = False, sentence_size: int = 100, **kwargs):
        super().__init__(**kwargs)
        self.pdf = pdf
        self.sentence_size = sentence_size

    def split_text(self, text: str) -> List[str]:   
        if self.pdf:
            text = re.sub(r"\n{3,}", r"\n", text)
            text = re.sub('\s', " ", text)
            text = re.sub("\n\n", "", text)

        text = re.sub(r'([;;.!?。!?\?])([^”’])', r"\1\n\2", text)  # 单字符断句符
        text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text)  # 英文省略号
        text = re.sub(r'(\…{2})([^"’”」』])', r"\1\n\2", text)  # 中文省略号
        text = re.sub(r'([;;!?。!?\?]["’”」』]{0,2})([^;;!?,。!?\?])', r'\1\n\2', text)
        # 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后,注意前面的几句都小心保留了双引号
        text = text.rstrip()  # 段尾如果有多余的\n就去掉它
        # 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。
        ls = [i for i in text.split("\n") if i]
        for ele in ls:
            if len(ele) > self.sentence_size:
                ele1 = re.sub(r'([,,.]["’”」』]{0,2})([^,,.])', r'\1\n\2', ele)
                ele1_ls = ele1.split("\n")
                for ele_ele1 in ele1_ls:
                    if len(ele_ele1) > self.sentence_size:
                        ele_ele2 = re.sub(r'([\n]{1,}| {2,}["’”」』]{0,2})([^\s])', r'\1\n\2', ele_ele1)
                        ele2_ls = ele_ele2.split("\n")
                        for ele_ele2 in ele2_ls:
                            if len(ele_ele2) > self.sentence_size:
                                ele_ele3 = re.sub('( ["’”」』]{0,2})([^ ])', r'\1\n\2', ele_ele2)
                                ele2_id = ele2_ls.index(ele_ele2)
                                ele2_ls = ele2_ls[:ele2_id] + [i for i in ele_ele3.split("\n") if i] + ele2_ls[
                                                                                                       ele2_id + 1:]
                        ele_id = ele1_ls.index(ele_ele1)
                        ele1_ls = ele1_ls[:ele_id] + [i for i in ele2_ls if i] + ele1_ls[ele_id + 1:]

                id = ls.index(ele)
                ls = ls[:id] + [i for i in ele1_ls if i] + ls[id + 1:]
        return ls


  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42

使用 modelscope text embedding 方法创建 embedding 工具

class ModelScopeEmbeddings(BaseModel, Embeddings):
    embed: Any
    model_id: str ="damo/nlp_corom_sentence-embedding_english-base"
    """Model name to use."""

    def __init__(self, **kwargs: Any):
        """Initialize the modelscope"""
        super().__init__(**kwargs)
        try:
            from modelscope.models import Model
            from modelscope.pipelines import pipeline
            from modelscope.utils.constant import Tasks
            self.embed = pipeline(Tasks.sentence_embedding,model=self.model_id)

        except ImportError as e:
            raise ValueError(
                "Could not import some python packages." "Please install it with `pip install modelscope`."
            ) from e

    class Config:
        extra = Extra.forbid

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        texts = list(map(lambda x: x.replace("\n", " "), texts))
        inputs = {"source_sentence": texts}
        embeddings = self.embed(input=inputs)['text_embedding']
        return embeddings

    def embed_query(self, text: str) -> List[float]:
        text = text.replace("\n", " ")
        inputs = {"source_sentence": [text]}
        embedding = self.embed(input=inputs)['text_embedding'][0]
        return embedding
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33

定义 prompt 模板

# define prompt template
prompt_template = """请基于```内的内容回答问题。"
\```
	{context}
\```
	我的问题是:{question}。
"""

prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9

加载向量数据库

# load the vector db and upsert docs with vector to db

print('@@@ reading docs ...')
sentence_size = 1600
embeddings = ModelScopeEmbeddings(model_id="damo/nlp_corom_sentence-embedding_chinese-tiny")

filepath = "../../../README_zh.md"
if filepath.lower().endswith(".md"):
    loader = UnstructuredFileLoader(filepath, mode="elements")
    docs = loader.load()
elif filepath.lower().endswith(".txt"):
    loader = TextLoader(filepath, autodetect_encoding=True)
    textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
    docs = loader.load_and_split(textsplitter) 

db = FAISS.from_documents(docs, embeddings)
print('@@@ reading doc done, vec db created.')
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17

构建知识链

# create knowledge chain
kc = RetrievalQA.from_llm(llm=qwllm, retriever=db.as_retriever(search_kwargs={"k": 6}), prompt=prompt)
# test the knowledge chain
query = 'modelscope是什么?'
result = kc({"query": query})
print(result)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6

请基于```内的内容回答问题。"
	\```
	Context:
ModelScope Library为模型贡献者提供了必要的分层API,以便将来自 CV、NLP、语音、多模态以及科学计算的模型集成到ModelScope生态系统中。所有这些不同模型的实现都以一种简单统一访问的方式进行封装,用户只需几行代码即可完成模型推理、微调和评估。同时,灵活的模块化设计使得在必要时也可以自定义模型训练推理过程中的不同组件。

Context:
ModelScope 是一个“模型即服务”(MaaS)平台,旨在汇集来自AI社区的最先进的机器学习模型,并简化在实际应用中使用AI模型的流程。ModelScope库使开发人员能够通过丰富的API设计执行推理、训练和评估,从而促进跨不同AI领域的最先进模型的统一体验。

Context:
除了包含各种模型的实现之外,ModelScope Library还支持与ModelScope后端服务进行必要的交互,特别是与Model-Hub和Dataset-Hub的交互。这种交互促进了模型和数据集的管理在后台无缝执行,包括模型数据集查询、版本控制、缓存管理等。
	\```
	我的问题是:modelscope是什么?。

tensor([1., 2.], device='cuda:0')
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14

2024-03-24

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/你好赵伟/article/detail/477308
推荐阅读
相关标签
  

闽ICP备14008679号