赞
踩
微调
大语言模型-ChatGLM-Tuning
大语言模型-微调chatglm6b
大语言模型-中文chatGLM-LLAMA微调
大语言模型-alpaca-lora
本地知识库
大语言模型2-document ai解读
大语言模型-DocumentSearch解读
大语言模型-中文Langchain
使用chatGLM6b + langchain实现本地化知识库检索与智能答案生成
https://github.com/yanqiangmiffy/Chinese-LangChain
class LangChainCFG:
llm_model_name = 'chatglm-6b' # 本地模型文件 or huggingface远程仓库
embedding_model_name = 'text2vec-large-chinese' # 检索模型文件 or huggingface远程仓库
vector_store_path = '.'
docs_path = './docs'
文本进行向量化后存入数据库,还不清楚langchain包里的FAISS做了哪些改变
class SourceService(object): def __init__(self, config): self.vector_store = None self.config = config self.embeddings = HuggingFaceEmbeddings(model_name=self.config.embedding_model_name) self.docs_path = self.config.docs_path self.vector_store_path = self.config.vector_store_path def init_source_vector(self): """ 初始化本地知识库向量 :return: """ docs = [] for doc in os.listdir(self.docs_path): if doc.endswith('.txt'): print(doc) loader = UnstructuredFileLoader(f'{self.docs_path}/{doc}', mode="elements") doc = loader.load() docs.extend(doc) self.vector_store = FAISS.from_documents(docs, self.embeddings) self.vector_store.save_local(self.vector_store_path) def add_document(self, document_path): loader = UnstructuredFileLoader(document_path, mode="elements") doc = loader.load() self.vector_store.add_documents(doc) self.vector_store.save_local(self.vector_store_path) def load_vector_store(self, path): if path is None: self.vector_store = FAISS.load_local(self.vector_store_path, self.embeddings) else: self.vector_store = FAISS.load_local(path, self.embeddings) return self.vector_store def search_web(self, query): SESSION.proxies = { "http": f"socks5h://localhost:7890", "https": f"socks5h://localhost:7890" } results = ddg(query) web_content = '' if results: for result in results: web_content += result['body'] return web_content
调用chatGLM
from typing import List, Optional from langchain.llms.base import LLM from langchain.llms.utils import enforce_stop_tokens from transformers import AutoModel, AutoTokenizer class ChatGLMService(LLM): max_token: int = 10000 temperature: float = 0.1 top_p = 0.9 history = [] tokenizer: object = None model: object = None def __init__(self): super().__init__() @property def _llm_type(self) -> str: return "ChatGLM" def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str: response, _ = self.model.chat( self.tokenizer, prompt, history=self.history, max_length=self.max_token, temperature=self.temperature, ) if stop is not None: response = enforce_stop_tokens(response, stop) self.history = self.history + [[None, response]] return response def load_model(self, model_name_or_path: str = "THUDM/chatglm-6b"): self.tokenizer = AutoTokenizer.from_pretrained( model_name_or_path, trust_remote_code=True ) self.model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True).half().cuda() self.model=self.model.eval()
道理还是一样,把搜索的结果包装成prompt之后调用LLM
class LangChainApplication(object): def __init__(self, config): self.config = config self.llm_service = ChatGLMService() self.llm_service.load_model(model_name_or_path=self.config.llm_model_name) self.source_service = SourceService(config) if self.config.kg_vector_stores is None: print("init a source vector store") self.source_service.init_source_vector() else: print("load zh_wikipedia source vector store ") try: self.source_service.load_vector_store(self.config.kg_vector_stores['初始化知识库']) except Exception as e: self.source_service.init_source_vector() def get_knowledge_based_answer(self, query, history_len=5, temperature=0.1, top_p=0.9, top_k=4, web_content='', chat_history=[]): if web_content: prompt_template = f"""基于以下已知信息,简洁和专业的来回答用户的问题。 如果无法从中得到答案,请说 "根据已知信息无法回答该问题" 或 "没有提供足够的相关信息",不允许在答案中添加编造成分,答案请使用中文。 已知网络检索内容:{web_content}""" + """ 已知内容: {context} 问题: {question}""" else: prompt_template = """基于以下已知信息,简洁和专业的来回答用户的问题。 如果无法从中得到答案,请说 "根据已知信息无法回答该问题" 或 "没有提供足够的相关信息",不允许在答案中添加编造成分,答案请使用中文。 已知内容: {context} 问题: {question}""" prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"]) self.llm_service.history = chat_history[-history_len:] if history_len > 0 else [] self.llm_service.temperature = temperature self.llm_service.top_p = top_p knowledge_chain = RetrievalQA.from_llm( llm=self.llm_service, retriever=self.source_service.vector_store.as_retriever( search_kwargs={"k": top_k}), prompt=prompt) knowledge_chain.combine_documents_chain.document_prompt = PromptTemplate( input_variables=["page_content"], template="{page_content}") knowledge_chain.return_source_documents = True result = knowledge_chain({"query": query}) return result # if __name__ == '__main__': # config = LangChainCFG() # application = LangChainApplication(config) # result = application.get_knowledge_based_answer('马保国是谁') # print(result) # application.source_service.add_document('/home/searchgpt/yq/Knowledge-ChatGLM/docs/added/马保国.txt') # result = application.get_knowledge_based_answer('马保国是谁') # print(result)
from duckduckgo_search import ddg
from duckduckgo_search.utils import SESSION
SESSION.proxies = {
"http": f"socks5h://localhost:7890",
"https": f"socks5h://localhost:7890"
}
r = ddg("马保国")
print(r[:2])
"""
[{'title': '马保国 - 维基百科,自由的百科全书', 'href': 'https://zh.wikipedia.org/wiki/%E9%A9%AC%E4%BF%9D%E5%9B%BD', 'body': '马保国(1951年 — ) ,男,籍贯 山东 临沂,出生及长大于河南,中国大陆太极拳师,自称"浑元形意太极门掌门人" 。 马保国因2017年约战mma格斗家徐晓冬首次出现
大众视野中。 2020年5月,马保国在对阵民间武术爱好者王庆民的比赛中,30秒内被连续高速击倒三次,此事件成为了持续多日的社交 ...'}, {'title': '馬保國的主页 - 抖音', 'href': 'https://www.douyin.com/user/MS4wLjABAAAAW0E1ziOvxgUh3VVv5FE6xmoo3w5WtZalfphYZKj4mCg', 'body': '6.3万. #马马国教扛打功 最近有几个人模芳我动作,很危险啊,不可以的,朋友们不要受伤了。. 5.3万. #马保国直播带货榜第一 朋友们周末愉快,本周六早上湿点,我本人在此号进行第一次带货直播,活到老,学到老,越活越年轻。. 7.0万. #马保国击破红牛罐 昨天 ...'}]
"""
采用transformers的gradio
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。