赞
踩
目录
目录
txt 有多行,我的这份数据有 67 行,样例如下:
字段1\t值1\n
字段2\t值2\n
...
pip install langchain-chroma
在本地下载了 embedding 模型,使用去向量化,并检索 top3
指定向量化后的数据库保存到哪里 persist_directory
- from langchain.text_splitter import CharacterTextSplitter
- from langchain.embeddings import HuggingFaceEmbeddings
- from langchain_community.document_loaders import TextLoader
- from langchain_community.vectorstores import FAISS
- from langchain.vectorstores import Chroma
-
-
- filepath = 'data/专业描述.txt'
- raw_documents = TextLoader(filepath, encoding='utf8').load()
-
- # 按行分割块
- text_splitter = CharacterTextSplitter(
- chunk_size=100,
- chunk_overlap=20,
- separator="\n",
- length_function=len,
- is_separator_regex=True,
- )
- documents = text_splitter.split_documents(raw_documents)
- # 加载本地 embedding 模型
- embedding = HuggingFaceEmbeddings(model_name='bge-small-zh-v1.5')
- # 创建向量数据库
- db = Chroma.from_documents(documents, embedding, persist_directory=r"./chroma/")
- db.persist() # 确保嵌入被写入磁盘
- '''
- 如果已经创建好了,可以直接读取
- db = Chroma(persist_directory=persist_directory, embedding_function=embedding)
- '''
-
- # 直接传入文本
- query = "材料科学与工程是一门研究材料的组成、性质、制备、加工及应用的多学科交叉领域。它涵盖了金属、无机非金属"
- docs = db.similarity_search(query, k=3)
- # docs = db.similarity_search_with_score(query, k=3) # 带分数的
- print(docs[0].page_content)
-
- # 传入向量去搜索
- embedding_vector = embedding.embed_query(query)
- docs = db.similarity_search_by_vector(embedding_vector, k=3)
- print(docs[0].page_content)
pip install faiss-cpu
感觉 faiss 向量化要快一些
- from langchain.text_splitter import CharacterTextSplitter
- from langchain.embeddings import HuggingFaceEmbeddings
- from langchain_community.document_loaders import TextLoader
- from langchain_community.vectorstores import FAISS
- from langchain.vectorstores import Chroma
-
-
- filepath = 'data/专业描述.txt'
- raw_documents = TextLoader(filepath, encoding='utf8').load()
-
- # 按行分割块
- text_splitter = CharacterTextSplitter(
- chunk_size=100,
- chunk_overlap=20,
- separator="\n",
- length_function=len,
- is_separator_regex=True,
- )
- documents = text_splitter.split_documents(raw_documents)
- # 加载本地 embedding 模型
- embedding = HuggingFaceEmbeddings(model_name='bge-small-zh-v1.5')
- # 创建向量数据库
- db = FAISS.from_documents(documents, embedding)
- # 保存
- db.save_local("./faiss_index")
- '''
- 如果已经创建好了,可以直接读取
- db = FAISS.load_local("./faiss_index", embeddings)
- '''
-
- # 直接传入文本
- query = "材料科学与工程是一门研究材料的组成、性质、制备、加工及应用的多学科交叉领域。它涵盖了金属、无机非金属"
- docs = db.similarity_search(query, k=3)
- # docs = db.similarity_search_with_score(query, k=3) # 带分数的
- print(docs[0].page_content)
-
- # 传入向量去搜索
- embedding_vector = embedding.embed_query(query)
- docs = db.similarity_search_by_vector(embedding_vector, k=3)
- print(docs[0].page_content)
在上面默认情况下,向量存储检索器使用相似性搜索
我们在用上面的例子,使用 faiss 已经创建好了向量数据库,我们在最后面修改检索的代码
选取 top30
- from langchain.text_splitter import CharacterTextSplitter
- from langchain.embeddings import HuggingFaceEmbeddings
- from langchain_community.document_loaders import TextLoader
- from langchain_community.vectorstores import FAISS
- from langchain.vectorstores import Chroma
-
-
- filepath = 'data/专业描述.txt'
- raw_documents = TextLoader(filepath, encoding='utf8').load()
-
- # 按行分割块
- text_splitter = CharacterTextSplitter(
- chunk_size=100,
- chunk_overlap=20,
- separator="\n",
- length_function=len,
- is_separator_regex=True,
- )
- documents = text_splitter.split_documents(raw_documents)
- # 加载本地 embedding 模型
- embedding = HuggingFaceEmbeddings(model_name='bge-small-zh-v1.5')
- # # 创建向量数据库
- # db = FAISS.from_documents(documents, embedding)
- # # 保存
- # db.save_local("./faiss_index")
-
- # 如果已经创建好了,可以直接读取
- db = FAISS.load_local("./faiss_index", embedding, allow_dangerous_deserialization=True)
-
- # 直接传入文本
- query = "材料科学与工程是一门研究材料的组成、性质、制备、加工及应用的多学科交叉领域。它涵盖了金属、无机非金属"
- retriever = db.as_retriever(search_kwargs={'k': 30}) # 构建检索器
- docs = retriever.get_relevant_documents(query)
- print(docs)
直接比较使用相似性,相似度方法,可能会有重复数据,使用 mmr 不会有重复的检索结果
retriever = db.as_retriever(search_type="mmr", search_kwargs={'k': 30}) # 构建检索器
会发现我指定 top30,只返回了 20 个
fetch_k 默认是 20,数据库提取的候选文档数量,理解为 mmr 算法使用时内部操作的参数就可以了
想取出 30 那,只需要设置大于 30 即可
retriever = db.as_retriever(search_type="mmr", search_kwargs={'k': 30, 'fetch_k': 50}) # 构建检索器
相似度大于 0.5 的拿出来
retriever = db.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.5}) # 构建检索器
基于向量距离的检索可能因微小的询问词变化或向量无法准确表达语义而产生不同结果;
使用大预言模型自动从不同角度生成多个查询,实现提示词优化;
对用户查询生成表达其不同方面的多个新查询(也就是query利用大模型生成多个表述),对每个表述进行检索,去结果的并集;
优点是生成的查询多角度,可以覆盖更全面的语义和信息需求;
- from langchain.text_splitter import CharacterTextSplitter
- from langchain.embeddings import HuggingFaceEmbeddings
- from langchain_community.document_loaders import TextLoader
- from langchain_community.vectorstores import FAISS
- from langchain.vectorstores import Chroma
- import os
- from dotenv import load_dotenv
- from langchain_community.llms import Tongyi
-
- load_dotenv('key.env') # 指定加载 env 文件
- key = os.getenv('DASHSCOPE_API_KEY') # 获得指定环境变量
- DASHSCOPE_API_KEY = os.environ["DASHSCOPE_API_KEY"] # 获得指定环境变量
- model = Tongyi(temperature=1)
-
- filepath = 'data/专业描述.txt'
- raw_documents = TextLoader(filepath, encoding='utf8').load()
-
- # 按行分割块
- text_splitter = CharacterTextSplitter(
- chunk_size=100,
- chunk_overlap=20,
- separator="\n",
- length_function=len,
- is_separator_regex=True,
- )
- documents = text_splitter.split_documents(raw_documents)
- # 加载本地 embedding 模型
- embedding = HuggingFaceEmbeddings(model_name='bge-small-zh-v1.5')
-
- # 如果已经创建好了,可以直接读取
- db = FAISS.load_local("./faiss_index", embedding, allow_dangerous_deserialization=True)
-
- # 直接传入文本
- query = "材料科学与工程是一门研究材料的组成、性质、制备、加工及应用的多学科交叉领域。它涵盖了金属、无机非金属"
-
- # MultiQueryRetriever 检索
- from langchain.retrievers.multi_query import MultiQueryRetriever
- retriever_from_llm = MultiQueryRetriever.from_llm(
- retriever=db.as_retriever(search_kwargs={'k': 8}), llm=model
- )
- unique_docs = retriever_from_llm.get_relevant_documents(query=query)
-
- print(unique_docs)
使用给定查询的上下文来压缩检索的输出,以便只返回相关信息,而不是立即按照原样返回检索到的文档
相当于提取每个检索结果的核心,简化每个文档,利用大模型的能力
这里我们就选择 top1,可以看到检索结果跟 query 一模一样了,是同一句话
- from langchain.text_splitter import CharacterTextSplitter
- from langchain.embeddings import HuggingFaceEmbeddings
- from langchain_community.document_loaders import TextLoader
- from langchain_community.vectorstores import FAISS
- from langchain.vectorstores import Chroma
- import os
- from dotenv import load_dotenv
- from langchain_community.llms import Tongyi
-
- load_dotenv('key.env') # 指定加载 env 文件
- key = os.getenv('DASHSCOPE_API_KEY') # 获得指定环境变量
- DASHSCOPE_API_KEY = os.environ["DASHSCOPE_API_KEY"] # 获得指定环境变量
- model = Tongyi(temperature=1)
-
- filepath = 'data/专业描述.txt'
- raw_documents = TextLoader(filepath, encoding='utf8').load()
-
- # 按行分割块
- text_splitter = CharacterTextSplitter(
- chunk_size=100,
- chunk_overlap=20,
- separator="\n",
- length_function=len,
- is_separator_regex=True,
- )
- documents = text_splitter.split_documents(raw_documents)
- # 加载本地 embedding 模型
- embedding = HuggingFaceEmbeddings(model_name='bge-small-zh-v1.5')
-
- # 如果已经创建好了,可以直接读取
- db = FAISS.load_local("./faiss_index", embedding, allow_dangerous_deserialization=True)
-
- # 传入文本
- query = "材料科学与工程是一门研究材料的组成、性质、制备、加工及应用的多学科交叉领域。它涵盖了金属、无机非金属"
-
- # 检索
- from langchain.retrievers import ContextualCompressionRetriever
- from langchain.retrievers.document_compressors import LLMChainExtractor
- retriever = db.as_retriever(search_kwargs={'k': 1})
- compressor = LLMChainExtractor.from_llm(model)
- compression_retriever = ContextualCompressionRetriever(
- base_compressor=compressor, base_retriever=retriever
- )
- unique_docs = compression_retriever.get_relevant_documents(query)
-
- print(unique_docs)
上面这个我是只取了 top1,但是我把全部结果打出来,发现有重复的,我用了下面检索代码,就去重了;官网的意思是:
LLMChainFilter 使用 LLM 链来决定过滤掉最初检索到的文档中的哪些以及返回哪些文档,而无需操作文档内容。
- # 检索
- from langchain.retrievers import ContextualCompressionRetriever
- from langchain.retrievers.document_compressors import LLMChainExtractor
- from langchain.retrievers.document_compressors import LLMChainFilter
- _filter = LLMChainFilter.from_llm(model)
- retriever = db.as_retriever(search_kwargs={'k': 10})
- compression_retriever = ContextualCompressionRetriever(
- base_compressor=_filter, base_retriever=retriever
- )
- unique_docs = compression_retriever.get_relevant_documents(query)
-
- print(unique_docs)
对每个检索到的文档进行额外的 LLM 调用既昂贵又缓慢。EmbeddingsFilter
通过嵌入文档和查询并仅返回那些与查询具有足够相似嵌入的文档
相当于少调用 llm 去判断相关的文档,改用 embedding 模型
- # 检索
- from langchain.retrievers import ContextualCompressionRetriever
- from langchain.retrievers.document_compressors import EmbeddingsFilter
- retriever = db.as_retriever(search_kwargs={'k': 10})
- embeddings_filter = EmbeddingsFilter(embeddings=embedding, similarity_threshold=0.76)
- compression_retriever = ContextualCompressionRetriever(
- base_compressor=embeddings_filter, base_retriever=retriever
- )
- compressed_docs = compression_retriever.get_relevant_documents(query)
- print(compressed_docs)
还有一种,是把文档分割为再小块一些的,再去做 embedding
- def contextual_compression_by_embedding_split(cls, db, query, embedding_model, topk=5, similarity_threshold=0.76,
- chunk_size=300, chunk_overlap=0, separator=". "):
- """
- https://python.langchain.com/docs/modules/data_connection/retrievers/contextual_compression/
- 上下文压缩检索器,embedding 模型,会对结果去重,将文档分割成更小的部分
- 使用给定查询的上下文来压缩检索的输出,以便只返回相关信息,而不是立即按照原样返回检索到的文档
- 利用 embedding 来计算
- :param db:
- :param query:
- :param embedding_model:
- :param topk: 不生效,默认是 4 个
- :return:
- """
- retriever = db.as_retriever(search_kwargs={'k': topk})
- splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, separator=separator)
- redundant_filter = EmbeddingsRedundantFilter(embeddings=embedding_model)
- relevant_filter = EmbeddingsFilter(embeddings=embedding_model, similarity_threshold=similarity_threshold)
- pipeline_compressor = DocumentCompressorPipeline(
- transformers=[splitter, redundant_filter, relevant_filter]
- )
- compression_retriever = ContextualCompressionRetriever(
- base_compressor=pipeline_compressor, base_retriever=retriever
- )
-
- retriever_docs = compression_retriever.get_relevant_documents(query)
- return retriever_docs
通过利用不同算法的优势, EnsembleRetriever
可以获得比任何单一算法更好的性能
最常见的模式是将稀疏检索器(如 BM25)与密集检索器(如嵌入相似性)相结合,因为它们的优势是互补的。它也被称为“混合搜索”。稀疏检索器擅长根据关键词查找相关文档,而密集检索器擅长根据语义相似度查找相关文档。
- from langchain.retrievers import EnsembleRetriever
- from langchain_community.retrievers import BM25Retriever
- from langchain_community.vectorstores import FAISS
- from langchain.embeddings import HuggingFaceEmbeddings
-
- doc_list_1 = [
- "I like apples",
- "I like oranges",
- "Apples and oranges are fruits",
- ]
-
- # initialize the bm25 retriever and faiss retriever
- bm25_retriever = BM25Retriever.from_texts(
- doc_list_1, metadatas=[{"source": 1}] * len(doc_list_1)
- )
- bm25_retriever.k = 2
-
- doc_list_2 = [
- "You like apples",
- "You like oranges",
- ]
-
- embedding = HuggingFaceEmbeddings(model_name='bge-small-zh-v1.5')
- faiss_vectorstore = FAISS.from_texts(
- doc_list_2, embedding, metadatas=[{"source": 2}] * len(doc_list_2)
- )
- faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": 2})
-
- # initialize the ensemble retriever
- ensemble_retriever = EnsembleRetriever(
- retrievers=[bm25_retriever, faiss_retriever], weights=[0.5, 0.5]
- )
- docs = ensemble_retriever.invoke("apples")
- print(docs)
- from langchain.text_splitter import CharacterTextSplitter
- from langchain.embeddings import HuggingFaceEmbeddings
- from langchain_community.document_loaders import TextLoader
- from langchain_community.vectorstores import FAISS
- from langchain.vectorstores import Chroma
- import os
- from dotenv import load_dotenv
- from langchain_community.llms import Tongyi
-
- load_dotenv('key.env') # 指定加载 env 文件
- key = os.getenv('DASHSCOPE_API_KEY') # 获得指定环境变量
- DASHSCOPE_API_KEY = os.environ["DASHSCOPE_API_KEY"] # 获得指定环境变量
- model = Tongyi(temperature=1)
-
- filepath = 'data/专业描述.txt'
- raw_documents = TextLoader(filepath, encoding='utf8').load()
-
- # 按行分割块
- text_splitter = CharacterTextSplitter(
- chunk_size=100,
- chunk_overlap=20,
- separator="\n",
- length_function=len,
- is_separator_regex=True,
- )
- documents = text_splitter.split_documents(raw_documents)
- # 加载本地 embedding 模型
- embedding = HuggingFaceEmbeddings(model_name='bge-small-zh-v1.5')
-
- # 如果已经创建好了,可以直接读取
- db = FAISS.load_local("./faiss_index", embedding, allow_dangerous_deserialization=True)
-
- # 传入文本
- query = "材料科学与工程是一门研究材料的组成、性质、制备、加工及应用的多学科交叉领域。它涵盖了金属、无机非金属"
-
- # 检索
- from langchain_community.document_transformers import LongContextReorder
- retriever = db.as_retriever(search_type="mmr", search_kwargs={'k': 10, 'fetch_k': 50}) # 构建检索器
- docs = retriever.get_relevant_documents(query)
- # 对检索结果重新排序
- reordering = LongContextReorder()
- reordered_docs = reordering.transform_documents(docs)
-
- print(reordered_docs)
大文档拆分成小文档(比如大文档指多个 txt 或文件)
小文档快通过向量空间建模,实现更准确的语义检索,大块提供跟完整的语义内容
检索小的,最后返回大的对应 id 进行返回
- from langchain.storage import InMemoryStore
- from langchain.embeddings import HuggingFaceEmbeddings
- from langchain_community.document_loaders import TextLoader
- from langchain_chroma import Chroma
- from langchain_text_splitters import RecursiveCharacterTextSplitter
- from langchain.retrievers import ParentDocumentRetriever
-
-
- loaders = [
- TextLoader("data/专业描述.txt", encoding="utf-8"),
- TextLoader("data/专业描述_copy.txt", encoding="utf-8"),
- ]
- docs = []
- for loader in loaders:
- docs.extend(loader.load())
-
- # 加载本地 embedding 模型
- embedding = HuggingFaceEmbeddings(model_name='bge-small-zh-v1.5')
-
- # This text splitter is used to create the child documents
- child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
- # The vectorstore to use to index the child chunks
- vectorstore = Chroma(
- collection_name="full_documents", embedding_function=embedding
- )
- # The storage layer for the parent documents
- store = InMemoryStore()
- retriever = ParentDocumentRetriever(
- vectorstore=vectorstore,
- docstore=store,
- child_splitter=child_splitter,
- )
-
- retriever.add_documents(docs, ids=None)
-
- # 会有两个键,添加了两个文档
- # print(list(store.yield_keys()))
-
- # 传入文本
- query = "材料科学与工程是一门研究材料的组成、性质、制备、加工及应用的多学科交叉领域。它涵盖了金属、无机非金属"
-
- # 检索小块
- sub_docs = vectorstore.similarity_search(query)
- print(sub_docs[0].page_content)
-
- # 检索大块
- retrieved_docs = retriever.get_relevant_documents("justice breyer")
- print(retrieved_docs)
如果文档还是太大,可先把父文档文档分割,参考:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。