赞
踩
接:redis向量数据库中文使用手册-快速开始(二)-CSDN博客
具体见:Redis as a vector database quick start guide | Docs
- schema = (
- TextField("$.model", no_stem=True, as_name="model"),
- TextField("$.brand", no_stem=True, as_name="brand"),
- NumericField("$.price", as_name="price"),
- TagField("$.type", as_name="type"),
- TextField("$.description", as_name="description"),
- VectorField(
- "$.description_embeddings",
- "FLAT",
- {
- "TYPE": "FLOAT32",
- "DIM": VECTOR_DIMENSION,
- "DISTANCE_METRIC": "COSINE",
- },
- as_name="vector",
- ),
- )
- definition = IndexDefinition(prefix=["bikes:"], index_type=IndexType.JSON)
- res = client.ft("idx:bikes_vss").create_index(
- fields=schema, definition=definition
- )
- # >>> 'OK'
- import json
- import time
-
- import numpy as np
- import pandas as pd
- import redis
- import requests
- from redis.commands.search.field import (
- NumericField,
- TagField,
- TextField,
- VectorField,
- )
- from redis.commands.search.indexDefinition import IndexDefinition, IndexType
- from redis.commands.search.query import Query
- from sentence_transformers import SentenceTransformer
-
-
- url = "https://raw.githubusercontent.com/bsbodden/redis_vss_getting_started/main/data/bikes.json"
- response = requests.get(url)
- bikes = response.json()
-
- json.dumps(bikes[0], indent=2)
-
- client = redis.Redis(host="localhost", port=6379, decode_responses=True)
-
- res = client.ping()
- # >>> True
- pipeline = client.pipeline()
- for i, bike in enumerate(bikes, start=1):
- redis_key = f"bikes:{i:03}"
- pipeline.json().set(redis_key, "$", bike)
- res = pipeline.execute()
- # >>> [True, True, True, True, True, True, True, True, True, True, True]
- res = client.json().get("bikes:010", "$.model")
- # >>> ['Summit']
- keys = sorted(client.keys("bikes:*"))
- # >>> ['bikes:001', 'bikes:002', ..., 'bikes:011']
- descriptions = client.json().mget(keys, "$.description")
- descriptions = [item for sublist in descriptions for item in sublist]
- embedder = SentenceTransformer("msmarco-distilbert-base-v4")
- embeddings = embedder.encode(descriptions).astype(np.float32).tolist()
- VECTOR_DIMENSION = len(embeddings[0])
- # >>> 768
- pipeline = client.pipeline()
- for key, embedding in zip(keys, embeddings):
- pipeline.json().set(key, "$.description_embeddings", embedding)
- pipeline.execute()
- # >>> [True, True, True, True, True, True, True, True, True, True, True]
- res = client.json().get("bikes:010")
- # >>>
- # {
- # "model": "Summit",
- # "brand": "nHill",
- # "price": 1200,
- # "type": "Mountain Bike",
- # "specs": {
- # "material": "alloy",
- # "weight": "11.3"
- # },
- # "description": "This budget mountain bike from nHill performs well..."
- # "description_embeddings": [
- # -0.538114607334137,
- # -0.49465855956077576,
- # -0.025176964700222015,
- # ...
- # ]
- # }
-
- schema = (
- TextField("$.model", no_stem=True, as_name="model"),
- TextField("$.brand", no_stem=True, as_name="brand"),
- NumericField("$.price", as_name="price"),
- TagField("$.type", as_name="type"),
- TextField("$.description", as_name="description"),
- VectorField(
- "$.description_embeddings",
- "FLAT",
- {
- "TYPE": "FLOAT32",
- "DIM": VECTOR_DIMENSION,
- "DISTANCE_METRIC": "COSINE",
- },
- as_name="vector",
- ),
- )
- definition = IndexDefinition(prefix=["bikes:"], index_type=IndexType.JSON)
- res = client.ft("idx:bikes_vss").create_index(
- fields=schema, definition=definition
- )
- # >>> 'OK'
- info = client.ft("idx:bikes_vss").info()
- num_docs = info["num_docs"]
- indexing_failures = info["hash_indexing_failures"]
- # print(f"{num_docs} documents indexed with {indexing_failures} failures")
- # >>> 11 documents indexed with 0 failures
- query = Query("@brand:Peaknetic")
- res = client.ft("idx:bikes_vss").search(query).docs
- # print(res)
- # >>> [Document {'id': 'bikes:008', 'payload': None, 'brand': 'Peaknetic', 'model': 'Soothe Electric bike', 'price': '1950', 'description_embeddings': ...
- query = Query("@brand:Peaknetic").return_fields("id", "brand", "model", "price")
- res = client.ft("idx:bikes_vss").search(query).docs
- # print(res)
- # >>> [Document {'id': 'bikes:008', 'payload': None, 'brand': 'Peaknetic', 'model': 'Soothe Electric bike', 'price': '1950'}, Document {'id': 'bikes:009', 'payload': None, 'brand': 'Peaknetic', 'model': 'Secto', 'price': '430'}]
- query = Query("@brand:Peaknetic @price:[0 1000]").return_fields(
- "id", "brand", "model", "price"
- )
- res = client.ft("idx:bikes_vss").search(query).docs
- # print(res)
- # >>> [Document {'id': 'bikes:009', 'payload': None, 'brand': 'Peaknetic', 'model': 'Secto', 'price': '430'}]
- queries = [
- "Bike for small kids",
- "Best Mountain bikes for kids",
- "Cheap Mountain bike for kids",
- "Female specific mountain bike",
- "Road bike for beginners",
- "Commuter bike for people over 60",
- "Comfortable commuter bike",
- "Good bike for college students",
- "Mountain bike for beginners",
- "Vintage bike",
- "Comfortable city bike",
- ]
-
- encoded_queries = embedder.encode(queries)
- len(encoded_queries)
- # >>> 11
-
- def create_query_table(query, queries, encoded_queries, extra_params={}):
- results_list = []
- for i, encoded_query in enumerate(encoded_queries):
- result_docs = (
- client.ft("idx:bikes_vss")
- .search(
- query,
- {
- "query_vector": np.array(
- encoded_query, dtype=np.float32
- ).tobytes()
- }
- | extra_params,
- )
- .docs
- )
- for doc in result_docs:
- vector_score = round(1 - float(doc.vector_score), 2)
- results_list.append(
- {
- "query": queries[i],
- "score": vector_score,
- "id": doc.id,
- "brand": doc.brand,
- "model": doc.model,
- "description": doc.description,
- }
- )
-
- # Optional: convert the table to Markdown using Pandas
- queries_table = pd.DataFrame(results_list)
- queries_table.sort_values(
- by=["query", "score"], ascending=[True, False], inplace=True
- )
- queries_table["query"] = queries_table.groupby("query")["query"].transform(
- lambda x: [x.iloc[0]] + [""] * (len(x) - 1)
- )
- queries_table["description"] = queries_table["description"].apply(
- lambda x: (x[:497] + "...") if len(x) > 500 else x
- )
- queries_table.to_markdown(index=False)
-
-
-
- query = (
- Query("(*)=>[KNN 3 @vector $query_vector AS vector_score]")
- .sort_by("vector_score")
- .return_fields("vector_score", "id", "brand", "model", "description")
- .dialect(2)
- )
-
- create_query_table(query, queries, encoded_queries)
- # >>> | Best Mountain bikes for kids | 0.54 | bikes:003... (+ 32 more results)
- hybrid_query = (
- Query("(@brand:Peaknetic)=>[KNN 3 @vector $query_vector AS vector_score]")
- .sort_by("vector_score")
- .return_fields("vector_score", "id", "brand", "model", "description")
- .dialect(2)
- )
- create_query_table(hybrid_query, queries, encoded_queries)
- # >>> | Best Mountain bikes for kids | 0.3 | bikes:008... (+22 more results)
- range_query = (
- Query(
- "@vector:[VECTOR_RANGE $range $query_vector]=>{$YIELD_DISTANCE_AS: vector_score}"
- )
- .sort_by("vector_score")
- .return_fields("vector_score", "id", "brand", "model", "description")
- .paging(0, 4)
- .dialect(2)
- )
- create_query_table(
- range_query, queries[:1], encoded_queries[:1], {"range": 0.55}
- )
- # >>> | Bike for small kids | 0.52 | bikes:001 | Velorim |... (+1 more result)
关于字段的详细描述:
1)$.description_embeddings AS vector: 矢量字段的json路径和其别名;
2)FLAT:指定索引方法
3)TYPE FLOAT32:设置矢量分量的类型;
4)DIM 768:确定嵌入式方式的长度或者维度;
5)DISTANCE_METRIC COSINE: 距离函数:余弦;
一旦执行FT.CREATE命令,索引过程就会在后台运行。在很短的时间内,所有JSON文档都应该被索引并准备好被查询。要验证这一点,可以使用FT.INFO命令,该命令提供有关索引的详细信息和统计信息。特别令人感兴趣的是成功索引的文档数量和失败的数量:
- info = client.ft("idx:bikes_vss").info()
- num_docs = info["num_docs"]
- indexing_failures = info["hash_indexing_failures"]
- # print(f"{num_docs} documents indexed with {indexing_failures} failures")
- # >>> 11 documents indexed with 0 failures
- queries = [
- "Bike for small kids",
- "Best Mountain bikes for kids",
- "Cheap Mountain bike for kids",
- "Female specific mountain bike",
- "Road bike for beginners",
- "Commuter bike for people over 60",
- "Comfortable commuter bike",
- "Good bike for college students",
- "Mountain bike for beginners",
- "Vintage bike",
- "Comfortable city bike",
- ]
- encoded_queries = embedder.encode(queries)
- len(encoded_queries)
- # >>> 11
KNN是一种基础算法,旨在找到与给定输入最相似的项目。KNN算法基于所选择的距离函数来计算查询向量与数据库中每个向量之间的距离。然后,它返回到查询向量的距离最小的K个项目。这些是最相似的项目。
以下示例显示了一个不应用预过滤器的查询。预筛选表达式(*)表示全部,但您可以将其替换为按其他元数据进行筛选的查询表达式。
然后查询的KNN部分搜索三个最近的邻居。到查询向量的距离返回为vector_score。结果按此分数排序。最后,它返回结果集中的字段vector_score、id、$.brand、$.model和$.description。
- query = (
- Query('(*)=>[KNN 3 @vector $query_vector AS vector_score]')
- .sort_by('vector_score')
- .return_fields('vector_score', 'id', 'brand', 'model', 'description')
- .dialect(2)
- )
必须将矢量化查询作为$query_vector作为字节数组传递。以下代码显示了从矢量化查询提示符(encoded_query)创建Python NumPy数组作为单精度浮点数组的示例,并将其转换为紧凑的字节级表示,该表示可以作为参数传递给查询:
client.ft(INDEX_NAME).search(query, { 'query_vector': np.array(encoded_query, dtype=np.float32).tobytes() }).docs
有了查询模板,就可以通过传递矢量化的查询提示来执行循环中的所有查询提示。请注意,脚本将每个结果的vector_score计算为1-doc.vector_score。因为使用余弦距离作为度量,所以距离最小的项目更接近,因此更类似于查询。
然后,在匹配的文档上循环,并创建一个结果列表,该列表可以转换为Pandas表以可视化结果:
- def create_query_table(query, queries, encoded_queries, extra_params={}):
- results_list = []
- for i, encoded_query in enumerate(encoded_queries):
- result_docs = (
- client.ft("idx:bikes_vss")
- .search(
- query,
- {
- "query_vector": np.array(
- encoded_query, dtype=np.float32
- ).tobytes()
- }
- | extra_params,
- )
- .docs
- )
- for doc in result_docs:
- vector_score = round(1 - float(doc.vector_score), 2)
- results_list.append(
- {
- "query": queries[i],
- "score": vector_score,
- "id": doc.id,
- "brand": doc.brand,
- "model": doc.model,
- "description": doc.description,
- }
- )
-
- # Optional: convert the table to Markdown using Pandas
- queries_table = pd.DataFrame(results_list)
- queries_table.sort_values(
- by=["query", "score"], ascending=[True, False], inplace=True
- )
- queries_table["query"] = queries_table.groupby("query")["query"].transform(
- lambda x: [x.iloc[0]] + [""] * (len(x) - 1)
- )
- queries_table["description"] = queries_table["description"].apply(
- lambda x: (x[:497] + "...") if len(x) > 500 else x
- )
- queries_table.to_markdown(index=False)
查询结果显示了各个查询的前三个匹配项(我们的K参数),以及每个查询的自行车id、品牌和型号。例如,对于查询“儿童最佳山地自行车”,相似度最高(0.54),因此最接近的匹配是“Nord”品牌的“Chook air 5”自行车型号,描述为:
Chook Air 5为六岁及以上的孩子们提供了一辆耐用且轻便的山地自行车,让他们第一次体验在赛道上的骑行,并轻松穿越森林和田野。顶部较低的管子使您在任何情况下都能轻松装卸,让您的孩子在小径上更安全。Chook Air 5是山地自行车的完美入门。
从描述来看,这辆自行车非常适合年幼的孩子,所使用的嵌入准确地捕捉到了描述的语义。
- query = (
- Query("(*)=>[KNN 3 @vector $query_vector AS vector_score]")
- .sort_by("vector_score")
- .return_fields("vector_score", "id", "brand", "model", "description")
- .dialect(2)
- )
- create_query_table(query, queries, encoded_queries)
- # >>> | Best Mountain bikes for kids | 0.54 | bikes:003... (+ 32 more results)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。