当前位置:   article > 正文

使用 ElasticSearch 作为知识库,存储向量及相似性搜索_elasticsearch cosinesimilarity(2)_python elasticsearch cosinesimilarity 查询

python elasticsearch cosinesimilarity 查询
数据存入 ElasticSearch

引入 ElasticSearch 依赖库:

pip install elasticsearch -i https://pypi.tuna.tsinghua.edu.cn/simple

from elasticsearch import Elasticsearch
from transformers import BertTokenizer, BertModel
import torch
import pandas as pd

def embeddings_doc(doc, tokenizer, model, max_length=300):
encoded_dict = tokenizer.encode_plus(
doc,
add_special_tokens=True,
max_length=max_length,
padding=‘max_length’,
truncation=True,
return_attention_mask=True,
return_tensors=‘pt’
)
input_id = encoded_dict[‘input_ids’]
attention_mask = encoded_dict[‘attention_mask’]

前向传播

with torch.no_grad():
outputs = model(input_id, attention_mask=attention_mask)

提取最后一层的CLS向量作为文本表示

last_hidden_state = outputs.last_hidden_state
cls_embeddings = last_hidden_state[:, 0, :]
return cls_embeddings[0]

def add_doc(index_name, id, embedding_ask, ask, answer, es):
body = {
“ask_vector”: embedding_ask.tolist(),
“ask”: ask,
“answer”: answer
}
result = es.create(index=index_name, id=id, doc_type=“_doc”, body=body)
return result

def main():

模型下载的地址

model_name = ‘D:\AIGC\model\chinese-roberta-wwm-ext-large’

ES 信息

es_host = “http://127.0.0.1”
es_port = 9200
es_user = “elastic”
es_password = “elastic”
index_name = “medical_index”

数据地址

path = “D:\AIGC\dataset\Chinese-medical-dialogue-data\Chinese-medical-dialogue-data\Data_数据\IM_内科\内科5000-33000.csv”

分词器和模型

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

ES 连接

es = Elasticsearch(
[es_host],
port=es_port,
http_auth=(es_user, es_password)
)

读取数据写入ES

data = pd.read_csv(path, encoding=‘ANSI’)
for index, row in data.iterrows():

写入前 5000 条进行测试

if index >= 500:
break
ask = row[“ask”]
answer = row[“answer”]

文本转向量

embedding_ask = embeddings_doc(ask, tokenizer, model)
result = add_doc(index_name, index, embedding_ask, ask, answer, es)
print(result)

if name == ‘__main__’:
main()

在这里插入图片描述

五、相似性搜索

1.
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小桥流水78/article/detail/1004685
推荐阅读
相关标签
  

闽ICP备14008679号