赞
踩
SentenceTransformer主要用于对句子、文本和图像进行嵌入。可用于文本和图像的相似度对比查找等
# SentenceTransformer官网地址
https://www.sbert.net/
# 安装SentenceTransformer
pip install -U sentence-transformers -i https://pypi.tuna.tsinghua.edu.cn/simple
from sentence_transformers import SentenceTransformer model = SentenceTransformer("all-MiniLM-L6-v2") # Our sentences we like to encode sentences = [ "This framework generates embeddings for each input sentence", "Sentences are passed as a list of string.", "The quick brown fox jumps over the lazy dog.", ] # Sentences are encoded by calling model.encode() sentence_embeddings = model.encode(sentences) # Print the embeddings for sentence, embedding in zip(sentences, sentence_embeddings): print("Sentence:", sentence) print("Embedding:", embedding)
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer("all-MiniLM-L6-v2")
# Sentences are encoded by calling model.encode()
emb1 = model.encode("This is a red cat with a hat.")
emb2 = model.encode("Have you seen my red cat?")
cos_sim = util.cos_sim(emb1, emb2)
print("Cosine-Similarity:", cos_sim)
它还可以跟Elasticsearch结合使用。
from sentence_transformers import SentenceTransformer, util import torch embedder = SentenceTransformer("E:\\model\\distiluse-base-multilingual-cased-v1") # Corpus with example sentences corpus = [ "A man is eating food.", "A man is eating a piece of bread.", "The girl is carrying a baby.", "A man is riding a horse.", "A woman is playing violin.", "Two men pushed carts through the woods.", "A man is riding a white horse on an enclosed ground.", "A monkey is playing drums.", "A cheetah is running behind its prey.", ] corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True) # Query sentences: queries = [ "A man is eating pasta.", "Someone in a gorilla costume is playing a set of drums.", "A cheetah chases prey on across a field.", ] # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity top_k = min(5, len(corpus)) for query in queries: query_embedding = embedder.encode(query, convert_to_tensor=True) # We use cosine-similarity and torch.topk to find the highest 5 scores cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0] top_results = torch.topk(cos_scores, k=top_k) print("\n\n======================\n\n") print("Query:", query) print("\nTop 5 most similar sentences in corpus:") for score, idx in zip(top_results[0], top_results[1]): print(corpus[idx], "(Score: {:.4f})".format(score))
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。