赞
踩
关键词搜索引擎:将搜索查询本身视为文档,从而获得它的基于 TF-IDF 的向量表示。然后找到与查询余弦相似度最高的向量的文档,并将这些文档作为搜索结果返回。
我们的语料库由3 篇文档组成,而查询是“How long does it take to get to the store?”,如下代码所示:
from collections import OrderedDict import copy from collections import Counter from nltk.tokenize import TreebankWordTokenizer import nltk import math # 计算余弦相似度 # 输入为词频向量 def cosine_sim(vec1, vec2): """ Let's convert our dictionaries to lists for easier matching.""" vec1 = [val for val in vec1.values()] vec2 = [val for val in vec2.values()] dot_prod = 0 for i, v in enumerate(vec1): dot_prod += v * vec2[i] mag_1 = math.sqrt(sum([x**2 for x in vec1])) mag_2 = math.sqrt(sum([x**2 for x in vec2])) return dot_prod / (mag_1 * mag_2) # 数据:三篇文档 docs = ["The faster Harry got to the store, the faster and faster Harry would get home."] docs.append("Harry is hairy and faster than Jill.") docs.append("Jill is not as hairy as Harry.") # 分词后取并集 tokenizer = TreebankWordTokenizer() doc_tokens = [] for doc in docs: doc_tokens += [sorted(tokenizer.tokenize(doc.lower()))] print(len(doc_tokens[0])) all_doc_tokens = sum(doc_tokens, []) print(len(all_doc_tokens)) lexicon = sorted(set(all_doc_tokens)) print(len(lexicon)) print(lexicon) zero_vector = OrderedDict((token, 0) for token in lexicon) print(zero_vector) # 构建文档词频向量tf ''' copy.copy()构建了完全独立的副本,即 0 向量的一个独立 的实例,而非复用一个指针指向原始对象的内存位置,否 则,就会在每次循环中用新值重写相同的 zero_vector,从 而导致每次循环都没有使用新的零向量 ''' doc_vectors = [] for doc in docs: vec = copy.copy(zero_vector) tokens = tokenizer.tokenize(doc.lower()) token_counts = Counter(tokens) for key, value in token_counts.items(): vec[key] = value / len(lexicon) doc_vectors.append(vec) print(doc_vectors) # 在每个文档向量中,我们用词的 TF-IDF 替换 TF。 # 向量将更全面地反映文档的含义或主题 document_tfidf_vectors = [] for doc in docs: vec = copy.copy(zero_vector) tokens = tokenizer.tokenize(doc.lower()) token_counts = Counter(tokens) for key, value in token_counts.items(): docs_containing_key = 0 for _doc in docs: if key in _doc: docs_containing_key += 1 tf = value / len(lexicon) if docs_containing_key: idf = len(docs) / docs_containing_key else: idf = 0 vec[key] = tf * idf document_tfidf_vectors.append(vec) # 进行基本 TF-IDF 搜索: # 将搜索查询本身视为文档,从而获得它的基于 TF-IDF 的向量表示。 # 接着找到与查询余弦相似度最高的向量的文档,并将这些文档作为搜索结果返回。 query = "How long does it take to get to the store?" # copy.copy()确保对独立的对象进行处 # 理,而不是多个指向同一个对象的引用 query_vec = copy.copy(zero_vector) documents = docs tokens = tokenizer.tokenize(query.lower()) token_counts = Counter(tokens) for key, value in token_counts.items(): docs_containing_key = 0 for _doc in documents: if key in _doc.lower(): docs_containing_key += 1 if docs_containing_key == 0: continue tf = value / len(tokens) idf = len(documents) / docs_containing_key query_vec[key] = tf * idf # 计算余弦相似度 # 对于当前查询,文档 0 的相关度最高 print(cosine_sim(query_vec, document_tfidf_vectors[0])) print(cosine_sim(query_vec, document_tfidf_vectors[1])) print(cosine_sim(query_vec, document_tfidf_vectors[2]))
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。