赞
踩
莫烦NLP学习总结系列:搜索引擎简介
扩展:多模态搜索:以文字搜图片等,阿里在视频搜索领域的探索:多模态搜索算法实践
假设你开了家咨询公司,手上有100篇材料。这时有人来找你咨询NLP的问题,你会怎么在这100篇材料中找到合适的内容呢?
——用倒排索引的方式就是:我们在第一次拿到所有材料时,把它们通读一遍,然后构建关键词和文章的对应关系。当用户在搜索特定词的时候,比如“红”,就会直接返回“红”这个【关键词索引】下的文章列表。先构造索引的好处就是能够将这种索引,放在后续的搜索中复用,搜索也就变成了一种词语匹配加返回索引材料的过程。
中文名称:【词频-逆文本频率指数】,用来表达一篇文章的关键词信息。TF是一个局部信息(某篇文档中的词频TF),IDF是一个全局参数,用来判断一个词在所有文档中是不是垃圾信息。
1、将多篇文档的关键词数量建立成对应的多维词向量:[{单词:数量},… , {单词:数量}] , 并计算其TF-IDF
2、用户输入的搜索词组,计算搜索词组的TF-IDF
3、计算搜索词组和每篇文章的cosine距离
docs = [ "it is a good day, I like to stay here", "I am happy to be here", "I am bob", "it is sunny today", "I have a party today", "it is a dog and that is a cat", "there are dog and cat on the tree", "I study hard this morning", "today is a good day", "tomorrow will be a good day", "I like coffee, I like book and I like apple", "I do not like it", "I am kitty, I like bob", "I do not care who like bob, but I like kitty", "It is coffee time, bring your cup", ]
#将文档的单词转换成ID形式,这样便于后续通过ID进行统计。
docs_words = [d.replace(",", "").split(" ") for d in docs]
vocab = set(itertools.chain(*docs_words))
v2i = {v: i for i, v in enumerate(vocab)}
i2v = {i: v for v, i in v2i.items()}
def safe_log(x): mask = x != 0 x[mask] = np.log(x[mask]) return x tf_methods = { "log": lambda x: np.log(1+x), "augmented": lambda x: 0.5 + 0.5 * x / np.max(x, axis=1, keepdims=True), "boolean": lambda x: np.minimum(x, 1), "log_avg": lambda x: (1 + safe_log(x)) / (1 + safe_log(np.mean(x, axis=1, keepdims=True))), } def get_tf(method="log"): # 词w 在 文档d 的TF=文档d 中 词w 总数 # term frequency: how frequent a word appears in a doc # _tf是一个47×15的表格,47代表所有文本中出现的单词(不同)个数,15代表文档个数 _tf = np.zeros((len(vocab), len(docs)), dtype=np.float64) # [n_vocab, n_doc] for i, d in enumerate(docs_words): # counter代表了第i个文档种每一个单词出现的个数,用键值对的方式表示 counter = Counter(d) for v in counter.keys(): _tf[v2i[v], i] = counter[v] / counter.most_common(1)[0][1] # counter.most_common(1)[0][1]的意思是个数最多的单词的数量,为什么要除一个数可以理解为标准化 weighted_tf = tf_methods.get(method, None) if weighted_tf is None: raise ValueError # 传入一个47×15的矩阵,47代表词数,15代表文档数 return weighted_tf(_tf)
idf_methods = { #log: 词w 的IDF本质计算 IDF=log(所有文档数/所有文档中 词w 数) "log": lambda x: 1 + np.log(len(docs) / (x+1)), "prob": lambda x: np.maximum(0, np.log((len(docs) - x) / (x+1))), "len_norm": lambda x: x / (np.sum(np.square(x))+1), } def get_idf(method="log"): # inverse document frequency: low idf for a word appears in more docs, mean less important # 统计所有单词47在所有文档中一共出现的次数,df是一个47维的向量 df = np.zeros((len(i2v), 1)) for i in range(len(i2v)): d_count = 0 for d in docs_words: # 如果第i个单词在文档d中出现,d_count计数 d_count += 1 if i2v[i] in d else 0 df[i, 0] = d_count idf_fn = idf_methods.get(method, None) if idf_fn is None: raise ValueError #传入47个单词的数目,返回经过idf处理后的47维向量 return idf_fn(df)
tf = get_tf() # [n_vocab, n_doc]
idf = get_idf() # [n_vocab, 1]
tf_idf = tf * idf
q = "I get a coffee cup"
对输入词组进行词频(TF)的计算,结合文档数据库的IDF,计算出词组的TF-IDF
【注】此处代码为docs_score()的一部分,计算搜索语句与数据库文档相似的时候,顺便把搜索词组的TF-IDF给计算了
q_words = q.replace(",", "").split(" ") # add unknown words unknown_v = 0 for v in set(q_words): if v not in v2i: v2i[v] = len(v2i) i2v[len(v2i)-1] = v unknown_v += 1 if unknown_v > 0: _idf = np.concatenate((idf, np.zeros((unknown_v, 1), dtype=np.float)), axis=0) _tf_idf = np.concatenate((tf_idf, np.zeros((unknown_v, tf_idf.shape[1]), dtype=np.float)), axis=0) else: _idf, _tf_idf = idf, tf_idf counter = Counter(q_words) q_tf = np.zeros((len(_idf), 1), dtype=np.float) # [n_vocab, 1] # 求q的tf for v in counter.keys(): q_tf[v2i[v], 0] = counter[v] q_vec = q_tf * _idf # [n_vocab, 1]
def cosine_similarity(q, _tf_idf): unit_q = q / np.sqrt(np.sum(np.square(q), axis=0, keepdims=True)) unit_ds = _tf_idf / np.sqrt(np.sum(np.square(_tf_idf), axis=0, keepdims=True)) similarity = unit_ds.T.dot(unit_q).ravel() return similarity def docs_score(q, len_norm=False): """ 获取多维向量q与tfidf的cosine距离,计为相关性分数 :param q: :param len_norm: :return: """ q_words = q.replace(",", "").split(" ") # add unknown words unknown_v = 0 for v in set(q_words): if v not in v2i: v2i[v] = len(v2i) i2v[len(v2i)-1] = v unknown_v += 1 if unknown_v > 0: _idf = np.concatenate((idf, np.zeros((unknown_v, 1), dtype=np.float)), axis=0) _tf_idf = np.concatenate((tf_idf, np.zeros((unknown_v, tf_idf.shape[1]), dtype=np.float)), axis=0) else: _idf, _tf_idf = idf, tf_idf counter = Counter(q_words) q_tf = np.zeros((len(_idf), 1), dtype=np.float) # [n_vocab, 1] # 求q的tf for v in counter.keys(): q_tf[v2i[v], 0] = counter[v] q_vec = q_tf * _idf # [n_vocab, 1] #余弦距离对比 q_scores = cosine_similarity(q_vec, _tf_idf) if len_norm: len_docs = [len(d) for d in docs_words] q_scores = q_scores / np.array(len_docs) return q_scores
参考莫烦Github
import numpy as np from collections import Counter import itertools from visual import show_tfidf # this refers to visual.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/) docs = [ "it is a good day, I like to stay here", "I am happy to be here", "I am bob", "it is sunny today", "I have a party today", "it is a dog and that is a cat", "there are dog and cat on the tree", "I study hard this morning", "today is a good day", "tomorrow will be a good day", "I like coffee, I like book and I like apple", "I do not like it", "I am kitty, I like bob", "I do not care who like bob, but I like kitty", "It is coffee time, bring your cup", ] #将文档的单词转换成ID形式,这样便于后续通过ID进行统计。 docs_words = [d.replace(",", "").split(" ") for d in docs] vocab = set(itertools.chain(*docs_words)) v2i = {v: i for i, v in enumerate(vocab)} i2v = {i: v for v, i in v2i.items()} def safe_log(x): mask = x != 0 x[mask] = np.log(x[mask]) return x tf_methods = { "log": lambda x: np.log(1+x), "augmented": lambda x: 0.5 + 0.5 * x / np.max(x, axis=1, keepdims=True), "boolean": lambda x: np.minimum(x, 1), "log_avg": lambda x: (1 + safe_log(x)) / (1 + safe_log(np.mean(x, axis=1, keepdims=True))), } idf_methods = { #log: 词w 的IDF本质计算 IDF=log(所有文档数/所有文档中 词w 数) "log": lambda x: 1 + np.log(len(docs) / (x+1)), "prob": lambda x: np.maximum(0, np.log((len(docs) - x) / (x+1))), "len_norm": lambda x: x / (np.sum(np.square(x))+1), } def get_tf(method="log"): # 词w 在 文档d 的TF=文档d 中 词w 总数 # term frequency: how frequent a word appears in a doc # _tf是一个47×15的表格,47代表所有文本中出现的单词(不同)个数,15代表文档个数 _tf = np.zeros((len(vocab), len(docs)), dtype=np.float64) # [n_vocab, n_doc] for i, d in enumerate(docs_words): # counter代表了第i个文档种每一个单词出现的个数,用键值对的方式表示 counter = Counter(d) for v in counter.keys(): _tf[v2i[v], i] = counter[v] / counter.most_common(1)[0][1] # counter.most_common(1)[0][1]的意思是个数最多的单词的数量,为什么要除一个数可以理解为标准化 weighted_tf = tf_methods.get(method, None) if weighted_tf is None: raise ValueError # 传入一个47×15的矩阵,47代表词数,15代表文档数 return weighted_tf(_tf) def get_idf(method="log"): # inverse document frequency: low idf for a word appears in more docs, mean less important # 统计所有单词47在所有文档中一共出现的次数,df是一个47维的向量 df = np.zeros((len(i2v), 1)) for i in range(len(i2v)): d_count = 0 for d in docs_words: # 如果第i个单词在文档d中出现,d_count计数 d_count += 1 if i2v[i] in d else 0 df[i, 0] = d_count idf_fn = idf_methods.get(method, None) if idf_fn is None: raise ValueError #传入47个单词的数目,返回经过idf处理后的47维向量 return idf_fn(df) def cosine_similarity(q, _tf_idf): unit_q = q / np.sqrt(np.sum(np.square(q), axis=0, keepdims=True)) unit_ds = _tf_idf / np.sqrt(np.sum(np.square(_tf_idf), axis=0, keepdims=True)) similarity = unit_ds.T.dot(unit_q).ravel() return similarity def docs_score(q, len_norm=False): """ 获取多维向量q与tfidf的cosine距离,计为相关性分数 :param q: :param len_norm: :return: """ q_words = q.replace(",", "").split(" ") # add unknown words unknown_v = 0 for v in set(q_words): if v not in v2i: v2i[v] = len(v2i) i2v[len(v2i)-1] = v unknown_v += 1 if unknown_v > 0: _idf = np.concatenate((idf, np.zeros((unknown_v, 1), dtype=np.float)), axis=0) _tf_idf = np.concatenate((tf_idf, np.zeros((unknown_v, tf_idf.shape[1]), dtype=np.float)), axis=0) else: _idf, _tf_idf = idf, tf_idf counter = Counter(q_words) q_tf = np.zeros((len(_idf), 1), dtype=np.float) # [n_vocab, 1] # 求q的tf for v in counter.keys(): q_tf[v2i[v], 0] = counter[v] q_vec = q_tf * _idf # [n_vocab, 1] #余弦距离对比 q_scores = cosine_similarity(q_vec, _tf_idf) if len_norm: len_docs = [len(d) for d in docs_words] q_scores = q_scores / np.array(len_docs) return q_scores def get_keywords(n=2): for c in range(3): col = tf_idf[:, c] #argsort返回的是数组值从小到大的索引值,[-n:]取最后的n个数 idx = np.argsort(col)[-n:] print("doc{}, top{} keywords {}".format(c, n, [i2v[i] for i in idx])) tf = get_tf() # [n_vocab, n_doc] idf = get_idf() # [n_vocab, 1] tf_idf = tf * idf # [n_vocab, n_doc] print("tf shape(vecb in each docs): ", tf.shape) print("\ntf samples:\n", tf[:2]) print("\nidf shape(vecb in all docs): ", idf.shape) print("\nidf samples:\n", idf[:2]) print("\ntf_idf shape: ", tf_idf.shape) print("\ntf_idf sample:\n", tf_idf[:2]) # test # 返回前n个关键词 get_keywords() q = "I get a coffee cup" scores = docs_score(q) d_ids = scores.argsort()[-3:][::-1] print("\ntop 3 docs for '{}':\n{}".format(q, [docs[i] for i in d_ids])) show_tfidf(tf_idf.T, [i2v[i] for i in range(tf_idf.shape[0])], "tfidf_matrix")
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from visual import show_tfidf # this refers to visual.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/) docs = [ "it is a good day, I like to stay here", "I am happy to be here", "I am bob", "it is sunny today", "I have a party today", "it is a dog and that is a cat", "there are dog and cat on the tree", "I study hard this morning", "today is a good day", "tomorrow will be a good day", "I like coffee, I like book and I like apple", "I do not like it", "I am kitty, I like bob", "I do not care who like bob, but I like kitty", "It is coffee time, bring your cup", ] vectorizer = TfidfVectorizer() # tf_idf用csr_matrix来进行存储,即稀疏矩阵 # fit_transform:Learn vocabulary and idf, return document-term matrix. tf_idf = vectorizer.fit_transform(docs) #zip() 函数用于将可迭代的对象作为参数,将对象中对应的元素打包成一个个元组,然后返回由这些元组组成的列表。 # print("idf: ", zip(vectorizer.idf_, vectorizer.get_feature_names())) print("idf: ", [(n, idf) for idf, n in zip(vectorizer.idf_, vectorizer.get_feature_names())]) print("v2i: ", vectorizer.vocabulary_) q = "I get a coffee cup" # transform: Transform documents to document-term matrix. qtf_idf = vectorizer.transform([q]) res = cosine_similarity(tf_idf, qtf_idf) res = res.ravel().argsort()[-3:] print("\ntop 3 docs for '{}':\n{}".format(q, [docs[i] for i in res[::-1]])) i2v = {i: v for v, i in vectorizer.vocabulary_.items()} dense_tfidf = tf_idf.todense() show_tfidf(dense_tfidf, [i2v[i] for i in range(dense_tfidf.shape[1])], "tfidf_sklearn_matrix")
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。