赞
踩
import numpy as np
import pandas as pd
# 查询关键词 - 待匹配文本
# 待匹配文本
corpus_data = pd.read_csv('./data/ecom/corpus.txt',sep='\t',names=['doc','title'])
# 测试集查询关键词
dev_data = pd.read_csv('./data/ecom/dev.query.txt', sep='\t',names=['query','title'])
# 训练集查询关键词
train_data = pd.read_csv('./data/ecom/train.query.txt', sep='\t', names=['query', 'title'])
# 训练集查询关键词 到 corpus_data的映射
qrels = pd.read_csv('./data/ecom/qrels.train.txt', sep='\t',names=['query','b','doc','d'])
qrels = qrels[['query','doc']]
corpus_data = corpus_data.set_index('doc')
dev_data = dev_data.set_index('query')
train_data = train_data.set_index('query')
qrels = qrels.set_index('query')
# 真正对应展示
for idx in range(1, 20):
print(
train_data.loc[idx]['title'],
'\t',
corpus_data.loc[qrels.loc[idx]['doc']]['title']
)
减速器链条 高品质KMC工业单双链条06B35B4008B5010A6010B12A8012B16A16B415
tasco望远镜 库存美国西蒙斯望远镜高倍望远镜SIMMONS 望远镜10*32
焦糖色美背薄款 美背吊带背心女网红爆款带胸垫文胸一体抹胸夏季薄款裹胸学生内衣
import jieba
' '.join(jieba.cut('见天天气真好'))
# 可以自己构建词典
‘见天 天气 真 好’
list(jieba.cut('今天年后天气'))
[‘今天’, ‘年’, ‘后’, ‘天气’]
# 训练集和测试集进行分词
def title_cut(x):
return list(jieba.cut(x))
from joblib import Parallel, delayed
corpus_title = Parallel(n_jobs=4)(delayed(title_cut)(title) for title in corpus_data['title'])
train_title = Parallel(n_jobs=4)(delayed(title_cut)(title) for title in train_data['title'])
dev_title = Parallel(n_jobs=4)(delayed(title_cut)(title) for title in dev_data['title'])
train_title[1:5]
[[‘tasco’, ‘望远镜’],
[‘焦’, ‘糖色’, ‘美背’, ‘薄款’],
[‘灌’, ‘香肠’, ‘红曲米’],
[‘统’, ‘机手’, ‘表壳’]]
from gensim.models import Word2Vec
from gensim.test.utils import common_texts
import os
if os.path.exists('word2vec.model'):
model = Word2Vec.load('word2vec.model')
else:
# https://blog.csdn.net/qq_27586341/article/details/90025288
model = Word2Vec(
# 传入类型为list
sentences=list(corpus_title)+list(train_title)+list(dev_title),
vector_size=128,
window=5,
min_count=1,
workers=4,
)
model.save('word2vec.model')
# 可尝试已有的词向量 腾讯的词向量
model.wv.most_similar('格力')
'''
[('变频空调', 0.8377534747123718),
('柜机', 0.8231720924377441),
('海尔', 0.8123170137405396),
('奥克斯', 0.8062556385993958),
('3p', 0.80381178855896),
('KFR', 0.8033701777458191),
('美的', 0.7991029620170593),
('格力空调', 0.7952483892440796),
('中央空调', 0.7899882197380066),
('GREE', 0.7736614346504211)]
'''
print(model.wv.index_to_key[:5])
model.wv.key_to_index['女']
'''
[' ', '/', '-', '家用', '儿童']
29
'''
# 每一句title -> 多个单词id
train_w2v_ids = [[model.wv.key_to_index[word] for word in title] for title in train_title]
dev_w2v_ids = [[model.wv.key_to_index[word] for word in title] for title in dev_title]
corpus_w2v_ids = [[model.wv.key_to_index[word] for word in title] for title in corpus_title]
from sklearn.feature_extraction.text import TfidfVectorizer
idf = TfidfVectorizer(analyzer=lambda x:x)
idf.fit(train_title + corpus_title)
token = np.array(idf.get_feature_names())
#idf.idf_ 单词的得分 where返回元素索引
# token根据索引返回关键词
# drop_token 常见词
drop_token = token[np.where(idf.idf_ < 10)[0]]
# 去重 转list
drop_token = list(set(drop_token))
# 获取常见词的id
drop_token_ids = [model.wv.key_to_index[word] for word in drop_token]
def unsuper_w2v_encoding(s, pooling='max'):
feat = []
# 不是常见词就获取
corpus_query_word = [id for id in s if id not in drop_token_ids]
if len(corpus_query_word) == 0:
return np.zeros(128)
# 获取关键词的词向量
# N*128 N个单词的128维向量 -> 句子向量列表
feat = model.wv[corpus_query_word]
# 所有词语的向量 -> 句子的向量表示
if pooling == 'max':
# 0代表列 不加代表所有元素
return np.array(feat).max(0)
if pooling == 'avg':
return np.array(feat).mean(0)
from tqdm import tqdm_notebook
# corpus_title的每句向量编码
corpus_mean_feat = [
unsuper_w2v_encoding(s) for s in tqdm_notebook(corpus_w2v_ids[:1000])
]
train_mean_feat = [
unsuper_w2v_encoding(s) for s in tqdm_notebook(train_w2v_ids[:100])
]
dev_mean_feat = [
unsuper_w2v_encoding(s) for s in tqdm_notebook(dev_w2v_ids[:100])
]
# vstack按行拼接 hstack按列拼接
dev_mean_feat = np.vstack(dev_mean_feat)
from sklearn.preprocessing import normalize
corpus_mean_feat = normalize(corpus_mean_feat)
train_mean_feat = normalize(train_mean_feat)
dev_mean_feat = normalize(dev_mean_feat)
mrr = []
for idx in tqdm_notebook(range(1, 100)):
dis = np.dot(train_mean_feat[idx-1], corpus_mean_feat.T)
ids = np.argsort(dis)[::-1]
mrr.append(ids)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。