当前位置:   article > 正文

【Python】gensim TF-IDF 词向量->句子向量_tf-idf表示词向量 gensim

tf-idf表示词向量 gensim
import numpy as np
import pandas as pd
  • 1
  • 2

读取数据

# 查询关键词 - 待匹配文本
# 待匹配文本
corpus_data = pd.read_csv('./data/ecom/corpus.txt',sep='\t',names=['doc','title'])
# 测试集查询关键词
dev_data = pd.read_csv('./data/ecom/dev.query.txt', sep='\t',names=['query','title'])
# 训练集查询关键词
train_data = pd.read_csv('./data/ecom/train.query.txt', sep='\t', names=['query', 'title'])
# 训练集查询关键词 到 corpus_data的映射
qrels = pd.read_csv('./data/ecom/qrels.train.txt', sep='\t',names=['query','b','doc','d'])
qrels = qrels[['query','doc']]
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
corpus_data = corpus_data.set_index('doc')
dev_data = dev_data.set_index('query')
train_data = train_data.set_index('query')
qrels = qrels.set_index('query')

  • 1
  • 2
  • 3
  • 4
  • 5
# 真正对应展示
for idx in range(1, 20):
    print(
        train_data.loc[idx]['title'],
        '\t',
         corpus_data.loc[qrels.loc[idx]['doc']]['title']
    )
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7

减速器链条 高品质KMC工业单双链条06B35B4008B5010A6010B12A8012B16A16B415
tasco望远镜 库存美国西蒙斯望远镜高倍望远镜SIMMONS 望远镜10*32
焦糖色美背薄款 美背吊带背心女网红爆款带胸垫文胸一体抹胸夏季薄款裹胸学生内衣

词向量

import jieba

' '.join(jieba.cut('见天天气真好'))
# 可以自己构建词典
  • 1
  • 2
  • 3
  • 4

‘见天 天气 真 好’

list(jieba.cut('今天年后天气'))
  • 1

[‘今天’, ‘年’, ‘后’, ‘天气’]

# 训练集和测试集进行分词
def title_cut(x):
    return list(jieba.cut(x))

from joblib import Parallel, delayed

corpus_title = Parallel(n_jobs=4)(delayed(title_cut)(title)  for title in corpus_data['title'])
train_title = Parallel(n_jobs=4)(delayed(title_cut)(title) for title in train_data['title'])
dev_title = Parallel(n_jobs=4)(delayed(title_cut)(title) for title in dev_data['title'])
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
train_title[1:5]
  • 1

[[‘tasco’, ‘望远镜’],
[‘焦’, ‘糖色’, ‘美背’, ‘薄款’],
[‘灌’, ‘香肠’, ‘红曲米’],
[‘统’, ‘机手’, ‘表壳’]]

from gensim.models import Word2Vec
from gensim.test.utils import common_texts
import os

if os.path.exists('word2vec.model'):
    model = Word2Vec.load('word2vec.model')
else:
    # https://blog.csdn.net/qq_27586341/article/details/90025288
    model = Word2Vec(
        # 传入类型为list
        sentences=list(corpus_title)+list(train_title)+list(dev_title),
        vector_size=128,
        window=5,
        min_count=1,
        workers=4,
    )
    model.save('word2vec.model')
# 可尝试已有的词向量 腾讯的词向量
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
model.wv.most_similar('格力')

'''

    [('变频空调', 0.8377534747123718),
     ('柜机', 0.8231720924377441),
     ('海尔', 0.8123170137405396),
     ('奥克斯', 0.8062556385993958),
     ('3p', 0.80381178855896),
     ('KFR', 0.8033701777458191),
     ('美的', 0.7991029620170593),
     ('格力空调', 0.7952483892440796),
     ('中央空调', 0.7899882197380066),
     ('GREE', 0.7736614346504211)]
'''

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
print(model.wv.index_to_key[:5])
model.wv.key_to_index['女']

'''
   [' ', '/', '-', '家用', '儿童']
      29
'''
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
# 每一句title -> 多个单词id
train_w2v_ids = [[model.wv.key_to_index[word] for word in title] for title in train_title]
dev_w2v_ids = [[model.wv.key_to_index[word] for word in title] for title in dev_title]
corpus_w2v_ids = [[model.wv.key_to_index[word] for word in title] for title in corpus_title]
  • 1
  • 2
  • 3
  • 4

TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

idf = TfidfVectorizer(analyzer=lambda x:x)
idf.fit(train_title + corpus_title)


token = np.array(idf.get_feature_names())
#idf.idf_ 单词的得分  where返回元素索引
# token根据索引返回关键词
# drop_token 常见词
drop_token = token[np.where(idf.idf_ < 10)[0]]
# 去重 转list 
drop_token = list(set(drop_token)) 
# 获取常见词的id
drop_token_ids = [model.wv.key_to_index[word] for word in drop_token]
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15

句子编码

def unsuper_w2v_encoding(s, pooling='max'):
    feat = []
    # 不是常见词就获取
    corpus_query_word = [id for id in s if id not in drop_token_ids]
    if len(corpus_query_word) == 0:
        return np.zeros(128)
    # 获取关键词的词向量
    # N*128 N个单词的128维向量 -> 句子向量列表
    feat = model.wv[corpus_query_word]
    # 所有词语的向量 -> 句子的向量表示
    if pooling == 'max':
        # 0代表列 不加代表所有元素
        return np.array(feat).max(0)
    if pooling == 'avg':
        return np.array(feat).mean(0)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
from tqdm import tqdm_notebook

# corpus_title的每句向量编码
corpus_mean_feat = [
    unsuper_w2v_encoding(s) for s in tqdm_notebook(corpus_w2v_ids[:1000])
]
train_mean_feat = [
    unsuper_w2v_encoding(s) for s in tqdm_notebook(train_w2v_ids[:100])
]
dev_mean_feat = [
    unsuper_w2v_encoding(s) for s in tqdm_notebook(dev_w2v_ids[:100])
]
# vstack按行拼接 hstack按列拼接
dev_mean_feat = np.vstack(dev_mean_feat)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14

检索

from sklearn.preprocessing import normalize

corpus_mean_feat = normalize(corpus_mean_feat)
train_mean_feat = normalize(train_mean_feat)
dev_mean_feat = normalize(dev_mean_feat)
  • 1
  • 2
  • 3
  • 4
  • 5
mrr = []
for idx in tqdm_notebook(range(1, 100)):
    dis = np.dot(train_mean_feat[idx-1], corpus_mean_feat.T)
    ids = np.argsort(dis)[::-1]
    mrr.append(ids)
    
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/从前慢现在也慢/article/detail/354430
推荐阅读
相关标签
  

闽ICP备14008679号