赞
踩
以下分别是TF(词袋模型)/TF-IDF/skip-gram(Word2vec)的具体实现代码。
from sklearn.feature_extraction.text import CountVectorizer #语料库 train_x= ['build fails due publication-tests.xml build target','due to sb'] test_x =['build one to '] #将文本中的词语转换为词频矩阵 选择前256个词 相当于词向量的维度是256维的 cv_ = CountVectorizer(max_features=256) #计算个词语出现的次数 此类方法一般先fit拟合,再transform转换 X = cv_.fit_transform(train_x) #输出语料库 print('corpus',train_x) #输出词典 print('feature_names',cv_.get_feature_names()) #输出词汇 print('vocabulary_',cv_.vocabulary_) #输出模型参数 print('params',cv_.get_params(deep=True)) #输出词频 print(X) #查看词频结果 print(X.toarray())
结果:
(0,0) 1 表示第1句预料中,第0个单词,出现的次数
封装为函数
from sklearn.feature_extraction.text import CountVectorizer
def count(train_x,test_x):
cv_ = CountVectorizer(max_features=256)
# 使用训练数据train_x进行词向量的拟合,得到训练集的词向量表示结果data_train_tf,然后输出矩阵形式
data_train_tf = cv_.fit_transform(train_x).toarray()
# 对测试集数据test_x进行词向量的转换
data_test_tf = cv_.transform(test_x).toarray()
return data_train_tf,data_test_tf
from sklearn.feature_extraction.text import TfidfVectorizer
# 理论同上
def tfidf(train_x,test_x):
tr_ = TfidfVectorizer(max_features=256)
data_train_tfidf = tr_.fit_transform(train_x).toarray()
data_test_tfidf = tr_.transform(test_x).toarray()
return data_train_tfidf,data_test_tfidf
from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence import numpy as np # 获取skip-gram的词向量 一句话中所有词向量求均值得到该句的向量 def get_embed(skip_,data_path,dim_size=300): data_res=[] for s_ in LineSentence(data_path): vec_res = np.zeros(dim_size) for t_ in s_: try: vec = skip_[t_] #不存在该单词 vec_res += vec except: vec = np.zeros(dim_size) vec_res += vec data_res.append(vec_res/len(s_)) return np.array(data_res) # 输出skip-gram表示的词向量 词向量大小维度为300 retain表示是否覆盖之前的训练结果 def word2vec(train_x,test_x,soft,dim_size=300,retrain=True): data_path_train = 'F:/shiyan/bert/dataset/generate/'+soft+'/train_x.txt' save_path_train = 'F:/shiyan/bert/dataset/generate/'+soft+'/skip_.model' if retrain: # 将训练数据train_x以一行一句话的形式存入txt文件中 train_x.to_csv(data_path_train, sep='\t',index=False, header=None) # 训练skip-gram skip_ = Word2Vec(LineSentence(data_path_train),size=dim_size,window=10,sg=1) # 保存模型 skip_.save(save_path_train) # 单词 向量的存储 skip_.wv.save_word2vec_format('F:/shiyan/bert/dataset/generate/'+soft+'/skip_keys.model', binary=False) else: # 若之前训练过,这里直接加载训练好的模型 skip_ = Word2Vec.load(save_path_train) data_train_skip = get_embed(skip_,data_path_train,dim_size=dim_size) # 加入测试集再次训练 data_path_test = 'F:/shiyan/bert/dataset/generate/'+soft+'/test_x.txt' save_path_test = 'F:/shiyan/bert/dataset/generate/'+soft+'/skip_latter.model' test_x.to_csv(data_path_test, sep='\t',index=False, header=None) # 对模型进行微调 skip_.train(LineSentence(data_path_test),total_examples=skip_.corpus_count,epochs=skip_.epochs) skip_.save(save_path_test) skip_.wv.save_word2vec_format('F:/shiyan/bert/dataset/generate/'+soft+'/skip_keys_latter.model', binary=False) data_test_skip = get_embed(skip_,data_path_test,dim_size=dim_size) return data_train_skip,data_test_skip
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。