赞
踩
这是对涂铭等老师撰写的《Python自然语言处理实战:核心技术与算法》中第7章文本向量化的学习笔记。
(1)数据预处理
# -*- coding: utf-8 -*- from gensim.corpora import WikiCorpus import jieba from langconv import * def my_function(): space = ' ' i = 0 l = [] zhwiki_name = './data/zhwiki-latest-pages-articles.xml.bz2' # xml文件中读出的训练语料 f = open('./data/reduce_zhiwiki.txt', 'w',encoding='UTF-8') wiki = WikiCorpus(zhwiki_name, lemmatize=False, dictionary={}) for text in wiki.get_texts(): for temp_sentence in text: # 语料中的繁体中文转换为简体 temp_sentence = Converter('zh-hans').convert(temp_sentence) # 利用Jieba分词工具包对语料中的句子进行分词 seg_list = list(jieba.cut(temp_sentence)) for temp_term in seg_list: l.append(temp_term) f.write(space.join(l) + '\n') l = [] i = i + 1 if (i %200 == 0): print('Saved ' + str(i) + ' articles') f.close() if __name__ == '__main__': my_function()
(2)训练
# -*- coding: utf-8 -*- from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) def my_function(): wiki_news = open('./data/reduce_zhiwiki.txt', 'r',encoding='UTF-8') # 词向量,word2vec函数中第一个参数是预处理后的训练语料库。sg=0表示使用CBOW模型训练词向量;sg=1表示利用Skip-gram训练词向量。 # 参数size表示词向量的维度。windows表示当前词和预测词可能的最大距离,windows越大所需要枚举的预测词越多,计算时间越长。 # min_count表示最小出现的次数,如果一个词语出现的次数小于min_count,那么直接忽略该词语。 # 最后一个 workers表示训练词向量时使用的线程数。 model = Word2Vec(LineSentence(wiki_news), sg=0,size=192, window=5, min_count=5, workers=9) model.save('zhiwiki_news.word2vec') if __name__ == '__main__': my_function()
(3)测试,计算相似度
#coding=utf-8
import gensim
def my_function():
model = gensim.models.Word2Vec.load('./data/zhiwiki_news.word2vec')
print(model.similarity('汽车','自行车')) #相似度为0.63
print(model.similarity('汽车','摩托车')) #相似度为0.44
word = '中国'
if word in model.wv.index2word:
print(model.most_similar(word))
if __name__ == '__main__':
my_function()
# -*- coding: utf-8 -*- import jieba.posseg as pseg from jieba import analyse def keyword_extract(data, file_name): tfidf = analyse.extract_tags keywords = tfidf(data) return keywords def getKeywords(docpath, savepath): with open(docpath, 'r',encoding='utf-8') as docf, open(savepath, 'w',encoding='utf-8') as outf: for data in docf: data = data[:len(data)-1] keywords = keyword_extract(data, savepath) for word in keywords: outf.write(word + ' ') outf.write('\n')
# -*- coding: utf-8 -*- import codecs import numpy import gensim import numpy as np from keyword_extract import * wordvec_size=192 def get_char_pos(string,char): chPos=[] try: chPos=list(((pos) for pos,val in enumerate(string) if(val == char))) except: pass return chPos def word2vec(file_name,model): with codecs.open(file_name, 'r',encoding='utf-8') as f: word_vec_all = numpy.zeros(wordvec_size) for data in f: space_pos = get_char_pos(data, ' ') first_word=data[0:space_pos[0]] if model.__contains__(first_word): word_vec_all= word_vec_all+model[first_word] for i in range(len(space_pos) - 1): word = data[space_pos[i]:space_pos[i + 1]] if model.__contains__(word): word_vec_all = word_vec_all+model[word] return word_vec_all def simlarityCalu(vector1,vector2): vector1Mod=np.sqrt(vector1.dot(vector1)) vector2Mod=np.sqrt(vector2.dot(vector2)) if vector2Mod!=0 and vector1Mod!=0: simlarity=(vector1.dot(vector2))/(vector1Mod*vector2Mod) else: simlarity=0 return simlarity if __name__ == '__main__': model = gensim.models.Word2Vec.load('data/zhiwiki_news.word2vec') p1 = './data/P1.txt' p2 = './data/P2.txt' p1_keywords = './data/P1_keywords.txt' p2_keywords = './data/P2_keywords.txt' getKeywords(p1, p1_keywords) getKeywords(p2, p2_keywords) p1_vec=word2vec(p1_keywords,model) p2_vec=word2vec(p2_keywords,model) print(simlarityCalu(p1_vec,p2_vec))
(1)预处理
#!/usr/bin/env python # -*- coding: utf-8 -*- import gensim.models as g from gensim.corpora import WikiCorpus import logging from langconv import * #enable logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) docvec_size=192 class TaggedWikiDocument(object): def __init__(self, wiki): self.wiki = wiki self.wiki.metadata = True def __iter__(self): import jieba for content, (page_id, title) in self.wiki.get_texts(): yield g.doc2vec.LabeledSentence(words=[w for c in content for w in jieba.cut(Converter('zh-hans').convert(c))], tags=[title]) def my_function(): zhwiki_name = './data/zhwiki-latest-pages-articles.xml.bz2' wiki = WikiCorpus(zhwiki_name, lemmatize=False, dictionary={}) documents = TaggedWikiDocument(wiki) model = g.Doc2Vec(documents, dm=0, dbow_words=1, size=docvec_size, window=8, min_count=19, iter=5, workers=8) model.save('data/zhiwiki_news.doc2vec') if __name__ == '__main__': my_function()
(2)测试,相似度计算
import gensim.models as g import codecs import numpy import numpy as np model_path = './data/zhiwiki_news.doc2vec' start_alpha = 0.01 infer_epoch = 1000 docvec_size = 192 def simlarityCalu(vector1, vector2): vector1Mod = np.sqrt(vector1.dot(vector1)) vector2Mod = np.sqrt(vector2.dot(vector2)) if vector2Mod != 0 and vector1Mod != 0: simlarity = (vector1.dot(vector2)) / (vector1Mod * vector2Mod) else: simlarity = 0 return simlarity def doc2vec(file_name, model): import jieba doc = [w for x in codecs.open(file_name, 'r', 'utf-8').readlines() for w in jieba.cut(x.strip())] doc_vec_all = model.infer_vector(doc, alpha=start_alpha, steps=infer_epoch) return doc_vec_all if __name__ == '__main__': model = g.Doc2Vec.load(model_path) p1 = './data/P1.txt' p2 = './data/P2.txt' P1_doc2vec = doc2vec(p1, model) P2_doc2vec = doc2vec(p2, model) print(simlarityCalu(P1_doc2vec, P2_doc2vec))
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。