赞
踩
目录
Word2Vec是Google在2013年开源的一款词向量计算工具,它的特点是将所有的词向量化,这样词与词之间就可以定量的去度量他们之间的关系,挖掘词之间的联系。
Word2vec源码 Word2vec论文 Word2Vec原理参考
- from gensim.models.word2vec import Word2Vec
- import pandas as pd
-
- # 构建word2vec模型,词向量的训练与生成
- def get_dataset_vec(dataset):
- n_dim = 300
- w2v_model = Word2Vec(dataset, sg=1, size=n_dim, min_count=10, hs=0) # 初始化模型并训练
- # 在测试集上训练
- # w2v_model.train(x_test,total_examples=w2v_model.corpus_count,epochs=w2v_model.iter) #追加训练模型
- # 将imdb_w2v模型保存,训练集向量,测试集向量保存到文件
- # print(w2v_model['会议'])
- w2v_model.save('data/w2v/w2v_model_300.pkl') # 保存训练结果
-
- if __name__ == '__main__':
- # 数据集获取
- train_data = pd.read_csv('data/clean_data_train.csv', sep=',', names=['contents', 'labels']).astype(str)
- test_data = pd.read_csv('data/clean_data_test.csv', sep=',', names=['contents', 'labels']).astype(str)
- cw = lambda x: str(x).split()
- train_data['words'] = train_data['contents'].apply(cw)
- test_data['words'] = train_data['contents'].apply(cw)
- dataset = pd.concat([train_data, test_data])
-
- # word2vec词向量训练
- get_dataset_vec(dataset['words'])
-
- # 词向量模型加载
- # w2v_model = Word2Vec.load('data/w2v/w2v_model_300.pkl')
word2vec 相关的API都在包gensim.models.word2vec中 ,主要参数如下:
计算公式如下:
实现代码如下:
-
- #对每个句子的所有词向量取均值,来生成一个句子的vector
- def build_sentence_vector(sentence,size,w2v_model):
- sen_vec=np.zeros(size).reshape((1,size))
- count=0
- for word in sentence:
- try:
- sen_vec+=w2v_model[word].reshape((1,size))
- count+=1
- except KeyError:
- continue
- if count!=0:
- sen_vec/=count
- return sen_vec
计算公式如下:
实现代码如下:
-
- #对每个句子的所有词向量取加权均值,来生成一个句子的vector
- def build_sentence_vector_weight(sentence,size,w2v_model,key_weight):
- key_words_list=list(key_weight)
- sen_vec=np.zeros(size).reshape((1,size))
- count=0
- for word in sentence:
- try:
- if word in key_words_list:
- sen_vec+=(np.dot(w2v_model[word],math.exp(key_weight[word]))).reshape((1,size))
- count+=1
- else:
- sen_vec+=w2v_model[word].reshape((1,size))
- count+=1
- except KeyError:
- continue
- if count!=0:
- sen_vec/=count
- return sen_vec
-
- # 将文本数据转换为文本向量
- def doc_vec():
- train_data = pd.read_csv('data/clean_data_train.csv', sep=',',names=['contents', 'labels']).astype(str)
- test_data = pd.read_csv('data/clean_data_test.csv', sep=',', names=['contents', 'labels']).astype(str)
- w2v_model = Word2Vec.load('data/w2v/w2v_model_300.pkl') #加载训练好的Word2Vec模型
-
- #读取词权重字典
- #with open('data/key_words_importance', 'r') as f:
- #key_words_importance = eval(f.read())
-
- cw=lambda x:int(x)
- y_train = np.array(train_data['labels'].apply(cw))
- y_test=np.array(test_data['labels'].apply(cw))
-
- #训练集转换为向量
- train_lenth=len(train_data)
- train_data_list=[]
- for i in range(train_lenth):
- train_data_list.append(str(train_data['contents'][i]).split())
- train_docvec_list=np.concatenate([build_sentence_vector(sen,300,w2v_model) for sen in train_data_list])
-
- #测试集转换为向量
- test_lenth = len(test_data)
- test_data_list = []
- for i in range(test_lenth):
- test_data_list.append(str(test_data['contents'][i]).split())
- test_docvec_list = np.concatenate([build_sentence_vector(sen, 300, w2v_model) for sen in test_data_list])
-
- return train_docvec_list,y_train,test_docvec_list,y_test
现有词向量模型中,第一种是全局的词-文本矩阵分解(LSA),该方法能有效收集每一个词的统计信息,但他们却不能捕捉到词的上下文信息(语义的表达能力不够);第二种就是基于局部窗口信息(Word2Vec),这种方法虽然能在词的语义上有更丰富的表达,但是他们却不能很好的捕捉词的全局统计信息。
GloVe词向量模型融合了全局矩阵分解方法(Matrix Factorization)和局部文本框捕捉方法(word2vec),是一种用于获得单词矢量表示的无监督学习算法。
GloVe论文 GloVe源码 GloVe介绍 斯坦福NLP课程
首先,下载GloVe源码
命令:cd到GloVe源码对应目录下,vim demo.sh
demo.sh文件如下:
- #!/bin/bash
-
- # Makes programs, downloads sample data, trains a GloVe model, and then evaluates it.
- # One optional argument can specify the language used for eval script: matlab, octave or [default] python
-
- make
- if [ ! -e text8 ]; then
- if hash wget 2>/dev/null; then
- wget http://mattmahoney.net/dc/text8.zip
- else
- curl -O http://mattmahoney.net/dc/text8.zip
- fi
- unzip text8.zip
- rm text8.zip
- fi
-
- CORPUS=text8
- VOCAB_FILE=vocab.txt
- COOCCURRENCE_FILE=cooccurrence.bin
- COOCCURRENCE_SHUF_FILE=cooccurrence.shuf.bin
- BUILDDIR=build
- SAVE_FILE=vectors
- VERBOSE=2
- MEMORY=4.0
- VOCAB_MIN_COUNT=5
- VECTOR_SIZE=50
- MAX_ITER=15
- WINDOW_SIZE=15
- BINARY=2
- NUM_THREADS=8
- X_MAX=10
-
- $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE
- if [[ $? -eq 0 ]]
- then
- $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE
- if [[ $? -eq 0 ]]
- then
- $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE
- if [[ $? -eq 0 ]]
- then
- $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE
- if [[ $? -eq 0 ]]
- then
- if [ "$1" = 'matlab' ]; then
- matlab -nodisplay -nodesktop -nojvm -nosplash < ./eval/matlab/read_and_evaluate.m 1>&2
- elif [ "$1" = 'octave' ]; then
- octave < ./eval/octave/read_and_evaluate_octave.m 1>&2
- else
- python eval/python/evaluate.py
- fi
- fi
- fi
- fi
- fi
对demo.sh文件进行修改
修改后的demo.sh文件如下
- #!/bin/bash
-
- # Makes programs, downloads sample data, trains a GloVe model, and then evaluates it.
- # One optional argument can specify the language used for eval script: matlab, octave or [default] python
-
- #请把make这边注释掉,这个是让你去下个demo,我们直接改成自己的数据
- #make
- #if [ ! -e text8 ]; then
- # if hash wget 2>/dev/null; then
- # wget http://mattmahoney.net/dc/text8.zip
- # else
- # curl -O http://mattmahoney.net/dc/text8.zip
- # fi
- # unzip text8.zip
- # rm text8.zip
- #fi
-
- CORPUS=counts.txt #CORPUS需要对应自己的欲训练的文档
- VOCAB_FILE=vocab.txt
- COOCCURRENCE_FILE=cooccurrence.bin
- COOCCURRENCE_SHUF_FILE=cooccurrence.shuf.bin
- BUILDDIR=build
- SAVE_FILE=vectors
- VERBOSE=2
- MEMORY=4.0
- VOCAB_MIN_COUNT=5 #单词至少出现的次数
- VECTOR_SIZE=300 #训练的词向量维度
- MAX_ITER=15 #训练迭代次数
- WINDOW_SIZE=15 #窗口大小
- BINARY=2
- NUM_THREADS=8
- X_MAX=10
-
- $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE
- if [[ $? -eq 0 ]]
- then
- $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE
- if [[ $? -eq 0 ]]
- then
- $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE
- if [[ $? -eq 0 ]]
- then
- $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE
- if [[ $? -eq 0 ]]
- then
- if [ "$1" = 'matlab' ]; then
- matlab -nodisplay -nodesktop -nojvm -nosplash < ./eval/matlab/read_and_evaluate.m 1>&2
- elif [ "$1" = 'octave' ]; then
- octave < ./eval/octave/read_and_evaluate_octave.m 1>&2
- else
- python eval/python/evaluate.py
- fi
- fi
- fi
- fi
- fi
训练文档为counts.txt,部分内容如下:
- 紧急通知 通知 七点 50 准时 武陵 楼 开会 早 开 早 散 十一 十一点 十一点半 一点 一点半 准时 散会 时间 会议
- 党 服 成员 明天 中午 12 点 30 分 第一 第一次 例会 会地 地点 待定 请 预留 留出 时间 收到 请 回复 全体成员 成员 时间 会议
- 通知 宣传 宣传部 部将 明天 中午 1230 周三 j4101 开本 本次 部门 例会 请 带好 笔 本次 例会 考评 部 本次 例会 主席 主席团 发 邀请 梁 淑 楠 收到 请 回复 全体成员 成员 时间 会议 议会 会议 议会 会议
- 本周 周六 叉叉 裤 聚会 聚会时间 时间 会议
- 总 通知 当班 班长 眼 长 表工 930 中控室 开会 开会时间 时间 会议
命令:sh demo.sh ,会得到vectors.txt,这个里面就对应每个词的向量表示。
vectors.txt部分内容如下:
- 时间 -0.192653 -0.027064 0.157989 -0.959652 -0.249349 -0.695960 -0.290393 -0.320663 0.850561 1.123161 -0.820032 -0.031310 -0.220525 0.391944 0.363133 0.433053 0.133621 0.590930 -1.096903 -0.346236 0.211173 1.074069 1.840458 -0.619614 -0.047013 -0.482248 -0.980474 -0.052697 -0.764487 -1.042479 0.310801 1.027933 -0.703576 -0.121927 0.096973 -0.389311 0.467025 0.532758 -0.626837 0.689972 0.020686 0.654299 -0.463879 0.430281 -0.206859 -0.253277 -0.091177 -0.616631 0.082589 -0.110885 0.858724 1.039171 -0.215845 0.597427 -0.564434 1.393491 -0.340165 0.079653 -0.093698 0.238416 0.800182 0.019553 -0.275385 0.314701 0.126838 -0.891766 -0.598391 -0.339991 0.278942 0.150008 0.169043 -0.106491 0.129413 0.154625 -1.077015 0.636639 0.760766 0.291428 -0.304663 -0.709912 0.578007 0.400317 0.498921 0.299688 -1.924125 -0.233359 -0.593315 -0.064116 0.141521 0.708793 0.896055 -0.439493 0.040833 0.089314 -0.004766 0.501943 0.239141 -0.368876 0.016068 -0.074727 -0.526003 -0.683604 1.052587 -0.511065 -1.180889 -0.136742 -0.571856 0.072116 -0.220324 -0.180080 0.959551 0.180546 0.292177 0.048728 -0.535014 0.014530 0.631147 -0.088323 0.668570 -0.310500 0.363473 -0.564667 0.857510 -0.148112 0.238181 0.135603 -0.073686 0.755832 0.572558 0.281101 0.297423 -0.748391 -0.020244 -0.557294 -0.441473 0.106799 -0.297012 0.233237 0.218440 -0.034616 0.410545 -0.245199 0.695645 0.192339 -1.024643 -0.628238 0.357365 0.426126 0.290365 -0.421078 -0.207671 -0.393413 0.527472 -0.465794 0.134838 -0.118478 -0.358524 0.733155 0.181742 -0.357532 -0.425675 0.262033 0.192739 0.320736 -0.073291 -0.603243 -0.214388 0.251359 -0.157907 0.639871 -0.322949 0.153679 0.128943 -0.079536 -1.243106 0.405427 -0.679834 0.706061 -0.556909 0.317398 -0.972522 -0.323571 0.082381 0.952649 0.470793 -0.255887 -0.072567 0.154703 0.553813 0.444109 -0.955931 -0.506380 -0.185121 0.436316 -0.166778 -0.070471 0.134493 -0.041048 0.371387 -0.466888 -0.322176 -0.139643 -0.469103 -0.297749 -0.341331 -0.182869 -1.138910 -0.840458 -1.005790 -1.030163 0.435325 -0.218241 -1.017183 -0.249351 0.060505 0.819773 -1.382254 -0.520934 -0.643470 0.369318 0.390774 -0.547370 -0.003034 -0.119246 0.397086 -0.720554 -0.267369 -0.626506 0.650304 0.491474 0.097891 0.035371 0.471716 0.680434 -0.168340 -0.088465 0.200047 -0.021552 -0.849985 0.090245 1.103723 0.242453 -0.830361 -0.132980 0.391840 -0.074169 -0.254151 0.175168 -0.482801 0.787707 1.463750 -0.666707 -0.324269 0.170228 -0.455849 -0.454797 0.485042 0.906702 -0.069496 -0.017353 0.824049 0.567829 -0.075913 -0.094140 -0.010110 -0.428374 0.830554 -0.249833 0.168137 0.541595 -1.243334 0.625043 -0.149822 0.004912 -0.021771 -0.491535 0.503651 -0.315409 0.031236 0.328521 -0.264316 -0.191875 0.681480 -0.782049 0.641517 0.380847 0.347847 0.043671 0.636368 -0.315160 -0.576198 -0.348432 0.281816 -0.623102 -0.564527 0.902504 0.016528 -0.069687 -0.144267 0.006254
- 月 0.282719 0.152989 0.277989 -0.409890 -0.100124 -0.562012 0.054540 -0.302864 0.340999 0.332058 -0.036765 -0.328907 -0.370917 0.120894 -0.257750 -0.695093 0.201110 -0.020349 -0.200095 0.133917 0.161953 1.409806 0.930571 0.444746 -0.639931 0.444198 -0.495933 0.229397 -0.739602 -1.055825 -0.318828 1.522916 0.004156 0.343912 0.006537 0.388347 0.600495 0.239206 -0.028486 0.552802 -0.278926 0.373066 -0.426863 0.489640 -0.323016 -0.080631 -1.074515 -0.507549 0.665036 0.391942 1.328556 1.073624 -0.177601 -0.018996 0.348987 0.525943 0.005171 -0.306071 -0.512433 -0.289054 0.337168 -0.289363 -0.288751 -0.316822 -0.342586 0.074602 -0.808880 0.993347 -0.027332 0.564592 -0.264008 0.547095 0.713716 0.306165 -0.489996 0.403748 -0.368705 -0.457879 0.797570 -0.495398 0.355519 1.163623 0.563982 -0.054508 -0.902991 -0.064551 -0.314185 -0.710030 -0.444326 -0.638333 0.392305 -0.397702 -0.002391 -0.020851 0.107497 0.182052 0.570882 -0.650250 0.326903 -0.717934 0.280774 -0.278874 0.623692 -0.195263 -0.542201 -0.003978 0.787480 0.940788 0.495447 0.390056 0.547322 -0.223878 -0.775611 0.423554 0.227717 0.880800 0.295964 0.302781 0.979723 -0.007546 -0.030809 -0.124463 1.068195 -0.250004 -0.743149 0.287037 -0.627813 0.902924 -0.019548 -0.008901 0.716885 -0.080450 -0.078079 -0.380885 0.595999 -0.006114 0.456368 0.310282 -0.593804 -0.097739 -0.533996 0.129195 -0.206451 0.854950 -0.736590 -0.373819 -0.171245 0.822648 0.022098 -0.041053 0.133135 -0.221786 -0.370930 -0.073366 -0.076312 -0.141111 0.754814 0.311759 -0.487563 0.148415 -0.327943 1.227186 1.104577 0.584394 0.866440 -0.155514 0.571861 0.225291 -0.596585 0.341632 -0.075625 0.459881 0.492822 -0.856172 -0.467885 1.009984 -0.509890 -0.289279 0.198226 0.779081 -0.505615 0.262267 -0.692102 -0.631975 0.688475 -0.343992 0.473175 0.662025 0.846393 -0.026749 -0.032720 -0.642198 -0.022452 -0.492827 0.074650 -0.200605 0.142318 -0.030474 -0.021498 -0.059870 -0.558756 -0.568076 -0.125238 0.087478 -0.520128 -0.406735 -0.211258 -0.481394 -0.664873 -0.277975 0.412186 0.010863 -0.240466 -0.070284 0.405342 0.066035 -1.461064 -1.013241 -0.072386 0.634662 0.716899 -0.185302 -0.030074 -0.113286 0.790407 -0.334036 0.145996 0.457602 0.061558 0.303261 0.445127 0.128329 0.371765 0.705406 -0.924010 0.312624 -0.264194 0.308366 0.098177 0.532446 -0.651350 0.452719 -0.764295 -0.191709 0.274529 -0.663116 -0.442445 -0.332471 -0.566812 0.441788 0.991128 0.062422 0.332908 0.114607 -0.790051 0.140813 -0.078728 0.221595 -0.313071 0.172639 0.681872 0.530355 -0.548417 0.459590 -0.557620 -0.267441 0.221526 -0.000649 0.288101 0.463587 -0.214246 0.953989 -0.281264 0.129526 -0.206172 -0.171313 -0.152700 0.777293 -0.521110 0.491064 0.467925 0.892538 -0.254171 -0.500852 0.145604 -0.039736 -0.016100 0.545795 -0.154586 0.515680 0.105515 -0.214572 -0.341872 0.009775 0.009587 1.174931 -0.101082 0.310132 -0.541959 -0.693216
- 元 0.306025 0.322643 1.077880 -0.600908 0.172935 -0.737794 0.255050 -0.157029 0.342953 -0.706849 -0.381400 0.061180 -0.953065 -0.095882 -0.368000 0.420154 0.951357 0.767554 -0.143634 -0.126899 0.307859 -0.300399 1.556839 0.364413 0.250135 -0.566936 -0.995738 -0.113883 -1.406691 -1.014822 -0.687600 1.741928 -0.615409 -0.387166 0.027234 1.281103 0.351747 -0.227581 -0.232469 0.271432 -0.415515 0.631718 -0.894145 0.733428 -1.230473 0.682133 0.075777 0.649245 0.289526 -0.467158 0.919363 0.351625 0.222911 0.504691 -0.181684 -0.378320 -0.687206 0.148117 0.729295 -0.105381 0.695989 -0.754219 -0.440530 0.118117 -0.883886 0.492664 0.079025 1.197542 -0.610626 0.157875 -0.443919 0.746728 1.468872 -0.575745 -0.470869 0.846473 -0.617876 -0.127038 0.003786 -0.001282 0.176192 0.778630 -0.308934 -0.249084 -0.301070 -0.087675 0.103930 -0.340054 0.051219 0.025974 0.446019 -0.389392 0.933938 0.164278 0.435473 0.114423 0.302819 -0.724311 -0.338384 -0.036844 0.040301 -0.075052 0.496395 0.007586 -0.240168 -0.644824 -0.383305 -0.000070 -0.135047 -0.441491 -0.397919 -0.251733 0.320516 -0.316922 -0.176393 0.178300 0.372434 0.236957 -0.165925 0.870158 -0.275455 -0.283281 0.730676 0.013333 0.467073 -0.715855 1.007568 0.391907 -0.500897 1.030966 0.698415 0.478425 -0.691962 0.789402 0.111583 0.636666 -0.887124 -0.272008 -0.584863 -0.175891 0.452481 0.355332 0.007005 0.805152 -0.325006 -0.249743 0.031626 0.073259 -0.453138 0.277470 -0.478755 0.575895 0.355874 0.035219 0.079009 0.054281 0.201341 -0.105456 0.624423 0.419490 0.422743 0.886797 0.058878 0.218280 0.783062 -0.278714 0.244279 0.244416 -0.558108 -0.538741 -0.626169 0.796777 0.313345 -0.594111 -0.612010 0.543808 0.326743 -0.068215 -0.284259 0.111663 -1.129291 0.464670 0.355996 0.317091 0.101179 -0.515349 -0.268207 1.036510 0.211519 0.545480 -0.140292 -0.338422 0.350183 0.160976 -0.810460 -0.581156 -0.083485 0.175628 -0.395788 -0.154374 1.656540 -0.149448 -0.250864 0.027195 -0.000960 0.097104 -0.385608 0.388430 -0.219321 0.263330 -0.339932 0.169096 -1.619655 -0.260263 0.345896 0.407777 -1.409953 -0.595680 -0.820713 -0.197955 0.132537 0.609146 0.432836 -0.071146 0.582325 -0.670098 0.034255 -0.291316 0.500515 1.061890 0.178922 -0.471838 -0.328243 -0.209035 -0.135114 -0.890339 -0.736016 -0.594963 -0.932622 -0.414669 0.308166 -0.405150 0.466669 0.111795 0.032414 -0.055596 -0.688865 -0.334978 -0.041121 0.252767 0.603890 0.051988 0.886804 -0.175748 0.236281 0.094143 0.452659 0.917740 0.303790 -0.453451 0.630142 -0.170290 -0.083950 0.310024 -0.560198 -0.581944 0.530984 -0.194871 -0.052266 0.184277 -0.055074 -0.165853 -0.889973 -0.130048 0.556185 0.093815 0.077349 0.786506 0.017906 -0.209109 -0.610210 0.882594 0.242138 0.931476 -0.261045 -0.497909 0.045525 -0.391315 -0.737701 0.693279 0.625252 0.169180 -1.320762 0.212941 0.034001 1.308421 0.299258 0.197882 -0.742295 -0.070419
- 1 0.888034 -0.269472 0.416726 -0.396031 0.035506 -0.622379 -0.440201 -0.753203 0.398985 0.105516 0.040577 0.550698 -1.196764 0.480859 -0.343003 -0.207138 0.024000 0.383471 -0.265794 0.036858 -0.098424 -0.568933 0.755142 -0.054576 -0.848028 -0.005950 -0.107111 -1.310041 -0.301031 -0.132762 -0.114449 1.529382 0.480697 -0.317541 0.098669 0.736248 0.507810 0.586016 0.222051 -0.894592 -0.685444 0.937100 -0.727683 0.896437 0.331013 -0.252685 -0.648829 0.198743 0.434566 -1.018625 0.604905 0.406292 -0.378488 -0.145570 0.143134 0.145109 0.082009 -0.334453 -0.308911 -0.548263 -0.306763 -0.239198 -0.025372 -1.286177 -0.418609 -0.941808 0.073460 0.799592 -0.180774 0.470524 -0.339510 0.074398 1.134824 -0.534680 -0.852488 0.674400 -0.772466 0.522004 0.311741 0.318874 0.701483 0.355505 0.916747 -0.391279 -0.043185 -0.035573 0.124245 -0.511008 -0.028164 0.396926 0.310719 -0.058037 0.415305 -0.124527 0.795043 -0.048217 -0.204688 -0.545567 -0.332399 0.533343 -0.196562 -0.587495 0.613407 0.339769 -1.312807 -0.425666 0.550779 -0.078751 0.106345 -0.597031 0.303927 0.367110 -0.479566 -0.280796 -0.077603 0.089361 0.057174 0.680617 -0.232158 -0.380329 -0.353017 0.261309 0.214358 -0.436605 0.726974 0.686759 -0.190239 0.569367 -0.357779 0.217572 0.577694 -0.200133 0.319214 -0.238840 0.142662 0.019461 -0.504774 -0.493205 0.434241 -0.077088 0.492905 0.143735 -0.385636 0.110834 -0.760336 -1.083028 0.465417 0.823979 -0.218171 0.331168 -0.171310 -0.281734 0.110895 0.805839 0.391114 -0.155342 0.366512 -0.215544 0.573037 0.302591 -0.715427 1.287327 -0.233937 1.247682 0.546919 -0.104151 0.185894 0.783857 -0.716013 0.745546 -0.484780 -0.330844 0.012054 -0.701770 -0.930120 0.304442 -0.101639 0.141293 0.213534 1.119318 -0.159903 0.034759 -0.237266 -0.027194 1.521587 0.373965 -0.609322 1.010848 0.325201 -0.254094 -1.082284 0.966625 0.455843 -0.249176 -1.112331 -0.351975 0.052057 -0.396617 0.975547 -0.350484 -0.519251 -0.116380 0.263111 -0.679581 -0.509177 -0.611612 -1.014958 -1.130154 -0.559550 -0.558102 0.844820 -0.233135 -1.107969 -0.443999 0.112733 -0.204075 -0.698707 -1.276953 0.029345 0.273015 0.776428 0.283971 1.089718 -0.201296 -0.199805 -0.994360 0.001024 0.000260 0.042881 0.442563 -0.074997 -0.225658 -0.127922 -0.043150 -0.446940 0.630705 0.122018 0.889063 -1.137543 0.602873 -0.074088 0.591213 -0.055548 -0.744662 -0.415526 0.587157 -0.649435 -0.512667 -0.191861 0.295901 1.263322 -0.303088 0.406485 0.562705 -0.070011 0.483705 -0.296791 -0.003154 0.232777 -0.667530 0.346206 -0.297299 -0.304221 -0.251610 -0.225808 -1.008746 1.032771 -0.805736 0.350475 0.072382 1.180248 0.153301 -0.149750 -0.278651 -0.837156 -0.453207 0.156093 -0.189694 -0.123239 -0.640496 0.453016 -0.009482 -0.390708 -0.866602 0.362385 -0.031449 0.210468 0.231915 -0.285665 0.196669 0.176305 0.104762 -0.411070 -0.084570 -0.912936 0.740259 1.085935 0.331030 -0.355904 -0.241756
- 会议 1.771033 0.176118 0.379083 -1.038539 -0.159619 -0.810625 -0.211994 0.767950 0.113235 0.995838 0.142457 0.287684 -0.035386 0.464643 -0.773942 -0.403471 0.569797 0.253740 0.426339 0.071171 -0.311686 0.865899 0.530907 -0.161541 -0.488560 -0.129126 -0.448615 -0.926315 -0.088304 -0.902380 -0.259576 0.434504 -0.757882 -1.191447 0.162485 0.108723 0.197529 0.581506 -1.198933 0.218464 0.503901 1.417884 -0.579565 0.713137 -0.258934 -0.476144 -0.548408 -0.012146 0.578107 0.107231 1.058120 0.882515 0.756237 -0.384693 0.165540 0.411003 0.348020 -0.023861 -0.259647 0.186697 0.023265 0.225137 -0.692051 -0.037526 0.435802 -0.293929 -1.062105 -0.157920 0.217099 -0.734419 -0.667745 -0.096092 0.113397 -0.059034 0.295290 0.334193 -0.308422 -0.613945 0.205872 -0.658714 0.406285 0.736497 -0.066900 -0.202817 -0.440141 0.164648 -0.142157 -0.441622 -0.122423 0.034315 0.195609 0.276887 1.015851 0.545567 -0.872668 -0.008203 -0.217614 -0.203448 -0.353687 0.261102 0.208269 -0.262863 0.746820 -0.594605 -1.031103 0.288283 0.015618 0.884347 -0.138220 0.333664 0.230371 0.605099 0.317958 0.915120 -0.128864 0.209937 0.466525 0.142983 0.987952 -0.338007 -0.503637 -1.041105 0.585695 0.133922 -0.283604 -0.389145 -0.513809 0.927723 0.178702 -0.179315 0.077576 -0.665182 -0.031870 0.141394 0.240645 0.393327 -0.711886 0.200745 -0.061365 -1.052649 0.527113 -0.460608 0.246856 -0.320261 -0.142330 -0.609020 0.796626 0.201946 -0.246875 0.225949 0.785565 -0.603375 0.355625 -0.179703 -0.322463 0.631364 -0.112776 0.125427 -1.077946 -0.163022 0.206300 -0.000588 -0.276898 -0.698788 0.289042 -0.117832 -0.423394 0.628158 -0.965626 0.353545 0.179244 -0.491377 0.257370 0.089590 -0.951331 -0.250147 -0.444209 0.185769 -0.132900 -0.006498 0.112555 -0.392332 -0.012566 -0.171049 0.412430 -0.784530 0.597930 0.654582 0.305766 -0.196717 -0.108986 -0.882588 0.133782 0.114576 -0.491082 0.697926 0.155321 0.534372 0.424671 0.057960 -0.566005 -0.098149 0.216918 -0.223675 0.078227 0.377749 -1.097760 -0.621768 -0.174247 -0.364606 1.090865 -0.314739 -1.034864 -0.132283 -0.111963 0.465461 -0.824262 -0.782546 0.374973 -0.286167 0.011139 -0.383112 0.354855 -0.439314 0.381809 -0.474764 0.159545 -1.221441 0.666809 -0.709793 0.239210 -0.138918 0.590115 0.215989 0.275636 0.433601 -0.156793 -0.382559 -1.424089 -0.417850 -0.116381 -0.344665 0.489679 0.626358 0.052841 0.577256 0.230637 -0.600817 -0.635985 1.089865 0.722713 0.494734 -0.428877 0.334609 -0.017065 0.336209 0.419211 1.363705 0.044493 0.188291 0.569913 -0.441831 0.071900 0.495701 -1.016109 0.061804 -0.363644 -0.767242 0.558061 0.567890 -0.662831 0.129088 -0.273309 -0.404180 -0.102150 -0.352034 0.208647 -0.281785 -0.134028 0.287623 -0.592395 -0.741625 -0.765389 -0.062867 1.148501 -0.574797 0.111045 -0.296249 0.241325 -0.732370 0.038897 -0.928541 0.283815 -0.379969 -1.189722 0.377329 0.206329 -0.463671 -0.644053 -0.356224
- 日 0.629690 0.297326 0.232061 0.184713 0.456603 -1.360426 -0.161957 -0.327643 0.379252 0.329272 -0.132093 -0.592853 0.369421 1.037765 0.194027 -0.582120 -0.030273 -0.433952 -0.491450 0.534424 0.298695 0.749407 0.961532 -0.013265 -0.160156 0.327203 0.178737 0.260298 0.023090 -0.661755 -0.069589 1.252100 0.274427 0.240321 0.676330 0.472751 0.397922 0.450867 -0.418843 0.434151 -0.318744 0.347155 -0.379078 -0.420713 -1.057411 0.168308 -0.487032 -0.069009 0.791306 0.176923 0.595739 0.527158 -0.328227 -0.121685 0.394574 0.216304 -0.410889 -0.465544 -0.511710 -0.106813 0.241581 -0.421071 -0.769960 0.508721 -0.518993 -0.582130 -0.122278 1.356947 -0.472147 0.219176 -0.637373 -0.379232 0.783050 -0.213542 -0.539033 0.326720 -0.806085 -0.636173 0.789530 -0.457049 0.013086 0.821359 0.303692 0.398572 -0.555042 -0.129366 -0.750004 -0.814094 -0.069134 -0.530091 0.515363 -0.586577 0.416111 -0.013937 -0.207776 0.584663 -0.013731 0.084024 -0.081933 -0.352394 0.082810 -0.485991 1.068765 -0.357893 -0.397341 -0.211468 0.420876 0.739444 0.097145 -0.032144 0.608081 0.445486 -0.453803 0.010084 0.063489 0.685313 0.311570 0.270301 0.373394 -0.579335 0.392215 -0.846188 1.196801 -0.524513 -1.024028 0.281200 -0.396088 0.811682 0.015575 0.444197 1.280806 -0.460657 -0.160046 -0.448320 0.348771 0.186178 -0.179169 -0.506007 -0.467586 0.232892 -0.132603 0.303326 0.532450 0.307448 -0.503961 -0.096622 -0.799405 0.108548 0.766091 -0.692119 -0.166302 -0.499754 -0.297644 -0.890542 -0.026685 -0.066082 0.513476 -0.234157 0.065489 0.236738 -0.698255 1.164697 0.689472 1.031871 0.946376 -0.528323 0.608606 -0.244125 -0.698136 0.663297 -0.315753 -0.024955 0.280253 -0.422331 -0.580495 0.018098 -0.057022 -0.306539 -0.084533 0.808313 -0.383531 0.320171 -0.399376 -0.477695 0.049535 -0.240776 0.169495 1.111698 0.712204 -0.209212 -0.668397 -0.478881 1.045714 -1.103660 0.234638 -0.466507 -0.287628 -0.133884 -0.486556 -0.456381 -0.277966 -0.006142 -0.013448 -0.202422 0.082728 -0.331743 -0.455172 -0.096092 -0.726063 -0.445307 0.451012 -0.380199 -0.300324 -0.068178 0.028520 0.143256 -0.972176 -1.223306 -0.187028 0.516180 -0.013776 -0.332469 -0.086373 -0.002370 0.777023 -0.041362 0.118836 -0.213045 0.513897 0.059171 0.371966 0.248342 -0.023563 0.700690 -0.921861 -0.051847 -0.109556 0.310796 -0.321488 0.309024 -0.368767 0.219925 -0.691066 -0.023356 0.030746 -0.452083 0.313607 -0.559715 -0.557504 0.528424 0.607529 -0.136493 0.152444 0.168481 -0.786771 0.006343 0.041056 0.642219 -0.609352 0.189559 0.682013 -0.063358 -0.391703 0.779091 -0.527853 0.707578 0.406660 0.103047 0.234198 0.038635 0.302829 1.064635 0.117944 -0.125693 -0.097658 -0.065264 0.228236 0.545942 0.338116 0.127504 0.171728 0.730071 -0.496038 0.158599 0.477939 0.339338 0.073317 0.780151 -0.409706 -0.080721 0.052942 0.037625 -0.642805 -0.008267 0.050677 1.016556 0.096010 0.497020 0.160714 -0.086144
- 12 0.060987 0.220731 -0.132685 -0.820905 -0.146994 -0.353599 -0.018432 -0.725461 0.737936 -0.019994 -0.154258 -0.419405 -0.338631 -0.291629 -0.022446 -0.251117 0.267038 0.434467 0.078770 -0.025413 0.500651 0.854019 0.395952 -0.051031 -0.435698 0.237282 -0.583161 0.168157 -0.601435 -1.348934 -0.164784 1.616810 -0.478824 -0.017075 -0.366454 0.365084 0.612911 0.161186 0.276902 0.375643 -0.112530 0.241358 -0.396695 -0.194163 -0.237141 0.697846 -0.526700 -0.531520 0.772570 -0.322104 1.315113 0.933782 -0.300440 0.198085 0.157843 0.812475 0.187118 -0.254738 0.139663 -0.073107 0.528683 -0.647160 -0.423146 0.006624 -0.894147 -0.726237 -0.881079 0.825501 0.080627 -0.561961 -0.475129 0.047358 0.167624 0.359150 -0.428893 -0.098055 -0.787332 -0.403023 0.472614 -0.063268 0.086656 0.310264 0.610745 -0.221438 -0.421589 0.778696 0.256653 -0.560248 -0.649633 -0.102870 0.586092 0.003949 0.076587 -0.567913 0.704371 0.364411 0.761299 -0.051966 0.385057 -0.367773 0.355770 -0.647424 0.923446 -0.051832 -0.543764 0.588581 0.554076 0.719280 0.339640 0.226296 0.472695 0.397156 -0.489635 0.773813 -0.117013 0.493147 0.093860 0.815739 0.779326 0.310038 -0.783332 -0.620891 0.599204 0.011149 -0.107909 0.348672 -0.498058 0.057396 -0.031033 0.168456 0.991581 -0.478983 -0.781825 -0.271430 0.955011 -0.478941 0.302897 -0.093343 -1.225087 0.255701 -0.017678 -0.452325 0.377625 0.635914 -1.060914 -0.345057 -0.322616 0.805557 0.755067 0.543725 0.030418 -0.496267 -0.373508 -0.095644 -0.015342 0.230422 0.639305 0.210231 0.053320 0.272379 0.220056 0.767136 0.343294 0.690162 1.038597 -0.527260 0.498455 0.133132 -0.857316 -0.088142 -0.121594 0.524317 0.324666 -0.600952 -0.612691 1.126837 -0.052147 -0.754994 -0.371588 0.592034 -0.199333 -0.271918 -0.781767 -0.777349 0.545457 -0.369653 0.403855 0.507788 0.341316 0.256601 -0.652804 -0.737112 -0.248066 -0.641719 0.033328 -0.249865 0.328787 0.311992 -0.004281 0.216633 -0.081194 -0.126300 -0.272642 -0.001156 -0.393779 -0.830402 0.065871 -0.295060 -0.708441 -0.639450 -0.084257 0.074900 -0.591158 -0.031511 0.261493 -0.149527 -1.486934 -0.835603 -0.202950 0.765206 0.159304 -0.304259 0.363361 -0.466972 1.143983 -0.557980 -0.238127 -0.099868 0.267013 0.080638 0.565594 -0.164587 0.111982 0.391523 -0.656658 0.514244 -0.385658 -0.082066 -0.306449 0.810565 -0.694959 -0.296500 0.137355 -0.246647 0.456403 -0.376321 0.404851 -0.319968 -0.270184 0.835729 0.550599 0.198025 1.204497 -0.237713 -0.441482 -0.020244 0.109327 -0.164325 -0.587616 0.205860 0.452751 0.599642 -0.033190 0.569953 0.012436 -0.112856 0.127171 -0.183683 0.785418 0.581380 -0.613472 0.544212 0.165229 0.089152 -0.788505 -0.047272 -0.276138 -0.010708 -0.105783 0.323865 0.158936 0.329226 -0.172630 0.240504 0.143898 0.473886 -0.170357 0.875988 -0.070156 0.038928 0.481665 0.419615 0.238390 0.543345 0.468181 0.671512 -0.106844 -0.193693 -0.246399 -0.186887
- 2 0.361149 0.704917 0.291457 -0.080728 -0.572933 -0.589067 -0.793803 -1.000979 0.820175 0.295928 0.184207 0.885769 -0.399009 0.637581 0.318221 -0.022845 0.259705 -0.114552 -0.054940 0.182685 0.221006 -0.218122 0.865297 -0.441181 0.185419 0.447350 -0.213860 -1.769530 -0.623723 -1.066396 0.099660 1.458072 0.450007 -0.789303 -0.121537 0.312682 0.246968 0.901696 0.033775 0.563942 0.521539 0.790706 -0.875751 0.932463 -0.507096 -0.130464 1.109031 -0.469966 0.884996 -0.271595 0.670886 0.471352 -0.666085 0.080534 0.530477 0.371931 -0.060190 -0.320351 0.219629 -0.268545 0.640168 -0.545242 0.073721 -0.646921 0.468086 -0.719434 -0.026094 0.861088 0.010164 0.028802 -0.204849 0.493036 0.357690 -0.472914 -0.582728 0.376733 0.621382 -0.667704 -0.412258 -0.081961 0.306849 -0.197246 0.451879 -1.593393 0.159934 -0.159739 0.726716 0.039497 -0.330342 -0.193373 -0.045038 -0.785062 0.898486 -0.217732 1.028231 0.260610 0.865236 -0.474029 -0.726025 -0.554919 -1.216791 -0.570384 0.724477 -0.346603 -0.662520 0.868426 0.021664 -0.019214 0.271993 -0.176794 0.194711 -0.084973 -1.092003 -0.006232 -0.776445 -0.021545 0.000348 -0.076073 0.193496 0.259513 -0.696651 -0.119337 0.141526 0.255560 0.426479 0.253425 -0.899710 0.643363 -0.486986 0.184396 1.035831 0.141304 0.019429 0.155365 0.002902 -0.109107 0.055377 -0.024775 -0.693244 0.206938 0.179723 -0.798811 0.141440 -0.525572 -0.577044 -0.878470 0.477901 0.946455 -0.141953 -0.503441 -0.478369 0.069927 -0.136817 0.606908 -0.076950 0.324349 0.192879 0.114894 -0.919087 0.352348 -1.414676 0.916139 0.328463 1.186773 0.745050 -0.434486 -0.267733 -0.199708 -0.534591 0.888049 -0.429322 0.363039 -0.040896 -0.322206 -0.337510 -0.026441 -0.142639 0.213708 -0.124947 0.622512 -1.049125 -0.706038 -0.208899 0.447170 0.536233 -0.231709 0.489767 0.514644 0.048299 -0.557416 0.273464 -1.164850 0.695038 0.409650 -0.814939 -0.453800 0.442305 -0.609455 0.602072 0.006255 -0.069054 -0.450125 0.125254 -0.478271 -0.413480 -1.055180 -0.893434 -0.556728 -1.093960 -0.768353 0.271325 -0.388704 -0.630777 0.779149 -0.211849 -0.288888 -0.145470 -0.570022 -0.030645 -0.831691 0.284689 0.429069 0.362861 -0.749234 0.465644 0.248452 -0.246630 -0.053088 0.302568 0.058695 -0.025171 0.224125 -0.313906 0.595948 -0.747329 0.141962 -0.211617 0.277275 -0.510860 0.348968 0.048324 -0.095582 0.024356 -0.532523 -0.415193 0.493446 0.292916 -0.496475 0.292169 -0.555822 0.702531 -0.081132 -0.078505 0.356687 0.408023 0.532309 -0.100873 0.750387 0.157968 -0.626308 0.372636 0.901865 0.156366 -0.398677 -0.315714 0.024024 -0.440340 0.041518 0.937736 0.692113 -0.201351 -0.068569 -0.755074 -0.320450 -0.319768 -0.102927 0.009273 0.043572 -0.163320 -0.071635 0.635202 0.503277 0.106900 -0.362000 0.350847 -0.668718 -0.516442 -0.376715 -0.380673 0.517315 0.964409 -0.042094 -0.786081 -0.959971 -0.023261 1.323840 1.026073 -0.052168 -0.350861 0.062560
- 年 -0.025680 -0.148341 0.260403 0.435279 0.340753 -0.370165 0.692524 0.405896 0.703198 0.419036 0.065059 1.252715 -0.546617 0.288202 0.012702 0.090569 -0.643207 -0.240639 0.319901 -0.293442 0.487747 1.345064 0.846493 0.117547 -1.066390 0.662450 0.008567 -0.178598 -0.069384 -0.303476 -0.419415 0.074236 -0.287813 -0.494250 -1.147110 -0.279624 0.231568 0.129380 -0.133671 0.771361 0.169357 0.685386 -0.558743 -0.012061 -0.356053 0.066274 -0.478152 0.360300 0.584327 -0.111067 0.741198 0.946980 -0.227466 0.026979 0.199299 -0.356406 0.466501 -0.262172 -0.271103 -0.312378 0.475935 0.135873 -0.576739 -0.376040 0.230084 0.311886 -0.885410 -0.017326 0.172162 -0.166069 -0.669513 -0.079418 0.324018 0.229297 0.349141 1.200803 -0.836550 -0.131899 0.847725 0.495316 0.118726 0.463500 0.874937 -0.116077 -0.785283 0.009747 -1.198582 -0.528437 -1.039171 0.715792 0.459527 -0.346323 0.090036 0.272467 -0.146408 -0.137629 0.630577 0.404533 0.325585 -0.837493 0.497690 -0.118868 0.341744 0.342129 -0.100156 0.656184 0.002508 0.282706 0.801364 -0.268771 0.100605 0.126436 -0.148326 0.836686 0.522442 0.337463 0.759250 0.411933 0.206825 -0.341759 -1.007716 -1.160924 0.417559 0.168041 -0.196686 -0.114311 -0.240378 0.511523 -0.102376 0.243313 0.841565 -0.114063 -0.525006 -0.811578 -0.088705 0.243166 0.499764 -0.087099 -0.166266 -0.083989 -0.240918 0.481979 -0.541293 0.408957 -0.772534 -0.699110 0.168422 0.308715 -0.136194 0.043910 -0.489641 -1.030214 0.028125 0.707278 -0.245171 0.421922 -0.013471 0.309093 -0.260784 -0.311308 0.071559 0.550136 0.403435 0.467421 1.336632 0.114131 -0.127869 0.211865 -0.463383 0.807332 -0.462692 -0.031325 0.557738 -0.334642 -0.955832 1.272109 -0.304985 -0.560109 0.276666 0.388646 -1.179766 0.460541 -1.091042 -0.190026 0.188740 -0.575042 -0.085519 0.385787 0.031602 -0.232948 -0.306679 -0.166243 0.281174 -0.706293 -0.701480 -0.668203 -0.032880 0.267294 -0.275263 0.358886 -0.889863 -0.086739 0.058511 -0.305190 0.250048 -0.219450 -0.859118 -0.484297 -0.703766 -0.833164 0.497965 -0.961938 -0.814127 0.056959 -0.110192 0.812637 -0.936772 -0.882466 -0.594758 0.620133 -0.045738 -0.646158 0.377140 0.196992 0.951233 0.108737 0.363643 0.312669 0.062553 0.577432 0.085247 0.433714 -0.561889 1.315280 -0.046503 1.064645 -0.535019 -0.173368 -0.653338 0.681322 -0.250618 0.710895 0.240201 -0.413712 -0.050010 0.009633 -0.610238 -0.342928 0.126470 0.558831 0.700774 0.322080 0.095641 0.450095 -0.186828 0.389571 0.042733 -0.028652 0.072356 -0.591279 0.306309 1.063946 -0.136931 -0.370101 -0.133079 -0.435311 -0.852584 0.450015 0.808773 0.596418 -0.594216 0.450295 0.264785 0.314351 -0.079821 0.326998 -0.191889 0.554631 0.185501 0.550335 0.010618 0.740444 -0.118180 -0.088854 0.899637 -0.636072 -0.017279 -0.687941 -0.471457 0.413444 0.003071 -0.017811 0.194908 -0.023812 -0.580133 0.906154 -0.002010 0.835716 -0.461930 -1.249504
- 请
- from gensim.models import KeyedVectors
- from gensim.scripts.glove2word2vec import glove2word2vec
-
- # 输入文件
- glove_file = 'data/glove/vectors.txt'
- # 输出文件
- w2v_file = 'data/glove/w2v.txt'
- # 开始转换
- glove2word2vec(glove_file, w2v_file)
- # 加载转化后的文件
- model = KeyedVectors.load_word2vec_format(w2v_file) #该加载的文件格式需要转换为utf-8
- print(model['时间']) #这个就和Word2Vec训练的模型使用方法一样了
Word2Vec表示的词向量不仅考虑了词之间的语义信息,还压缩了维度。但是,有时候当我们需要得到Sentence/Document的向量表示,虽然可以直接将Sentence/Document中所有词的向量取均值作为Sentence/Document的向量表示,但是这样会忽略单词之间的排列顺序对句子或文本信息的影响。
Doc2vec是在Word2vec的基础上做出的改进,它不仅考虑了词和词之间的语义,也考虑了词序。
Doc2Vec有两种模型,分别为:句向量的分布记忆模型(PV-DM: Distributed Memory Model of Paragraph Vectors)和句向量的分布词袋(PV-DBOW: Distributed Bag of Words version of Paragraph Vector)。
DM模型在给定上下文和文档向量的情况下预测单词的概率。即在训练时,首先将每个文档ID和语料库中的所有词初始化一个K维的向量,然后将文档向量和上下文词的向量输入模型,隐层将这些向量累加(或取均值、或直接拼接起来)得到中间向量,作为输出层softmax的输入。在一个文档的训练过程中,文档ID保持不变,共享着同一个文档向量,相当于在预测单词的概率时,都利用了这个句子的语义。
DBOW模型在给定文档向量的情况下预测文档中一组随机抽样的单词的概率。
注:Doc2vec的DM模型跟Word2vec的CBOW很像,DBOW模型跟Word2vec的Skip-gram很像。Doc2Vec为不同长度的段落训练出同一长度的向量;不同段落的词向量不共享;训练集训练出来的词向量意思一致,可以共享。
- from gensim.models.doc2vec import Doc2Vec, TaggedDocument
- import pandas as pd
- import numpy as np
-
- # 构建Doc2vec模型,获得句子向量
- def get_sentence_vec(datasets):
- # gemsim里Doc2vec模型需要的输入为固定格式,输入样本为[句子,句子序号]
- documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(datasets)]
- # 初始化和训练模型
- model = Doc2Vec(documents, vector_size=500, dm=1, window=4, min_count=5,epochs=50)
- #model = Doc2Vec(vector_size=300, dm=1, window=4, min_count=5, epochs=50)
- #model.build_vocab(documents)
- #model.train(documents,total_examples=model.corpus_count,epochs=model.epochs)
-
- model.save('data/w2v/doc2vec_model.pkl') # 将模型保存到磁盘
- # 获得数据集的句向量
- documents_vecs = np.concatenate([np.array(model.docvecs[sen.tags[0]].reshape(1, 300)) for sen in documents])
- return documents_vecs
-
- if __name__=='__main__':
- #准备数据
- train_data = pd.read_csv('data/clean_data_train.csv', sep=',', names=['contents', 'labels']).astype(str)
- test_data = pd.read_csv('data/clean_data_test.csv', sep=',', names=['contents', 'labels']).astype(str)
- cw = lambda x: str(x).split()
- train_data['words'] = train_data['contents'].apply(cw)
- test_data['words'] = train_data['contents'].apply(cw)
- datasets = pd.concat([train_data, test_data])
-
- #doc2vec句向量训练和生成
- documents_vec=get_sentence_vec(list(datasets['words']))
-
- #加载训练好的模型
- doc2vec_model=Doc2Vec.load('data/w2v/doc2vec_model.pkl')
- #推断新文档向量
- doc2vec_model.infer_vector(['绝望','快递','说','收到','快递','中奖','开心'])
Fasttext可以实现高效学习单词表示和句子分类;Fasttext是一个快速文本分类算法,与基于神经网络的分类算法相比有两大优点。
Fasttext论文 Fasttext源码 Fasttext官网 Fasttext原理和实现 Fasttext源码解析 Fasttext源码解析
使用fasttext.train_unsupervised函数训练词向量模型
代码实现如下:
- import fasttext
-
- # Skipgram model
- model = fasttext.train_unsupervised('data.txt', model='skipgram')
-
- # cbow model
- model = fasttext.train_unsupervised('data.txt', model='cbow')
-
- print(model.words) # list of words in dictionary
- print(model['king']) # get the vector of the word 'king'
-
- model.save_model("model_filename.bin") #可以通过调用该函数来保存训练的模型对象save_model
-
- model = fasttext.load_model("model_filename.bin") #加载训练好的模型
其中,data.txt是包含utf-8编码文本的训练文件;返回的model对象表示学习的词向量模型,可以使用该模型检索信息。
使用fasttext.train_supervised函数训练文本分类器
代码实现如下:
- # coding=utf-8
- import pandas as pd
- import numpy as np
- import fasttext
- from sklearn import metrics
-
- # CSV文件转换为fasttext格式的文件
- def fasttext_file():
- train_data = pd.read_csv('data/clean_train_data.csv', sep=',', names=['contents', 'labels']).astype(str)
- test_data = pd.read_csv('data/clean_test_data.csv', sep=',', names=['contents', 'labels']).astype(str)
- train_rows = len(train_data)
- test_rows = len(test_data)
- with open('data/fasttext_train.txt', 'a') as f:
- for i in range(train_rows):
- f.write(train_data['contents'][i] + '\t' + '__label__' + train_data['labels'][i] + '\n')
- f.close()
-
- with open('data/fasttext_test.txt', 'a') as f:
- for i in range(test_rows):
- f.write(train_data['contents'][i] + '\t' + '__label__' + train_data['labels'][i] + '\n')
- f.close()
-
- if __name__ == '__main__':
- # fasttext_file()
- # 第一个参数是前面得到的 fasttex_train.txt ,第二个参数是将要保存模型的路径,默认会加上.bin
- # label_prefix 就是标签或类别的起始符号
- classifier = fasttext.supervised("data/fasttext_train.txt", "data/fasttext.model", label_prefix="__label__",dim=5000, epoch=100, min_count=10)
- # 加载模型
- classifier = fasttext.load_model("data/fasttext.model.bin", label_prefix="__label__")
-
- # 测试模型 其中 fasttext_test.txt 就是测试数据,格式和 fasttext_train.txt 一样
- result = classifier.test("data/fasttext_test.txt")
- print(result)
- print("准确率:", result.precision)
- print("召回率:", result.recall)
其中,fasttext_train.txt
是一个文本文件,每行包含一个训练语句以及标签。默认情况下,我们假设标签是以字符串为前缀的单词__label__
- import fasttext
- model = fasttext.train_supervised('fasttext_train.txt')
-
- #我们还可以预测特定文本的标签
- model.predict("Which baking dish is best to bake a banana bread ?")
-
- #默认情况下,predict仅返回一个标签:概率最高的标签。您还可以通过指定参数来预测多个标签k:
- model.predict("Which baking dish is best to bake a banana bread ?", k=3)
-
- #如果要预测多个句子,可以传递一个字符串数组:
- model.predict(["Which baking dish is best to bake a banana bread ?",
- "Why not put knives in the dishwasher?"], k=3)
-
- #model_filename.ftz将比一个小得多的尺寸model_filename.bin
- model.save_model("model_filename.ftz")
Elmo,Bert,Flair……
参考:
本人博文NLP学习内容目录:
一、NLP基础学习
二、NLP项目实战
交流学习资料共享欢迎入群:955817470(群一),801295159(群二)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。