赞
踩
- #encoding=utf-8
- import sys
- import re
- import codecs
- import os
- import shutil
- import jieba
- import jieba.analyse
-
- #导入自定义词典
- jieba.load_userdict("dict_all.txt")
-
- #Read file and cut
- def read_file_cut():
- #create path
- pathBaidu = "BaiduSpiderCountry\\"
- resName = "Result_Country.txt"
- if os.path.exists(resName):
- os.remove(resName)
- result = codecs.open(resName, 'w', 'utf-8')
-
- num = 1
- while num<=100: #5A 200 其它100
- name = "%04d" % num
- fileName = pathBaidu + str(name) + ".txt"
- source = open(fileName, 'r')
- line = source.readline()
-
- while line!="":
- line = line.rstrip('\n')
- #line = unicode(line, "utf-8")
- seglist = jieba.cut(line,cut_all=False) #精确模式
- output = ' '.join(list(seglist)) #空格拼接
- #print output
- result.write(output + ' ') #空格取代换行'\r\n'
- line = source.readline()
- else:
- print 'End file: ' + str(num)
- result.write('\r\n')
- source.close()
- num = num + 1
- else:
- print 'End Baidu'
- result.close()
-
- #Run function
- if __name__ == '__main__':
- read_file_cut()
make
#if [ ! -e text8 ]; then
# wget http://mattmahoney.net/dc/text8.zip -O text8.gz
# gzip -d text8.gz -f
#fi
time ./word2vec -train Result_Country.txt -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
./distance vectors.bin
cd C:/Users/dell/Desktop/word2vec
sh demo-word.sh
./distance vectors.bin
最后希望文章对你有所帮助,主要是使用的方法。同时更多应用需要你自己去研究学习。
word2vec源码、语料下载地址:
http://download.csdn.net/detail/eastmount/9434889
(By:Eastmount 2016-02-18 深夜1点 http://blog.csdn.net/eastmount/ )
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。