赞
踩
data
文件夹pip install gensim
数据集大概1.2G,下载完成后放在data
文件夹下
data_pre_process.py
# -*- coding: utf-8 -*-
from gensim.corpora import WikiCorpus
import jieba
from langconv import *
def my_function():
space = ' '
i = 0
l = []
zhwiki_name = './data/zhwiki-latest-pages-articles.xml.bz2'
f = open('./data/reduce_zhiwiki.txt', 'w')
wiki = WikiCorpus(zhwiki_name, lemmatize=False, dictionary={
})
for text in
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。