赞
踩
import chardet if __name__ == '__main__': f = file2file() s = '中国是个好地方,我住在这里。' stopwords = set(sum(f.readtxt('../data/HITstopwords.txt'), [])) # 查看s字符集 s_charset = chardet.detect(s) # jieba cut = jieba.lcut(s) # 下面这句一直会报错 # cut_charset = chardet.detect(cut[0]) # # 编码处理部分:Begin # k =[] # for each in cut: # k.append(each.encode('utf-8')) # # 编码处理部分:End # 去停用词,注意把cut换成k cut__stop_data = [word for word in cut if word not in stopwords] # cut__stop_data = [word for word in k if word not in stopwords] # 写入本地 open('test.txt', 'w').write(' '.join(cut__stop_data)) print('------------------Run over-----------------')
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。