赞
踩
# -*- coding: utf-8 -*-
from janome.tokenizer import Tokenizer
import codecs
import sys
import os
def analyze(path):
fi = codecs.open(path, encoding="UTF-8")
lines = fi.readlines()
fi.close()
wordSet = set()
fo = codecs.open(path + "x" , "w")
t = Tokenizer()
tokenStr = None
for line in lines:
for token in t.tokenize(line):
tokenStr = str(token)
if (tokenStr.find("記号") < 0) and (tokenStr.find("人名")) < 0:
wordSet.add(tokenStr.split(",")[6])
#print(wordSet)
wordList = list(wordSet)
#fo.write(wordSet.pop() + "\r\n")
for item in wordList:
fo.write(item + "\r\n")
fo.close()
analyze("C:\\Users\\70485528\\mymail.txt")
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。