当前位置:   article > 正文

python tokenizer

bytelevelbpetokenizer在哪个库中python

# -*- coding: utf-8 -*-
from janome.tokenizer import Tokenizer
import codecs
import sys
import os

def analyze(path):
fi = codecs.open(path, encoding="UTF-8")
lines = fi.readlines()
fi.close()
wordSet = set()
fo = codecs.open(path + "x" , "w")
t = Tokenizer()
tokenStr = None
for line in lines:
for token in t.tokenize(line):
tokenStr = str(token)
if (tokenStr.find("記号") < 0) and (tokenStr.find("人名")) < 0:
wordSet.add(tokenStr.split(",")[6])
#print(wordSet)
wordList = list(wordSet)
#fo.write(wordSet.pop() + "\r\n")
for item in wordList:
fo.write(item + "\r\n")
fo.close()

analyze("C:\\Users\\70485528\\mymail.txt")

转载于:https://www.cnblogs.com/corgiwmh/p/6604732.html

声明:本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:【wpsshop博客】
推荐阅读
相关标签
  

闽ICP备14008679号