赞
踩
基于规则的分词,主要是通过维护词典,在切分语句时,将语句的每个字符串与词表中的词逐一匹配,找到则切分,否则不切分。
class MM(object): def __init__(self, dic_path): #dic_path为字典的路径 self.dictionary = set() self.maximum = 0 #读取词典 with open(dic_path, 'r', encoding='utf8') as f: for line in f: line = line.strip() if not line: continue self.dictionary.add(line) if len(line) > self.maximum: self.maximum = len(line) print(self.dictionary) def cut(self, text): result = [] length = len(text) index=0 while length > 0: word = None for size in range(self.maximum, 0, -1): if length - size < 0: continue piece = text[index:index+size] if piece in self.dictionary: word = piece result.append(word) length -= size index += size break if word is None: length-=1 result.append(text[index]) index += 1 return result def main(): text = "南京市长江大桥" tokenizer = MM('./data/imm_dic.utf8') print(tokenizer.cut(text)) main()
class IMM(object): def __init__(self, dic_path): self.dictionary = set() self.maximum = 0 #读取词典 with open(dic_path, 'r', encoding='utf8') as f: for line in f: line = line.strip() if not line: continue self.dictionary.add(line) if len(line) > self.maximum: self.maximum = len(line) print(self.dictionary) def cut(self, text): result = [] index = len(text) while index > 0: word = None for size in range(self.maximum, 0, -1): if index - size < 0: continue piece = text[(index - size):index] if piece in self.dictionary: word = piece result.append(word) index -= size break if word is None: index -= 1 return result[::-1] def main(): text = "南京市长江大桥" tokenizer = IMM('./data/imm_dic.utf8') print(tokenizer.cut(text)) main()
class BMM(object): def __init__(self, dic_path): self.dictionary = set() self.maximum = 0 self.dic_path=dic_path #读取词典 with open(dic_path, 'r', encoding='utf8') as f: for line in f: line = line.strip() if not line: continue self.dictionary.add(line) if len(line) > self.maximum: self.maximum = len(line) def cut(self,text): mm = MM(self.dic_path) imm = IMM(self.dic_path) mmMatch = mm.cut(text) immMatch = imm.cut(text) # #返回分词数较少者 if (len(mmMatch) != len(immMatch)): if (len(mmMatch) < len(immMatch)): return mmMatch else: return immMatch else:#若分词数量相同,进一步判断 mmsingle = 0 immsingle = 0 isEqual = True #用以标志结果是否相同 for i in range(len(mmMatch)): if(mmMatch[i] != immMatch[i]): isEqual = False #统计单字数 if(len(mmMatch[i])==1): mmsingle += 1 if(len(immMatch[i])==1): immsingle += 1 if(isEqual): return mmMatch if(mmsingle < immsingle): return mmMatch else: return immMatch def main(): text = "南京市长江大桥" tokenizer = BMM('./data/imm_dic.utf8') print(tokenizer.cut(text)) main()
以上主要总结了三种规则匹配方法,正向最大匹配法,逆向最大匹配法,和双向最大匹配算法,并给出了相应的代码实现,以上代码都可以直接运行出结果。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。