赞
踩
目前中文的分词可分为三大类:基于词典的方法、基于统计的方法和混合方法。基于词典的方法需要分词的源字符串,如果能够找到对应的字符串将成功匹配。这是一种很原始且效率相对低效的分词策略。举个简单案例,在“我要认真看论文”句子中查找关键词“论文”,无论采用何种匹配方式,它都需要从左往右或者从右往左一个字或一个词的查找(长度取决于对分词的粒度控制),直到经过几个轮回之后找到“论文”这个词组,这样才算成功。对于这类分词,根据解析方向的不同,可以分为正向匹配和逆向匹配;按照不同长度优先原则,可以分为最长匹配和最短匹配。
import os import sys import time class IMM(object): def __init__(self, *dic_path): self.dictionary = set() self.maximum = 0 # 加载词典 for path in dic_path: self.load_dic(path) # 加载字典 def load_dic(self, dic_path): with open(dic_path, 'r', encoding='utf-8') as fp: for line in fp: line = line.strip().split()[0] if not line: continue self.dictionary.add(line) self.maximum = max(self.maximum, len(line)) # 正向最大匹配 def FMM_cut(self, text): result = [] index = 0 while index < len(text): # 小标未超过句子长度 match = False for size in range(self.maximum, 0, -1): if index + size > len(text): continue piece = text[index:(index + size)] if piece in self.dictionary: match = True result.append(piece) index += size break if not match: result.append(text[index]) index += 1 return result # 逆向最大匹配 def RMM_cut(self, text): result = [] index = len(text) while index > 0: match = False for size in range(self.maximum, 0, -1): if index - size < 0: continue piece = text[(index - size):index] # 切分单词 # 匹配成功,index向前移动word长度 if piece in self.dictionary: match = True result.append(piece) index -= size break if not match: result.append(text[index - 1]) index -= 1 return result[::-1] # 双向最大匹配 def BMM_cut(self, text): words_FMM = self.FMM_cut(text) words_RMM = self.RMM_cut(text) print("FMM:", words_FMM) print("RMM:", words_RMM) # 如果正向和反向结果一样,返回任意一个 if words_FMM == words_RMM: return words_FMM # 单字词个数 f_single_word = 0 r_single_word = 0 # 总次数 fmm_count = len(words_FMM) rmm_count = len(words_RMM) # 非字典数 fmm_oov = 0 rmm_oov = 0 # 罚分都为1分,分值越低越好 fmm_score = 0 rmm_score = 0 # 分词结果不同,返回单字数、非字典词、总词数少的那一个 for each_word in words_FMM: if len(each_word) == 1: f_single_word += 1 if each_word not in self.dictionary: fmm_oov += 1 for each_word in words_RMM: if len(each_word) == 1: r_single_word += 1 if each_word not in self.dictionary: rmm_oov += 1 # 非字典词越少越好 fmm_score = fmm_oov + fmm_count + f_single_word rmm_score = rmm_oov + rmm_count + r_single_word # 返回罚分少的那个 if fmm_score < rmm_score: return words_FMM else: return words_RMM def main(): dict1_path = os.path.join(sys.path[0], r'.\data\dict.txt.big') dict2_path = os.path.join(sys.path[0], r'.\data\THUOCL_animal.txt') test_path = os.path.join(sys.path[0], r'.\data\CTBtestingset.txt') output_path = os.path.join(sys.path[0], r'.\data\output.txt') tokenizer = IMM(dict1_path, dict2_path) # tokenizer = IMM(r'./data/THUOCL_animal.txt') try: with open(test_path, 'r', encoding='utf-8') as input_text, open(output_path, 'w', encoding='utf-8', newline='') as output: for line in input_text: line = tokenizer.BMM_cut(line.strip()) print(line) line = ' '.join(line) + os.linesep print(line) output.write(line) except Exception: print(sys.stderr, "文件打开错误") raise Exception sys.exit(1) if __name__ == "__main__": start = time.time() main() end = time.time() print("运行时间:", end - start)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。