当前位置:   article > 正文

Python---正向、逆向和双向最大匹配算法_双向最大匹配算法 python

双向最大匹配算法 python

使用python实现正向、逆向和双向最大匹配算法
正向最大匹配

class leftMax(object):
    def __init__(self,dict_path):
        self.dictionary = set() #定义字典
        self.maximum = 0 #最大匹配长度
        
        with open(dict_path,'r',encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                self.dictionary.add(line.split('\t')[1])
                if len(line) > self.maximum:
                    self.maximum = len(line)
                    
    def cut(self,text):
        result = []
        length = len(text)
        index = 0
        while length > 0:
            word = None
            for size in range(self.maximum,0,-1):
                if length - size < 0:
                    continue
                piece = text[index:index+size]
                if piece in self.dictionary:
                    word = piece
                    result.append(word)
                    length -= size
                    index += size
                    break
            if word is None:
                length -= 1
                result.append(text[index])
                index += 1
        return result
def main():
    text = "北京大学生前来应聘算法工程师岗位"
    tokenizer = leftMax('XXX/ChineseDic.txt')
    print(tokenizer.cut(text))
        
main()
['北京大学', '生前', '来', '应聘', '算法', '工程师', '岗位']
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42

逆向最大匹配

class rightMax(object):
    def __init__(self,dict_path):
        self.dictionary = set() #定义字典
        self.maximum = 0 #最大匹配长度
        
        with open(dict_path,'r',encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                self.dictionary.add(line.split('\t')[1])
                if len(line) > self.maximum:
                    self.maximum = len(line)
    def cut(self,text):
        result = []
        index = len(text)
        while index > 0:
            word = None
            for size in range(self.maximum,0,-1):
                if index - size < 0:
                    continue
                piece = text[(index - size):index]
                if piece in self.dictionary:
                    word = piece
                    result.append(word)
                    index -= size
                    break
            if word is None:
            	result.append(text[(index-1):index])
                index -= 1
        return result[::-1]#由于append为添加至末尾,故需反向打印
    
def main():
    text = "北京大学生前来应聘算法工程师岗位"
    tokenizer = rightMax('XXX/ChineseDic.txt')
    print(tokenizer.cut(text))
        
main()
['北京', '大学生', '前来', '应聘', '算法', '工程师', '岗位']
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39

双向最大匹配

def doubleMax(text,path):
    left = leftMax(path)
    right = rightMax(path)
    
    leftMatch = left.cut(text)
    rightMatch = right.cut(text)
    
    #返回分词数较少者
    if (len(leftMatch) != len(rightMatch)):
        if (len(leftMatch) < len(rightMatch)):
            return leftMatch 
        else:
            return rightMatcht
    else:#若分词数量相同,进一步判断
        leftsingle = 0
        rightsingle = 0
        isEqual = True #用以标志结果是否相同
        for i in range(len(leftMatch)):
            if(leftMatch[i] != rightMatch[i]):
                isEqual = False
            #统计单字数
            if(len(leftMatch[i])==1):
                leftsingle += 1
            if(len(rightMatch[i])==1):
                rightsingle += 1
        if(isEqual):
            return leftMatch
        if(leftsingle < rightsingle):
            return leftMatch
        else:
            return rightMatch

text = "北京大学生前来应聘算法工程师岗位"
print(doubleMax(text,'XXX/ChineseDic.txt'))
['北京', '大学生', '前来', '应聘', '算法', '工程师', '岗位']
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35

ChineseDic.txt词典可从该处下载:中文分词 词库
完!

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小小林熬夜学编程/article/detail/374886
推荐阅读
相关标签
  

闽ICP备14008679号