赞
踩
- #-------------------------------------------------------------------------------
- # Name: n_gram切分中文
- # Purpose: 自然语言处理第5章作业
- # 水平有限,仅做参考
- # Author: nkenen
- #
- # Created: 22/02/2020
- # Copyright: (c) Administrator 2020
- # Licence: <your licence>
- #-------------------------------------------------------------------------------
- import re
-
- symbol = ',.!?。,?!0123456789qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM'
-
- #本程序并不是有用的只是将已标注好的1998语料库给转变成无标注的
- def Makenomarkedcorpus():
- file = open('F:/自然语言处理/1980pd.txt','w',encoding='utf-8')
- filer = open('F:/自然语言处理/199801_people_s_daily.txt','r',encoding='utf-8',errors='ignore')
-
- for line in filer:
- str = ''
- flag = 0
- i=0
- while i < len(line):
- if line[i] == '/':
- while line[i] != ' ' :
- i += 1
- if i >= len(line):
- break
- elif line[i] == ']':
- str += ']'
- flag = 1
- elif flag == 1:
- str += line[i]
- i += 1
- file.write(str+'\n')
-
- file.close()
- filer.close()
-
- #如何切分总步骤
- #第一步导入语料库
- #第二步列出所有切分可能的句子
- #第三步将已切分的句子在语料库以n-gram求最大概率
-
- #语料库类
- class Corpus(object):
- def __init__(self):
- self.content = ''
- self.size = 0
-
- #导入语料库
- def loadlib(self,path):
- with open(path,'r',encoding='utf-8',errors='ignore')as file:
- self.content = str(file.readlines())
- self.size = len(self.content)
-
- #ngram语法
- class N_gram(object):
- def __init__(self,N,corpus):
- self.N = N
- self.corpus = corpus.content
- self.problist = []
-
- #计算句子概率,历史已有n个字出现wi的概率p(wi|wi-1。。。wi-n-1)的连乘
- def ComputeProb(self,entlist):
- for sen in entlist:
- lst = sen.split(" ")
- lens = len(lst)
- prob = 1.0
-
- if self.N > 1:#本次运算中可能语料库或者算法问题,起始>2-gram输出的概率相同,可能我水平还不够
- prob *= (self.corpus.count(lst[0])+1)/len(self.corpus)#加1平滑,懒得写算法比较难的
- for i in range(1,len(lst)):
- strh = ''
- stra = ''
- k = 0
- if i <= self.N:
- k = 0
- else:
- k = i - self.N
- for j in range(k,i):#找到前面的n个历史,可能少于n
- strh += lst[j]+' '
- stra = strh + lst[i]
- #print(stra)
- #概率p(wi|wi-1。。。wi-n-1)的连乘
- #prob *= (self.corpus.count(stra)+1)/(self.corpus.count(strh))#这个连乘算法有问题,分母可能出现0,直接让分母为语料库的大小
- prob *= (self.corpus.count(stra)+1)/len(self.corpus)
-
- else:
- for i in range(len(lst)):
- prob *= (self.corpus.count(lst[i])+1)/len(self.corpus)
-
- self.problist.append(prob)
- print(dict(zip(entlist,self.problist)))
-
- #找到最大概率
- def findBiggestP(self,entlist):
- max =0.0
- index = 0
- for i in range(len(self.problist)):
- if max < self.problist[i]:
- max = self.problist[i]
- index = i
- print([str(entlist[index]),max])
- return str(entlist[index]),max
-
- def clearplst(self):
- self.problist.clear()
-
- class Segmentation(object):
- def __init__(self,content,corpus):
- self.content= content
- self.entlist = []
- self.endindex = 0
- self.worddict = corpus.content
- self.n_gram = N_gram(2,corpus)
-
- def loadContent(self,path):
- with open(path,'r',encoding='utf-8',errors='ignore')as file:
- self.content = str(file.readlines())
-
- def processSeg(self):
- strA = ''
- seg_suc_str = ''
- prob_suc = 1.0
- global symbol
- for word in self.content:
- if word in symbol:
- #这里进行切分
- self.entlist.clear()
- self.endindex = 0
- self.n_gram.clearplst()
- #第二步列出所有切分可能的句子
- self.__findAllSentence(strA)
- #计算所有句子概率
- if len(strA) >1:
- self.n_gram.ComputeProb(self.entlist)
- #切分完成后加上后面的符号继续
- strA,prob = self.n_gram.findBiggestP(self.entlist)
- prob_suc *= prob
-
- #else:没计算符号数字出现的概率
- #prob *= (self.corpus.count(word)+1)/len(len(self.corpus))
-
- if strA != '':#当不是字符或单个字符多加个空格用于标示切分
- strA += ' '
- seg_suc_str += strA + word + ' '
- strA = ''
- prob = 0
- elif word not in '\n\t\r':
- strA += word
-
- return seg_suc_str,prob_suc
-
- def __findAllSentence(self,strA):
- #将所有的单词切分出来,然后切格式是【word,start,stop】
- exword = self.__findExistofword(strA)
- if exword != [] and exword != None:
- for i in range(len(exword)):#此循环很重要,找到所有起始点0的字或词语
- if exword[i][1] == 0:#找到起始点0
- self.endindex = 0
- strs = exword[i][0]
- #下面是递归遍历所有可能的节点,跟树节点很像
- self.__findSentence(exword,strs,i,len(strA))
- print(self.entlist)
-
- #找出所有在语料库出现的字词,语料库是已经切分好的
- def __findExistofword(self,strA):
- if len(strA) <= 1:
- return
- exword = []
- for i in range(len(strA)):
- for j in range(i+1,len(strA)+1):
- if strA[i:j] in self.worddict:
- exword.append([strA[i:j],i,j])
- print('exword is ',exword)
- return list(exword)
-
- #此算法很重要,是将所有可能的句子找出来的算法
- def __findSentence(self,exword,strs,i,tlen):
- if self.endindex == tlen:#已经是最后的节点
- self.entlist.append(strs)
- return
- else:
- for j in range(i+1,len(exword)):
- if exword[i][2] == exword[j][1]:#是下一个连续的节点
- strs1 =strs + ' ' + exword[j][0]#构成句子,并切分
- self.endindex = exword[j][2]#记录本次节点
- #继续向下寻找
- self.__findSentence(exword,strs1,j,tlen)
-
-
- def main():
- path = 'F:/自然语言处理/1980pd.txt'
- #Makenomarkedcorpus()
- corpus = Corpus()
- corpus.loadlib(path)
-
- seg_str = '我正在学习自然语言处理!'
- segmentation = Segmentation(seg_str,corpus)
- seg_suc_str,prob_suc = segmentation.processSeg()
-
- print(seg_suc_str,prob_suc)
- pass
-
- if __name__ == '__main__':
- main()
参考:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。