赞
踩
# -*- coding: cp936 -*- ###librarys: import sys ###global variables: freqdic={} dic={} transferdic={} inputfilename='' outputfilename='' trainingfilename='' marklist=[] ###classes: class nodeinfotable: def __init__(self,No,length,previousdict): self.No=No self.length=length self.previousdict=previousdict class node: def __init__(self,outpointerdict,tablelist): self.outpointerdict=outpointerdict self.tablelist=tablelist def addOutPointer(self,targetnum,wordname): self.outpointerdict[targetnum]=wordname class wordnode: def __init__(self,wordval,propertynodelist): self.wordval=wordval self.propertynodelist=propertynodelist class propertynode: def __init__(self,propertyval,probability,beststack): self.propertyval=propertyval self.probability=probability self.beststack=beststack ###functions: #find cmpstr in valuestr,which is splited by ',' def findinstr(valuestr,cmpstr): wordlist=valuestr.split(',') for w in wordlist: if cmpstr==w: return True return False # generate the dictionary def chnsegtager_training(): global dic global freqdic global transferdic transfercounter=0 wordcounter=0 orgname = '' f=file(trainingfilename) while True: line = f.readline() if len(line)==0: break line=line.strip() linewordindex=0 prewordproperty='' currwordproperty='' if line.find('/') != -1: lineset = line.split(' ') for w in lineset:#对于一行中每一个字母 wordcounter+=1 singlewordset = w.split('/') formerword = singlewordset[0] laterword = singlewordset[1] if formerword.find('[') != -1: #has '[' token formerword=formerword[1:] orgname=formerword else: if orgname != '': orgname += formerword rbracketpos = laterword.find(']') if rbracketpos != -1: #has ']' token orgproperty=laterword[rbracketpos+1:] laterword=laterword[:rbracketpos] if dic.has_key(orgname): if freqdic.has_key(orgname+','+orgproperty): freqdic[orgname+','+orgproperty]+=1 else: freqdic[orgname+','+orgproperty]=1 orgvalueStr = dic[orgname] if findinstr(orgvalueStr,orgproperty)==False: dic[orgname] += ','+orgproperty orgname = '' else: dic[orgname] = orgproperty freqdic[orgname+','+orgproperty]=1 orgname = '' if dic.has_key(formerword):#字典里存在此单词 if freqdic.has_key(formerword+','+laterword): freqdic[formerword+','+laterword]+=1 else: freqdic[formerword+','+laterword]=1 valueStr = dic[formerword]