赞
踩
仅限拼写上的纠错,暂不去进行语法层次上的纠错。
I like play football. 本文暂不纠错。
spell-errors.txt
正确:错误1,错误2… 其他错误给予小概率
vocab.txt 词典
testdata.txt 测试数据
编号–错误量–文本
# 词典库
# 转换成集合复杂度O(logn),列表复杂度为O(n)
vocab = set([line.rstrip() for line in open('vocab.txt')])
print(vocab)
# 需要生成所有候选集合 def generate_candidates(word): """ word: 给定的输入(错误的输入) 返回所有(valid)候选集合 """ # 生成编辑距离为1的单词 # 1.insert 2. delete 3. replace # appl: replace: bppl, cppl, aapl, abpl... # insert: bappl, cappl, abppl, acppl.... # delete: ppl, apl, app # 假设使用26个字符 letters = 'abcdefghijklmnopqrstuvwxyz' splits = [(word[:i], word[i:]) for i in range(len(word)+1)] # insert操作 inserts = [L+c+R for L, R in splits for c in letters] # delete deletes = [L+R[1:] for L,R in splits if R] # replace replaces = [L+c+R[1:] for L,R in splits if R for c in letters] candidates = set(inserts+deletes+replaces) # 过来掉不存在于词典库里面的单词 return [word for word in candidates if word in vocab] print(generate_candidates("apple")) ['apple', 'apples', 'ample', 'apply']
from nltk.corpus import reuters
# 读取语料库
categories = reuters.categories()
corpus = reuters.sents(categories=categories)
print(corpus)
[['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between', 'the', 'U', '.', 'S', '.', 'And', 'Japan', 'has', 'raised', 'fears', 'among', 'many', 'of', 'Asia', "'", 's', 'exporting', 'nations', 'that', 'the', 'row', 'could', 'inflict', 'far', '-', 'reaching', 'economic', 'damage', ',', 'businessmen', 'and', 'officials', 'said', '.'], ['They', 'told', 'Reuter', 'correspondents', 'in', 'Asian', 'capitals', 'a', 'U', '.', 'S', '.', 'Move', 'against', 'Japan', 'might', 'boost', 'protectionist', 'sentiment', 'in', 'the', 'U', '.', 'S', '.', 'And', 'lead', 'to', 'curbs', 'on', 'American', 'imports', 'of', 'their', 'products', '.'], ...]
# 构建语言模型: bigram term_count = {} bigram_count = {} for doc in corpus: doc = ['<s>'] + doc for i in range(0, len(doc)-1): # bigram: [i,i+1] term = doc[i] bigram = doc[i:i+2] if term in term_count: term_count[term]+=1 else: term_count[term]=1 bigram = ' '.join(bigram) if bigram in bigram_count: bigram_count[bigram]+=1 else: bigram_count[bigram]=1 print(term_count) {'<s>': 54716, 'ASIAN': 12, 'EXPORTERS': 46, 'FEAR': 2, 'DAMAGE': 13, 'FROM': 208, 'U': 6388, '.': 45900, 'S': 6382, '.-': 167, 'JAPAN': 295, 'RIFT': 1, 'Mounting': 1, 'trade': 2271, 'friction': 28, 'between': 1053, 'the': 58251, 'And': 227, 'Japan': 1594, 'has': 4679, 'raised': 334, 'fears': 65, 'among': 229, 'many': 297, 'of': 35979, 'Asia': 71, "'": 11272, 's': 9298, 'exporting': 51, 'nations': 399, 'that': 7377, 'row': 49, 'could': 1431, 'inflict': 1,......} print(bigram_count) {'<s> ASIAN': 4, 'ASIAN EXPORTERS': 1, 'EXPORTERS FEAR': 1, 'FEAR DAMAGE': 1, 'DAMAGE FROM': 2, 'FROM U': 4, 'U .': 6350, '. S': 5809, 'S .-': 120, '.- JAPAN': 8, 'JAPAN RIFT': 1, 'RIFT Mounting': 1, 'Mounting trade': 1, 'trade friction': 17, 'friction between': 4, 'between the': 237, 'the U': 1959,....} # sklearn里面有现成的包
channel_prob = {}
for line in open('spell-errors.txt'):
items = line.split(":")
correct = items[0].strip()
mistakes = [item.strip() for item in items[1].strip().split(",")]
channel_prob[correct] = {}
for mis in mistakes:
channel_prob[correct][mis] = 1.0/len(mistakes)
print(channel_prob)
{'raining': {'rainning': 0.5, 'raning': 0.5}, 'writings': {'writtings': 1.0}, 'disparagingly': {'disparingly': 1.0}, 'yellow': {'yello': 1.0}, 'four': {'forer': 0.2, 'fours': 0.2, 'fuore': 0.2, 'fore*5': 0.2, 'for*4': 0.2}, 'woods': {'woodes': 1.0}, 'hanging': {'haing': 1.0}, 'aggression': {'agression': 1.0},...}
V = len(term_count.keys()) file = open("testdata.txt","r") for line in file: items = line.rstrip().split('\t') line = items[2].split() j = 0 for word in line: if word not in vocab: #需要替换word成正确的单词 #Step1: 生成所有的(valid)候选集合 candidates = generate_candidates(word) # 一种方式: if candidate = [], 多生成几个candidates, 比如生成编辑距离不大于2的 # TODO : 根据条件生成更多的候选集合 if len(candidates) < 1: continue probs=[] # 对于每一个candidate, 计算它的score # score = p(correct)*p(mistake|correct) # = log p(correct) + log p(mistake|correct) # 返回score最大的candidate for candi in candidates: prob = 0 # 计算channel probability if candi in channel_prob and word in channel_prob[candi]: prob += np.log(channel_prob[candi][word]) else: prob += np.log(0.0001) #计算语言模型的概率 pre_word = line[j-1]+" "+candi if pre_word in bigram_count and line[j-1] in term_count: prob += np.log((bigram_count[pre_word]+1.0)/(term_count[line[j-1]]+V)) else: prob += np.log(1.0/V) if j+1 < len(line): pos_word = candi + " " + line[j+1] if pos_word in bigram_count and candi in term_count: prob += np.log((bigram_count[pos_word] + 1.0)/(term_count[candi]+V)) else: prob += np.log(1.0/V) probs.append(prob) max_idx = probs.index(max(probs)) print(word,candidates[max_idx]) j +=1 export. exports Natonal National exports, exports dlrs, dlrs remainder. remainder statementa statement security, security countries. countries political, political negotiations, negotiations statemant statement said. said Justice. Justice ananounced announced oil. oil avirted averted waters. waters histodric historic countries, countries said. said Athens, Athens Akiman, Akiman Ozal. Ozal disclosed. disclosed Zeeuw, Zeeuw Trade's Trades agriculture, agriculture prirce price supports. supports rates, rates cmmittee committee said. said can't cant eliminati eliminate unilaterally, unilaterally said. said lawmakers' lawmakers policy, policy daliry dairy policy, policy herds, herds said. said Wallis. Wallis Tomorrow, Tomorrow Lyng. Lyng week. week states. states May. Mayo transaction, transaction stock, stock share, share said. said said. said name, named GGHF, GGHF reporter's reporters aftere after bank's banks but, but are. area hand, hand that, that dollar, dollar higher. higher rates. rates grwth growth year. year consaquence consequence dollar's dollars rises, rises 1987. 1987 30, 30 dlrs, dlrs dlrs. dlrs period, period dlrs, dlrs dlrs. dlrs chairman, chairman line. lined shar share 1988. 1988 copany company dividends. dividends repurchases, repurchases said. said Medtronic's Medtronics maket market 1988, 1988 1987. 1987 products, products pacemakers, pacemakers valves, valves lasers, lasers 1995. 1995 said, said beg, beg past, past Medtronic's Medtronics paecemakers pacemakers recalls. recalls critixized criticized industry, industry pacemakers. pacemakers leads, leads said. said cempany company industry. industry Nelson, Nelson presidelnt president Medtronics, Medtronics Medtronic's Medtronics expertise, expertise systems. systems Watllin Wallin acquisitions. acquisitions plovisions provisions won't wont acquisition. acquisition responsivve responsive 1988. 1988 Activitrax, Activitrax activity. activity AG, AG pacemaker. pacemakers Batignolles, Batignolles ltSCHN.PA, ltSCHN.PA shares. shares 1986. 1986 bonds, bonds 30, 30 sources. sources extension, extension Britain, Britain udertook undertook later. later deilare declare extension. extension tomorrow, tomorrow delegartes delegates session. session Cntral Central future, future reported. reported Baluchp Baluch bales, bales year's years production. productions Morlin Marlin tariffs. tariffs ..........
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。