赞
踩
def gender_features(word):
return {'last_letter':word[-1]}
gender_features('shrek')
from nltk.corpus import names
import random
names=([(name,'male') for name in names.words('male.txt')]+
[(name,'female') for name in names.words('female.txt')])
random.shuffle(names) #对元素重新排序
featuresets=[(gender_features(n), g) for (n,g) in names] #the last word of each name and it's gender
train_set,test_set=featuresets[500:],featuresets[:500]
classifier=nltk.NaiveBayesClassifier.train(train_set)
#测试没有出现过的名字
classifier.classify(gender_features('Neo')) #male
#利用大量没有出现过的数据评估分类器
print nltk.classify.accuracy(classifier,test_set) #0.785
#检查分类器
classifier.show_most_informative_features(5)
出现的比例叫做似然比,可以用于比较不同特征-结果关系。
def gender_features(word):
return {'last_word':word[-1],'first_word':word[0],'length':len(word)}
import nltk
from nltk.corpus import names
import random
names=([(name,'male') for name in names.words('male.txt')]+
[(name,'female') for name in names.words('female.txt')])
random.shuffle(names)
#准备训练集和测试集,训练朴素贝叶斯分类器。
featuresets=[(gender_features(n) ,g) for (n,g) in names]
train_sets,test_sets=featuresets[500:],featuresets[:500]
classifier=nltk.NaiveByvesClassifier(train_sets)
#测试分类器
classifier.calssify(gender_features('Neo'))
#评估分类器
print nltk.classify.accurcy(classifier,test_set)
#检查分类器
classifier.show_most_informative_features(5)
from nltk.classify import apply_features
train_set=apply_features(gender_features,name[500:])
test_set=apply_features(gender_features,name[:500])
def gender_features2(name):
features={}
feature["firstletter"]=name[0].lower()
features["lastletter"]=name[-1].lower()
for letter in 'abcdefghijklmnopqrstuvwxyz':
features["count(%s)" % letter]=name.lower().count(letter)
features["has(%s)" %letter]=(letter in name.lower())
return feature
gender_features2('John') #调用
{'count(j)':1,'has(d)':False,'count(b)':0,.....}
featuresets=[(gender_features2(n),g) for (n,g) in names]
train_set,test_set=featuresets[500:],featuresets[:500]
classifier=nltk.NaiveBayvesClassifier.train(train_set)
print nltk.classify.accuracy(classifier,test_set)
train_names=names[1500:]
devtest_names=names[500:1500]
test_names=names[:500]
train_set=[(gender_features(n),g) for (n,g) in train_names]
devtest_set=[(gender_features(n),g) for (n,g) in devtest_names]
test_set=[(gender_features(n),g) for (n,g) in test_names]
classifier=nltk.NaivNayvesClassifier.train(train_set)
pritn nltk.calssity.accuracy(classifier,dectest_set)
errors=[ ]
for (name,tag) in devtest_names:
guess=classifier.classify(gender_features(name))
if guess!=tag:
errors.append((tag,guess,name))
from nltk.corpus import movie_reviews
documents=[(list(movie_review.words(fileid)),category)
for catogiry in movie_reviews.categories()
for fileid in moive_review.fileids(category)]
random.shuffle(documents)
all_words=nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features=all_words.keys()[:2000]
def features(document):
document_words=set(document)
features={}
for word in word_features:
features['cotains(%S)' %word]=(word in document_words)
return features
print document_features(movie_reviews.words('pos/cv957_8737.txt'))
featuresets=[(document_features(d),c) for (d,c) in documents]
train_set,test_set=featuresets[100:],featuresets[:100]
calssifier=nltk.NaiveBayvesClassifier.train(train_set)
print nltk.classify.accuracy(calssifier,test_set)
classifier.show_most_informative_features(5)
from nltk.corpus import brown
suffix_fdist=nltk.FreqDist()
for word in brown.words():
word=word.lower()
suffix_fdist.inc(word[-1:])
suffix_fdist.inc(word[-2:])
sufix_fdist.inc(word[-3:])
common_suffixes=suffix_fdist.keys()[:100]
print common_suffixes
def pos_features(word):
features={ }
for suffix in common_suffixes:
features['endswith(%s)' %suffix]=word.lower().endswith(suffix)
return features
tagged_words=brown.tagged_words(categories='news')
features=[(pos_features(n),g) for (n,g) in tagged_words]
size=int(len(features)*01.)
train_set,test_set=featuresets[size:],featuresets[:size]
classifier=nltk.NaiveBayvesClassifier.train(train_set)
nltk.Classify.accuracy(classifier,test_set)
classifier.classify(pos_features('cast'))
print classifier.pseudicode(depth=4)
def pos_features(sentence,i):
features={"suffix(1)":sentence[i][-1:],
"suffix(2):"sentence[i][-2:],
"suffix(3):"sentence[i][-3:]}
if i==0:
features["prev-word"]="<START>"
else:
features["prev-word"]=sentence[i-1]
return features
pos_features(brown.sents()[0],8)
tagged_words=brown.tagged_words(categoories='news')
featuresets=[ ]
for tagged_sent in tagged_sents:
untagged_sent =nltk.tag.untag(tagged_sent)
for i,(word,tag) in enumerate(tagged_sent):
featuresets.apend(
(pos_features(untagged_seng,i),tag))
size=int(len(featuresets)*0.1)
train_set,test_set=featuresets[size:],featuresets[:size]
classifier=nltk.NaiveBayvesClassifier.train(train_set)
nltk.classity.accuracy(classifier,test_set)
def pos_features(sentence,i,history):
features={"suffix(1)":sentence[i][-1:],
"suffix(2)":sentence[i][-2:],
"suffix(3)":sentence[i][-3:]}
if i==0:
features["pre_word"]="<START>"
features["pre_word"]="<START>"
else:
features["pre_word"]=sentence[i-1]
features["pre_word"]=sentence[i-1]
return reatures
class ConsecutivePosTagger(nltk.TaggerI):
def __init__(self,train_sents):
train_set=[]
for tagged_sent in tagged_sents:
untagged_sent=nltk.tag.untag(tagged_sent)
history=[]
for i,(word,tag) in enumerate(tagger_sent):
featureset=pos_reatures(untagged_sent,i,history)
train_set.append((featureset,tag))
history.append(tag)
self.classifier=nltk.NaiveBayvesClassifier.train(train_set)
def tag(self,sentence):
history=[]
for i,word in enumerate(sentence):
featureset=pos_feaures(sentence,i,history)
tag=self.classifier.classify(featureset)
history.append(tag)
return zip(sentence,history)
tagged_sent=brown.tagged_sent(categories='news')
size=int(len(tagged_sents)*0.1)
train_sents,test_sents=tagged_sents[size:],tagged_sents[:size]
tagger=ConsecutivePosTagger(train_sents)
print tagger.evaluate(test_sents)
sents=nltk.corpus.treebank_raw.sents()
tokens=[]
boundaries=set()
offset=0
for sent in nltk.corpus.treebank_raw.sents():
tokens.extend(sent)
offset+=len(sent)
boundaries.add(offset-1)
def punct_features(tokens,i):
return {'netx-word-capitalized':tokens[i+1][0].isupper(), #判断首字母是否为大写
'prevword':tokens[i-1].lower(), #判断是否为小写
'punct':tokens[i],
'prev-word-is-one-char':len(token[i-1])==1}
featuresets=[(punct_features(tokens,i),(i in boundaries))
for i in range(1,len(tokens)-1)
if tokens[i] in '.?!']
def rte_features(retpair):
extractor=nltk.RTEFeatuerExtractor(rtepair)
features={}
features['word_overlap']=len(extractor.overlap('word'))
features['word_hyp_extra']=len(extractor.hyp_extra('word'))
features['ne_overlap']=len(extractor.overlap('ne'))
features['ne_hyp_extra']=len(extractor.hyp_extra('ne'))
return features
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。