赞
踩
import pandas as pd
import nltk
from nltk.corpus import stopwords
nltk.download('all') #建议使用梯子
set(stopwords.words('english')) #如果使用中文可改为'chinese'
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
example_sent = 'This is a sample sentence, showing off the stop words filtration.'
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(example_sent)
filtered_sentence = [w for w in word_tokens if w not in stop_words]
filtered_sentence = []
for w in word_tokens:
if w not in stop_words:
filtered_sentence.append(w)
print(word_tokens)
print(filtered_sentence)
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
ps = PorterStemmer()
example_words = ["python","pythoner","pythoning","pythoned","pythonly"]
for w in example_words:
print(ps.stem(w))
# 我们尝试对一个典型的句子,而不是一些单词提取词干:
new_text = "It is important to by very pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."
words = word_tokenize(new_text)
# print(words)
for w in words:
print(ps.stem(w))
import nltk from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer train_text = state_union.raw('2005-GWBush.txt') sample_text = state_union.raw('2006-GWBush.txt') # 训练 Punkt 标记器 custom_sent_tokenizer = PunktSentenceTokenizer(train_text) # 实际分词 tokenized = custom_sent_tokenizer.tokenize(sample_text) # 通过创建一个函数,来完成这个词性标注脚本 def process_content(): try: for i in tokenized[:5]: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) print(tagged) print('\n') except Exception as e: print(str(e)) process_content()
import nltk from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer train_text = state_union.raw("2005-GWBush.txt") sample_text = state_union.raw("2006-GWBush.txt") custom_sent_tokenizer = PunktSentenceTokenizer(train_text) tokenized = custom_sent_tokenizer.tokenize(sample_text) def process_content(): try: for i in tokenized[5:]: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) namedEnt = nltk.ne_chunk(tagged, binary=True) namedEnt.draw() except Exception as e: print(str(e)) process_content()
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("run",'v'))
import nltk import random from nltk.corpus import movie_reviews documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) print(documents[1]) all_words = [] for w in movie_reviews.words(): all_words.append(w.lower()) all_words = nltk.FreqDist(all_words) print(all_words.most_common(15)) print(all_words["stupid"])
import nltk import random from nltk.corpus import movie_reviews documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) all_words = [] for w in movie_reviews.words(): all_words.append(w.lower()) all_words = nltk.FreqDist(all_words) word_features = list(all_words.keys())[:3000] def find_features(document): words = set(document) features = {} for w in word_features: features[w] = (w in words) return features print(find_features(movie_reviews.words('neg/cv000_29416.txt'))) #打印特征集 featuresets = [(find_features(rev), category) for (rev, category) in documents] #保存特征存在性布尔值,以及它们各自的正面或负面的类别: print(featuresets)
import nltk import random from nltk.corpus import movie_reviews from nltk.classify.scikitlearn import SklearnClassifier import pickle from sklearn.naive_bayes import MultinomialNB, BernoulliNB from sklearn.linear_model import LogisticRegression, SGDClassifier from sklearn.svm import SVC, LinearSVC, NuSVC from nltk.classify import ClassifierI from statistics import mode from nltk.tokenize import word_tokenize class VoteClassifier(ClassifierI): def __init__(self, *classifiers): self._classifiers = classifiers def classify(self, features): votes = [] for c in self._classifiers: v = c.classify(features) votes.append(v) return mode(votes) def confidence(self, features): votes = [] for c in self._classifiers: v = c.classify(features) votes.append(v) choice_votes = votes.count(mode(votes)) conf = choice_votes / len(votes) return conf short_pos = open("short_reviews/positive.txt","r",encoding='utf-8').read() short_neg = open("short_reviews/negative.txt","r",encoding='utf-8').read() documents = [] for r in short_pos.split('\n'): documents.append( (r, "pos") ) for r in short_neg.split('\n'): documents.append( (r, "neg") ) all_words = [] short_pos_words = word_tokenize(short_pos) short_neg_words = word_tokenize(short_neg) for w in short_pos_words: all_words.append(w.lower()) for w in short_neg_words: all_words.append(w.lower()) all_words = nltk.FreqDist(all_words) word_features = list(all_words.keys())[:5000] def find_features(document): words = word_tokenize(document) features = {} for w in word_features: features[w] = (w in words) return features #print((find_features(movie_reviews.words('neg/cv000_29416.txt')))) featuresets = [(find_features(rev), category) for (rev, category) in documents] random.shuffle(featuresets) # positive data example: training_set = featuresets[:10000] testing_set = featuresets[10000:] ## ### negative data example: training_set = featuresets[100:] testing_set = featuresets[:100] classifier = nltk.NaiveBayesClassifier.train(training_set) print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100) classifier.show_most_informative_features(15) MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100) BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(training_set) print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100) LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(training_set) print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100) SGDClassifier_classifier = SklearnClassifier(SGDClassifier()) SGDClassifier_classifier.train(training_set) print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100) SVC_classifier = SklearnClassifier(SVC()) SVC_classifier.train(training_set) print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100) LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(training_set) print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100) # LinearSVC_classifier = SklearnClassifier(LinearSVC()) # LinearSVC_classifier.train(training_set) # print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100) voted_classifier = VoteClassifier( # NuSVC_classifier, LinearSVC_classifier, MNB_classifier, BernoulliNB_classifier, LogisticRegression_classifier) print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。