赞
踩
▶ 文本处理
from nltk.corpus import movie_reviews # ([...], pos) # ([...], neg) documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] # 将documents「随机化」,为组成训练集和测试集作准备 import random random.shuffle(documents) # 挑出词频最高的2000个词,作为「特征词」 (其实去掉停词,去掉标点符号,还剩大概1800个词) import nltk from nltk.corpus import stopwords import string word_fd = nltk.FreqDist(w.lower() for w in movie_reviews.words()).most_common(200) # 词频最高的前2000个(词+频度) feature_words = [w for (w, _) in word_fd if w not in stopwords.words("english") and w not in string.punctuation] # 处理后最终的特征词列表 # 文本处理(用的是多个document文件,返回的正好是训练需要的二维数组) import numpy as np features = np.zeros([len(documents), len(feature_words)], dtype = float) for i in range(len(documents)): document_words = set(documents[i][0]) for j in range(len(feature_words)): features[i, j] = 1 if (feature_words[j] in document_words) else 0 # 文本处理函数(用来处理单个文件,返回的是个一位数组———其实是一个单行的二维数组) def get_document_feature(document): document_words = set(document) features = np.zeros([1, len(feature_words)], dtype = float) for j in range(len(feature_words)): features[0, j] = 1 if (feature_words[j] in document_words) else 0 return features # document(文件)应该有2000个,去前1200作为训练集,后800作为测试集 target = [c for (d, c) in documents] train_X = features[:1200, :] train_Y = target[:1200] test_X = features[1200:, :] test_Y = target[1200:]
▶ SVM训练与测试
from sklearn import svm
classifier = svm.SVC(kernel = 'rbf') # kernel为核函数类型,默认RBF径向基神经网络
classifier.fit(train_X, train_Y)
print('支持向量机(SVM)的测试集正确率为', classifier.score(test_X, test_Y))
text = input('请输入影评文本(一句话也行>_<): ')
print('情感分析结果为(pos/neg): ', classifier.predict(get_document_feature(text.split(' '))))
# 注意get_document_feature的参数需要是个词列表哦!因此需要提前分词,这里使用了最偷工减料的text.split(' ')
▶ KNN训练与测试
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=7) # 即k值(默认为5)
classifier.fit(train_X, train_Y)
print('k最邻近(KNN)的测试集正确率为', classifier.score(test_X, test_Y))
text = input('请输入影评文本(一句话也行>_<): ')
print('情感分析结果为(pos/neg): ', classifier.predict(get_document_feature(text.split(' '))))
# 注意get_document_feature的参数需要是个词列表哦!因此需要提前分词,这里使用了最偷工减料的text.split(' ')
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。