赞
踩
from sklearn.datasets import load_filesfrom sklearn.feature_extraction.text import CountVectorizer, TfidfTransformerfrom nerutils import *from sklearn.linear_model import SGDClassifier# 选取参与分析的文本类别categories = ['呼吸内科', '心内科', '消化内科']train_path='category/train'# 从硬盘获取原始数据twenty_train=load_files(train_path, categories=categories, load_content = True, encoding='utf-8', decode_error='strict', shuffle=True, random_state=42)# 统计词语出现次数count_vect = CountVectorizer()for index in range(len(twenty_train.data)): twenty_train.data[index] = ' '.join(ner( twenty_train.data[index]))from sklearn.pipeline import Pipeline# 建立Pipelinetext_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),])# 训练分类器text_clf = text_clf.fit(twenty_train.data, twenty_train.target)# 打印分类器信息print(text_clf)# 读取测试数据categories = ['呼吸内科']test_path = 'category/test'test_train=load_files(test_path, categories=categories, load_content = True, encoding='utf-8', decode_error='strict', shuffle=True, random_state=42)for index in range(len(test_train.data)): test_train.data[index] = ' '.join(ner( test_train.data[index]))test_train.target = [0]*len(test_train.target)docs_test = test_train.data# 使用测试数据进行分类预测predicted = text_clf.predict(docs_test)print("分类数据:" + str(predicted))score = text_clf.score# 计算预测结果的准确率import numpy as npprint("准确率为:")print(np.mean(predicted == test_train.target) * 100)
分类数据:[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]准确率为:100.0
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。