赞
踩
字段说明: Announce_ID字段代表用户ID,User_Name字段代表用户名,topic字段代表发帖主题,body字段代表发帖内容,post_type字段代表发帖话题是否与工作相关,sentiment字段表明发帖情感色彩,IP字段代表用户IP地址。
关于classify.xls文件 提取码:fkwq
stopwords.txt 文件 提取码:05z1
相关代码
import pandas as pd import jieba from sklearn.feature_extraction.text import CountVectorizer from sklearn.decomposition import TruncatedSVD from sklearn import model_selection, metrics from sklearn.naive_bayes import BernoulliNB # 读入classify文件,Sheetname设为classify df = pd.read_excel('classify.xls', sheet_name='classify') # print(df) # 对post_type特征做频数统计 # print(df['post_type'].value_counts()) # 删去User_Name、post_type、IP特征 df.drop(columns=['User_Name', 'post_type', 'IP'], inplace=True) # print(df) # 对Body特征进行中文分词处理 df['Body'] = df['Body'].map(lambda x:jieba.lcut(x)) # print(df['Body'].head()) # 读取停用词表,剔除停用词 s = '' with open('stopwords.txt', 'r', encoding='utf8') as r: for i in r.readlines(): s += i.strip() df['Body'] = df['Body'].map(lambda x:[i for i in x if(i not in s) and (len(i) > 1)]) # print(df['Body'].head()) # 将剔除停用词后的Body特征转换为矩阵向量 list1 = df['Body'].map(lambda line:' '.join(line)) cv = CountVectorizer() w = cv.fit_transform(list1) # 降维处理得到特征X svd = TruncatedSVD(50) X = svd.fit_transform(w) # 切片sentiment特征作为目标标签Y Y = df['sentiment'] # 按照8:2划分数据集 train_x, test_x, train_y, test_y = model_selection.train_test_split(X, Y, test_size=0.2, random_state=0) # 建立朴素贝叶斯分类模型 model = BernoulliNB() model.fit(train_x, train_y) # 进行模型训练和预测 h = model.predict(test_x) # 打印混淆矩阵和分类报告 print('classification_report:\n', metrics.classification_report(test_y, h)) print('confusion_matrix:\n', metrics.confusion_matrix(test_y, h))
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。