赞
踩
import pandas as pd
import pickle
先导入pandas库,该库提供了csv文件的相关数据分析功能,是我们数据挖掘必须学会使用的库。
之后导入pickle库,该库是用于将中文语句中使用频数超过10次的字或词收集的字典序列化。
train_df = pd.read_csv('../yangqf/Desktop/Train.csv',encoding='gb18030')
train_df.head()
读取Train.csv文件,采用gb18030的方式解码,并显示该数据集中的前五行数据
jieguoList = ['Love', 'Sorrow', 'Hate', 'Anxiety', 'Surprise', 'Expect', 'Joy', 'Anger']
train_df.info()
jieguoList列表用于存储分类类别,info()用于显示数据集的基本数据情况
train_df["Text"][0]
显示第一个数据的Text列文字
length = len(train_df) #数据的大小 Total_dict={} #将数据集中所有出现的字词编号并放入该字典中 icount = 0 #计数器 ziShu_dict = {} #字数字典 for i in range(length): #该for循环用于让字数字典所有字词的计数先初始化为0 Str = "" for j in range(len(train_df["Text"][i])): if train_df["Text"][i][j] != ' ': Str = "{0}{1}".format(Str,train_df["Text"][i][j]) continue ziShu_dict[Str] = 0 #字典内的数据显示如:{"她们":0,"都":0,"睡":0,"蹑手蹑脚":0,……} Str = "" for i in range(length): #该for循环用于将字数字典内的字词进行分别计数,统计数据集所有的对应的字词数量 #Total_dict字典将所有的字词进行收集并记录序号,数据显示如: #{她们":0,"都":1,"睡":2,"蹑手蹑脚":3,……} Str = "" for j in range(len(train_df["Text"][i])): if train_df["Text"][i][j] != ' ': Str = "{0}{1}".format(Str,train_df["Text"][i][j]) continue if Str in Total_dict: ziShu_dict[Str] += 1 else: Total_dict[Str] = icount icount+=1 Str = "" print(len(Total_dict)) #输出总字典的长度 print(len(ziShu_dict)) #输出字数字典的长度,会与总字典长度一样
icount = 0 for i in range(len(Total_dict)): #该for循环是删除字数字典中字词出现数量小于10次的字词,也是为了防止出 #现过拟合的现象,减小特征向量的空间,减小学习模型的复杂度,并把字数字 #典改成序号排序的字典,类似于Total_dict List = [] temp = list(Total_dict.keys())[list(Total_dict.values()).index(i)] #该语句作用是根据字典的 #value值提取对应的Key if ziShu_dict[temp] < 10: del ziShu_dict[temp] else: ziShu_dict[temp] = icount icount += 1 for i in range(len(jieguoList)): #该for循环是为了创建多个分类列,因为我们要处理的是多标签的分类问题, #所以先将问题转化为二分类的问题,比如提取Love类,将Label里有Love的 #标记为1,没有Love的标记为0,这样可以创建一个专门判断Love的学习模 #型,以此类推,总共需要创建8个学习模型。 List = [] for j in range(length): if jieguoList[i] in train_df["Labels"][j] : List.append(1) else: List.append(0) train_df[jieguoList[i]] = List icount = 0 for i in range(len(ziShu_dict)): #for循环用于将中文语句数字化,比如'他们'这个词在总体数据集中出现超过 #了10次,便创建一列,将它设为列名为0,接着判断所有语句中是否出现了它 #,如果出现了,便将0列设为1,没有出现设为0。依次类推,可以将中文数字 #化 List = [] temp = list(ziShu_dict.keys())[list(ziShu_dict.values()).index(i)] for j in range(length): if temp in train_df["Text"][j] : List.append(1) else: List.append(0) icount += 1 print(len(ziShu_dict)-icount) train_df[i] = List with open('Total_dict.dict', 'wb') as handle: #用于序列化Total_dict字典,保存在磁盘上,便于下次用在 #测试集上的中文序列化。 pickle.dump(Total_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) with open('ziShu_dict.dict', 'wb') as handle: #原理同上 pickle.dump(ziShu_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
print(ziShu_dict)
print(len(ziShu_dict))
train_df.to_csv('../yangqf/Desktop/zhongyaoshuju.csv', index=False)
输出字数字典,并输出其长度,最后将数字化后的数据集保存在磁盘上,便于之后读取处理。
train_df.head()
# data analysis and wrangling import pandas as pd import numpy as np import random as rnd import pickle # visualization import seaborn as sns import matplotlib.pyplot as plt %matplotlib inline # machine learning from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC, LinearSVC from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from sklearn.externals import joblib
导入数据分析处理的库和可视化的库和机器学习的库
# 读取数据
train_df = pd.read_csv('../yangqf/Desktop/zhongyaoshuju.csv')
x = train_df.drop("ID",axis = 1) #丢弃数据集中没用的ID列
x = x.drop("Text",axis=1) #接着丢弃没用的“Text”列
x = x.drop("Labels",axis=1) #再丢弃没用的"Labels"列
x_love = x.drop(['Sorrow', 'Hate', 'Anxiety', 'Surprise', 'Expect', 'Joy', 'Anger'], axis=1, inplace=True) #留下Love列,其他类都丢弃
x_love_train = x.drop("Love",axis=1) #丢弃完Love这个Labels后,剩余的便是要求的训练集特征向量
y_love_train = x['Love'] #将Love列当做给训练集标签列
# 将特征划分到 X 中,标签划分到 Y 中
# 使用train_test_split函数划分数据集(训练集占75%,测试集占25%)
x_love_train, x_love_test, y_love_train, y_love_test = train_test_split(x_love_train, y_love_train,test_size=0.25)
# Logistic Regression
love_logreg = LogisticRegression()
love_logreg.fit(x_love_train, y_love_train)
y_love_predict = love_logreg.predict(x_love_test)
acc_log = round(love_logreg.score(x_love_test, y_love_test) * 100, 2)
print(acc_log)
经过与其他机器学习模型比较之后,发现Logistic回归模型的效果更好,故对Love类的判断采用逻辑回归模型。
x = train_df.drop(['ID','Text','Labels','Love','Hate','Anxiety','Surprise','Expect','Joy','Anger'],axis=1)
x
x_sorrow_train = x.drop("Sorrow",axis=1)
y_sorrow_train = x['Sorrow']
# 使用train_test_split函数划分数据集(训练集占75%,测试集占25%)
x_sorrow_train, x_sorrow_test, y_sorrow_train, y_sorrow_test = train_test_split(x_sorrow_train, y_sorrow_train,test_size=0.25)
# Random Forest
sorrow_random_forest = RandomForestClassifier(n_estimators=100)
sorrow_random_forest.fit(x_sorrow_train, y_sorrow_train)
# Y_pred = random_forest.predict(X_test)
# random_forest.score(X_train, Y_train)
acc_random_forest = round(sorrow_random_forest.score(x_sorrow_test, y_sorrow_test) * 100, 2)
acc_random_forest
原理同上,将sorrow列单独拎出来判断,发现随机森林模型的判断更好,顾选择其判断。
x = train_df.drop(['ID','Text','Labels','Love','Sorrow','Anxiety','Surprise','Expect','Joy','Anger'],axis=1)
x
x_hate_train = x.drop("Hate",axis=1)
y_hate_train = x['Hate']
# 使用train_test_split函数划分数据集(训练集占75%,测试集占25%)
x_hate_train, x_hate_test, y_hate_train, y_hate_test = train_test_split(x_hate_train, y_hate_train,test_size=0.25)
# Random Forest
hate_random_forest = RandomForestClassifier(n_estimators=100)
hate_random_forest.fit(x_hate_train, y_hate_train)
# Y_pred = random_forest.predict(X_test)
# random_forest.score(X_train, Y_train)
acc_random_forest = round(hate_random_forest.score(x_hate_test, y_hate_test) * 100, 2)
print(acc_random_forest)
x = train_df.drop(['ID','Text','Labels','Love','Sorrow','Hate','Surprise','Expect','Joy','Anger'],axis=1)
x
x_anxiety_train = x.drop("Anxiety",axis=1)
y_anxiety_train = x['Anxiety']
# 使用train_test_split函数划分数据集(训练集占75%,测试集占25%)
x_anxiety_train, x_anxiety_test, y_anxiety_train, y_anxiety_test = train_test_split(x_anxiety_train, y_anxiety_train,test_size=0.25)
# Random Forest
anxiety_random_forest = RandomForestClassifier(n_estimators=100)
anxiety_random_forest.fit(x_anxiety_train, y_anxiety_train)
# Y_pred = random_forest.predict(X_test)
# random_forest.score(X_train, Y_train)
acc_random_forest = round(anxiety_random_forest.score(x_anxiety_test, y_anxiety_test) * 100, 2)
print(acc_random_forest)
x = train_df.drop(['ID','Text','Labels','Love','Sorrow','Hate','Anxiety','Expect','Joy','Anger'],axis=1)
x
x_surprise_train = x.drop("Surprise",axis=1)
y_surprise_train = x['Surprise']
# 使用train_test_split函数划分数据集(训练集占75%,测试集占25%)
x_surprise_train, x_surprise_test, y_surprise_train, y_surprise_test = train_test_split(x_surprise_train, y_surprise_train,test_size=0.25)
# Random Forest
surprise_random_forest = RandomForestClassifier(n_estimators=100)
surprise_random_forest.fit(x_surprise_train, y_surprise_train)
# Y_pred = random_forest.predict(X_test)
# random_forest.score(X_train, Y_train)
acc_random_forest = round(surprise_random_forest.score(x_surprise_test, y_surprise_test) * 100, 2)
print(acc_random_forest)
x = train_df.drop(['ID','Text','Labels','Love','Sorrow','Hate','Anxiety','Surprise','Joy','Anger'],axis=1)
x
x_expect_train = x.drop("Expect",axis=1)
y_expect_train = x['Expect']
# 使用train_test_split函数划分数据集(训练集占75%,测试集占25%)
x_expect_train, x_expect_test, y_expect_train, y_expect_test = train_test_split(x_expect_train, y_expect_train,test_size=0.25)
# Random Forest
expect_random_forest = RandomForestClassifier(n_estimators=100)
expect_random_forest.fit(x_expect_train, y_expect_train)
# Y_pred = random_forest.predict(X_test)
# random_forest.score(X_train, Y_train)
acc_random_forest = round(expect_random_forest.score(x_expect_test, y_expect_test) * 100, 2)
print(acc_random_forest)
x = train_df.drop(['ID','Text','Labels','Love','Sorrow','Hate','Anxiety','Surprise','Expect','Anger'],axis=1)
x
x_joy_train = x.drop("Joy",axis=1)
y_joy_train = x['Joy']
# 使用train_test_split函数划分数据集(训练集占75%,测试集占25%)
x_joy_train, x_joy_test, y_joy_train, y_joy_test = train_test_split(x_joy_train, y_joy_train,test_size=0.25)
# Random Forest
joy_random_forest = RandomForestClassifier(n_estimators=100)
joy_random_forest.fit(x_joy_train, y_joy_train)
# Y_pred = random_forest.predict(X_test)
# random_forest.score(X_train, Y_train)
acc_random_forest = round(joy_random_forest.score(x_joy_test, y_joy_test) * 100, 2)
print(acc_random_forest)
x = train_df.drop(['ID','Text','Labels','Love','Sorrow','Hate','Anxiety','Surprise','Expect','Joy'],axis=1)
x
x_anger_train = x.drop("Anger",axis=1)
y_anger_train = x['Anger']
# 使用train_test_split函数划分数据集(训练集占75%,测试集占25%)
x_anger_train, x_anger_test, y_anger_train, y_anger_test = train_test_split(x_anger_train, y_anger_train,test_size=0.25)
# Random Forest
anger_random_forest = RandomForestClassifier(n_estimators=100)
anger_random_forest.fit(x_anger_train, y_anger_train)
# Y_pred = random_forest.predict(X_test)
# random_forest.score(X_train, Y_train)
acc_random_forest = round(anger_random_forest.score(x_anger_test, y_anger_test) * 100, 2)
print(acc_random_forest)
重复上述过程,便可以得到8个机器学习模型,便于之后对测试集类别进行判断。
joblib.dump(love_logreg, '../yangqf/Desktop/xueximoxing/love_Logistic.model')#也可以使用文件对象
joblib.dump(sorrow_random_forest, '../yangqf/Desktop/xueximoxing/sorrow_random_forest.model')#也可以使用文件对象
joblib.dump(hate_random_forest, '../yangqf/Desktop/xueximoxing/hate_random_forest.model')#也可以使用文件对象
joblib.dump(anxiety_random_forest, '../yangqf/Desktop/xueximoxing/anxiety_random_forest.model')#也可以使用文件对象
joblib.dump(surprise_random_forest, '../yangqf/Desktop/xueximoxing/surprise_random_forest.model')#也可以使用文件对象
joblib.dump(expect_random_forest, '../yangqf/Desktop/xueximoxing/expect_random_forest.model')#也可以使用文件对象
joblib.dump(joy_random_forest, '../yangqf/Desktop/xueximoxing/joy_random_forest.model')#也可以使用文件对象
joblib.dump(anger_random_forest, '../yangqf/Desktop/xueximoxing/anger_random_forest.model')#也可以使用文件对象
将所有学习后的模型进行序列化之后,保存在磁盘上。
这里只是验证打分了模型,之后要记得把所有数据都用上去建立新模型。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。