赞
踩
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler ,LabelEncoder
import warnings
warnings.filterwarnings('ignore')
#文件读取
attrition = pd.read_csv('./data/WA_Fn-UseC_-HR-Employee-Attrition.csv')
attrition.head()
Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 41 | Yes | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | ... | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
1 | 49 | No | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | ... | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
2 | 37 | Yes | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | ... | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
3 | 33 | No | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | ... | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
4 | 27 | No | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | ... | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
5 rows × 35 columns
从员工流失数据样本中,我们可以看到数据的一些特征:
* Age:员工年龄
* Attrition:员工是否已经离职,1表示已经离职,2表示未离职,这是目标预测值;
* BusinessTravel:商务差旅频率,Non-Travel表示不出差,Travel_Rarely表示不经常出差,Travel_Frequently表示经常出差;
* Department:员工所在部门,Sales表示销售部,Research & Development表示研发部,Human Resources表示人力资源部;
* DistanceFromHome:公司跟家庭住址的距离,从1到29,1表示最近,29表示最远;
* Education:员工的教育程度,从1到5,5表示教育程度最高;
* EducationField:员工所学习的专业领域,Life Sciences表示生命科学,Medical表示医疗,Marketing表示市场营销,Technical Degree表示技术学位,Human Resources表示人力资源,Other表示其他;
* EmployeeNumber:员工号码;
* EnvironmentSatisfaction:员工对于工作环境的满意程度,从1到4,1的满意程度最低,4的满意程度最高;
* Gender:员工性别,Male表示男性,Female表示女性;
* JobInvolvement:员工工作投入度,从1到4,1为投入度最低,4为投入度最高;
* JobLevel:职业级别,从1到5,1为最低级别,5为最高级别;
* JobRole:工作角色:Sales Executive是销售主管,Research Scientist是科学研究员,Laboratory Technician实验室技术员,Manufacturing Director是制造总监,Healthcare Representative是医疗代表,Manager是经理,Sales Representative是销售代表,Research Director是研究总监,Human Resources是人力资源;
* JobSatisfaction:工作满意度,从1到4,1代表满意程度最低,4代表满意程度最高;
* MaritalStatus:员工婚姻状况,Single代表单身,Married代表已婚,Divorced代表离婚;
* MonthlyIncome:员工月收入,范围在1009到19999之间;
* NumCompaniesWorked:员工曾经工作过的公司数;
* Over18:年龄是否超过18岁;
* OverTime:是否加班,Yes表示加班,No表示不加班;
* PercentSalaryHike:工资提高的百分比;
* PerformanceRating:绩效评估;
* RelationshipSatisfaction:关系满意度,从1到4,1表示满意度最低,4表示满意度最高;
* StandardHours:标准工时;
* StockOptionLevel:股票期权水平;
* TotalWorkingYears:总工龄;
* TrainingTimesLastYear:上一年的培训时长,从0到6,0表示没有培训,6表示培训时间最长;
* WorkLifeBalance:工作与生活平衡程度,从1到4,1表示平衡程度最低,4表示平衡程度最高;
* YearsAtCompany:在目前公司工作年数;
* YearsInCurrentRole:在目前工作职责的工作年数
* YearsSinceLastPromotion:距离上次升职时长
* YearsWithCurrManager:跟目前的管理者共事年数;
在我们即将构建的机器学习模型当中,'Attrition’将是模型训练的目标列。
# 查看表中是否有空值
attrition.isnull().sum()
Age 0 Attrition 0 BusinessTravel 0 DailyRate 0 Department 0 DistanceFromHome 0 Education 0 EducationField 0 EmployeeCount 0 EmployeeNumber 0 EnvironmentSatisfaction 0 Gender 0 HourlyRate 0 JobInvolvement 0 JobLevel 0 JobRole 0 JobSatisfaction 0 MaritalStatus 0 MonthlyIncome 0 MonthlyRate 0 NumCompaniesWorked 0 Over18 0 OverTime 0 PercentSalaryHike 0 PerformanceRating 0 RelationshipSatisfaction 0 StandardHours 0 StockOptionLevel 0 TotalWorkingYears 0 TrainingTimesLastYear 0 WorkLifeBalance 0 YearsAtCompany 0 YearsInCurrentRole 0 YearsSinceLastPromotion 0 YearsWithCurrManager 0 dtype: int64
问题
数据集中有空值嘛?如果有,应该如何处理呢?
没有,如果有有三种处理办法
1:直接删除,但是还需要看下比例,如果缺失比过多不推荐这种做法。
2:观察数据,看具体数据的类比,将平均数,中位数,众数填入。
3:利用一些算法填入,如拉格朗日填充法
import seaborn as sns # TODO # 将dataframe[]类型变量attrition中的Attrition列转化为Int类型,并放入一个新建的属性Attrition_numerical中。 # Attrition列中的元素与Attrition_numerical元素的值对应关系是:'Yes'=>1,'No'=>0 def function(a): if 'Yes'in a : return 1 else: return 0 attrition['Attrition_numerical'] = attrition.apply(lambda x: function(x['Attrition']), axis = 1) # 定义要绘制两两关系的属性列 numerical = [u'Age', u'DailyRate', u'JobSatisfaction', u'MonthlyIncome', u'PerformanceRating', u'WorkLifeBalance', u'YearsAtCompany', u'Attrition_numerical'] # 绘制关系图 g = sns.pairplot(attrition[numerical], hue='Attrition_numerical', palette='seismic', diag_kind = 'kde',diag_kws=dict(shade=True)) g.set(xticklabels=[])
# TODO
# 使用变量y记录attrition中'Attrition_numerical'这一列的值
y = attrition['Attrition_numerical']
# TODO
# 将attrition中的'Attrition'以及'Attrition_numerical'列删除
attrition=attrition.drop(['Attrition','Attrition_numerical'],axis=1)
attrition.head()
Age | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 41 | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | 2 | ... | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
1 | 49 | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | 3 | ... | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
2 | 37 | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | 4 | ... | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
3 | 33 | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | 4 | ... | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
4 | 27 | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | 1 | ... | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
5 rows × 34 columns
# categoricals列表将用于记录所有的非数值属性名
categoricals = []
# TODO
# 将非数值列的列名添加到列表categoricals中,并且将这些非数值列的列名都打印出来
cols = attrition.columns
for col in cols:
if str(attrition[col].dtype) == 'object':
categoricals.append(col)
print(categoricals)
['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime']
为什么要将非数值属性进行数字编码?
因为采用大多数算法计算时是数值的计算,如果是非数值的话无法进行计算
# 使用pandas中的get_dummies方法将非数值列转为数字,并用转化好的列替换掉原本的列 # 提取非数值类型 # TODO # 将attrition中名字为categoricals中所记录的那些列数据提取出来,放到DataFrame类型的变量attrition_cat中 attrition_cat = attrition.loc[:,categoricals] # TODO # 使用pd.get_dummies将attrition_cat转换成数值形式并将结果覆盖赋值到原有变量attrition_cat之上 attrition_cat =pd.get_dummies(attrition_cat) # 提取数值类型 # TODO # 获取attrition的数值类型属性列的所有数据,赋值到DataFrame类型的变量attrition_num上 categoricals_int = [] cols_int = attrition.columns for col in cols: if str(attrition[col].dtype) != 'object': categoricals_int .append(col) attrition_num = attrition.loc[:,categoricals_int] # TODO # 将attrition_num与attrition_cat进行拼接,生成新的DataFrame并赋值到变量attrition_final上 # attrition_final中的列由attrition_num与attrition_cat的列组成 attrition_final=pd.merge(attrition_num,attrition_cat,left_on='Age',right_index=True) attrition_final.head()
Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | ... | JobRole_Research Director | JobRole_Research Scientist | JobRole_Sales Executive | JobRole_Sales Representative | MaritalStatus_Divorced | MaritalStatus_Married | MaritalStatus_Single | Over18_Y | OverTime_No | OverTime_Yes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 41 | 1102 | 1 | 2 | 1 | 1 | 2 | 94 | 3 | 2 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 |
45 | 41 | 1360 | 12 | 3 | 1 | 58 | 2 | 49 | 3 | 5 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 |
133 | 41 | 802 | 9 | 1 | 1 | 176 | 3 | 96 | 3 | 3 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 |
147 | 41 | 857 | 10 | 3 | 1 | 199 | 4 | 91 | 2 | 4 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 |
148 | 41 | 933 | 9 | 4 | 1 | 200 | 3 | 94 | 3 | 1 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 |
5 rows × 55 columns
提示: 参考归一化内容。
如果使用决策树类似的算法结果影响不大,但如果是线性回归这类型的函数,对结果产生的影响还是大的,如身高和体重两个变量,身高单位为米,如1.5米,体重单位为kg,如50kg,分析出来的结果会偏向于数值差别大的体重特征,如果我们将所有的数值全部限定在0-1范围内,结果不会产生过大的影响,因为相当于做了归一化处理。
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
# TODO
# 使用train_test_split方法,划分训练集和测试集,指定80%数据为训练集,20%为测试集
X_train, X_test, y_train, y_test = train_test_split(attrition_final,y, test_size=0.2)
为什么需要将数据集划分为训练集和测试集?
因为要经行数据的调参,找到一个最好的参数,如果只是对训练集进行调参,最后得到的结果不具有普遍性,需要模型没有见过的数据进行测试,看看对新数据的拟合情况
请建立SVM模型并计算准确率
from sklearn.svm import SVC
# 准确率
svm_acc=[]
# TODO
# 构建SVM模型(默认参数即可),并调用fit进行模型拟合
clf = SVC(C=1.0, kernel='rbf', gamma=0.1)
clf.fit(X_train, y_train)
# TODO
# 计算svm在测试集上的准确率并将准确率结果添加到svm_acc中
svm_acc.append(clf.score(X_test, y_test))
# 打印准确率
print('准确率:', svm_acc[0])
准确率: 0.8231292517006803
请根据不同树深度建立决策树模型并计算准确率
from sklearn.tree import DecisionTreeClassifier
# 树深度
depths=[1,3,5,7,9]
# 准确率
dt_acc=[]
# TODO
# 尝试depths中所列举的所有树深度情况,使用决策树模型做多次训练。
# 针对每种树深情况计算一次在测试集上的准确率,打印每次训练所获得的准确率,并将每次准确率结果添入列表dt_acc中。
for i in depths:
model = DecisionTreeClassifier(max_depth=i)
model.fit(X_train, y_train)
dt_acc.append(model.score(X_test, y_test))
print(dt_acc)
[0.8231292517006803, 0.8231292517006803, 0.7891156462585034, 0.7959183673469388, 0.7551020408163265]
树的深度是越大,在测试集上的准确率不一定越高,随着深度增加模型在测试集上的准确率出现下降的情况,说明最适合的参数应该在1或2之间,分层太多没有用。
请建立朴素贝叶斯模型并计算准确率
from sklearn.naive_bayes import GaussianNB # 准确率 gnb_acc=[] # TODO # 构建GaussianNB模型(默认参数即可),并调用fit进行模型拟合 model = GaussianNB() model.fit(X_train, y_train) # TODO # 计算GaussianNB在测试集上的准确率并将准确率结果添加到gnb_acc中 gnb_acc.append(model.score(X_test, y_test)) # 打印准确率 print('准确率:', gnb_acc[0])
准确率: 0.7755102040816326
请建立KNN模型并计算准确率
from sklearn.neighbors import KNeighborsClassifier
# K参数选项
neighbors = [1,3,5,7,9]
# 准确率
knn_acc=[]
# TODO
# 尝试neighbors中所列举的所有K选项,使用KNeighborsClassifier模型做多次训练。
# 针对每种K值情况计算一次在测试集上的准确率,打印每次训练所获得的准确率,并将每次准确率结果添入列表knn_acc中。
for i in neighbors:
model = KNeighborsClassifier(n_neighbors=i)
model.fit(X_train, y_train)
knn_acc.append(model.score(X_test, y_test))
print(knn_acc)
[0.7210884353741497, 0.7857142857142857, 0.826530612244898, 0.8231292517006803, 0.826530612244898]
请建立逻辑回归模型并计算准确率
from sklearn.linear_model import LogisticRegression # 准确率 lr_acc=[] # TODO # 构建LogisticRegression模型(默认参数即可),并调用fit进行模型拟合 model = LogisticRegression() model.fit(X_train, y_train) # TODO # 计算LogisticRegression在测试集上的准确率并将准确率结果添加到lr_acc中 lr_acc.append(model.score(X_test, y_test)) # 打印准确率 print('准确率:', lr_acc[0])
准确率: 0.8231292517006803
为什么我们可以使用逻辑回归模型进行分类呢?
通过对应函数分布在范围之内,通过阈值进行划分
# 将结果以图像展示出来
results_df = pd.DataFrame(columns=['Accuracy (%)'],index=['svm','dt','gnb','knn','lr'])
results_df.index.name = 'Model'
results_df.loc['svm', 'Accuracy (%)'] = max(svm_acc) * 100
results_df.loc['dt', 'Accuracy (%)'] = max(dt_acc) * 100
results_df.loc['gnb', 'Accuracy (%)'] = max(gnb_acc) * 100
results_df.loc['knn', 'Accuracy (%)'] = max(knn_acc) * 100
results_df.loc['lr', 'Accuracy (%)'] = max(lr_acc) * 100
plt.figure(figsize=(10, 4))
ax1 = plt.subplot(1, 2, 1)
results_df.plot(y=['Accuracy (%)'], kind='bar', ylim=[70, 100], ax=ax1, title='Accuracy(%)', legend=False)
plt.tight_layout()
plt.show()
相对于准确率,F1值对模型的评价更为准确,下面就请你们算出以上五个模型的F1值,并通过柱形图展示出来
from sklearn.svm import SVC svm_f1=[] clf = SVC() clf.fit(X_train, y_train) y_pred=clf.predict(X_test) svm_f1.append(f1_score(y_test,y_pred,average='binary')) print('svm_f1',svm_f1[0]) from sklearn.tree import DecisionTreeClassifier depths=[1,3,5,7,9] dt_f1=[] for i in depths: model = DecisionTreeClassifier(max_depth=i) model.fit(X_train, y_train) y_pred=model.predict(X_test) dt_f1.append(f1_score(y_test,y_pred,average='binary')) print('dt_f1',dt_f1[0]) from sklearn.naive_bayes import GaussianNB gnb_f1=[] model = GaussianNB() model.fit(X_train, y_train) y_pred=model.predict(X_test) gnb_f1.append(f1_score(y_test,y_pred,average='binary')) print('gnb_f1:',gnb_f1[0]) from sklearn.neighbors import KNeighborsClassifier neighbors = [1,3,5,7,9] knn_f1=[] for i in neighbors: model = KNeighborsClassifier(n_neighbors=i) model.fit(X_train, y_train) y_pred=model.predict(X_test) knn_f1.append(f1_score(y_test,y_pred,average='binary')) print('knn_f1:',knn_f1[0]) from sklearn.linear_model import LogisticRegression lr_f1=[] model = LogisticRegression() model.fit(X_train, y_train) y_pred=model.predict(X_test) lr_f1.append(f1_score(y_test,y_pred,average='binary')) print('lr_f1:', lr_f1[0])
svm_f1 0.0
dt_f1 0.0
gnb_f1: 0.10810810810810811
knn_f1: 0.10869565217391305
lr_f1: 0.0
# TODO
# 计算以上五个模型的F1值,保存在dataframe类型的变量results_df中并绘制成柱形图
results_df = pd.DataFrame(columns=['f1'],index=['svm','dt','gnb','knn','lr'])
results_df.index.name = 'Model'
results_df.loc['svm', 'f1'] = max(svm_f1)*100
results_df.loc['dt', 'f1'] = max(dt_f1)*100
results_df.loc['gnb', 'f1'] = max(gnb_f1)*100
results_df.loc['knn', 'f1'] = max(knn_f1)*100
results_df.loc['lr', 'f1'] = max(lr_f1)*100
plt.figure(figsize=(10, 4))
ax1 = plt.subplot(1, 2, 1)
results_df.plot(y=['f1'], kind='bar', ylim=[10,30], ax=ax1, title='f1', legend=False)
plt.tight_layout()
plt.show()
在这些模型中,你看到的SVM的F1值是多少? 从F1值的角度评判该模型,所得到的好坏结论与从准确率角度评判所得结论是否相同?
您认为在这里例子中,使用F1值与准确率进行模型评判,哪个指标更为合理?为什么?
** 回答: **
F1的值为0,不同,使用F1更能体现分类的准确性,不然只是通过准确值进行估计就会由于数据的不均衡性导致准确的发生偏差
XGBoost
import xgboost as xgb
model = xgb.XGBRegressor()
model.fit(X_train, y_train)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print('train score: {train_score:.6f}; test score: {test_score:.6f}'.format(
train_score=train_score, test_score=test_score))
[19:42:00] WARNING: C:/Jenkins/workspace/xgboost-win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
train score: 0.357942; test score: -0.024752
import xgboost as xgb params={'booster':'gbtree', 'objective': 'binary:logistic', 'eval_metric': 'auc', 'max_depth':4, 'lambda':10, 'subsample':0.75, 'colsample_bytree':0.75, 'min_child_weight':2, 'eta': 0.025, 'seed':0, 'nthread':8, 'silent':1} plst = params.items() dtrain = xgb.DMatrix(X_train, y_train) num_rounds = 1000 model = xgb.train(plst,dtrain,num_rounds) dtest=xgb.DMatrix(X_test) ypred=model.predict(dtest) # 设置阈值, 输出一些评价指标,选择概率大于0.5的为1,其他为0类 y_pred = (ypred >= 0.5)*1 from sklearn import metrics print ('AUC: %.4f' % metrics.roc_auc_score(y_test,ypred)) print ('F1-score: %.4f' %metrics.f1_score(y_test,y_pred))
AUC: 0.5970
ACC: 0.8197
Recall: 0.0000
F1-score: 0.0000
Precesion: 0.0000
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。