赞
踩
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
pd.set_option('display.max_columns', None)
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
df = pd.read_csv("HR-Employee-Attrition.csv")
for column in df.columns:
print(f"{column}: 唯一值数量 {df[column].nunique()}")
df.head(1)
df.drop(['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours'], axis="columns", inplace=True)
df.info()
df["Attrition"] = LabelEncoder().fit_transform(df['Attrition'])
df["BusinessTravel"] = LabelEncoder().fit_transform(df['BusinessTravel'])
df["Department"] = LabelEncoder().fit_transform(df['Department'])
df["EducationField"] = LabelEncoder().fit_transform(df['EducationField'])
df["Gender"] = LabelEncoder().fit_transform(df['Gender'])
df["JobRole"] = LabelEncoder().fit_transform(df['JobRole'])
df["MaritalStatus"] = LabelEncoder().fit_transform(df['MaritalStatus'])
df["OverTime"] = LabelEncoder().fit_transform(df['OverTime'])在这里插入代码片
f,ax = plt.subplots(figsize=(18, 18))
sns.heatmap(df.corr(), annot=True, linewidths=.5, fmt= '.2f',ax=ax)
df.drop(['JobLevel'], axis="columns", inplace=True)在这里插入代码片
#模型构建,随机森林这边选取建立70个决策树,画出混淆矩阵
X = df.drop('Attrition', axis=1)
y = df.Attrition
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf_rf = RandomForestClassifier(n_estimators=70,random_state=43)
clr_rf = clf_rf.fit(x_train,y_train)
ac = accuracy_score(y_test,clf_rf.predict(x_test))
print('正确率: ',ac)
cm = confusion_matrix(y_test,clf_rf.predict(x_test))
sns.heatmap(cm,annot=True,fmt="d")
importances = clr_rf.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(x_train.shape[1]):
print("%2d) %-*s %f" % (f + 1, 40, x_train.columns[indices[f]], importances[indices[f]]))
perm_importance = permutation_importance(clr_rf, x_test, y_test).importances_mean
indices = np.argsort(perm_importance)[::-1]
for f in range(x_test.shape[1]):
print("%2d) %-*s %f" % (f + 1, 40, x_test.columns[indices[f]], perm_importance[indices[f]]))
x_train_2 = select_feature.transform(x_train)
x_test_2 = select_feature.transform(x_test)
clf_rf_2 = RandomForestClassifier(n_estimators=70)
clr_rf_2 = clf_rf_2.fit(x_train_2,y_train)
ac_2 = accuracy_score(y_test,clf_rf_2.predict(x_test_2))
print("正确率:",ac_2)
cm_2 = confusion_matrix(y_test,clf_rf_2.predict(x_test_2))
sns.heatmap(cm_2,annot=True,fmt="d")
clf_rf_3 = RandomForestClassifier(n_estimators=70,random_state=42)
rfe = RFE(estimator=clf_rf_3, n_features_to_select=10, step=1)
rfe = rfe.fit(x_train, y_train)
print('由RFE选取最佳10个特征',x_train.columns[rfe.support_])
x_train_3 = select_feature.transform(x_train)
x_test_3 = select_feature.transform(x_test)
clf_rf_3 = RandomForestClassifier(n_estimators=50)
clr_rf_3 = clf_rf_3.fit(x_train_3,y_train)
ac_3 = accuracy_score(y_test,clf_rf_3.predict(x_test_3))
print('精确率: ',ac_3)
cm_3 = confusion_matrix(y_test,clf_rf_3.predict(x_test_3))
sns.heatmap(cm_3,annot=True,fmt="d")
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。