当前位置:   article > 正文

Python数据分析实战之:特征重要性分析_python 特征重要性

python 特征重要性

提醒

  • pandas 读取 excel 文件,需要 xlrd >= 1.1.0

代码

import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 混淆矩阵
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve, f1_score, precision_score, recall_score
from sklearn.svm import SVC

## pandas 显示全部单元格

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19

数据处理

path = "./2019-2020年.xlsx"

# df = pd.read_csv(path, error_bad_lines=False)
df = pd.read_excel(path)

# 因为 OTT, TOAST 空值太多,暂不用于分析
df = df.drop(['OTT', 'TOAST subtypes'],axis=1)


# 其余的缺失值比较少,直接删除空值
df = df.dropna(axis=0, how='any')

df = df.reset_index()

# 有些列中的数据是字符串,要转成 int 或者 float 才能训练
# 看哪些数据是 object 类型的,全部转成 int 或者 float64 型

# 只有这一个数据有问题,转成 0 即可
df['Coronary heart disease'][74] = 0

# 数据很干净
df.head(10)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
indexSexMedication before thrombolytic therapyAgeAge.1Periventricular White MatterDeep White MatterThe degree of WMHSmokingDrinkingAtrialFibrillationHypertensionDiabetesHyperlipidemiaCoronary heart diseaseHeart failureStrokeTIAWBCNLNLRHBPLTPCVPTINRAPTTTTFibrinogenEmergency blood sugarFasting blood glucoseCreatinineHDLLDLHDL/LDL比值CholesterolTriglycerideHBLACHCYDNTsBPdBPBaseline NIHSS scoreHemorrhagic Transformation(HT)Early neurological deterioration (END)Prognosis&0 (mRS0-2:0;3-6:1)90dmRSPrognosis&1(mRS0-1:0;2-6:1)
050258.00.02121.01.00.01000000.08.45.442.352.3100001412663810.70.9234.514.93.525.685.1166.20.923.150.2900004.271.994.611.025.0177.090.0800151
160253.00.02121.01.00.01000000.011.66.54.041.608911145259459.90.8630.615.33.346.386.1374.01.464.540.3215866.112.395.511.337.0166.098.0400010
2101277.01.03330.00.00.00000010.07.836.630.5911.237288103225309.20.802814.94.821.767.11107.91.191.031.1553402.280.415.013.030.0150.090.0900000
3130265.00.01011.00.00.00000000.013.8410.32.334.4206011563274711.81.0238.312.42.698.224.9885.00.722.270.3171813.511.345.712.043.0150.0102.0900021
4261266.00.02330.00.00.01100010.04.7311.62.564.5312501422814012.51.0826.118.22.496.364.0295.00.822.940.2789124.141.675.922.017.0147.075.0300021
5270274.01.03330.00.00.01001000.06.185.070.549.3888891101523212.31.0626.716.13.3310.407.0780.31.112.710.4095944.200.976.719.022.0125.080.0500000
6301270.01.01111.01.00.00000000.08.56.191.613.8447201362584011.30.9731.414.14.295.984.4054.31.133.590.3147635.281.005.415.015.0147.089.0200000
7340158.00.01111.01.00.01100000.09.486.851.733.9595381522034710.10.8731.212.74.4310.468.3772.81.423.550.4000004.961.518.613.443.0160.0105.0400010
8361265.00.01110.00.01.00000010.015.611.523.063.7647061422564311.20.9631.411.64.637.324.7550.61.082.020.5346533.521.026.521.015.0110.076.01100131
9380282.01.03331.00.00.01000000.010.318.121.246.5483871252473711.30.9731.412.54.796.254.8872.21.652.560.6445314.210.775.618.725.0220.0104.0400010

分离 data 和 label

# 看分布
label_1 = df.columns[-3]
label_2 = df.columns[-1]

# df[label_1].hist()

# df[label_2].hist()

# 得到数据集 Data 是训练数据;label1_data 是第一种标签的标签数据; label2_data 是第二种标签的标签数据

label1_data = df[label_1]
label2_data = df[label_2]


data = df[[column for column in df.columns if column not in [label_1, label_2]]]
# 去除 index 这一列,这列数据没有意义
data.drop(['index'], axis=1, inplace=True)

data.drop(['90dmRS'], axis=1, inplace=True)

data.head(10)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
SexMedication before thrombolytic therapyAgeAge.1Periventricular White MatterDeep White MatterThe degree of WMHSmokingDrinkingAtrialFibrillationHypertensionDiabetesHyperlipidemiaCoronary heart diseaseHeart failureStrokeTIAWBCNLNLRHBPLTPCVPTINRAPTTTTFibrinogenEmergency blood sugarFasting blood glucoseCreatinineHDLLDLHDL/LDL比值CholesterolTriglycerideHBLACHCYDNTsBPdBPBaseline NIHSS scoreHemorrhagic Transformation(HT)Early neurological deterioration (END)
00258.00.02121.01.00.01000000.08.45.442.352.3100001412663810.70.9234.514.93.525.685.1166.20.923.150.2900004.271.994.611.025.0177.090.0800
10253.00.02121.01.00.01000000.011.66.54.041.608911145259459.90.8630.615.33.346.386.1374.01.464.540.3215866.112.395.511.337.0166.098.0400
21277.01.03330.00.00.00000010.07.836.630.5911.237288103225309.20.802814.94.821.767.11107.91.191.031.1553402.280.415.013.030.0150.090.0900
30265.00.01011.00.00.00000000.013.8410.32.334.4206011563274711.81.0238.312.42.698.224.9885.00.722.270.3171813.511.345.712.043.0150.0102.0900
41266.00.02330.00.00.01100010.04.7311.62.564.5312501422814012.51.0826.118.22.496.364.0295.00.822.940.2789124.141.675.922.017.0147.075.0300
50274.01.03330.00.00.01001000.06.185.070.549.3888891101523212.31.0626.716.13.3310.407.0780.31.112.710.4095944.200.976.719.022.0125.080.0500
61270.01.01111.01.00.00000000.08.56.191.613.8447201362584011.30.9731.414.14.295.984.4054.31.133.590.3147635.281.005.415.015.0147.089.0200
70158.00.01111.01.00.01100000.09.486.851.733.9595381522034710.10.8731.212.74.4310.468.3772.81.423.550.4000004.961.518.613.443.0160.0105.0400
81265.00.01110.00.01.00000010.015.611.523.063.7647061422564311.20.9631.411.64.637.324.7550.61.082.020.5346533.521.026.521.015.0110.076.01100
90282.01.03331.00.00.01000000.010.318.121.246.5483871252473711.30.9731.412.54.796.254.8872.21.652.560.6445314.210.775.618.725.0220.0104.0400

训练

def train(model, dataset, labelset):
    x_train, x_test, y_train, y_test = train_test_split(dataset.values
                                                        , labelset.values
                                                        , test_size=0.2
                                                        , train_size=0.8
                                                        , shuffle=True
                                                        , stratify=labelset)
    model.fit(x_train, y_train)
    score = model.score(x_test, y_test)
    accs = cross_val_score(model, dataset.values, labelset.values, verbose=0)
    print(f'validation acc is: {score}')
    print(f'cross validation accs are: {accs}')
    
    y_pre = model.predict(x_test)
    metri = confusion_matrix(y_test, y_pre)
    sns.heatmap(metri, annot=True)
    plt.show()
    
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18

训练结果 & 混淆矩阵

svc1 = SVC(class_weight='balanced', kernel='linear')
train(svc1, data, label1_data)
  • 1
  • 2
validation acc is: 0.8863636363636364
cross validation accs are: [0.90909091 0.69767442 0.76744186 0.88372093 0.88372093]
  • 1
  • 2

在这里插入图片描述

svc2 = SVC(class_weight='balanced', kernel='linear')
train(svc2, data, label2_data)
  • 1
  • 2
validation acc is: 0.8863636363636364
cross validation accs are: [0.81818182 0.6744186  0.72093023 0.76744186 0.76744186]
  • 1
  • 2

在这里插入图片描述

lr1 = LogisticRegression(class_weight='balanced', max_iter=10000)
train(lr1, data, label1_data)
  • 1
  • 2
validation acc is: 0.9090909090909091
cross validation accs are: [0.81818182 0.6744186  0.76744186 0.88372093 0.90697674]
  • 1
  • 2

在这里插入图片描述

lr2 = LogisticRegression(class_weight='balanced', max_iter=10000)
train(lr2, data, label2_data)
  • 1
  • 2
validation acc is: 0.8863636363636364
cross validation accs are: [0.79545455 0.69767442 0.74418605 0.6744186  0.81395349]
  • 1
  • 2

在这里插入图片描述

各种 feature 的重要性

def make_coef_dictNdf(data_columns, coef):
    name_influence_dic = {string: imp for string, imp in zip(data_columns, coef.squeeze())}
    name_influence_df = pd.DataFrame(data=name_influence_dic, index=['influence']).T
    return name_influence_dic, name_influence_df
## label2 结果各个特征的重要性
  • 1
  • 2
  • 3
  • 4
  • 5
def write(filename, name_df_dic):
    writer = pd.ExcelWriter(filename)
    for k,v in name_df_dic.items():
        v.to_excel(writer, sheet_name=k)
    writer.save()
    writer.close()

# writer = pd.ExcelWriter("逻辑回归.xlsx")
# label1_df.to_excel(writer, sheet_name="label1")
# label2_df.to_excel(writer, sheet_name="label2")
# writer.save()
# writer.close()

def plot(figsize, name_influence_df_lst, img_label_lst, title):
    plt.figure(figsize=figsize)
    for i in range(len(name_influence_df_lst)):
        df = name_influence_df_lst[i]
        plt.bar(x=df.index, height=df['influence'],label=img_label_lst[i])
    plt.title(title)
    plt.legend()
    plt.xticks(rotation=90)
# plt.figure(figsize=(20,10))
# plt.bar(x=label1_df.index, height=label1_df['influence'],label='label1')
# plt.bar(x=label1_df.index, height=label2_df['influence'], label='label2')
# plt.legend()
# plt.xticks(rotation=90) # 旋转90度
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
lr1_dic, lr1_df = make_coef_dictNdf(data.columns, lr1.coef_)
lr2_dic, lr2_df = make_coef_dictNdf(data.columns, lr2.coef_)
  • 1
  • 2
svc1_dic, svc1_df = make_coef_dictNdf(data.columns, svc1.coef_)
svc2_dic, svc2_df = make_coef_dictNdf(data.columns, svc2.coef_)
  • 1
  • 2
write("逻辑回归.xlsx", {'label1': lr1_df, 'label2': lr2_df})
  • 1
write("SVM.xlsx", {'label1': svc1_df, 'label2': svc2_df})
  • 1
plot((20,10), [lr1_df, lr2_df], ['label1', 'label2'], 'lr')
  • 1

在这里插入图片描述

plot((20,10), [svc1_df, svc2_df], ['label1', 'label2'], 'svm')
  • 1

在这里插入图片描述

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/AllinToyou/article/detail/433531
推荐阅读
相关标签
  

闽ICP备14008679号