赞
踩
读取数据
In [27]:
import pandas as pd import re import matplotlib.pyplot as plt import numpy as np import seaborn as sns import warnings warnings.filterwarnings("ignore") # 让所有行和列都显示出来 pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None)In [28]:
# 读取数据 train_data = pd.read_csv('/home/mw/input/MLearn9130/训练营.csv') test_data = pd.read_csv('/home/mw/input/MLearn9130/测试集.csv') train_data.head(10) test_data.head()Out[28]:
Age 年龄 Gender 性别 Region 区域 Weight 体重 Height 身高 Body Mass Index 体重指数 Obesity 肥胖腰围 Waist 腰围 Maximum Blood Pressure 最高血压 Minimum Blood Pressure 最低血压 Good Cholesterol 好胆固醇 Bad Cholesterol 坏胆固醇 Total Cholesterol 总胆固醇 Dyslipidemia 血脂异常 PVD Physical Activity 体育活动 Education 教育 Unmarried 未婚 Income 收入 Source of Care 护理来源 PoorVision 视力不佳 Alcohol Consumption 饮酒 HyperTension 高血压 Family HyperTension 家庭高血压 Diabetes 糖尿病 Family Diabetes 家族糖尿病 Hepatitis 肝炎 Family Hepatitis 家族肝炎 Chronic Fatigue 慢性疲劳 id 0 34 M south 65.0 163.3 24.37 0.0 79.3 113.0 79.0 48.0 154.0 202.0 0 0 1.0 1.0 0.0 1.0 Private Hospital 0.0 0 0.0 0 0.0 0 0.0 0.0 0.0 3692 1 27 F east 91.9 177.8 29.07 0.0 100.8 119.0 76.0 46.0 151.0 197.0 0 0 2.0 0.0 1.0 1.0 Private Hospital 0.0 0 0.0 0 0.0 0 0.0 0.0 0.0 3761 2 32 M south 63.2 157.0 25.64 0.0 78.3 97.0 42.0 43.0 132.0 175.0 0 0 2.0 0.0 0.0 0.0 clinic 0.0 0 0.0 0 0.0 1 0.0 0.0 0.0 5670 3 20 F west 78.2 178.3 24.60 0.0 84.6 111.0 81.0 56.0 131.0 187.0 0 0 3.0 1.0 1.0 0.0 Private Hospital 0.0 0 0.0 0 0.0 0 0.0 0.0 0.0 2478 4 67 M east 114.7 168.0 40.64 1.0 125.5 145.0 77.0 58.0 122.0 180.0 0 0 1.0 0.0 1.0 0.0 Private Hospital 0.0 1 1.0 0 1.0 0 0.0 0.0 0.0 4107 In [29]:
# 数据处理 # 先把列名,都改成中文的,去掉英文的 old_columns = train_data.columns.tolist() new_columns = train_data.columns.str.split( r'\n').str[-1].str.split('\u2028').str[-1].tolist() columns_dic = dict(zip(old_columns, new_columns)) train_data.rename(columns=columns_dic, inplace=True) test_data.rename(columns=columns_dic, inplace=True) train_data.columnsOut[29]:
Index(['年龄', '性别', '区域', '体重', '身高', '体重指数', '肥胖腰围', '腰围', '最高血压', '最低血压', '好胆固醇', '坏胆固醇', '总胆固醇', '血脂异常', 'PVD', '体育活动', '教育', '未婚', '收入', '护理来源', '视力不佳', '饮酒', '高血压', '家庭高血压', '糖尿病', '家族糖尿病', '肝炎', '家族肝炎', '慢性疲劳', 'ALF', 'id'], dtype='object')In [30]:
#查看数据情况,是否有异常数据 train_data.describe()Out[30]:
年龄 体重 身高 体重指数 肥胖腰围 腰围 最高血压 最低血压 好胆固醇 坏胆固醇 总胆固醇 血脂异常 PVD 体育活动 教育 未婚 收入 视力不佳 饮酒 高血压 家庭高血压 糖尿病 家族糖尿病 肝炎 家族肝炎 慢性疲劳 ALF id count 4200.000000 4106.000000 4095.000000 4048.000000 4048.00000 4055.000000 4055.000000 4024.000000 4193.000000 4193.000000 4195.000000 4200.000000 4200.000000 4195.00000 4188.000000 3993.000000 3654.000000 3932.000000 4200.000000 4158.000000 4200.000000 4200.000000 4200.000000 4190.000000 4197.000000 4182.000000 4200.000000 4200.000000 mean 49.179048 79.027896 167.060269 28.257824 0.31250 96.796917 125.431073 71.609095 51.897687 152.192702 204.096782 0.107143 0.039762 2.02503 0.437918 0.383171 0.414614 0.061292 0.307857 0.404762 0.234524 0.103571 0.311667 0.064916 0.022635 0.030129 0.076190 3000.830238 std 18.905358 19.360576 10.177974 6.145081 0.46357 15.098074 21.168740 12.461346 15.779077 43.027503 42.674084 0.309332 0.195423 0.82208 0.496190 0.486220 0.492723 0.239896 0.461662 0.490905 0.423751 0.304740 0.463230 0.246408 0.148755 0.170963 0.265334 1741.236022 min 20.000000 33.700000 130.400000 16.030000 0.00000 58.600000 72.000000 10.000000 12.000000 42.000000 80.000000 0.000000 0.000000 1.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 25% 33.000000 65.300000 159.500000 24.040000 0.00000 86.000000 111.000000 64.000000 41.000000 122.000000 176.000000 0.000000 0.000000 1.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1494.750000 50% 47.000000 76.600000 166.700000 27.320000 0.00000 96.200000 122.000000 72.000000 49.000000 148.000000 200.000000 0.000000 0.000000 2.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3015.500000 75% 64.000000 89.400000 174.300000 31.290000 1.00000 106.200000 136.000000 79.000000 61.000000 178.000000 230.000000 0.000000 0.000000 2.00000 1.000000 1.000000 1.000000 0.000000 1.000000 1.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 4507.000000 max 85.000000 188.300000 199.400000 62.990000 1.00000 163.600000 233.000000 132.000000 142.000000 560.000000 606.000000 1.000000 1.000000 4.00000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 5998.000000 从上面的描述看,一共有4200个数据,看起来没有非常异常的数据,但是很多列是有缺失数据的
特征增强
特征观察
In [31]:
# 把 分类的数据列 和 数值型数据列 分开 cate_columns = [ col for col in train_data.columns if train_data[col].max() == 1] cate_columns.extend(['性别', '区域', '体育活动', '护理来源']) number_columns = [col for col in train_data.columns if col not in cate_columns]In [32]:
# 数值型特征观察 for col in number_columns: # 确诊急性肝衰竭 plt.hist(train_data[train_data['ALF'] == 1][col], bins=10, alpha=0.8, label='确诊急性肝衰竭患者') # 没有急性肝衰竭 plt.hist(train_data[train_data['ALF'] == 0][col], bins=10, alpha=0.2, label='没有急性肝衰竭患者') plt.legend(loc='upper left') plt.xlabel(col) plt.ylabel('计数') plt.title('特征【{}】的分布图'.format(col)) plt.show()In [33]:
# 看分类型变量的分布情况 sns.set_palette('muted') cate_features_columns = [x for x in cate_columns if x != 'ALF'] for col in cate_features_columns: # 分组后计数 grouped = train_data[[col, 'ALF']].groupby([col, 'ALF']).size() # 计算每组的百分比,level=0 是针对第一级别的分类里再次计算百分比 group_percentages = grouped.groupby( level=0).apply( lambda x: 100 * x / float( x.sum())) group_percentages.unstack().plot(kind='bar', stacked=True) plt.legend(loc='upper left') plt.xlabel(col) plt.ylabel('百分比') plt.title('特征【{}】的分布图'.format(col)) plt.show()从上面的分布图里有以下大致的观察:
急性肝炎患者相比非急性肝炎患者:平均年龄更大,平均身高更低,平均腰围更大,平均的最高血压更高,平均的最低血压更低,好胆固醇更少
肥胖腰围更多,患有PVD 的更多,没有收到教育的更多,已婚的更多,没有收入的更多,视力不佳的更多,饮酒的、高血压的更多,没有家庭高血压的更多,有糖尿病的更多,患有肝炎的更容易患病,有家族肝炎的,有慢性疲劳的,东部和北部的 比 西部南部的更多,体育活动更少的,更容易患病但是有些特征 患病和不患病区别不明显,而且没有患病的用户数量较大,无法判断是 是特征与结果不相关/还是不明显
特征处理
缺失值填充,同时标准化 数值特征
看前面数值型的数据基本符合正态分布,没有非常离谱的异常数据点
所以在缺失值填充这里,数值型数据我还是选择了均值In [34]:
# 把内容是文本的列单独拿出来 def find_object_columns(data): object_columns = [] for x in data.columns: if data[x].dtype == 'object': object_columns.append(x) return object_columns need_one_hot_list = find_object_columns(train_data) cate_columns_not_need_onehot = [ x for x in cate_columns if x not in need_one_hot_list] cate_columns_not_need_onehotOut[34]:
['肥胖腰围', '血脂异常', 'PVD', '教育', '未婚', '收入', '视力不佳', '饮酒', '高血压', '家庭高血压', '糖尿病', '家族糖尿病', '肝炎', '家族肝炎', '慢性疲劳', 'ALF', '体育活动']In [35]:
from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder num_cate_columns = [x for x in cate_columns_not_need_onehot if x != 'ALF'] num_transformer = Pipeline([('mean_imputer', SimpleImputer(strategy='mean')), ('std_Scaler', StandardScaler())]) cate_transformer = Pipeline( [('most_frequent_imputer', SimpleImputer(strategy='most_frequent'))]) # 处理器接收的都是ndarray # 预处理器,这个返回的结果就是两个合并起来的类型 # 这里先处理:数值类型的列(分类列和数值列) preprocesser = ColumnTransformer(transformers=[ ('num', num_transformer, number_columns), ('cate', cate_transformer, num_cate_columns) ]) cols_transformed = number_columns + num_cate_columns train_data_X = train_data.drop('ALF', axis=1) train_imputed_part1 = preprocesser.fit_transform(train_data_X) train_imputed_part1_df = pd.DataFrame( train_imputed_part1, columns=cols_transformed) # 上面已经学习了train_data 的统计特性,下面应该用相同的统计特性转换测试数据,所以直接transform test_imputed_part1 = preprocesser.transform(test_data) test_imputed_part1_df = pd.DataFrame( test_imputed_part1, columns=cols_transformed)In [36]:
# 再单独处理文字内容的列 from sklearn.preprocessing import OneHotEncoder from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer # OneHot编码的转化器 onehot_transformer = Pipeline(steps=[ ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False)), ]) # 定义预处理器 preprocessor_onehot = ColumnTransformer( transformers=[ ('onehot', onehot_transformer, need_one_hot_list), ]) train_cate_columns_onehotted = preprocessor_onehot.fit_transform(train_data_X) test_cate_columns_onehotted = preprocessor_onehot.transform(test_data) # 获取经过 OneHotEncoder 处理的所有列名 onehot_features = preprocessor_onehot.named_transformers_[ 'onehot'][0].get_feature_names(need_one_hot_list) train_onehotted_part2 = pd.DataFrame( train_cate_columns_onehotted, columns=onehot_features) test_onehotted_part2 = pd.DataFrame( test_cate_columns_onehotted, columns=onehot_features)In [37]:
test_onehotted_part2.head()Out[37]:
性别_F 性别_M 区域_east 区域_north 区域_south 区域_west 护理来源_ 护理来源_Governament Hospital 护理来源_Never Counsulted 护理来源_Private Hospital 护理来源_clinic 0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 1 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 2 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 3 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 4 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 In [38]:
# 把处理完的 两部分数据合并 train_data_finished_X = pd.concat( [train_imputed_part1_df, train_onehotted_part2], axis=1) test_data_finished = pd.concat( [test_imputed_part1_df, test_onehotted_part2], axis=1)In [39]:
train_data_finished_X.head()Out[39]:
年龄 体重 身高 体重指数 腰围 最高血压 最低血压 好胆固醇 坏胆固醇 总胆固醇 id 肥胖腰围 血脂异常 PVD 教育 未婚 收入 视力不佳 饮酒 高血压 家庭高血压 糖尿病 家族糖尿病 肝炎 家族肝炎 慢性疲劳 体育活动 性别_F 性别_M 区域_east 区域_north 区域_south 区域_west 护理来源_ 护理来源_Governament Hospital 护理来源_Never Counsulted 护理来源_Private Hospital 护理来源_clinic 0 1.894977 -1.540613 -2.852132 -0.397510 -0.067208 2.671897 -0.623904 -0.374123 -0.888481 -1.034078 0.679006 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 2.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 1 0.202134 1.362153 0.113421 1.474139 1.658635 0.604346 -0.213931 -0.881607 -0.353430 -0.682325 0.452129 1.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 2.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 2 1.154358 -0.121622 0.202984 -0.238361 0.526050 1.277503 -0.623904 -0.945042 -0.353430 -0.705775 -1.001609 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 4.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 3 -0.908795 -0.403749 -0.234883 -0.327882 0.000208 -1.415122 -2.017810 0.958023 0.600358 0.959188 -0.308341 0.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 4 -1.543611 -1.490457 -0.782216 -1.380581 -1.887432 -1.126626 -1.607838 -0.881607 -1.842269 -2.183137 -1.604701 0.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 特征选择
In [40]:
# 画热力图,看特征之间的相关性 sns.heatmap(train_data_finished_X.corr(), cmap='Blues')Out[40]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f46e96fe908>In [41]:
# 特征选择,只选择相关性超过0.04的特征 train_data_finished = pd.concat( [train_data_finished_X, train_data['ALF']], axis=1) feature_mask = train_data_finished.corr()['ALF'].abs() > 0.04 high_related_feature = train_data_finished.columns[feature_mask] high_related_feature # len(feature_mask)Out[41]:
Index(['年龄', '身高', '腰围', '最高血压', '最低血压', 'PVD', '教育', '未婚', '收入', '视力不佳', '饮酒', '高血压', '家庭高血压', '糖尿病', '肝炎', '慢性疲劳', '体育活动', '区域_east', '区域_south', '护理来源_Never Counsulted', '护理来源_Private Hospital', 'ALF'], dtype='object')机器学习
寻找合适的模型和参数
In [42]:
# 空准确率 train_data_finished['ALF'].value_counts(normalize=True) # 空准确率就达到了 92%以上,存在数据不平衡的问题Out[42]:
0.0 0.92381 1.0 0.07619 Name: ALF, dtype: float64In [45]:
# 我自己从训练数据里再拆出来:训练集合,和测试集合,演练一下模型的预测能力 from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn.metrics import f1_score, precision_score, recall_score from imblearn.over_sampling import SMOTE # 尝试用SMOTE 的方法生成新的样本,解决数据不平衡的问题 sm = SMOTE(random_state=2) X_train, X_test, y_train, y_test = train_test_split( train_data_finished[high_related_feature], train_data_finished['ALF'], test_size=0.3, random_state=123) X_train_res, y_train_res = sm.fit_resample(X_train, y_train)对于不平衡的数据,F1 score 和 AUC 是两个非常常用的指标,它们同时考虑了正类和负类的情况。
F1 score 是 precision 和 recall 的调和平均,它的优点是既考虑了正类的预测准确度(precision),又考虑了正类的覆盖面(recall)
AUC 是 Receiver Operating Characteristic curve (ROC curve) 下的面积,考虑到了不同的分类阈值下模型的表现,同样是适合处理数据不平衡的指标。In [46]:
# 然后找到最合适的模型 from sklearn.model_selection import GridSearchCV def get_best_model_and_accuracy( model, params, X_train, y_train, X_test, y_test): grid = GridSearchCV(model, # 要搜索的模型 params, # 要尝试的参数 error_score=0.) # 如果报错,结果是0 grid.fit(X_train, y_train) # 拟合模型和参数 y_pre = grid.predict(X_test) accuracy = accuracy_score(y_test, y_pre) f1 = f1_score(y_test, y_pre) precision = precision_score(y_test, y_pre) recall = recall_score(y_test, y_pre) # 经典的性能指标 print("Best Accuracy: {}".format(accuracy)) # 得到最佳准确率的最佳参数 print("Best Parameters: {}".format(grid.best_params_)) # 拟合的平均时间(秒) print("Average Time to Fit (s): {}".format( round(grid.cv_results_['mean_fit_time'].mean(), 3))) # 预测的平均时间(秒) # 从该指标可以看出模型在真实世界的性能 print("Average Time to Score (s): {}".format( round(grid.cv_results_['mean_score_time'].mean(), 3))) print('F1 score: ', f1) print('Precision: ', precision) print('Recall: ', recall) from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier # 为网格搜索设置变量 # 先设置机器学习模型的参数 # 逻辑回归 lr_params = {'C': [1e-1, 1e0, 1e1, 1e2], 'penalty': ['l1', 'l2']} # KNN knn_params = {'n_neighbors': [1, 3, 5, 7]} # 决策树 tree_params = {'max_depth': [None, 1, 3, 5, 7]} # 随机森林 forest_params = {'n_estimators': [10, 50, 100], 'max_depth': [None, 1, 3, 5, 7]} # 实例化机器学习模型 lr = LogisticRegression() knn = KNeighborsClassifier() d_tree = DecisionTreeClassifier() forest = RandomForestClassifier()In [52]:
get_best_model_and_accuracy( lr, lr_params, X_train_res, y_train_res, X_test, y_test)Best Accuracy: 1.0 Best Parameters: {'C': 0.1, 'penalty': 'l2'} Average Time to Fit (s): 0.349 Average Time to Score (s): 0.011 F1 score: 1.0 Precision: 1.0 Recall: 1.0F1得分:F1得分是一个介于0和1之间的数,接近1意味着模型表现良好,接近0则表示表现差。F1得分是精确度和召回率的调和均值,试图平衡这两者。在你的结果中,F1得分是0.04255,这是一个相当低的分数,表明模型的精确度和召回率都较低。
精确度(Precision):精确度是指模型预测为正类的样本中真正为正类的比例。在你的结果中,精确度是0.4,表示40%的被模型预测为正类的样本实际上是正类。
召回率(Recall):召回率是指模型正确预测的正类样本占总的正类样本的比例。在你的结果中,召回率是0.02247,这意味着只有约2.25% 的实际正类被模型正确地预测为正类。
In [48]:
get_best_model_and_accuracy( knn, knn_params, X_train_res, y_train_res, X_test, y_test)Best Accuracy: 0.9587301587301588 Best Parameters: {'n_neighbors': 1} Average Time to Fit (s): 0.004 Average Time to Score (s): 0.385 F1 score: 0.7263157894736842 Precision: 0.6831683168316832 Recall: 0.7752808988764045In [49]:
get_best_model_and_accuracy( d_tree, tree_params, X_train_res, y_train_res, X_test, y_test)Best Accuracy: 1.0 Best Parameters: {'max_depth': None} Average Time to Fit (s): 0.006 Average Time to Score (s): 0.002 F1 score: 1.0 Precision: 1.0 Recall: 1.0In [50]:
get_best_model_and_accuracy( forest, forest_params, X_train_res, y_train_res, X_test, y_test)Best Accuracy: 1.0 Best Parameters: {'max_depth': None, 'n_estimators': 50} Average Time to Fit (s): 0.115 Average Time to Score (s): 0.008 F1 score: 1.0 Precision: 1.0 Recall: 1.0这次看起来,knn模型可以更好的解决过拟合的问题,用knn模型继续看
In [60]:
high_related_featureOut[60]:
Index(['年龄', '身高', '腰围', '最高血压', '最低血压', 'PVD', '教育', '未婚', '收入', '视力不佳', '饮酒', '高血压', '家庭高血压', '糖尿病', '肝炎', '慢性疲劳', '体育活动', '区域_east', '区域_south', '护理来源_Never Counsulted', '护理来源_Private Hospital', 'ALF'], dtype='object')In [65]:
# 用交叉验证的方式看 knn 模型的表现情况 from sklearn.model_selection import cross_val_score from sklearn.metrics import make_scorer, f1_score, recall_score high_related_feature = [i for i in high_related_feature if i != 'ALF'] X_train_res_all, y_train_res_all = sm.fit_resample( train_data_finished[high_related_feature], train_data['ALF']) test_imputed_test = test_data_finished[high_related_feature] model = KNeighborsClassifier(n_neighbors=1) f1 = cross_val_score( model, X_train_res_all, y_train_res_all, cv=5, scoring=make_scorer( f1_score, average='macro')) recall = cross_val_score( model, X_train_res_all, y_train_res_all, cv=5, scoring=make_scorer( recall_score, average='macro')) print("f1: ", f1) # 输出平均准确性 print("recall ", recall)f1: [0.9398706 0.93399325 0.9228865 0.93661314 0.9398706 ] recall [0.94007732 0.93427835 0.92332474 0.93685567 0.94007732]模型训练,输出预测结果
In [72]:
# 用knn 模型做预测模型 # 对训练集的所有数据做重采样 model.fit(X_train_res_all, y_train_res_all) result_y = model.predict(test_data_finished[high_related_feature])In [73]:
# 结果写入csv文件 result = pd.concat([test_data['id'], pd.DataFrame( result_y, columns=['ALF'])], axis=1) result.to_csv('logistic_test1.csv', encoding='utf-8', index=False)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。