2022讯飞——糖尿病遗传风险检测挑战赛解决方案

0. 赛事背景





  训练集说明

  测试集说明

  评估指标



1. 读取数据

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm

train_data = pd.read_csv('比赛训练集.csv',encoding='gbk')
test_data = pd.read_csv('比赛测试集.csv',encoding='gbk')
2. 数据探索及预处理

2.1 缺失值



# 用均值填充缺失值
train_data['舒张压'] = train_data['舒张压'].fillna(train_data['舒张压'].mean())
test_data['舒张压'] = test_data['舒张压'].fillna(test_data['舒张压'].mean())

2.2 分析字段类型


train_data = train_data.drop(['编号'], axis=1)
test_data = test_data.drop(['编号'], axis=1)
2.3 计算字段相关性



train_corr = train_data.drop('糖尿病家族史',axis=1).corr()

import seaborn as sns

plt.subplots(figsize=(9,9),dpi=80,facecolor='w') # 设置画布大小,分辨率,和底色
plt.rcParams['font.sans-serif'] = ['SimHei'] # 黑体
plt.rcParams['axes.unicode_minus'] = False   # 解决无法显示符号的问题
sns.set(font='SimHei', font_scale=0.8)       # 解决Seaborn中文显示问题

fig=sns.heatmap(train_corr,annot=True, vmax=1, square=True, cmap="Blues", fmt='.2g')

3. 特征工程


  • 特征构造:尝试构建有价值的新变量;
  • 特征筛选:删除对因变量影响不大的冗余变量。


3.1 特征构造



  1. 特征的统计指标;
  2. 特征之间的四则运算;
  3. 交叉特征;
  4. 分解类别特征。如将三个颜色分解为“知道颜色”和“不知道颜色”。
  5. 特征分箱。将数值型特征变量按段划分,得到类别型特征。
  6. 重构特征。单位转换、整数部分与小数部分分离等。
  7. 根据已有经验构造新的特征变量,比如xx因子。

[1] 深度了解特征工程

# 将出生年份换算成年龄
train_data['年龄']=2022-train_data['出生年份']  #换成年龄

train_data = train_data.drop('出生年份', axis=1)
test_data = test_data.drop('出生年份', axis=1)

    if a<18.5:
        return 0
    elif 18.5<=a<=24:
        return 1
    elif 24<a<=27:
        return 2
    elif 27<a<=32:
        return 3
        return 4


# 转换舒张压为类别型变量
def DBP(a):
    # 舒张压范围为60-90
    if a<60:
        return 0
    elif 60<=a<=90:
        return 1
    elif a>90:
        return 2
        return a

train_data['DBP'] = train_data['舒张压'].apply(DBP)
test_data['DBP'] = test_data['舒张压'].apply(DBP)

X_train = train_data.drop('患有糖尿病标识', axis=1)
Y_train = train_data['患有糖尿病标识']
X_train['年龄'] = X_train['年龄'].astype(float)
X_test = test_data

4. 模型训练


4.1 LightGBM (0.96206)


[1] Lightgbm原理、参数详解及python实例
[2] 深入理解LightGBM


from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold, GridSearchCV

def select_by_lgb(train_data,train_label,test_data,random_state=1234, n_splits=5,metric='auc',num_round=10000,early_stopping_rounds=100):
#    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    result0 = []
    for train_idx, val_idx in kfold.split(train_data, train_label):
        train_x = train_data.loc[train_idx]
        train_y = train_label.loc[train_idx]
        test_x = train_data.loc[val_idx]
        test_y = train_label.loc[val_idx]
        clf = lightgbm
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'learning_rate': 0.1,
                # 'max_depth': 7,
                # 'num_leaves': 10,
                'metric': metric,
                'seed': random_state,
                'silent': True,
                'nthread':-1 }

        pred_test = pd.DataFrame(result0).T

        # 将5次预测结果求平均值
        pred_test['average'] = pred_test.mean(axis=1)

        pred_test['label'] = pred_test['average'].apply(lambda x:1 if x>0.5 else 0)

        ## 导出结果
        result = pd.read_csv('提交示例.csv')

    return result
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score

def SKFold(train_data,train_label,test_data, model, random_state=1234, n_splits=5,metric='auc',num_round=10000,early_stopping_rounds=100):
    # 采用分层K折交叉验证训练模型。
    kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    fold = 1
    pred_test = []
    for train_idx, val_idx in kfold.split(train_data, train_label):
        train_x = train_data.loc[train_idx]
        train_y = train_label.loc[train_idx]
        val_x = train_data.loc[val_idx]
        val_y = train_label.loc[val_idx]
        eval_set = (val_x, val_y)
        clf = model
        model_trained = clf.fit(train_x, train_y)
        # model_trained = clf.fit(train_x,train_y,early_stopping_rounds=early_stopping_rounds, verbose=False)
        # model_trained = clf.fit(train_x, train_y, eval_set=eval_set, early_stopping_rounds=early_stopping_rounds)
        pre_y = model_trained.predict(test_data)

        auc_train = roc_auc_score(train_y, model_trained.predict(train_x))
        auc_val = roc_auc_score(val_y, model_trained.predict(val_x))
        f_score_train = f1_score(train_y, model_trained.predict(train_x))
        f_score_val = f1_score(val_y, model_trained.predict(val_x))
        print('Fold: %d, AUC_train: %.4f, AUC_val: %.4f, F1-score_train: %.4f, F1-score_val: %.4f'%(fold, 
            auc_train, auc_val, f_score_train, f_score_val))
        fold += 1

    pred_test = pd.DataFrame(pred_test).T
    # 将5次预测结果求平均值
    pred_test['average'] = pred_test.mean(axis=1)

    pred_test['label'] = pred_test['average'].apply(lambda x:1 if x>0.5 else 0)

    ## 导出结果

    return result
def evaluate(result_LightGBM, result_others):
    # 以lightGBM的结果为基准,评估其他模型的表现。

    c = result_LightGBM['label'] - result_others['label']

    count = 0
    for i in c:
        if i != 0:
            count += 1

    print('与LightGBM预测不同的样本数: ', count)

    return count
random_state = 1234

result_LightGBM = select_by_lgb(X_train, Y_train, X_test)   #baseline

# 试试网格搜索最优参数
import lightgbm as lgb
params_test = {
    'max_depth': range(4, 10, 1),
    'num_leaves': range(10, 60, 10)

skf = StratifiedKFold(n_splits=5)  

gsearch1 = GridSearchCV(estimator=lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.1, n_estimators=325, 
    max_depth=8, bagging_fraction = 0.8,feature_fraction = 0.8),  param_grid=params_test,
        scoring='roc_auc', cv=skf, n_jobs=-1)

gsearch1.fit(X_train, Y_train)

# 用最优参数再训练一遍
model_lgb = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',
    learning_rate=0.1, n_estimators=200, num_leaves=10, silent=True,

result_SKFold_lgb = SKFold(X_train, Y_train, X_test, model_lgb, n_splits=5)

diff_lgb = evaluate(result_LightGBM, result_SKFold_lgb)
[LightGBM] [Warning] Unknown parameter: silent
[LightGBM] [Warning] Unknown parameter: silent
[LightGBM] [Info] Number of positive: 1549, number of negative: 2507
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000386 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1047
[LightGBM] [Info] Number of data points in the train set: 4056, number of used features: 10
[LightGBM] [Warning] Unknown parameter: silent
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.381903 -> initscore=-0.481477
[LightGBM] [Info] Start training from score -0.481477
[1]	valid_0's auc: 0.979322
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.980354
[3]	valid_0's auc: 0.982351
[4]	valid_0's auc: 0.981993

Early stopping, best iteration is:
[40]	valid_0's auc: 0.989851
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
{'max_depth': 7, 'num_leaves': 10}
Fold: 1, AUC_train: 0.9917, AUC_val: 0.9461, F1-score_train: 0.9909, F1-score_val: 0.9358
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
Fold: 2, AUC_train: 0.9852, AUC_val: 0.9487, F1-score_train: 0.9837, F1-score_val: 0.9386
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
Fold: 3, AUC_train: 0.9863, AUC_val: 0.9457, F1-score_train: 0.9853, F1-score_val: 0.9328
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
Fold: 4, AUC_train: 0.9876, AUC_val: 0.9607, F1-score_train: 0.9860, F1-score_val: 0.9490
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
Fold: 5, AUC_train: 0.9858, AUC_val: 0.9505, F1-score_train: 0.9841, F1-score_val: 0.9403
与LightGBM预测不同的样本数:  7
24     1
35    -1
43     1
76     1
434    1
442    1
501   -1
Name: label, dtype: int64
4.2 随机森林(0.96324)

[1] Permutation Importance vs Random Forest Feature Importance (MDI)

from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(max_depth=5, random_state=1234)
forest.fit(X_train, Y_train)
pred_forest = forest.predict(X_test)


feature_importance_forest = pd.Series(forest.feature_importances_, 

plt.figure(figsize=(10, 7), dpi=80)

ax = feature_importance_forest.plot.barh()
ax.set_title("Random Forest Feature Importances (MDI)")
# ax.figure.tight_layout()
## 网格搜索最优参数组合

params_test = {
    'max_depth': range(3, 20, 2),
    'n_estimators': range(100, 600, 100),
    'min_samples_leaf': [2, 4, 6]

skf = StratifiedKFold(n_splits=5)  

gsearch2 = GridSearchCV(estimator=RandomForestClassifier(n_estimators=200, max_depth=5, random_state=random_state), 
     param_grid=params_test, scoring='roc_auc', cv=skf, n_jobs=-1)

gsearch2.fit(X_train, Y_train)
model_forest = RandomForestClassifier(n_estimators=400, max_depth=13, random_state=random_state)
result_SKFold_forest = SKFold(X_train, Y_train, X_test, model_forest) 

diff_skold_forest = evaluate(result_LightGBM, result_SKFold_forest)
4.3 XGBoost (0.95981)

[1] XGBoost:在Python中使用XGBoost
[2] Python机器学习笔记:XgBoost算法
[3] python包xgboost安装和简单使用
[4] 深入理解XGBoost,优缺点分析,原理推导及工程实现
[5] XGBoost的原理、公式推导、Python实现和应用
[6] XGBoost官方文档

import xgboost as xgb
from xgboost import plot_importance
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

import warnings

# 分层k折交叉检验
skf = StratifiedKFold(n_splits=5)  

result_xgb = []
fold = 1
for train_idx, val_idx in skf.split(X_train, Y_train):
    train_x = X_train.loc[train_idx]
    train_y = Y_train.loc[train_idx]
    val_x = X_train.loc[val_idx]
    val_y = Y_train.loc[val_idx]
    d_train = xgb.DMatrix(train_x, train_y)
    d_val = xgb.DMatrix(val_x, val_y)
    d_test = xgb.DMatrix(X_test)

    params = {
        'eta': 0.1,  #学习率
        'gamma': 0.1, #后剪枝参数,取值在[0, 1],越大越保守
        'seed': 1234,
        'alpha': 1,  #L1正则项的惩罚系数
        'eval_metric': 'auc'
    num_round = 500

    # # 方式一:采用sklearn接口,采用fit 和 predict
    # model_xgb = xgb.XGBClassifier()
    # model_xgb.fit(train_x, train_y, verbose=False) 
    # pred_train = model_xgb.predict(train_x)
    # pred_val = model_xgb.predict(val_x)
    # pred_xgb = model_xgb.predict(X_test)

    # 方式二:采用xgboost原生接口,采用train和predict,方便调参
    model_xgb = xgb.train(params, d_train, num_round)
    pred_train = model_xgb.predict(d_train)
    pred_val = model_xgb.predict(d_val)
    pred_xgb = model_xgb.predict(d_test)

    auc_train = roc_auc_score(train_y, pred_train)
    auc_val = roc_auc_score(val_y, pred_val)
    f_score_train = f1_score(train_y, pred_train)
    f_score_val = f1_score(val_y, pred_val)
    print('Fold: %d, AUC_train: %.4f, AUC_val: %.4f, F1-score_train: %.4f, F1-score_val: %.4f'%(fold, 
        auc_train, auc_val, f_score_train, f_score_val))


    fold += 1

result_xgb = pd.DataFrame(result_xgb).T
print('result_xgb.shape = ', result_xgb.shape)

# 将5次预测结果求平均值
result_xgb['average'] = result_xgb.mean(axis=1)

# 最终预测结果
result_xgb['label'] = result_xgb['average'].apply(lambda x:1 if x>0.5 else 0)

# 特征重要性

# 导出结果
result = pd.read_csv('提交示例.csv')
result['label'] = result_xgb['label']

diff_xgb = evaluate(result_LightGBM, result_xgb)
4.4 CatBoost(0.95854)

CatbBoost 是GBDT算法框架的一种改进实现,其主要创新点有:

  • 支持类别性变量。嵌入了自动将类别型特征处理为数值型特征的创新算法。
  • 使用了组合类别特征,丰富特征维度。
  • 采用排序提升的方法对抗训练集中的噪声点,从而避免梯度估计的偏差,进而解决预测偏移的问题
  • 采用了完全对称树作为基模型。

[1] 深入理解CatBoost
[2] Catboost 一个超级简单实用的boost算法

import catboost as cb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

# 分层k折交叉检验
skf = StratifiedKFold(n_splits=5)  

categorical_features_index = np.where(X_train.dtypes != float)[0]

result_cat = []
fold = 1
for train_idx, val_idx in skf.split(X_train, Y_train):
    train_x = X_train.loc[train_idx]
    train_y = Y_train.loc[train_idx]
    val_x = X_train.loc[val_idx]
    val_y = Y_train.loc[val_idx]

    model_catboost = cb.CatBoostClassifier(eval_metric='AUC', cat_features=categorical_features_index, 
        depth=6, n_estimators=400, learning_rate=0.5, verbose=False)
    model_catboost.fit(train_x, train_y, eval_set=(val_x, val_y), plot=False)

    pred_train = model_catboost.predict(train_x)
    pred_val = model_catboost.predict(val_x)

    auc_train = roc_auc_score(train_y, pred_train)
    auc_val = roc_auc_score(val_y, pred_val)
    f_score_train = f1_score(train_y, pred_train)
    f_score_val = f1_score(val_y, pred_val)
    print('Fold: %d, AUC_train: %.4f, AUC_val: %.4f, F1-score_train: %.4f, F1-score_val: %.4f'%(fold, 
        auc_train, auc_val, f_score_train, f_score_val))

    pred_catboost = model_catboost.predict(X_test)

    fold += 1

result_cat = pd.DataFrame(result_cat).T
print('result_cat.shape = ', result_cat.shape)

# 将5次预测结果求平均值
result_cat['average'] = result_cat.mean(axis=1)

# 最终预测结果
result_cat['label'] = result_cat['average'].apply(lambda x:1 if x>0.5 else 0)

# 导出结果
result = pd.read_csv('提交示例.csv')
result['label'] = result_cat['label']

diff_catboost = evaluate(result_LightGBM, result_cat)

feature_importance_catboost = model_catboost.feature_importances_

plt.figure(figsize=(10,8), dpi=80)
plt.barh(col_names, feature_importance_catboost)
4.5 AdaBoost(0.96098)

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

random_state = 1234
model_tree = DecisionTreeClassifier(max_depth=5, random_state=random_state)
model_adaboost = AdaBoostClassifier(base_estimator=model_tree, n_estimators=200, 

result_adaboost = SKFold(X_train, Y_train, X_test, model_adaboost) 

# 评估
diff_skold_adaboost = evaluate(result_LightGBM, result_adaboost)

# 特征重要性
feature_importance_adaboost = model_adaboost.feature_importances_

plt.figure(figsize=(10,8), dpi=80)
plt.rc('font', size = 18)
plt.barh(col_names, feature_importance_adaboost)
plt.title('Feature importances computed by AdaBoost')
4.6 集成模型(0.95971)


skf = StratifiedKFold(n_splits=5)  

categorical_features_index = np.where(X_train.dtypes != float)[0]
print('类别型特征: ', X_train.columns[categorical_features_index])
cat_features = list(map(lambda x:int(x), categorical_features_index))
random_state = 1234

fold = 1
for train_idx, val_idx in skf.split(X_train, Y_train):
    train_x = X_train.loc[train_idx]
    train_y = Y_train.loc[train_idx]
    val_x = X_train.loc[val_idx]
    val_y = Y_train.loc[val_idx]
    d_train = xgb.DMatrix(train_x, train_y)
    d_val = xgb.DMatrix(val_x, val_y)
    d_test = xgb.DMatrix(X_test)

    params_xgb = {
        'eta': 0.1,  #学习率
        'gamma': 0.1, #后剪枝参数,取值在[0, 1],越大越保守
        'seed': 1234,
        'alpha': 1,  #L1正则项的惩罚系数
        'eval_metric': 'auc'

    num_round = 500
    early_stopping_rounds = 100

    model_lightGBM = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',
        learning_rate=0.1, n_estimators=200, num_leaves=10, silent=True,
    model_lightGBM.fit(X_train, Y_train)

    model_forest = RandomForestClassifier(max_depth=13, n_estimators=400, random_state=1234)
    model_forest.fit(X_train, Y_train)

    model_tree = DecisionTreeClassifier(max_depth=5, random_state=random_state)
    model_adaboost = AdaBoostClassifier(base_estimator=model_tree, n_estimators=200, 
    model_adaboost.fit(X_train, Y_train)

    model_xgb = xgb.train(params_xgb, d_train, num_round)

    model_catboost = cb.CatBoostClassifier(eval_metric='AUC', cat_features=categorical_features_index, 
        depth=6, iterations=400, learning_rate=0.5, verbose=False)
    model_catboost.fit(train_x, train_y, eval_set=(val_x, val_y), plot=False)

    print('Fold: %d finished training. '%fold)
    fold += 1

pred_lightGBM = model_lightGBM.predict(test_data)
# pred_lightGBM = list(map(lambda x: 1 if x>0.5 else 0, pred_lightGBM))  #调用lightGBM原生接口时使用
pred_forest = forest.predict(X_test)
pred_adaboost = model_adaboost.predict(X_test)
pred_xgb = model_xgb.predict(d_test)
pred_catboost = model_catboost.predict(X_test)
pred_all = pd.DataFrame({'lightGBM': pred_lightGBM,
                            'RandomForest': pred_forest,
                            'AdaBoost': pred_adaboost,
                            'XGBoost': pred_xgb,
                            'CatBoost': pred_catboost})

pred_all['Average'] = pred_all.mean(axis=1)

# 最终预测结果
pred_all['label'] = pred_all['Average'].apply(lambda x:1 if x>0.5 else 0)

# 导出结果
result = pd.read_csv('提交示例.csv')
result['label'] = pred_all['label']

diff_ensemble = evaluate(result_LightGBM, result)

4.7 Stacking(0.96577)


Stacking本质是一种层级结构,第一层有n个基学习器,每个基学习器进行k折交叉训练,把每一折的验证集(validation set)的预测结果输出并拼接在一起,把这n个模型的训练集预测结果作为新的训练集,将这n个模型的测试集预测结果拼接在一起作为新的测试集。


[1] stacking模型融合
[2] Kaggle上分技巧——单模K折交叉验证训练+多模型融合

model_tree = DecisionTreeClassifier(max_depth=5, random_state=random_state)

clfs = [lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',
        learning_rate=0.1, n_estimators=200, num_leaves=10, silent=True,
        RandomForestClassifier(max_depth=13, n_estimators=400, random_state=1234),
        AdaBoostClassifier(base_estimator=model_tree, n_estimators=200, 
        cb.CatBoostClassifier(eval_metric='AUC', cat_features=categorical_features_index, 
        depth=6, iterations=400, learning_rate=0.5, verbose=False)]

data_train = np.zeros((X_train.shape[0], len(clfs)))
data_test = np.zeros((X_test.shape[0], len(clfs)))

# 5折stacking
n_splits = 5
skf = StratifiedKFold(n_splits)

# 第一层,训练各个个体学习器
for i, clf in enumerate(clfs):
    # 依次训练各个模型
    d_test = np.zeros((X_test.shape[0], n_splits)) #存放个体学习器在测试集上的预测输出
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, Y_train)):
        train_x = X_train.loc[train_idx]
        train_y = Y_train.loc[train_idx]
        val_x = X_train.loc[val_idx]
        val_y = Y_train.loc[val_idx]
        clf.fit(train_x, train_y)
        pred_train = clf.predict(train_x)
        pred_val = clf.predict(val_x)
        data_train[val_idx, i] = pred_val
        d_test[:, fold] = clf.predict(X_test)

        auc_train = roc_auc_score(train_y, pred_train)
        auc_val = roc_auc_score(val_y, pred_val)
        f_score_train = f1_score(train_y, pred_train)
        f_score_val = f1_score(val_y, pred_val)
        print('Classifier:%d, Fold: %d, AUC_train: %.4f, AUC_val: %.4f, F1-score_train: %.4f, F1-score_val: %.4f'%(i+1,
            fold+1, auc_train, auc_val, f_score_train, f_score_val))

    data_test[:, i] = d_test.mean(axis=1)

data_train = pd.DataFrame(data_train)
data_test = pd.DataFrame(data_test)

# 第二层改用高级点的模型,并进行5折交叉训练
# model_forest = RandomForestClassifier(max_depth=5, random_state=1234)
model_2 = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',
        learning_rate=0.3, n_estimators=200, num_leaves=10, silent=True,
result_stack = SKFold(data_train, Y_train, data_test, model_2)
result_stack.to_csv('result_stack.csv', index=False)

diff_stack = evaluate(result_LightGBM, result_stack)
4.8 归一化数据,pytorch神经网络



  1. Linear 输出维度为1 + sigmoid + BCELoss。
  2. Linear 输出维度为1 + BCEWithLogitsLoss。不需要加sigmoid或softmax函数,BCEWithLogitsLoss自带sigmoid作为激活函数。
  3. Linear 输出维度为2 + 交叉熵(CrossEntropyLoss)。输出tensor的维度0对应第一个label(即0),维度1对应第二个label(即1)。注意使用交叉熵时,真实标签不能是onehot格式,必须为1维tensor,预测标签必须大于或等于2维,预测标签的每一个维度对应一个标签

pytorch 中使用神经网络进行多分类时,网络的输出 prediction 是 one hot 格式,但计算 交叉熵损失函数时,loss = criterion(prediction, target) 的输入 target 不能是 one hot 格式,直接用数字来表示就行(4 表示 one hot 中的 0 0 0 1)。
所以,自己构建数据集,返回的 target 不需要是 one hot 格式。

[1] Pytorch学习笔记(5)——交叉熵报错RuntimeError: 1D target tensor expected, multi-target not supported
[2] PyTorch二分类时BCELoss,CrossEntropyLoss,Sigmoid等的选择和使用
[3] Pytorch实现二分类器
[4] RuntimeError: multi-target not supported at

from sklearn.preprocessing import MinMaxScaler

# 归一化
scaler = MinMaxScaler()
X_train2 = scaler.fit_transform(X_train)
X_test2 = scaler.fit_transform(X_test)
Y_train2 = Y_train.to_numpy()
print('X_train.shape = ', X_train.shape)
print('X_train2.shape = ', X_train2.shape)
print('Y_train2.shape = ', Y_train2.shape)
def Convert(x):
    # Conver the numeric values into categorical values.

    y = np.zeros((x.shape[0],))
    for i in range(len(x)):
        if x[i, 0] > x[i, 1]:
            y[i] = 0
            y[i] = 1

    return y
import torch
import torch.nn as nn
import torch.nn.functional as F

class NET(nn.Module):
    def __init__(self, input_dim:int, hidden:int, out_dim:int, activation='relu', dropout=0.2):
        super(NET, self).__init__()
        self.input_dim = input_dim
        self.hidden = hidden
        self.out_dim = out_dim
        self.activation = activation
        self.Dropout = dropout

        # 激活函数选择
        if self.activation == 'relu':
            mid_act = torch.nn.ReLU()
        elif self.activation == 'tanh':
            mid_act = torch.nn.Tanh()
        elif self.activation == 'sigmoid':
            mid_act = torch.nn.Sigmoid()
        elif self.activation == 'LeakyReLU':
            mid_act = torch.nn.LeakyReLU()
        elif self.activation == 'ELU':
            mid_act = torch.nn.ELU()
        elif self.activation == 'GELU':
            mid_act = torch.nn.GELU()

        self.model = nn.Sequential(
            nn.Linear(self.input_dim, self.hidden),
            nn.Linear(self.hidden, self.hidden),
            nn.Linear(hidden, self.out_dim)

    def forward(self, x):
        out = self.model(x)

        return out

    def predict(self, x):
        # x = torch.tensor(x.to_numpy()) #针对datafram
        # x = x.to(torch.float32)
        x = torch.tensor(x).to(torch.float32)  #针对ndarray
        x = F.softmax(self.model(x))
        ans = []
        for t in x:
            if t[0] > t[1]:

        return np.array(ans)
import time
from torch.utils.data  import DataLoader,  TensorDataset
class NN_classifier():
    def __init__(self, model, crit, l_rate, batch_size, max_epochs, n_splits=5, verbose=True):
        super(NN_classifier, self).__init__()
        self.model = model # Neural network model, should be a nn.Module()
        self.l_rate = l_rate
        self.batch_size = batch_size
        self.max_epochs = max_epochs
        self.verbose = verbose
        self.n_splits = n_splits  # the value of k in k-fold validation
        self.crit = crit  # loss function
        self.device = 'cpu'

    def fit(self, X_train, Y_train, X_test):
        skf = StratifiedKFold(n_splits=self.n_splits)
        fold = 1
        pred_test = []
        for train_idx, val_idx in skf.split(X_train, Y_train):
            train_x = X_train[train_idx, :]
            train_y = Y_train[train_idx]
            val_x = X_train[val_idx, :]
            val_y = Y_train[val_idx]
            train_data = TensorDataset(train_x, train_y)
            train_dataloader = DataLoader(dataset=train_data, batch_size=self.batch_size, shuffle=True)
            valid_data = TensorDataset(val_x, val_y)
            validation_dataloader = DataLoader(dataset=valid_data, batch_size=self.batch_size, shuffle=False)
            model = self.model
            optimizer = torch.optim.Adam(model.parameters(), lr=self.l_rate)
            scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.9)  #动态学习率调整

            for epoch in range(self.max_epochs):
                start_time = time.time()
                loss_all = []
                #------------- Training -----------------
                for data in train_dataloader:
                    x, y = data
                    x = x.to(self.device)
                    y = y.to(self.device)

                    out = model(x)
                    loss = self.crit(out, y.long())
                end_time = time.time()
                cost_time = end_time - start_time
                train_loss = np.mean(np.array(loss_all))

                #------------- Validation -----------------
                loss_all = []
                with torch.no_grad():
                    for data in validation_dataloader:
                        x, y = data
                        x = x.to(self.device)
                        y = y.to(self.device)
                        output = model(x)
                        loss = self.crit(output, y.long())

                validation_loss = np.mean(np.array(loss_all))
                if self.verbose and (epoch+1) % 100 ==0:
                    print('Fold:{:d}, Epoch:{:d}, train_loss: {:.4f}, validation_loss: {:.4f}, cost_time: {:.2f}s'
                    .format(fold, epoch+1, train_loss, validation_loss, cost_time))

            #------------- Prediction -----------------
            pred = Convert(model(X_test).detach().numpy())

            pred_train = Convert(model(train_x).detach().numpy())
            pred_val = Convert(model(val_x).detach().numpy())

            auc_train = roc_auc_score(train_y, pred_train)
            auc_val = roc_auc_score(val_y, pred_val)
            f_score_train = f1_score(train_y, pred_train)
            f_score_val = f1_score(val_y, pred_val)
            print('Fold: %d, AUC_train: %.4f, AUC_val: %.4f, F1-score_train: %.4f, F1-score_val: %.4f'%(fold, 
                auc_train, auc_val, f_score_train, f_score_val))
            fold += 1

        pred_test = pd.DataFrame(pred_test).T
        print('pred_test.shape = ', pred_test.shape)
        # 将5次预测结果求平均值
        pred_test['average'] = pred_test.mean(axis=1)

        pred_test['label'] = pred_test['average'].apply(lambda x:1 if x>0.5 else 0)

        ## 导出结果

        return result        
# k折交叉验证不断训练同一个模型,集成不同fold(即不同时刻)的模型的预测结果
hidden = 64
activation = 'relu'
# activation = 'tanh'
# crit = nn.MSELoss()
crit = nn.CrossEntropyLoss()
batch_size = 512*2
max_epochs = 500
l_rate = 1e-3
dropout = 0.1
n_splits = 5

# Convert to tensor
X_train_tensor = torch.from_numpy(X_train2).to(torch.float32)
X_test_tensor = torch.from_numpy(X_test2).to(torch.float32)
Y_train_tensor = torch.from_numpy(Y_train2).to(torch.float32)

model_NN = NET(X_train2.shape[1], hidden, out_dim=2, activation=activation)
classifier_NN = NN_classifier(model_NN, crit=crit, batch_size=batch_size, l_rate=l_rate, 
                    max_epochs=max_epochs, n_splits=n_splits)

result_SKFold_NN = NN_classifier.fit(classifier_NN,X_train_tensor, Y_train_tensor, X_test_tensor,)
c = result_LightGBM['label'] - result_SKFold_NN['label']

count = 0
for i in c:
    if i != 0:
        count += 1

print('与LightGBM预测不同的样本数: ', count)
Fold:1, Epoch:200, train_loss: 0.2785, validation_loss: 0.2505, cost_time: 0.04s
Fold:1, Epoch:300, train_loss: 0.2332, validation_loss: 0.2257, cost_time: 0.04s
Fold:1, Epoch:400, train_loss: 0.2098, validation_loss: 0.2130, cost_time: 0.04s
Fold:1, Epoch:500, train_loss: 0.2006, validation_loss: 0.2066, cost_time: 0.04s
Fold: 1, AUC_train: 0.9363, AUC_val: 0.9133, F1-score_train: 0.9254, F1-score_val: 0.8956
Fold:2, Epoch:100, train_loss: 0.1813, validation_loss: 0.1555, cost_time: 0.04s
Fold:2, Epoch:200, train_loss: 0.1671, validation_loss: 0.1449, cost_time: 0.04s
Fold:2, Epoch:300, train_loss: 0.1540, validation_loss: 0.1444, cost_time: 0.04s
Fold:2, Epoch:400, train_loss: 0.1513, validation_loss: 0.1384, cost_time: 0.04s
Fold:2, Epoch:500, train_loss: 0.1426, validation_loss: 0.1379, cost_time: 0.04s
Fold: 2, AUC_train: 0.9496, AUC_val: 0.9395, F1-score_train: 0.9401, F1-score_val: 0.9269
Fold:3, Epoch:100, train_loss: 0.1356, validation_loss: 0.1419, cost_time: 0.04s
Fold:3, Epoch:200, train_loss: 0.1219, validation_loss: 0.1384, cost_time: 0.04s
Fold:3, Epoch:300, train_loss: 0.1206, validation_loss: 0.1357, cost_time: 0.04s
Fold:3, Epoch:400, train_loss: 0.1152, validation_loss: 0.1400, cost_time: 0.04s
Fold:3, Epoch:500, train_loss: 0.1113, validation_loss: 0.1395, cost_time: 0.04s
Fold: 3, AUC_train: 0.9625, AUC_val: 0.9550, F1-score_train: 0.9535, F1-score_val: 0.9434
Fold:4, Epoch:100, train_loss: 0.1075, validation_loss: 0.1185, cost_time: 0.04s
Fold:4, Epoch:200, train_loss: 0.1155, validation_loss: 0.1250, cost_time: 0.04s
Fold:4, Epoch:300, train_loss: 0.1081, validation_loss: 0.1238, cost_time: 0.04s
Fold:4, Epoch:400, train_loss: 0.1056, validation_loss: 0.1283, cost_time: 0.04s
Fold:4, Epoch:500, train_loss: 0.0957, validation_loss: 0.1289, cost_time: 0.04s
Fold: 4, AUC_train: 0.9702, AUC_val: 0.9518, F1-score_train: 0.9629, F1-score_val: 0.9386
Fold:5, Epoch:100, train_loss: 0.1064, validation_loss: 0.0951, cost_time: 0.04s
Fold:5, Epoch:200, train_loss: 0.0983, validation_loss: 0.0978, cost_time: 0.04s
Fold:5, Epoch:300, train_loss: 0.1028, validation_loss: 0.1055, cost_time: 0.05s
Fold:5, Epoch:400, train_loss: 0.0954, validation_loss: 0.1065, cost_time: 0.04s
Fold:5, Epoch:500, train_loss: 0.0935, validation_loss: 0.1073, cost_time: 0.04s
Fold: 5, AUC_train: 0.9709, AUC_val: 0.9547, F1-score_train: 0.9647, F1-score_val: 0.9455
pred_test.shape =  (1000, 5)
与LightGBM预测不同的样本数:  423
0     -1
2     -1
4     -1
8      1
16    -1
985   -1
987   -1
994   -1
995   -1
999   -1
Name: label, Length: 423, dtype: int64
k折交叉验证不断训练同一个模型,虽然模型最终的表现结果还可以(F1-score_val上去了),但集成各fold(各时期)的模型的结果表现依然糟糕,与baseline —— lightGBM相差甚远,都不用提交就知道分数会很低(0.63左右)了。



  • 集成时被早期表现较差的模型所拖累;
  • 模型本身对表格数据拟合能力不够;
  • 模型过拟合;
# 不用k折交叉验证,一个模型用到底
hidden = 64
activation = 'tanh'
# activation = 'tanh'
crit = nn.CrossEntropyLoss()
batch_size = 128
max_epochs = 2000
l_rate = 5e-3
dropout = 0.1
n_splits = 5

model_NN2 = NET(X_train2.shape[1], hidden, out_dim=2, activation=activation)
classifier_NN2 = NN(model_NN2, crit=crit, batch_size=batch_size, l_rate=l_rate, 

# result_SKFold_NN = SKFold(pd.DataFrame(X_train2), pd.DataFrame(Y_train2),
#                     pd.DataFrame(X_test2), classifier_NN2, n_splits=5)

classifier_NN2.fit(X_train2, Y_train2)
result_NN = classifier_NN2.predict(X_test2)

# c = result_LightGBM['label'] - result_SKFold_NN['label']
c = result_LightGBM['label'] - result_NN

count = 0
for i in c:
    if i != 0:
        count += 1

print('与LightGBM预测不同的样本数: ', count)
4.9 SVM


[1] Python3《机器学习实战》学习笔记(八):支持向量机原理篇之手撕线性SVM

from sklearn.svm import SVC

model_SVM = SVC(C=10)  #C越大,对误分类的惩罚越大。

result_SKFold_SVM= SKFold(pd.DataFrame(X_train2), pd.DataFrame(Y_train2),
                    pd.DataFrame(X_test2), model_SVM, n_splits=5)

diff_SVM = evaluate(result_LightGBM, result_SKFold_SVM)
4.10 sklearn神经网络

from sklearn.neural_network import MLPClassifier

model_MLP = MLPClassifier(hidden_layer_sizes=128, activation='relu')

result_SKFold_MLP = SKFold(pd.DataFrame(X_train2), pd.DataFrame(Y_train2),
                    pd.DataFrame(X_test2), model_MLP, n_splits=5)

c = result_LightGBM['label'] - result_SKFold_MLP['label']

count = 0
for i in c:
    if i != 0:
        count += 1

print('与LightGBM预测不同的样本数: ', count)

5. 总结思考






[1] Datawhale_如何打一个数据挖掘比赛V2.1
[2] 讯飞官方参考解析
[3] Kaggle上分技巧——单模K折交叉验证训练+多模型融合

