赞
踩
可以进行多分类预测的模型有 逻辑回归、决策树、神经网络、随机森林、xgboost,发现效果排名靠前的依次是 XGBoost、随机森林、决策树
通过调用python相关包,对XGBoost分类模型进行参数调整,使模型效果更好。
- # 导入的包
- from xgboost.sklearn import XGBClassifier
- # 调用XGBClassifier方法,括号内都是默认的参数值,可对这些参数进行调整
- XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
- colsample_bynode=1, colsample_bytree=1, gamma=0,
- learning_rate=0.1, max_delta_step=0, max_depth=8,
- min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
- nthread=None, num_class=5, objective='multi:softprob',
- random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
- seed=None, silent=None, subsample=1, verbosity=1)
- import pandas as pd
- from sklearn.model_selection import train_test_split
- from xgboost.sklearn import XGBClassifier
- from sklearn.metrics import classification_report
- from sklearn.metrics import f1_score, precision_score, recall_score
- from sklearn.externals import joblib # 将模型导出所需包
-
- def get_cust_age_stage(birth_year):
- """根据出生年份获取年龄段"""
- age_stage = []
- for i in range(len(birth_year)):
- if int(birth_year[i]) == 0:
- age_stage.append("未知")
- elif int(birth_year[i]) < 1960:
- age_stage.append("60前")
- elif int(birth_year[i]) < 1970:
- age_stage.append("60后")
- elif int(birth_year[i]) < 1980:
- age_stage.append("70后")
- elif int(birth_year[i]) < 1990:
- age_stage.append("80后")
- elif int(birth_year[i]) < 2000:
- age_stage.append("90后")
- elif int(birth_year[i]) >= 2000:
- age_stage.append("00后")
- else:
- age_stage.append("未知")
- return age_stage
- def get_top5_onehot(data):
- """对c字段排名top5的进行one hot"""
- # 获取top5的值
- c_top5_counts = data['c'].value_counts()[:5]
- c_top5_names = list(c_top5_counts.keys())
- # 进行one-hot编码,只保留top5的列
- c_one_hot = pd.get_dummies(data['c'])
- c_top5 = c_one_hot[c_top5_names]
- # 将top5的列合并到data中
- data = data.join(c_top5)
- return data
-
- def get_quantile_20_values(input_data):
- """按照分位数切分为20等分"""
- grade = pd.DataFrame(columns=['quantile', 'value'])
- for i in range(0, 21):
- grade.loc[i, 'quantile'] = i / 20.0
- grade.loc[i, 'value'] = input_data.quantile(i / 20.0)
- cut_point = grade['value'].tolist() # 20等分的分位数的值
- # 对20等分的分位数的值 进行去重
- s_unique = []
- for i in range(len(cut_point)):
- if cut_point[i] not in s_unique:
- s_unique.append(cut_point[i])
- return s_unique
-
- def get_quantile_interregional(s_unique):
- """根据去重后的分位数,构造区间"""
- interregional = []
- for i in range(1, len(s_unique)):
- interregional.append([i, s_unique[i - 1], s_unique[i]])
- if i == len(s_unique) - 1 and len(interregional) < 20:
- interregional.append([i + 1, s_unique[i], s_unique[i]])
- return interregional
-
- def get_current_level(item_data,interregional):
- """根据分位数区间获取当前数所对应的的级别"""
- level = 0
- for i in range(len(interregional)):
- if item_data >= interregional[i][1] and item_data <interregional[i][2]:
- level = interregional[i][0]
- break
- elif interregional[i][1] == interregional[i][2]:
- level = interregional[i][0]
- break
- return level
-
- def get_division_level(input_data):
- """根据分位数划分对应级别"""
- # 获取去重后20等分的分位数的值
- s_unique = get_quantile_20_values(input_data)
- # 构造分位数区间,输出格式[index,下限,上限] 区间为左闭右开
- interregional = get_quantile_interregional(s_unique)
- # 根据分位数区间对数据划分不同等级
- quantile_20_level = []
- for item in input_data:
- quantile_20_level.append(get_current_level(item, interregional))
- return quantile_20_level
-
- def pre_processing(data):
- """对数据进行预处理"""
- # 1. 增加衍生变量
- # 年龄
- data['年龄'] = get_cust_age_stage(data['出生年份'])
- # 本月平均时长
- data['本月平均时长'] = data['本月时长'].div(data['本月次数'],axis=0)
- data['g'] = data['a'] - data['b']
-
- # 2. 填充数据
- col_name_0 = ['a', 'b','g', 'k'] # 需要填充为数字0的指标名
- values = {}
- for i in col_name_0:
- values[i] = 0
- # 不加inplace=True,数据不会被填充
- data.fillna(value=values, inplace=True)
- data.fillna({'m':'未知', 'z':'未知'}, inplace=True) # m/z列需要填充为字符串
- # 对c指标进行one-hot处理
- data = get_top5_onehot(data)
- # 3. 分级化
- col_name_level = ['d', 'e', 'f']
- for i in range(len(col_name_level)):
- new_col_name = col_name_level[i] + "_TILE20"
- data[new_col_name] = get_division_level(data[col_name_level[i]])
- return data
-
- def get_model_columns(input_data):
- """获取建模指标列名,列表类型"""
- total_col_names = input_data.columns
- del_col_names = ['a','b','c']
- model_col_names = [i for i in total_col_names if i not in del_col_names]
- return model_col_names
-
- def importance_features_top(model_str, model, x_train):
- """打印模型的重要指标,排名top10指标"""
- print("打印XGBoost重要指标")
- feature_importances_ = model.feature_importances_
- feature_names = x_train.columns
- importance_col = pd.DataFrame([*zip(feature_names, feature_importances_)],
- columns=['a', 'b'])
- importance_col_desc = importance_col.sort_values(by='b', ascending=False)
- print(importance_col_desc.iloc[:10, :])
-
- def print_precison_recall_f1(y_true, y_pre):
- """打印精准率、召回率和F1值"""
- print("打印精准率、召回率和F1值")
- print(classification_report(y_true, y_pre))
- f1 = round(f1_score(y_true, y_pre, average='macro'), 2)
- p = round(precision_score(y_true, y_pre, average='macro'), 2)
- r = round(recall_score(y_true, y_pre, average='macro'), 2)
- print("Precision: {}, Recall: {}, F1: {} ".format(p, r, f1))
-
- def xgboost_model(x_train,y_train):
- """用XGBoost进行建模,返回训练好的模型"""
- xgboost_clf = XGBClassifier(min_child_weight=6,max_depth=15,
- objective='multi:softmax',num_class=5)
- print("-" * 60)
- print("xgboost模型:", xgboost_clf)
- xgboost_clf.fit(x_train, y_train)
- # # 打印重要性指数
- importance_features_top('xgboost', xgboost_clf, x_train)
- # 保存模型
- joblib.dump(xgboost_clf, './model/XGBoost_model_v1.0')
- return xgboost_clf
-
- filename = "./文件对应路径.xlsx"
- data = pd.read_excel(filename)
- # 数据预处理,包括填充数据,增加衍生变量、分级化、top打横
- data_processed = pre_processing(data)
- # 根据业务删除某些变量,获取建模所需指标
- model_col_names = get_model_columns(input_data)
- model_data = data_processed[model_col_names]
- # 将数据拆分为输入数据和输出数据
- data_y = model_data['label']
- data_x = model_data.drop(['label'], axis=1)
- # 数据集拆分为训练集和测试集两部分 使用随机数种子,确保可以复现
- x_train, x_test, y_train, y_test = train_test_split(data_x,data_y,
- test_size=0.3,random_state=1)
- # 建模
- xgboost_clf = xgboost_model(x_train, y_train)
- # 预测
- pre_y_test = xgboost_clf.predict(x_test)
- # 打印测试集的结果信息,包含precision、recall、f1-socre
- print("-" * 30, "测试集", "-" * 30)
- print_precison_recall_f1(y_test, pre_y_test)
- # 1、将数据划分为训练集、测试集两部分 使用随机数种子,确保可以复现
- x_train, x_test_valid, y_train, y_test_valid =train_test_split(data_x,data_y,
- test_size=0.4,random_state=1)
- # 2、将测试集数据划分为 测试集 和 验证集 两部分
- x_test, x_vaild, y_test, y_valid = train_test_split(x_test_valid, y_test_valid,
- test_size=0.5,random_state=1)
- eval_set = [(x_train, y_train), (x_test, y_test)]
- xgboost_clf.fit(x_train, y_train, eval_metric="merror", eval_set=eval_set,
- verbose=True)
在调参的过程中,对这句话有了深刻的理解,"数据和特征决定了机器学习的上限,而模型和算法只是逼近这个上限而已"。当你调参效果都提升不大的时候,可以考虑再重新对数据和特征进行研究和处理,效果会有所提高。
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。