赞
踩
import numpy as np import pandas as pd import matplotlib.pyplot as plt %matplotlib inline SEED = 222 np.random.seed(SEED) df = pd.read_csv('input.csv') #切分训练集和测试集 from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score def get_train_test(test_size = 0.95): y = 1 * (df.cand_pty_affiliation == "REP") X = df.drop(["cand_pty_affiliation"], axis = 1) X = pd.get_dummies(X, sparse = True) #对样本的特征进行独热编码“one-hot encoding” X.drop(X.columns[X.std() == 0], axis = 1, inplace = True) #去掉标准差=0 即该特征所有样本都一样的列 return train_test_split(X,y,test_size = test_size, random_state = SEED) xtrain, xtest, ytrain, ytest = get_train_test() print("\nExample data:") df.head()
- cand_pty_affiliation:我们要预测的指标,共和党或者民主党
- entity_tp:个人还是组织
- classification:领域
- rpt_tp:贡献的大小
- cycle:捐赠在哪年
- transaction_amt:捐献金额
df.cand_pty_affiliation.value_counts(normalize = True).plot( kind = "bar", title = "Share of No. donations") plt.show() #这里看一下原始数据正例和负例的比例,这里对应的是民主党和共和党
import pydotplus #导入结构化图形绘制工具 from IPython.display import Image #导入图片显示的库,能够打开图片文件在jupyter中进行显示 from sklearn.metrics import roc_auc_score from sklearn.tree import DecisionTreeClassifier, export_graphviz #导入决策树模型和绘制决策树.dot文件的库 def print_graph(clf, feature_names): "打印决策树" graph = export_graphviz( clf, label = 'root', proportion = True, impurity = False, out_file = None, feature_names = feature_names, class_names = {0: "D", 1: "R"}, filled = True, rounded = True) graph = pydotplus.graph_from_dot_data(graph) #graph_from_dot_data(数据)按dot格式数据定义的加载图。数据假定为点格式。它将被解析后, #将返回一个点类,代表图。 return Image(graph.create_png()) t1 = DecisionTreeClassifier(max_depth = 1, random_state = SEED) #构建决策树模型 t1.fit(xtrain, ytrain) #对已经切分的训练集和测试集进行决策树模型拟合 p = t1.predict_proba(xtest)[:,1] #对拟合后的t1进行预测,这里返回的是预测值为共和党、民主党这个二维数据的第二维的全部数据,这里是数据为共和党的概率 print("Decision tree ROC-AUC score: %.3f" % roc_auc_score(ytest, p)) print_graph(t1, xtrain.columns)
这里最后预测结果都是民主党,结果都是一样的没啥用,接下来对预剪枝参数进行调整 t2 = DecisionTreeClassifier(max_depth = 3, random_state = SEED) #将决策树深度调整为3其余的参数不变得到新的决策树 t2.fit(xtrian, ytrain) p = t2.predict_proba(xtest)[:, 1] print("Decision tree ROC-AUC score: %.3f" % roc_auc_score(ytest, p)) print_graph(t2, xtrain.columns)
47.3%的样本落到了最左边, 还有35.9% 落在了基本最右边. 这看起来模型基本已经过拟合了。 我们来调整下策略,去掉个对结果有着最大影响的因素再来看看! drop = ['transaction_amt'] #去掉应最大的特征“捐献金额” xtrain_slim = xtrain.drop(drop, 1) xtest_slim = xtest.drop(drop, 1) t3 = DecisionTreeClassifier(max_depth = 3, random_state = SEED) t3.fit(xtrain_slim, ytrain) p = t3.predict_proba(xtest_slim)[:,1] print("Decision tree ROC-AUC score: %.3f" % roc_auc_score(ytest, p)) print_graph(t3, xtrain_slim.columns)
p1 = t2.predict_proba(xtest)[:, 1] p2 = t3.predict_proba(xtest_slim)[:,1] p = np.mean([p1,p2],axis = 0) print("Average of decision tree ROC-AUC score: %.3f" % roc_auc_score(ytest, p))
整了个平均还真比原来高了! 这么一说,应该是选择不同的特征会产生不同的结果,然后用不同的结果再进行组合得到了一个升华!那我们多选几组不就是随机森林了嘛!
from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier( n_estimators = 10, max_features = 3, random_state = SEED ) #estimators 随机森林树的个数,就是搞了多少个决策树 #max_features 每个决策树所需要的考虑的决策特征的个数 rf.fit(xtrain,ytrain) p = rf.predict_proba(xtest)[:,1] print("Average of decision tree ROC-AUC score: %.3f" % roc_auc_score(ytest, p))
#这里把sklearn的算法全部押上 from sklearn.svm import SVC, LinearSVC from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.linear_model import LogisticRegression from sklearn.neural_network import MLPClassifier from sklearn.kernel_approximation import Nystroem from sklearn.kernel_approximation import RBFSampler from sklearn.pipeline import make_pipeline def get_models(): "构建起一个包含了上述算法的一个集合" nb = GaussianNB() svc = SVC(C = 100, probability = True) knn = KNeighborsClassifier(n_neighbors = 3) lr = LogisticRegression(C = 100, random_state = SEED) nn = MLPClassifier((80,10),early_stopping = False, random_state = SEED) gb = GradientBoostingClassifier(n_estimators = 10, max_features =3, random_state = SEED) rf = RandomForestClassifier(n_estimators = 10, max_features =3, random_state = SEED) models = {'svm':svc, 'knn':knn, 'naive bayes':nb, 'mlp-nn':nn, 'random forest': rf, 'gbm':gb, 'logistic': lr, } return models def train_predict(model_list): "使用上述模型算法多测试集和训练集数据进行拟合并获得其预测的概率值" P = np.zeros((ytest.shape[0], len(model_list))) #构建一个和0矩阵,行和样本数一样,列的数目是用到的sklearn算法的总数 P = pd.DataFrame(P) print("Fitting models.") cols = list() #通过一个for循环对之前构建的算法的字典进行调用,先拟合模型,然后将预测的概率结果赋值到P矩阵中对应的位置4 #并依次保存算法的name到cols列表中,最后作为P的属性(就是DataFrame的第一列) for i, (name, m) in enumerate(models.items()): print("%s..." % name, end=" ", flush=False) m.fit(xtrain, ytrain) P.iloc[:, i] = m.predict_proba(xtest)[:, 1] cols.append(name) print("done") P.columns = cols print("Done.\n") return P def score_models(P, y): "对模型预测结果与测试集数据进行比较利用ROC-AUC评价指标进行打分" print("Scoring models.") for m in P.columns: score = roc_auc_score(y, P.loc[:, m]) print("%-26s: %.3f" % (m, score)) print("Done.\n") models = get_models() #调用get_models获得所有的模型集合组成的一个字典models P = train_predict(models) #依次利用每个算法模型进行拟合预测获得P矩阵中包含的每种模型算法的预测结果 score_models(P,ytest) #使用ROC-AUC评价指标对P矩阵中模型预测结果与实际测试集进行比较
#导入混淆矩阵可视化的库
from mlens.visualization import corrmatcorrmat(P.corr(), inflate = False)
plt.show()预测的结果很多都是高度相关的!
print("Ensemble ROC-AUC score: %.3f" % roc_auc_score(ytest, P.mean(axis = 1)))
from sklearn.metrics import roc_curve def plot_roc_curve(ytest, P_base_learners, P_ensemble, labels, ens_label): """Plot the roc curve for base learners and ensemble.""" plt.figure(figsize=(10, 8)) plt.plot([0, 1], [0, 1], 'k--') cm = [plt.cm.rainbow(i) for i in np.linspace(0, 1.0, P_base_learners.shape[1] + 1)] for i in range(P_base_learners.shape[1]): p = P_base_learners[:, i] fpr, tpr, _ = roc_curve(ytest, p) plt.plot(fpr, tpr, label=labels[i], c=cm[i + 1]) #绘制单个算法模型的曲线结果 fpr, tpr, _ = roc_curve(ytest, P_ensemble) plt.plot(fpr, tpr, label=ens_label, c=cm[0]) #绘制集成算法的曲线 plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC curve') plt.legend(frameon=False) plt.show() plot_roc_curve(ytest, P.values, P.mean(axis=1), list(P.columns), "ensemble")
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。