赞
踩
from sklearn.linear_model import LogisticRegression as LR
from sklearn.datasets import load_breast_cancer
import numpy as np
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
data = load_breast_cancer()
X = data.data
y = data.target
data.data.shape
输出数据的维度为(569, 30)
lrl1 = LR(penalty='l1', solver='liblinear', C=0.5, max_iter=1000)
lrl2 = LR(penalty='l2', solver='liblinear', C=0.5, max_iter=1000)
lrl1.fit(X, y)
lrl1.coef_
lrl2.fit(X, y)
lrl2.coef_
内置属性coef_可以查看权重参数
#分别存储正则项L1和L2的训练集正确率 l1 = [] l2 = [] #分别存储正则项L1和L2的测试集正确率 l1test = [] l2test = [] Xtrain ,Xtest, Ytrain, Ytest = train_test_split(X, y, test_size=0.3, random_state=30) #将C值分范围进行建模并存储正确率 for i in np.linspace(0.05, 1, 19): lrl1 = LR(penalty='l1', solver='liblinear', C=i, max_iter=1000) lrl2 = LR(penalty='l2', solver='liblinear', C=i, max_iter=1000) lrl1.fit(Xtrain, Ytrain) l1.append(accuracy_score(lrl1.predict(Xtrain), Ytrain)) l1test.append(accuracy_score(lrl1.predict(Xtest), Ytest)) lrl2.fit(Xtrain, Ytrain) l2.append(accuracy_score(lrl2.predict(Xtrain), Ytrain)) l2test.append(accuracy_score(lrl2.predict(Xtest), Ytest)) graph = [l1, l2, l1test, l2test] color = ["green", "black", "lightgreen", "gray"] label = ['l1', 'l2', 'l1test', 'l2test'] plt.figure() #一次将曲线绘制并设置颜色和标签 for i in range(len(graph)): plt.plot(np.linspace(0.05, 1, 19), graph[i], color[i], label=label[i]) plt.legend(loc=4) plt.savefig(r"C:\Users\86377\Desktop\1.png") plt.show()
绘制出来的图像为:
可见,至少在我们的乳腺癌数据集下,两种正则化的结果区别不大。但随着C的逐渐变大,正则化的强度越来越小,模型在训练集和测试集上的表现都呈上升趋势,直到C=0.8左右,训练集上的表现依然在走高,但模型在未知数据集上的表现开始下跌,这时候就是出现了过拟合。我们可以认为,C设定为0.8会比较好。在实际使用时,基本就默认使用l2正则化,如果感觉到模型的效果不好,那就换L1试试看。
#norm_order为正则项为1
X_embedded = SelectFromModel(LR_, norm_order=1).fit_transform(X, y)
X_embedded.shape
得到选取特征矩阵的维度为(569, 9)
fullx = [] fsx = [] threshold = np.linspace(0, abs(LR_.fit(X, y).coef_).max(), 20) k = 0 for i in threshold: X_embedded = SelectFromModel(LR_, threshold=i).fit_transform(X, y) fullx.append(cross_val_score(LR_, X, y, cv=10).mean()) fsx.append(cross_val_score(LR_, X_embedded, y, cv=10).mean()) print(threshold[k], X_embedded.shape[1]) k += 1 plt.plot(threshold, fullx, label='fullx') plt.plot(threshold, fsx, label='fsx') plt.xticks(threshold) plt.legend() plt.savefig(r"C:\Users\86377\Desktop\2.png") plt.show()
绘制的图像如下:
这种方法其实是比较无效的,大家可以用学习曲线来跑一跑:当threshold越来越大,被删除的特征越来越
多,模型的效果也越来越差,模型效果最好的情况下需要保证有17个以上的特征。实际上我画了细化的学习曲线,
如果要保证模型的效果比降维前更好,我们需要保留25个特征,这对于现实情况来说,是一种无效的降维:需要
30个指标来判断病情,和需要25个指标来判断病情,对医生来说区别不大。
fullx = [] fsx = [] C = np.arange(0.01, 10.01, 0.5) for i in C: LR_ = LR(solver='liblinear', C=i, random_state=420) fullx.append(cross_val_score(LR_, X, y, cv=10).mean()) X_embeded = SelectFromModel(LR_, norm_order=1).fit_transform(X, y) fsx.append(cross_val_score(LR_, X_embedded, y, cv=10).mean()) print(max(fsx), C[fsx.index(max(fsx))]) plt.figure(figsize=(20, 5)) plt.plot(C, fullx, label='fullx') plt.plot(C, fsx, label='fsx') plt.xticks(C) plt.legend() plt.savefig(r"C:\Users\86377\Desktop\3.png") plt.show()
这里我们可以选取最好的时候的C值,并可以更缩小范围去搜索最优参数。
l2 = [] l2test = [] Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, y, test_size=0.3, random_state=420) for i in range(1, 201, 10): lrl2 = LR(penalty='l2', solver='liblinear', max_iter=i) lrl2.fit(Xtrain, Ytrain) l2.append(accuracy_score(lrl2.predict(Xtrain), Ytrain)) l2test.append(accuracy_score(lrl2.predict(Xtest), Ytest)) graph = [l2, l2test] color = ["black", "gray"] label = ["L2", "L2test"] plt.figure(figsize=(20,5)) for i in range(len(graph)): plt.plot(np.arange(1, 201, 10), graph[i], color[i], label=label[i]) plt.legend(loc=4) plt.xticks(np.arange(1, 201, 10)) plt.savefig(r"C:\Users\86377\Desktop\4.png") plt.show()
通过设置max_iter参数来查看训练集和测试集正确率变化,图像如下
可以看到随着迭代次数增大,模型正确率也有所提高,且初期比后期提高更快。
from sklearn.datasets import load_iris
iris = load_iris()
#分别设置两种不同的参数
for multi_class in ['multinomial', 'ovr']:
clf = LR(solver='sag', max_iter=100, random_state=42, multi_class=multi_class).fit(iris.data, iris.target)
print("training score : %.3f (%s)" % (clf.score(iris.data, iris.target), multi_class))
最终结果为:
training score : 0.987 (multinomial)
training score : 0.960 (ovr)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。