赞
踩
像前面的KNN模型,不需要对f的形式做出假设,在学习中可以得到任意的模型叫非参数化
而需要对参数进行学习的模型叫参数化模型,参数化限制了f的可能的集合,学习难度相对较低
逻辑斯蒂函数
似然函数
对数似然函数
在多分类使用softmax函数
重点
真阳性率 、假阳性率 FPR的变化曲线就叫做ROC曲线
ROC曲线的面积就叫AUC
import numpy as np import matplotlib.pyplot as plt from matplotlib.ticker import MaxNLocator #%% # 从源文件中读入数据并处理 lines = np.loadtxt('./data/lr_dataset.csv', delimiter=',', dtype=float) x_total = lines[:, 0:2] y_total = lines[:, 2] print('数据集大小:', len(x_total)) #%% pos_index=np.where(y_total==1) neg_index=np.where(y_total==0) plt.scatter(x_total[pos_index,0],x_total[pos_index,1],marker='o',color='coral',s=10) plt.scatter(x_total[neg_index,0],x_total[neg_index,1],marker='x',color='blue',s=10) plt.xlabel('X1') plt.ylabel('X2') plt.show() #%% np.random.seed(0) ratio = 0.7 split = int(len(x_total) * ratio) idx = np.random.permutation(len(x_total)) x_total = x_total[idx] y_total = y_total[idx] x_train, y_train = x_total[:split], y_total[:split] x_test, y_test = x_total[split:], y_total[split:] #%% y_test idx=np.argsort(y_test[::-1]) #%% y_test #%% def acc(y_true,y_pred): return np.mean(y_true==y_pred) def auc(y_true,y_pred): idx=np.argsort(y_pred)[::-1] y_true=y_true[idx] y_pred=y_pred[idx] tp=np.cumsum(y_true) #累加 fp=np.cumsum(1-y_true) tpr=tp/tp[-1] fpr=fp/fp[-1] s=0.0 tpr = np.concatenate([[0], tpr]) #拼接函数 fpr = np.concatenate([[0], fpr]) for i in range(1, len(fpr)): s += (fpr[i] - fpr[i - 1]) * tpr[i] return s #%% def logistic(z): return 1/(1+np.exp(-z)) def GD(num_steps,learning_rate,l2_coef): theta=np.random.normal(size=(X.shape[1],)) train_losses=[] test_losses = [] train_acc = [] test_acc = [] train_auc = [] test_auc = [] for i in range(num_steps): pred = logistic(X @ theta) grad = -X.T @ (y_train - pred) + l2_coef * theta theta -= learning_rate * grad train_loss = - y_train.T @ np.log(pred) \ - (1 - y_train).T @ np.log(1 - pred) \ + l2_coef * np.linalg.norm(theta) ** 2 / 2 train_losses.append(train_loss / len(X)) test_pred = logistic(X_test @ theta) test_loss = - y_test.T @ np.log(test_pred) \ - (1 - y_test).T @ np.log(1 - test_pred) test_losses.append(test_loss / len(X_test)) # 记录各个评价指标,阈值采用0.5 train_acc.append(acc(y_train, pred >= 0.5)) test_acc.append(acc(y_test, test_pred >= 0.5)) train_auc.append(auc(y_train, pred)) test_auc.append(auc(y_test, test_pred)) return theta, train_losses, test_losses, \ train_acc, test_acc, train_auc, test_auc #%% # 定义梯度下降迭代的次数,学习率,以及L2正则系数 num_steps = 250 learning_rate = 0.002 l2_coef = 1.0 np.random.seed(0) # 在x矩阵上拼接1 X = np.concatenate([x_train, np.ones((x_train.shape[0], 1))], axis=1) X_test = np.concatenate([x_test, np.ones((x_test.shape[0], 1))], axis=1) theta, train_losses, test_losses, train_acc, test_acc, \ train_auc, test_auc = GD(num_steps, learning_rate, l2_coef) # 计算测试集上的预测准确率 y_pred = np.where(logistic(X_test @ theta) >= 0.5, 1, 0) final_acc = acc(y_test, y_pred) print('预测准确率:', final_acc) print('回归系数:', theta) plt.figure(figsize=(13, 9)) xticks = np.arange(num_steps) + 1 #%% # 绘制训练曲线 plt.subplot(221) plt.plot(xticks, train_losses, color='blue', label='train loss') plt.plot(xticks, test_losses, color='red', ls='--', label='test loss') plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True)) plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() #%% # 绘制准确率 plt.subplot(222) plt.plot(xticks, train_acc, color='blue', label='train accuracy') plt.plot(xticks, test_acc, color='red', ls='--', label='test accuracy') plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True)) plt.xlabel('Epochs') plt.ylabel('Accuracy') plt.legend()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。