赞
踩
上一讲:手撕机器学习01-线性回归
逻辑回归虽然被称为回归,但其实际上是分类模型,并常用于二分类。可以认为逻辑回归 = 线性回归 + sigmoid激活函数。之所以逻辑回归被称为回归是因为sigmoid的输出为连续的概率值,所以叫回归,但其实得到概率后进行了分类处理,所以逻辑回归其实是一个分类器。
原理更多可以参考:https://zhuanlan.zhihu.com/p/74874291,总结的非常详细,这里不再赘述
在上一讲线性回归中,目标函数为y = wx + b, 在逻辑回归中变成了 sigmoid( wx + b),所以可以理解为逻辑回归 = sigmoid(线性回归)。sigmoid输出概率大于0.5被认为属于正类,反之属于负类。
随机生成数据,同时为数据分类。
- import numpy as np
-
- num_inputs = 2 # 样本维度
- num_examples = 1000 # 样本数量
- true_w = [4, -2.4] # 设置真实w = [4, -2.4]⊤和偏差b = 3.2
- true_b = 3.2
- features = np.random.normal(scale=5, size=(num_examples, num_inputs))
- true_y = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b # x 在 wx+b上的实际应该值
-
- labels = true_y > 0 # >= wx + b 对应标签为1 < wx+b 对应标签为0
- labels = np.array(list(map(lambda x: 1 if x else 0, labels)))
展示数据机真实的分割线
- from IPython import display
- from matplotlib import pyplot as plt
- def use_svg_display():
- # 用矢量图显示
- display.set_matplotlib_formats('svg')
-
- def set_figsize(figsize=(5, 4)):
- use_svg_display()
- # 设置图的尺寸
- plt.rcParams['figure.figsize'] = figsize
-
- set_figsize()
- colors = ['g','r', 'b']
- Label_Com = ['0','1', 'line']
- x1_0, x2_0, x1_1, x2_1 = [], [], [], []
- for i in range(len(labels)):
- if labels[i] == 0:
- x1_0.append(features[i,0])
- x2_0.append(features[i,1])
- else:
- x1_1.append(features[i,0])
- x2_1.append(features[i,1])
- line_x1 = np.arange(-10, 10, 0.1)
- line_x2 = (true_w[0] * line_x1 + true_b) / -true_w[1]
- plt.scatter(x1_0, x2_0, c=colors[0], cmap='brg', s=5, marker='8', linewidth=0)
- plt.scatter(x1_1, x2_1, c=colors[1], cmap='brg', s=5, marker='8', linewidth=0)
- plt.scatter(line_x1, line_x2, c=colors[2], cmap='brg', s=3, marker='8', linewidth=0)
- plt.legend(labels = Label_Com, loc='upper right')
- plt.show()
将数据分批
- batch_size = 100 # 批大小
- indices = np.array(range(num_examples))
- np.random.shuffle(indices) # 随机打乱顺序
- X = np.array([features.take(indices[i:i+batch_size], 0) for i in range(0, num_examples, batch_size)])
- Y = np.array([labels.take(indices[i:i+batch_size], 0) for i in range(0, num_examples, batch_size)])
-
- print(X.shape, Y.shape)
- class LogisticsRegression:
- def __init__(self, num_inputs):
- # 初始化参数
- self.w = np.random.normal(scale=1, size=(num_inputs, 1))
- self.b = np.random.normal(scale=0.01, size=(1))
-
- def __str__(self):
- # 输出参数
- return 'w:' + str(self.w) + '\nb:' + str(self.b)
-
- def forward(self, input):
- # 前向计算
- return np.dot(input, self.w) + self.b
-
- def sigmoid(self, input):
- return 1.0/(1+np.exp(-input))
-
- def cross_entropy(self, p, label):
- # 损失函数-交叉熵函数
- return - np.dot(label.T,np.log(p)) - np.dot((1-label).T, np.log(1.0000000001-p)) # 防止溢出
-
- def accuracy(self, p, label):
- # 计算分类的准确率
- pre = (p > 0.5)
- pre = np.array(list(map(lambda x: 1 if x else 0, pre))).reshape(label.shape)
- return np.sum(pre == label)/len(label)
-
- def sgd(self, lr, input, p, label):
- # batch梯度下降,反向传播
- batch_size = len(label)
- off = p - label # 求导损失函数公式得到 (p(x) - y)* x , 先求p(x) - y, 方便后面计算
- for i in range(len(self.w)):
- error = np.dot(input[:,i].T, off) / batch_size # 这里求导得到的公式 w[i] -= lr* (p(x) - y)*xi
- error = error.reshape(1,)
- self.w[i] -= lr*error
- self.b -= lr * sum(off) / batch_size # b = lr*off
-
- def train(self, lr, inputs, labels, epoch):
- for i in range(epoch):
- loss, acc = [], []
- for input, label in zip(inputs, labels):
- output = self.forward(input) # wx + b
- p = self.sigmoid(output) # sigmoid(wx + b)
- label = label.reshape(output.shape)
- loss.append(self.cross_entropy(p, label))
- self.sgd(lr, input, p, label)
- acc.append(self.accuracy(p, label))
- print('epoch %d, loss %f, accuracy %f' % (i+1, np.mean(loss), np.mean(acc)))
- model = LogisticsRegression(num_inputs)
- print(model)
- model.train(1, X, Y, 40)
- print(model)
训练过程:
训练出来的分割线:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。