转自麦子学院
1 """ 2 network.py 3 ~~~~~~~~~~ 4 5 A module to implement the stochastic gradient descent learning 6 algorithm for a feedforward neural network. Gradients are calculated 7 using backpropagation. Note that I have focused on making the code 8 simple, easily readable, and easily modifiable. It is not optimized, 9 and omits many desirable features. 10 """ 11 12 #### Libraries 13 # Standard library 14 import random 15 16 # Third-party libraries 17 import numpy as np 18 19 class Network(object): 20 21 def __init__(self, sizes): 22 """The list ``sizes`` contains the number of neurons in the 23 respective layers of the network. For example, if the list 24 was [2, 3, 1] then it would be a three-layer network, with the 25 first layer containing 2 neurons, the second layer 3 neurons, 26 and the third layer 1 neuron. The biases and weights for the 27 network are initialized randomly, using a Gaussian 28 distribution with mean 0, and variance 1. Note that the first 29 layer is assumed to be an input layer, and by convention we 30 won't set any biases for those neurons, since biases are only 31 ever used in computing the outputs from later layers.""" 32 self.num_layers = len(sizes) 33 self.sizes = sizes 34 self.biases = [np.random.randn(y, 1) for y in sizes[1:]] 35 self.weights = [np.random.randn(y, x) 36 for x, y in zip(sizes[:-1], sizes[1:])] 37 38 def feedforward(self, a): 39 """Return the output of the network if ``a`` is input.""" 40 for b, w in zip(self.biases, self.weights): 41 a = sigmoid(np.dot(w, a)+b) 42 return a 43 44 def SGD(self, training_data, epochs, mini_batch_size, eta, 45 test_data=None): 46 """Train the neural network using mini-batch stochastic 47 gradient descent. The ``training_data`` is a list of tuples 48 ``(x, y)`` representing the training inputs and the desired 49 outputs. The other non-optional parameters are 50 self-explanatory. If ``test_data`` is provided then the 51 network will be evaluated against the test data after each 52 epoch, and partial progress printed out. This is useful for 53 tracking progress, but slows things down substantially.""" 54 if test_data: n_test = len(test_data) 55 n = len(training_data) 56 for j in range(epochs): 57 random.shuffle(training_data) 58 mini_batches = [ 59 training_data[k:k+mini_batch_size] 60 for k in range(0, n, mini_batch_size)] 61 for mini_batch in mini_batches: 62 self.update_mini_batch(mini_batch, eta) 63 if test_data: 64 print ("Epoch {0}: {1} / {2}".format( 65 j, self.evaluate(test_data), n_test)) 66 else: 67 print ("Epoch {0} complete".format(j)) 68 69 def update_mini_batch(self, mini_batch, eta): 70 """Update the network's weights and biases by applying 71 gradient descent using backpropagation to a single mini batch. 72 The ``mini_batch`` is a list of tuples ``(x, y)``, and ``eta`` 73 is the learning rate.""" 74 nabla_b = [np.zeros(b.shape) for b in self.biases] 75 nabla_w = [np.zeros(w.shape) for w in self.weights] 76 #一个一个的进行训练 跟吴恩达的Mini-Batch 不一样 77 for x, y in mini_batch: 78 delta_nabla_b, delta_nabla_w = self.backprop(x, y) 79 nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)] 80 nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)] 81 self.weights = [w-(eta/len(mini_batch))*nw 82 for w, nw in zip(self.weights, nabla_w)] 83 self.biases = [b-(eta/len(mini_batch))*nb 84 for b, nb in zip(self.biases, nabla_b)] 85 86 def backprop(self, x, y): 87 """Return a tuple ``(nabla_b, nabla_w)`` representing the 88 gradient for the cost function C_x. ``nabla_b`` and 89 ``nabla_w`` are layer-by-layer lists of numpy arrays, similar 90 to ``self.biases`` and ``self.weights``.""" 91 nabla_b = [np.zeros(b.shape) for b in self.biases] 92 nabla_w = [np.zeros(w.shape) for w in self.weights] 93 # feedforward 94 activation = x 95 activations = [x] # list to store all the activations, layer by layer 96 zs = [] # list to store all the z vectors, layer by layer 97 for b, w in zip(self.biases, self.weights): 98 z = np.dot(w, activation)+b 99 zs.append(z) 100 activation = sigmoid(z) 101 activations.append(activation) 102 # backward pass 103 delta = self.cost_derivative(activations[-1], y) * \ 104 sigmoid_prime(zs[-1]) 105 nabla_b[-1] = delta 106 nabla_w[-1] = np.dot(delta, activations[-2].transpose()) 107 # Note that the variable l in the loop below is used a little 108 # differently to the notation in Chapter 2 of the book. Here, 109 # l = 1 means the last layer of neurons, l = 2 is the 110 # second-last layer, and so on. It's a renumbering of the 111 # scheme in the book, used here to take advantage of the fact 112 # that Python can use negative indices in lists. 113 for l in range(2, self.num_layers): 114 z = zs[-l] 115 sp = sigmoid_prime(z) 116 delta = np.dot(self.weights[-l+1].transpose(), delta) * sp 117 nabla_b[-l] = delta 118 nabla_w[-l] = np.dot(delta, activations[-l-1].transpose()) 119 return (nabla_b, nabla_w) 120 121 def evaluate(self, test_data): 122 """Return the number of test inputs for which the neural 123 network outputs the correct result. Note that the neural 124 network's output is assumed to be the index of whichever 125 neuron in the final layer has the highest activation.""" 126 test_results = [(np.argmax(self.feedforward(x)), y) 127 for (x, y) in test_data] 128 return sum(int(x == y) for (x, y) in test_results) 129 130 def cost_derivative(self, output_activations, y): 131 """Return the vector of partial derivatives \partial C_x / 132 \partial a for the output activations.""" 133 return (output_activations-y) 134 135 #### Miscellaneous functions 136 def sigmoid(z): 137 """The sigmoid function.""" 138 return 1.0/(1.0+np.exp(-z)) 139 140 def sigmoid_prime(z): 141 """Derivative of the sigmoid function.""" 142 return sigmoid(z)*(1-sigmoid(z))
该算法比我之前写的神经网络算法准确率高,但是在测试过程中发现有错误,各个地方的注释我是没看明白,与理论结合不是很好。本人在他的基础上进行了改进,提高了算法的扩展程度,自己也亲测了改进后的代码,效果杠杠的。
1 # -*- coding: utf-8 -*- 2 """ 3 Created on Thu Jan 18 15:27:24 2018 4 5 @author: markli 6 """ 7 8 import numpy as np; 9 import random; 10 11 def tanh(x): 12 return np.tanh(x); 13 14 def tanh_derivative(x): 15 return 1.0 - np.tanh(x)*np.tanh(x); 16 17 def logistic(x): 18 return 1/(1 + np.exp(-x)); 19 20 def logistic_derivative(x): 21 return logistic(x)*(1-logistic(x)); 22 23 def ReLU(x,a=1): 24 return max(0,a * x); 25 26 def ReLU_derivative(x,a=1): 27 return 0 if x < 0 else a; 28 29 class NeuralNetwork: 30 ''' 31 Z = W * x + b 32 A = sigmod(Z) 33 Z 净输入 34 x 样本集合 n * m n 个特征 m 个样本数量 35 b 偏移量 36 W 权重 37 A 净输出 38 ''' 39 def __init__(self,layers,active_function=[logistic],active_function_der=[logistic_derivative],learn_rate=0.9): 40 """ 41 初始化神经网络 42 layer中存放每层的神经元数量,layer的长度即为网络的层数 43 active_function 为每一层指定一个激活函数,若长度为1则表示所有层使用同一个激活函数 44 active_function_der 激活函数的导数 45 learn_rate 学习速率 46 """ 47 self.weights = [np.random.randn(x,y) for x,y in zip(layers[1:],layers[:-1])]; 48 self.biases = [np.random.randn(x,1) for x in layers[1:]]; 49 self.size = len(layers); 50 self.rate = learn_rate; 51 self.sigmoids = []; 52 self.sigmoids_der = []; 53 for i in range(len(layers)-1): 54 if(len(active_function) == self.size-1): 55 self.sigmoids = active_function; 56 else: 57 self.sigmoids.append(active_function[0]); 58 if(len(active_function_der)== self.size-1): 59 self.sigmoids_der = active_function_der; 60 else: 61 self.sigmoids_der.append(active_function_der[0]); 62 63 def fit(self,TrainData,epochs=1000,mini_batch_size=32): 64 """ 65 运用后向传播算法学习神经网络模型 66 TrainData 是(X,Y)值对 67 X 输入特征矩阵 m*n 维 n 个特征,m个样本 68 Y 输入实际值 t*m 维 t个类别标签,m个样本 69 epochs 迭代次数 70 mini_batch_size mini_batch 一次的大小,不使用则mini_batch_size = 1 71 """ 72 n = len(TrainData); 73 for i in range(epochs): 74 random.shuffle(TrainData) 75 mini_batches = [ 76 TrainData[k:k+mini_batch_size] 77 for k in range(0, n, mini_batch_size)]; 78 for mini_batch in mini_batches: 79 self.BP(mini_batch, self.rate); 80 81 82 83 84 def predict(self, x): 85 """前向传播""" 86 i = 0; 87 for b, w in zip(self.biases, self.weights): 88 x = self.sigmoids[i](np.dot(w, x)+b); 89 i = i + 1; 90 return x 91 92 def BP(self,mini_batch,rate): 93 """ 94 BP 神经网络算法 95 """ 96 size = len(mini_batch); 97 98 nabla_b = [np.zeros(b.shape) for b in self.biases]; #存放每次训练b的变化量 99 nabla_w = [np.zeros(w.shape) for w in self.weights]; #存放每次训练w的变化量 100 #一个一个的进行训练 101 for x, y in mini_batch: 102 delta_nabla_b, delta_nabla_w = self.backprop(x, y); 103 nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]; #累加每次训练b的变化量 104 nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]; #累加每次训练w的变化量 105 self.weights = [w-(rate/size)*nw 106 for w, nw in zip(self.weights, nabla_w)]; 107 self.biases = [b-(rate/size)*nb 108 for b, nb in zip(self.biases, nabla_b)]; 109 110 def backprop(self, x, y): 111 """ 112 x 是一维 的行向量 113 y 是一维行向量 114 """ 115 nabla_b = [np.zeros(b.shape) for b in self.biases]; 116 nabla_w = [np.zeros(w.shape) for w in self.weights]; 117 # feedforward 118 activation = np.atleast_2d(x).reshape((len(x),1)); #转换为列向量 119 activations = [activation]; # 存放每层a 120 zs = []; # 存放每z值 121 i = 0; 122 for b, w in zip(self.biases, self.weights): 123 z = np.dot(w, activation)+b; 124 zs.append(z); 125 activation = self.sigmoids[i](z); 126 activations.append(activation); 127 i = i + 1; 128 # backward pass 129 y = np.atleast_2d(y).reshape((len(y),1)); #将y转化为列向量 130 #delta cost对z的偏导数 131 delta = self.cost_der(activations[-1], y) * \ 132 self.sigmoids_der[-1](zs[-1]); 133 nabla_b[-1] = delta; 134 nabla_w[-1] = np.dot(delta, np.transpose(activations[-2])); 135 #从后往前遍历每一层,从倒数第2层开始 141 for l in range(2, self.size): 142 z = zs[-l]; #当前层的z 143 sp = self.sigmoids_der[-l](z); #对z的偏导数值 144 delta = np.multiply(np.dot(np.transpose(self.weights[-l+1]), delta), sp); #求出当前层的误差 145 nabla_b[-l] = delta; 146 nabla_w[-l] = np.dot(delta, np.transpose(activations[-l-1])); 147 return (nabla_b, nabla_w) 148 149 """ 150 损失函数 151 cost_der 差的平方损失函数对a 的导数 152 cost_cross_entropy_der 交叉熵损失函数对a的导数 153 """ 154 def cost_der(self,a,y): 155 return a - y; 156 157 def cost_cross_entropy_der(self,a,y): 158 return (a-y)/(a * (1-a)); 159 160
以上是BP神经网络算法源码,下面给出一个数字识别程序,用来测试上述代码的正确性。
1 import numpy as np 2 from sklearn.datasets import load_digits 3 from sklearn.metrics import confusion_matrix, classification_report 4 from sklearn.preprocessing import LabelBinarizer 5 from network_mark import NeuralNetwork 6 from sklearn.cross_validation import train_test_split 7 8 9 10 digits = load_digits(); 11 X = digits.data; 12 y = digits.target; 13 X -= X.min(); # normalize the values to bring them into the range 0-1 14 X /= X.max(); 15 16 nn = NeuralNetwork([64,100,10]); 17 X_train, X_test, y_train, y_test = train_test_split(X, y); 18 labels_train = LabelBinarizer().fit_transform(y_train); 19 labels_test = LabelBinarizer().fit_transform(y_test); 20 21 22 # X_train.shape (1347,64) 23 #y_train.shape(1347) 24 #labels_train.shape (1347,10) 25 #labels_test.shape(450,10) 26 27 print ("start fitting"); 28 Data = [(x,y) for x,y in zip(X_train,labels_train)]; 29 #print(Data); 30 nn.fit(Data,epochs=500,mini_batch_size=32); 31 result = nn.predict(X_test.T); 32 predictions = [np.argmax(result[:,y]) for y in range(result.shape[1])]; 33 34 print(predictions); 35 #for i in range(result.shape[1]): 36 # y = result[:,i]; 37 # predictions.append(np.argmax(y)); 38 ##print(np.atleast_2d(predictions).shape); 39 print (confusion_matrix(y_test,predictions)); 40 print (classification_report(y_test,predictions)); 41
最后是测试结果,效果很客观。