赞
踩
在本部分的练习中,您将使用一个变量实现线性回归,以预测食品卡车的利润。假设你是一家餐馆的首席执行官,正在考虑不同的城市开设一个新的分店。该连锁店已经在各个城市拥有卡车,而且你有来自城市的利润和人口数据。
您希望使用这些数据来帮助您选择将哪个城市扩展到下一个城市。
ex1data1.txt 是本次作业数据,我们打印出来如图:
线性回归的代价函数:
其中1/2m的1/2只是方便求导,求梯度(平方的导数是乘以2, 刚好和1/2约了),网上也有些公式是没有1/2的, 都是正确的, 不影响后面的分析。
权重优化公式
红线部分为梯度公式, 也就是代价函数的导数。
α 是学习率, 之所以之前1/2可以约去, 是因为1/2只是个常数, 后面可以用α 来兜底, 来控制梯度的大小。
下面用python实现本周编程作业:
ex1必做题:
warmUpExercise.py:
from numpy import *;
def warmUpExercise():
# % ============= YOUR CODE HERE ==============
# % Instructions: Return the 5x5 identity matrix
# % In octave, we return values by defining which variables
# % represent the return values (at the top of the file)
# % and then set them accordingly.
A = mat(random.rand(5, 5));
print(A)
if __name__ == '__main__':
warmUpExercise()
computeCost.py:
from numpy import *; import numpy as np; import math; import pandas as pd import matplotlib.pyplot as plt def computeCost(X, y, theta): #COMPUTECOST Compute cost for linear regression # J = COMPUTECOST(X, y, theta) computes the cost of using theta as the # parameter for linear regression to fit the data points in X and y # # Initialize some useful values m = len(y); # number of training examples # You need to return the following variables correctly J = 0; # ====================== YOUR CODE HERE ====================== # Instructions: Compute the cost of a particular choice of theta # You should set J to the cost. J = np.sum( np.power((X.dot(theta.T)-y),2) )/(2 * m) # ========================================================================= return J if __name__ == '__main__': path = 'ex1data1.txt' data = pd.read_csv(path, header=None) X = np.matrix(data[0]) y = np.matrix(data[1]) m = X.shape[1] X = X.reshape(m,1) y = y.reshape(m,1) theta = np.zeros((1,1)) print('X.shape',X.shape) print('theta.shape',theta.shape) print('y.shape',y.shape) J = computeCost(X, y, theta) print('J',J)
输出:
X.shape (97, 1)
theta.shape (1, 1)
y.shape (97, 1)
J 32.072733877455676
gradientDescent.py:
from numpy import *; from computeCost import computeCost import numpy as np; import math; import pandas as pd import matplotlib.pyplot as plt from plotData import plotData def gradientDescent(X, y, theta, alpha, num_iters): #GRADIENTDESCENT Performs gradient descent to learn theta # theta = GRADIENTDESCENT(X, y, theta, alpha, num_iters) updates theta by # taking num_iters gradient steps with learning rate alpha # Initialize some useful values m = X.shape[0] # number of training examples J_history = np.zeros(num_iters); for iter in range(num_iters): # ====================== YOUR CODE HERE ====================== # theta. # # Hint: While debugging, it can be useful to print out the values # of the cost function (computeCost) and gradient here. # theta = theta - alpha/m * (( X.dot(theta.T) - y ).T.dot(X)) J = computeCost(X, y, theta) J_history[iter] = J return theta, J_history if __name__ == '__main__': path = 'ex1data1.txt' data = pd.read_csv(path, header=None) cols = data.shape[1] X = data.iloc[:,0:cols-1] y = data.iloc[:,cols-1:cols] X = np.matrix(X.values) y = np.matrix(y.values) theta = np.zeros((1,1)) alpha = 0.001 epoch = 20 theta, J_history = gradientDescent(X, y, theta, alpha, epoch) print('theta',theta) print('J_history',J_history) plotData(np.arange(0,epoch,1), J_history)
输出:
theta [[0.65565076]]
J_history [27.97858553 24.52386652 21.60870997 19.1488463 17.07316731 15.32167052
13.84372475 12.59660645 11.54426468 10.656279 9.90698007 9.2747076
8.74118426 8.29098728 7.91110265 7.59054888 7.32005961 7.0918157
6.89921922 6.73670271]
选择题:
featureNormalize.py:
from numpy import *; import numpy as np; import math; import pandas as pd import matplotlib.pyplot as plt def featureNormalize(X): #FEATURENORMALIZE Normalizes the features in X # FEATURENORMALIZE(X) returns a normalized version of X where # the mean value of each feature is 0 and the standard deviation # is 1. This is often a good preprocessing step to do when # working with learning algorithms. # You need to set these values correctly # ====================== YOUR CODE HERE ====================== # Instructions: First, for each feature dimension, compute the mean # of the feature and subtract it from the dataset, # storing the mean value in mu. Next, compute the # standard deviation of each feature and divide # each feature by it's standard deviation, storing # the standard deviation in sigma. # # Note that X is a matrix where each column is a # feature and each row is an example. You need # to perform the normalization separately for # each feature. # # Hint: You might find the 'mean' and 'std' functions useful. # mu = X.mean() sigma = X.std() X_norm = (X - mu)/sigma return X_norm, mu, sigma if __name__ == '__main__': path = 'ex1data2.txt' data = pd.read_csv(path, header=None) cols = data.shape[1] # 列数 X = data.iloc[:,0:cols-1] # 取前cols-1列,即输入向量 y = data.iloc[:,cols-1:cols] # 取最后一列,即目标向量 X = np.matrix(X.values) y = np.matrix(y.values) theta = np.zeros((1,2)) X_norm, mu, sigma = featureNormalize(X) # print('X_norm',X_norm) print('mu',mu) print('sigma',sigma)
输出:
mu 1001.9255319148937
sigma 1143.0528202028345
computeCostMulti.py:
from numpy import *; import numpy as np; import math; import pandas as pd import matplotlib.pyplot as plt path = 'ex1data2.txt' data = pd.read_csv(path, header=None) cols = data.shape[1] # 列数 X = data.iloc[:,0:cols-1] # 取前cols-1列,即输入向量 y = data.iloc[:,cols-1:cols] # 取最后一列,即目标向量 X = np.matrix(X.values) y = np.matrix(y.values) theta = np.matrix([0,0]) def computeCostMulti(X, y, theta): #COMPUTECOSTMULTI Compute cost for linear regression with multiple variables # J = COMPUTECOSTMULTI(X, y, theta) computes the cost of using theta as the # parameter for linear regression to fit the data points in X and y # Initialize some useful values m = len(y); # number of training examples # You need to return the following variables correctly J = 0; # ====================== YOUR CODE HERE ====================== # Instructions: Compute the cost of a particular choice of theta # You should set J to the cost. J = np.sum( np.power((X*theta.T-y),2) )/(2 * m) # # ========================================================================= return J if __name__ == '__main__': print(X.shape) print(theta.shape) print(y.shape) J = computeCostMulti(X, y, theta) print(J)
输出:
(47, 2)
(1, 2)
(47, 1)
65591548106.45744
normalEqn.py:
from numpy import *; import numpy as np; import math; import pandas as pd import matplotlib.pyplot as plt def normalEqn(X, y): #NORMALEQN Computes the closed-form solution to linear regression # NORMALEQN(X,y) computes the closed-form solution to linear # regression using the normal equations. # ====================== YOUR CODE HERE ====================== # Instructions: Complete the code to compute the closed form solution # to linear regression and put the result in theta. # # ---------------------- Sample Solution ---------------------- theta = np.linalg.inv(X.T * X) * X.T * y # ------------------------------------------------------------- # ============================================================ return theta if __name__ == '__main__': path = 'ex1data2.txt' data = pd.read_csv(path, header=None) cols = data.shape[1] # 列数 X = data.iloc[:,0:cols-1] # 取前cols-1列,即输入向量 y = data.iloc[:,cols-1:cols] # 取最后一列,即目标向量 X = np.matrix(X.values) y = np.matrix(y.values) theta = np.matrix([0,0]) theta = normalEqn(X, y) print('theta',theta)
输出:
theta [[ 140.86108621]
[16978.19105903]]
在这部分的练习中,你将建立一个逻辑回归模型来预测一个学生是否能进入大学。假设你是一所大学的行政管理人员,你想根据两门考试的结果,来决定每个申请人是否被录取。你有以前申请人的历史数据,可以将其用作逻辑回归训练集。对于每一个训练样本,你有申请人两次测评的分数以及录取的结果。为了完成这个预测任务,我们准备构建一个可以基于两次测试评分来评估录取可能性的分类模型。
下图是ex2data1.txt数据模型图:
逻辑回归的假设函数:
其中的 g代表逻辑回归的S形函数(Sigmoid function):
大概长这样:
y值在0到1之间,大于等于0.5归一类,小于0.5归一类。
逻辑回归的代价函数:
和线性回归的代价函数略有不同, 看似复杂,其实原理差不多。
下面用python实现本周编程作业:
sigmoid.py:
from numpy import *; import numpy as np; import math; import pandas as pd import matplotlib.pyplot as plt def sigmoid(z): #SIGMOID Compute sigmoid function # g = SIGMOID(z) computes the sigmoid of z. # You need to return the following variables correctly # g = zeros(size(z)); # ====================== YOUR CODE HERE ====================== # Instructions: Compute the sigmoid of each value of z (z can be a matrix, # vector or scalar). g = 1 / ( 1 + np.exp(-z)) # ============================================================= return g if __name__ == '__main__': x1 = np.arange(-10, 10, 0.1) g = sigmoid(x1) plt.plot(x1, sigmoid(x1), c='r') plt.show()
输出:
costFunction.py:
from numpy import *; import numpy as np; import math; import pandas as pd from sigmoid import sigmoid import matplotlib.pyplot as plt def costFunction(theta, X, y): #COSTFUNCTION Compute cost and gradient for logistic regression # J = COSTFUNCTION(theta, X, y) computes the cost of using theta as the # parameter for logistic regression and the gradient of the cost # w.r.t. to the parameters. # Initialize some useful values m = len(y); # number of training examples # You need to return the following variables correctly # J = 0; # grad = zeros(size(theta)); # ====================== YOUR CODE HERE ====================== # Instructions: Compute the cost of a particular choice of theta. # You should set J to the cost. # Compute the partial derivatives and set grad to the partial # derivatives of the cost w.r.t. each parameter in theta # # Note: grad should have the same dimensions as theta # h = sigmoid(X.dot(theta.T)) J = 1/m * np.sum(-np.multiply(y,np.log(h)) - np.multiply((1-y),np.log(1-h))) grad = (h -y).T * X/m # ============================================================= return J, grad if __name__ == '__main__': path = 'ex2data1.txt' data = pd.read_csv(path, header=None) cols = data.shape[1] # 列数 X = data.iloc[:,0:cols-1] # 取前cols-1列,即输入向量 y = data.iloc[:,cols-1:cols] # 取最后一列,即目标向量 X = np.matrix(X.values) y = np.matrix(y.values) theta = np.matrix([0,0]) # print('X.shape',X.shape) # print('theta.shape',theta.shape) # print('y.shape',y.shape) J, grad = costFunction(theta, X, y) print('J',J) print('grad',grad)
输出:
J 0.6931471805599453
grad [[-12.00921659 -11.26284221]]
costFunctionReg.py:
from numpy import *; import numpy as np; import math; import pandas as pd import costFunction as cf import sigmoid as s import matplotlib.pyplot as plt def costFunctionReg(theta, X, y, lamb_da): #COSTFUNCTIONREG Compute cost and gradient for logistic regression with regularization # J = COSTFUNCTIONREG(theta, X, y, lambda) computes the cost of using # theta as the parameter for regularized logistic regression and the # gradient of the cost w.r.t. to the parameters. # Initialize some useful values m = len(y); # number of training examples # You need to return the following variables correctly # J = 0; # grad = zeros(size(theta)); # ====================== YOUR CODE HERE ====================== # Instructions: Compute the cost of a particular choice of theta. # You should set J to the cost. # Compute the partial derivatives and set grad to the partial # derivatives of the cost w.r.t. each parameter in theta J, grad= cf.costFunction(theta, X, y) theta = theta[: 1] J += lamb_da/2*m*(theta * theta.T) grad += lamb_da/m*theta # ============================================================= return J, grad if __name__ == '__main__': path = 'ex2data2.txt' data = pd.read_csv(path, header=None) cols = data.shape[1] # 列数 X = data.iloc[:,0:cols-1] # 取前cols-1列,即输入向量 y = data.iloc[:,cols-1:cols] # 取最后一列,即目标向量 X = np.matrix(X.values) y = np.matrix(y.values) theta = np.matrix([0,0]) # print('X.shape',X.shape) # print('theta.shape',theta.shape) # print('y.shape',y.shape) J, grad = costFunctionReg(theta, X, y, lamb_da=1) print('J',J) print('grad',grad)
输出:
J [[0.69314718]]
grad [[1.87880932e-02 7.77711864e-05]]
predict.py:
from numpy import *; import numpy as np; import math; import pandas as pd import sigmoid as s import costFunction as cf import matplotlib.pyplot as plt import scipy.optimize as opt path = 'ex2data2.txt' data = pd.read_csv(path, header=None) cols = data.shape[1] # 列数 X = data.iloc[:,0:cols-1] # 取前cols-1列,即输入向量 y = data.iloc[:,cols-1:cols] # 取最后一列,即目标向量 X = np.matrix(X.values) y = np.matrix(y.values) theta = np.matrix([0.20623159, 0.20147149]) def predict(theta, X): #PREDICT Predict whether the label is 0 or 1 using learned logistic #regression parameters theta # p = PREDICT(theta, X) computes the predictions for X using a # threshold at 0.5 (i.e., if sigmoid(theta'*x) >= 0.5, predict 1) m = X.shape[0]; # Number of training examples # You need to return the following variables correctly # p = zeros(m, 1); # ====================== YOUR CODE HERE ====================== # Instructions: Complete the following code to make predictions using # your learned logistic regression parameters. # You should set p to a vector of 0's and 1's # h = s.sigmoid(X*theta.T) # p = np.matrix(data.iloc[:,cols-1:cols].values) # for i in range(m): # if h[i][0]>=0.5: # p[i][0] = 1 # else: # p[i][0] = 0 p = [1 if x >= 0.5 else 0 for x in h] # ========================================================================= return p if __name__ == '__main__': print(X.shape) print(theta.shape) print(y.shape) p = predict(theta, X) print(p)
输出:
(118, 2)
(1, 2)
(118, 1)
[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]
我们将扩展我们在练习2中写的logistic回归的实现,并将其应用于一对多的分类(不止两个类别)。
下面用plot_100_image方法根据ex3data1.mat随机画100个数字:
def plot_100_image(X):
sample_idx = np.random.choice(np.arange(X.shape[0]), 100) # 随机选100个样本
sample_images = X[sample_idx, :] # (100,400)
fig, ax_array = plt.subplots(nrows=10, ncols=10, sharey=True, sharex=True, figsize=(8, 8))
for row in range(10):
for column in range(10):
ax_array[row, column].matshow(sample_images[10 * row + column].reshape((20, 20)),
cmap='gray_r')
plt.xticks([])
plt.yticks([])
plt.show()
正则化的logistic回归的代价函数:
红色部分就是正则化部分, 前面就是之前提到的逻辑回归代价函数公式。
下面用python实现本周编程作业:
lrCostFunction.py:
import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.io import loadmat def lrCostFunction(theta, X, y, lamb_da): # LRCOSTFUNCTION Compute cost and gradient for logistic regression with # regularization # J = LRCOSTFUNCTION(theta, X, y, lambda) computes the cost of using # theta as the parameter for regularized logistic regression and the # gradient of the cost w.r.t. to the parameters. # Initialize some useful values m = len(y); # number of training examples # You need to return the following variables correctly # J = 0; # grad = zeros(size(theta)); # ====================== YOUR CODE HERE ====================== # Instructions: Compute the cost of a particular choice of theta. # You should set J to the cost. # Compute the partial derivatives and set grad to the partial # derivatives of the cost w.r.t. each parameter in theta # # Hint: The computation of the cost function and gradients can be # efficiently vectorized. For example, consider the computation # # sigmoid(X * theta) # # Each row of the resulting matrix will contain the value of the # prediction for that example. You can make use of this to vectorize # the cost function and gradient computations. # # Hint: When computing the gradient of the regularized cost function, # there're many possible vectorized solutions, but one solution # looks like: # grad = (unregularized gradient for logistic regression) # temp = theta; # temp(1) = 0; # because we don't add anything for j = 0 # grad = grad + YOUR_CODE_HERE (using the temp variable) J = compute_J(theta, X, y, lamb_da, m) grad = compute_grad(theta, X, y, lamb_da, m) # ============================================================= return J, grad def compute_J(theta, X, y, lamb_da, m): y = y.reshape(y.size,1) theta = theta.reshape(1,theta.size) h = sigmoid(X.dot(theta.T)) # theta = theta[:,1:] theta[0] = 1 J = 1 / m * np.sum(-np.multiply(y, np.log(h)) - np.multiply((1 - y), np.log(1 - h))) + lamb_da / (2 * m) * ( theta.dot(theta.T)) # J = 1 / m * np.sum(-np.multiply(y, np.log(h)) - np.multiply((1 - y), np.log(1 - h))) return J def compute_grad(theta, X, y, lamb_da, m): y = y.reshape(y.size,1) theta = theta.reshape(1,theta.size) h = sigmoid(X.dot(theta.T)) # theta = theta[:,1:] theta[0] = 1 grad = 1 / m * X.T.dot(h - y) + (lamb_da / m) * theta.T # grad = 1 / m * X.T.dot(h - y) return grad def sigmoid(z): g = 1 / (1 + np.exp(-z)); return g def load_data(path): data = loadmat(path) X = data['X'] y = data['y'] return X, y if __name__ == '__main__': X, y = load_data('ex3data1.mat') theta = np.zeros((1, X.shape[1])) J, grad = lrCostFunction(theta, X, y, lamb_da=1) print('J.shape',J.shape) print('J',J) print('grad.shape',grad.shape)
输出:
J.shape (1, 1)
J [[160.43425758]]
grad.shape (400, 1)
oneVsAll.py:
import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.io import loadmat from scipy.optimize import minimize from lrCostFunction import compute_J, compute_grad, sigmoid,load_data def oneVsAll(X, y, num_labels, lamb_da): # ONEVSALL trains multiple logistic regression classifiers and returns all # the classifiers in a matrix all_theta, where the i-th row of all_theta # corresponds to the classifier for label i # [all_theta] = ONEVSALL(X, y, num_labels, lambda) trains num_labels # logistic regression classifiers and returns each of these classifiers # in a matrix all_theta, where the i-th row of all_theta corresponds # to the classifier for label i # Some useful variables m, n = X.shape # You need to return the following variables correctly all_theta = np.zeros((num_labels, n)) # Add ones to the X data matrix # X = [ones(m, 1) X]; # ====================== YOUR CODE HERE ====================== # Instructions: You should complete the following code to train num_labels # logistic regression classifiers with regularization # parameter lambda. # # Hint: theta(:) will return a column vector. # # Hint: You can use y == c to obtain a vector of 1's and 0's that tell you # whether the ground truth is true/false for this class. # # Note: For this assignment, we recommend using fmincg to optimize the cost # function. It is okay to use a for-loop (for c = 1:num_labels) to # loop over the different classes. # # fmincg works similarly to fminunc, but is more efficient when we # are dealing with large number of parameters. # # Example Code for fmincg: # # # Set Initial theta # initial_theta = zeros(n + 1, 1); # # # Set options for fminunc # options = optimset('GradObj', 'on', 'MaxIter', 50); # # # Run fmincg to obtain the optimal theta # # This function will return theta and the cost # [theta] = ... # fmincg (@(t)(lrCostFunction(t, X, (y == c), lambda)), ... # initial_theta, options); for i in range(1, num_labels + 1): theta = np.zeros((1, X.shape[1])) y_i = np.array([1 if label == i else 0 for label in y]) ret = minimize(fun=compute_J, x0=theta, args=(X, y_i, lamb_da, m), method='TNC', jac=compute_grad, options={'disp': True}) all_theta[i - 1, :] = ret.x # ========================================================================= return all_theta if __name__ == '__main__': X, y = load_data('ex3data1.mat') all_theta = oneVsAll(X, y, num_labels=10, lamb_da=1) print(all_theta.shape)
输出:
(10, 400)
predict.py:
import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.io import loadmat from scipy.optimize import minimize from lrCostFunction import compute_J, compute_grad, sigmoid,load_data from oneVsAll import oneVsAll def predict(Theta1, Theta2, X): #PREDICT Predict the label of an input given a trained neural network # p = PREDICT(Theta1, Theta2, X) outputs the predicted label of X given the # trained weights of a neural network (Theta1, Theta2) # Useful values m = X.shape[0] # You need to return the following variables correctly # p = zeros(size(X, 1), 1); # ====================== YOUR CODE HERE ====================== # Instructions: Complete the following code to make predictions using # your learned neural network. You should set p to a # vector containing labels between 1 to num_labels. # # Hint: The max function might come in useful. In particular, the max # function can also return the index of the max element, for more # information see 'help max'. If your examples are in rows, then, you # can use max(A, [], 2) to obtain the max for each row. # Add ones to the X data matrix -jin a2 = sigmoid(X.dot(Theta1.T)) a2 = np.insert(a2, 0, values=np.ones(a2.shape[0]), axis=1) a3 = sigmoid(a2.dot(Theta2.T)) p = np.argmax(a3, axis=1) + 1 p = p.reshape(p.size,1) # ========================================================================= return p def load_weight(path): data = loadmat(path) return data['Theta1'], data['Theta2'] if __name__ == '__main__': theta1, theta2 = load_weight('ex3weights.mat') X, y = load_data('ex3data1.mat') X = np.insert(X, 0, values=np.ones(X.shape[0]), axis=1) p = predict(theta1, theta2, X) accuracy = np.mean(p == y) print ('accuracy = {0}%'.format(accuracy * 100))
输出:
accuracy = 97.52%
predictOneVsAll.py:
import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.io import loadmat from scipy.optimize import minimize from lrCostFunction import compute_J, compute_grad, sigmoid,load_data from oneVsAll import oneVsAll def predictOneVsAll(all_theta, X): #PREDICT Predict the label for a trained one-vs-all classifier. The labels #are in the range 1..K, where K = size(all_theta, 1). # p = PREDICTONEVSALL(all_theta, X) will return a vector of predictions # for each example in the matrix X. Note that X contains the examples in # rows. all_theta is a matrix where the i-th row is a trained logistic # regression theta vector for the i-th class. You should set p to a vector # of values from 1..K (e.g., p = [1; 3; 1; 2] predicts classes 1, 3, 1, 2 # for 4 examples) m = X.shape[0] num_labels = all_theta.shape[0] # You need to return the following variables correctly # p = zeros(size(X, 1), 1); # Add ones to the X data matrix # X = [ones(m, 1) X]; # ====================== YOUR CODE HERE ====================== # Instructions: Complete the following code to make predictions using # your learned logistic regression parameters (one-vs-all). # You should set p to a vector of predictions (from 1 to # num_labels). # # Hint: This code can be done all vectorized using the max function. # In particular, the max function can also return the index of the # max element, for more information see 'help max'. If your examples # are in rows, then, you can use max(A, [], 2) to obtain the max # for each row. p = sigmoid(X.dot(all_theta.T)) p = np.argmax(p, axis=1) p = p + 1 # ========================================================================= return p if __name__ == '__main__': X, y = load_data('ex3data1.mat') all_theta = oneVsAll( X, y, num_labels=10, lamb_da=1) y_pred = predictOneVsAll(all_theta,X) y_pred = y_pred.reshape(y_pred.size,1) accuracy = np.mean(y_pred == y) print ('accuracy = {0}%'.format(accuracy * 100))
输出:
accuracy = 97.39999999999999%
这周被称作“绝望第五周”,需要实现反向传播算法来学习神经网络的参数。依旧是上次预测手写数数字的例子。
正向传递过程,其中 g 为sigmoid激活函数:
反向传播过程:
先算误差:
再算梯度:
最后网络的总梯度为:
这里的反向传播有点绕,我另一篇文章有详细手写推导过程:
【用高中的知识点徒手推导逻辑回归中的反向梯队】
下面用python实现本周编程作业:
sigmoidGradient.py:
import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.io import loadmat def sigmoidGradient(z): # SIGMOIDGRADIENT returns the gradient of the sigmoid function # evaluated at z # g = SIGMOIDGRADIENT(z) computes the gradient of the sigmoid function # evaluated at z. This should work regardless if z is a matrix or a # vector. In particular, if z is a vector or matrix, you should return # the gradient for each element. # g = zeros(size(z)); # ====================== YOUR CODE HERE ====================== # Instructions: Compute the gradient of the sigmoid function evaluated at # each value of z (z can be a matrix, vector or scalar). h = sigmoid(z) g = np.multiply(h, (1 - h)) # ============================================================= return g def sigmoid(z): g = 1 / (1 + np.exp(-z)); return g def load_data(path): data = loadmat(path) X = data['X'] y = data['y'] return X, y def load_weight(path): data = loadmat(path) return data['Theta1'], data['Theta2'] if __name__ == '__main__': X, y = load_data('ex4data1.mat') theta = np.zeros((1, X.shape[1])) z = np.dot(X, theta.T) g = sigmoidGradient(z) print('g', g)
输出:
g [[0.25]
[0.25]
[0.25]
...
[0.25]
[0.25]
[0.25]]
nnCostFunction.py:
import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.io import loadmat from sigmoidGradient import sigmoid, load_data, load_weight, sigmoidGradient def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lamb_da): # NNCOSTFUNCTION Implements the neural network cost function for a two layer # neural network which performs classification # [J grad] = NNCOSTFUNCTON(nn_params, hidden_layer_size, num_labels, ... # X, y, lambda) computes the cost and gradient of the neural network. The # parameters for the neural network are "unrolled" into the vector # nn_params and need to be converted back into the weight matrices. # # The returned parameter grad should be a "unrolled" vector of the # partial derivatives of the neural network. # # Reshape nn_params back into the parameters Theta1 and Theta2, the weight matrices # for our 2 layer neural network # Theta1 = reshape(nn_params(1:hidden_layer_size * (input_layer_size + 1)), ... # hidden_layer_size, (input_layer_size + 1)); # # Theta2 = reshape(nn_params((1 + (hidden_layer_size * (input_layer_size + 1))):end), ... # num_labels, (hidden_layer_size + 1)); # Theta1 = np.zeros(hidden_layer_size,input_layer_size+1) # Theta2 = np.zeros(num_labels,hidden_layer_size+1) # Setup some useful variables m = X.shape[0] # You need to return the following variables correctly J = 0; # Theta1_grad = zeros(size(Theta1)); # Theta2_grad = zeros(size(Theta2)); # ====================== YOUR CODE HERE ====================== # Instructions: You should complete the code by working through the # following parts. # # Part 1: Feedforward the neural network and return the cost in the # variable J. After implementing Part 1, you can verify that your # cost function computation is correct by verifying the cost # computed in ex4.m # # Part 2: Implement the backpropagation algorithm to compute the gradients # Theta1_grad and Theta2_grad. You should return the partial derivatives of # the cost function with respect to Theta1 and Theta2 in Theta1_grad and # Theta2_grad, respectively. After implementing Part 2, you can check # that your implementation is correct by running checkNNGradients # # Note: The vector y passed into the function is a vector of labels # containing values from 1..K. You need to map this vector into a # binary vector of 1's and 0's to be used with the neural network # cost function. # # Hint: We recommend implementing backpropagation using a for-loop # over the training examples if you are implementing it for the # first time. # # Part 3: Implement regularization with the cost function and gradients. # # Hint: You can implement this around the code for # backpropagation. That is, you can compute the gradients for # the regularization separately and then add them to Theta1_grad # and Theta2_grad from Part 2. # Theta1, Theta2 = load_weight('ex4weights.mat') # Part 1 X = np.insert(X, 0, values=np.ones(X.shape[0]), axis=1) a2 = sigmoid(X.dot(Theta1.T)) a2 = np.insert(a2, 0, values=np.ones(a2.shape[0]), axis=1) a3 = sigmoid(a2.dot(Theta2.T)) cy = np.zeros((m, num_labels)) for i in range(m): cy[i, y[i] - 1] = 1 Theta1 = Theta1[:, 1:] Theta2 = Theta2[:, 1:] J = compute_J(Theta1, Theta2, a3, m, cy, lamb_da) grad = compute_grad(a2,a3, Theta2, cy, m) # ========================================================================= return J, grad def compute_J(Theta1, Theta2, a3, m, cy, lamb_da): J = 1 / m * np.sum(-np.multiply(cy, np.log(a3)) - np.multiply((1 - cy), np.log(1 - a3))) th1 = 0 th2 = 0 for i in range(Theta1.shape[0]): t_tmp = Theta1[i, :] th1 += np.sum(t_tmp.dot(t_tmp.T)) for i in range(Theta2.shape[0]): t_tmp = Theta2[i, :] th2 += np.sum(t_tmp.dot(t_tmp.T)) J += lamb_da / (2 * m) * (th1 + th2) return J def compute_grad(a2,a3, Theta2, cy, m): delta3 = a3 - cy # (5000,10) delta2 = np.multiply(delta3.dot(Theta2), sigmoidGradient(a2[:,1:])) # (5000,25) Theta1_grad = delta2.T.dot(X) # (25,401) Theta2_grad = delta3.T.dot(a2) # (10,26) t1_t2 = np.r_[Theta1_grad.flatten(), Theta2_grad.flatten()] # np.r_是按列连接两个矩阵,就是把两矩阵上下相加,要求列数相等。 grad = (1 / m) * t1_t2 # (10285,) return grad if __name__ == '__main__': X, y = load_data('ex4data1.mat') J, grad = nnCostFunction(nn_params=1, input_layer_size=10, hidden_layer_size=25, num_labels=10, X=X, y=y, lamb_da=1) print('J', J) print('grad.shape', grad.shape)
输出:
J 0.38376985909092365
grad.shape (10260,)
在前半部分的练习中,你将实现正则化线性回归,以预测水库中的水位变化,从而预测大坝流出的水量。在下半部分中,您将通过一些调试学习算法的诊断,并检查偏差 v.s. 方差的影响。
根据ex5data1.mat打印出这次数据的模样:
线性回归正则化代价函数:
和逻辑回归正则化代价函数一样, 前面都是正儿八经的线性回归代价函数 , 红色部分就是正则化函数。
线性回归正则化梯度:
下面用python实现本周编程作业:
learningCurve.py:
import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.io import loadmat import scipy.optimize as opt from linearRegCostFunction import compute_grad, compute_J, linearRegCostFunction def learningCurve(X, y, Xval, yval, lamb_da): #LEARNINGCURVE Generates the train and cross validation set errors needed #to plot a learning curve # [error_train, error_val] = ... # LEARNINGCURVE(X, y, Xval, yval, lambda) returns the train and # cross validation set errors for a learning curve. In particular, # it returns two vectors of the same length - error_train and # error_val. Then, error_train(i) contains the training error for # i examples (and similarly for error_val(i)). # # In this function, you will compute the train and test errors for # dataset sizes from 1 up to m. In practice, when working with larger # datasets, you might want to do this in larger intervals. # # Number of training examples m = X.shape[0] # You need to return these values correctly # error_train = np.zeros((m, 1)) # error_val = np.zeros((m, 1)) error_train, error_val = [], [] # ====================== YOUR CODE HERE ====================== # Instructions: Fill in this function to return training errors in # error_train and the cross validation errors in error_val. # i.e., error_train(i) and # error_val(i) should give you the errors # obtained after training on i examples. # # Note: You should evaluate the training error on the first i training # examples (i.e., X(1:i, :) and y(1:i)). # # For the cross-validation error, you should instead evaluate on # the _entire_ cross validation set (Xval and yval). # # Note: If you are using your cost function (linearRegCostFunction) # to compute the training and cross validation error, you should # call the function with the lambda argument set to 0. # Do note that you will still need to use lambda when running # the training to obtain the theta parameters. # # Hint: You can loop over the examples with the following: # # for i = 1:m # # Compute train/cross validation errors using training examples # # X(1:i, :) and y(1:i), storing the result in # # error_train(i) and error_val(i) # .... # # end # # ---------------------- Sample Solution ---------------------- for i in range(1, m+1): theta = trainLinearReg(X[:i], y[:i], lamb_da) error_train.append(compute_J(theta, X[:i], y[:i], lamb_da, m)) error_val.append(compute_J(theta, Xval, yval, lamb_da, m)) # ------------------------------------------------------------- # ========================================================================= return error_train, error_val def trainLinearReg(X, y, l): theta = np.zeros(X.shape[1]) m = len(X) res = opt.minimize(fun=compute_J, x0=theta, args=(X, y, l, m), method='TNC', jac=compute_grad) return res.x if __name__ == '__main__': data = loadmat('ex5data1.mat') X, y = data['X'], data['y'] Xval, yval = data['Xval'], data['yval'] X = np.insert(X, 0, 1, axis=1) Xval = np.insert(Xval, 0, 1, axis=1) theta = np.zeros((1, X.shape[1])) error_train, error_val = learningCurve(X, y, Xval, yval,0) print('error_train', error_train) print('error_val', error_val)
输出:
error_train [array([[0.18980339]]), array([[0.24715886]]), array([[49.43667484]]), array([[105.97979737]]), array([[106.30855924]]), array([[106.49601542]]), array([[115.51234849]]), array([[115.79709682]]), array([[116.379967]]), array([[116.960197]]), array([[119.38441281]]), array([[140.95412088]])]
error_val [array([[287.20263926]]), array([[287.20263926]]), array([[287.20263926]]), array([[287.20263926]]), array([[287.20263926]]), array([[287.20263926]]), array([[287.20263926]]), array([[287.20263926]]), array([[287.20263926]]), array([[287.20263926]]), array([[287.20263926]]), array([[287.20263926]])]
linearRegCostFunction.py:
import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.io import loadmat def linearRegCostFunction(X, y, theta, lamb_da): # LINEARREGCOSTFUNCTION Compute cost and gradient for regularized linear # regression with multiple variables # [J, grad] = LINEARREGCOSTFUNCTION(X, y, theta, lambda) computes the # cost of using theta as the parameter for linear regression to fit the # data points in X and y. Returns the cost in J and the gradient in grad # Initialize some useful values m = len(y); # number of training examples # You need to return the following variables correctly # J = 0; # grad = zeros(size(theta)); # ====================== YOUR CODE HERE ====================== # Instructions: Compute the cost and gradient of regularized linear # regression for a particular choice of theta. # # You should set J to the cost and grad to the gradient. # J = compute_J(theta, X, y, lamb_da, m) grad = compute_grad(theta, X, y, lamb_da, m) # ========================================================================= return J, grad def compute_J(theta, X, y, lamb_da, m): y = y.reshape(y.size, 1) theta = theta.reshape(1, theta.size) h = X.dot(theta.T) # theta = theta[:,1:] theta[0] = 0 J = (1 / (2 * m)) * np.sum(np.square(h - y)) + lamb_da / (2 * m) * (theta.dot(theta.T)) return J def compute_grad(theta, X, y, lamb_da, m): y = y.reshape(y.size, 1) theta = theta.reshape(1, theta.size) h = X.dot(theta.T) # theta = theta[:,1:] # grad = 1 / m * (h - y).T.dot(X) + np.concatenate((np.zeros((1, 1)), (lamb_da / m) * theta)) theta[0] = 0 grad = 1 / m * (h - y).T.dot(X) + (lamb_da / m) * theta return grad if __name__ == '__main__': data = loadmat('ex5data1.mat') X, y = data['X'], data['y'] X = np.insert(X, 0, 1, axis=1) theta = np.ones((1, X.shape[1])) J, grad = linearRegCostFunction(X, y, theta, lamb_da=1) print('X={},y={}'.format(X.shape, y.shape)) print('theta={}'.format(theta.shape)) print('J', J) print('grad.shape', grad.shape) print('grad', grad)
输出:
X=(12, 2),y=(12, 1)
theta=(1, 2)
J [[303.95152555]]
grad.shape (1, 2)
grad [[ -11.21758933 -245.65199649]]
polyFeatures.py:
import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.io import loadmat import scipy.optimize as opt from linearRegCostFunction import compute_grad,compute_J, linearRegCostFunction def polyFeatures(X, p): #POLYFEATURES Maps X (1D vector) into the p-th power # [X_poly] = POLYFEATURES(X, p) takes a data matrix X (size m x 1) and # maps each example into its polynomial features where # X_poly(i, :) = [X(i) X(i).^2 X(i).^3 ... X(i).^p]; # # You need to return the following variables correctly. # X_poly = [] X_poly = np.ones((len(X), p)) # ====================== YOUR CODE HERE ====================== # Instructions: Given a vector X, return a matrix X_poly where the p-th # column of X contains the values of X to the p-th power. # # for i in range(1, p): X_poly[:,i] = np.power(X, i).flatten() # ========================================================================= return X_poly if __name__ == '__main__': data = loadmat('ex5data1.mat') X, y = data['X'], data['y'] X_poly = polyFeatures(X, 6) print('X_poly',X_poly)
输出:
X_poly [[ 1.00000000e+00 -1.59367581e+01 2.53980260e+02 -4.04762197e+03 6.45059724e+04 -1.02801608e+06] [ 1.00000000e+00 -2.91529792e+01 8.49896197e+02 -2.47770062e+04 7.22323546e+05 -2.10578833e+07] [ 1.00000000e+00 3.61895486e+01 1.30968343e+03 4.73968522e+04 1.71527069e+06 6.20748719e+07] [ 1.00000000e+00 3.74921873e+01 1.40566411e+03 5.27014222e+04 1.97589159e+06 7.40804977e+07] [ 1.00000000e+00 -4.80588295e+01 2.30965109e+03 -1.10999128e+05 5.33448815e+06 -2.56369256e+08] [ 1.00000000e+00 -8.94145794e+00 7.99496701e+01 -7.14866612e+02 6.39194974e+03 -5.71533498e+04] [ 1.00000000e+00 1.53077929e+01 2.34328523e+02 3.58705250e+03 5.49098568e+04 8.40548715e+05] [ 1.00000000e+00 -3.47062658e+01 1.20452489e+03 -4.18045609e+04 1.45088020e+06 -5.03546340e+07] [ 1.00000000e+00 1.38915437e+00 1.92974986e+00 2.68072045e+00 3.72393452e+00 5.17311991e+00] [ 1.00000000e+00 -4.43837599e+01 1.96991814e+03 -8.74323736e+04 3.88057747e+06 -1.72234619e+08] [ 1.00000000e+00 7.01350208e+00 4.91892115e+01 3.44988637e+02 2.41957852e+03 1.69697190e+04] [ 1.00000000e+00 2.27627489e+01 5.18142738e+02 1.17943531e+04 2.68471897e+05 6.11115839e+06]]
validationCurve.py:
import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.io import loadmat import scipy.optimize as opt from linearRegCostFunction import compute_grad, compute_J, linearRegCostFunction def validationCurve(X, y, Xval, yval): # VALIDATIONCURVE Generate the train and validation errors needed to # plot a validation curve that we can use to select lambda # [lambda_vec, error_train, error_val] = ... # VALIDATIONCURVE(X, y, Xval, yval) returns the train # and validation errors (in error_train, error_val) # for different values of lambda. You are given the training set (X, # y) and validation set (Xval, yval). # # Selected values of lambda (you should not change this) lambda_vec = np.array([0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]).T # You need to return these variables correctly. # error_train = np.zeros((len(lambda_vec), 1)) # error_val = np.zeros((len(lambda_vec), 1)) error_train, error_val = [], [] # ====================== YOUR CODE HERE ====================== # Instructions: Fill in this function to return training errors in # error_train and the validation errors in error_val. The # vector lambda_vec contains the different lambda parameters # to use for each calculation of the errors, i.e, # error_train(i), and error_val(i) should give # you the errors obtained after training with # lambda = lambda_vec(i) # # Note: You can loop over lambda_vec with the following: # # for i = 1:length(lambda_vec) # lambda = lambda_vec(i); # # Compute train / val errors when training linear # # regression with regularization parameter lambda # # You should store the result in error_train(i) # # and error_val(i) # .... # # end # # m = len(lambda_vec) for i in range(m): lamb_da = lambda_vec[i] theta = trainLinearReg(X, y, lamb_da, m) error_train.append(linearRegCostFunction(X, y, theta, 0)) error_val.append(linearRegCostFunction(Xval, yval, theta, 0)) # ========================================================================= return lambda_vec, error_train, error_val def trainLinearReg(X, y, l, m): theta = np.zeros(X.shape[1]) res = opt.minimize(fun=compute_J, x0=theta, args=(X, y, l, m), method='TNC', jac=compute_grad) return res.x if __name__ == '__main__': data = loadmat('ex5data1.mat') X, y = data['X'], data['y'] Xval, yval = data['Xval'], data['yval'] X = np.insert(X, 0, 1, axis=1) Xval = np.insert(Xval, 0, 1, axis=1) theta = np.zeros((1, X.shape[1])) lambda_vec, error_train, error_val = validationCurve(X, y, Xval, yval) print('error_train', error_train) print('error_val', error_val)
这部分,使用SVM做非线性分类。我们将使用高斯核函数。因为SVM在九十年代、数据量少的时代一直充当着主要分类函数, 所以本章有必要介绍一下。
为了用SVM找出一个非线性的决策边界,我们首先要实现高斯核函数。我可以把高斯核函数想象成一个相似度函数,用来测量一对样本的距离。
根据ex6data1.mat打印数据
根据ex6data2.mat打印数据
根据ex6data3.mat打印数据
高斯核函数:
这里公式十分复杂, 可以不用记住,sklearn自带的svm中的核函数,用即可。
下面用python实现本周编程作业:
dataset3Params.py:
import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.io import loadmat import scipy.optimize as opt from sklearn import svm def dataset3Params(X, y, Xval, yval): #DATASET3PARAMS returns your choice of C and sigma for Part 3 of the exercise #where you select the optimal (C, sigma) learning parameters to use for SVM #with RBF kernel # [C, sigma] = DATASET3PARAMS(X, y, Xval, yval) returns your choice of C and # sigma. You should complete this function to return the optimal C and # sigma based on a cross-validation set. # # You need to return the following variables correctly. C = 1; sigma = 0.3; # ====================== YOUR CODE HERE ====================== # Instructions: Fill in this function to return the optimal C and sigma # learning parameters found using the cross validation set. # You can use svmPredict to predict the labels on the cross # validation set. For example, # predictions = svmPredict(model, Xval); # will return the predictions on the cross validation set. # # Note: You can compute the prediction error using # mean(double(predictions ~= yval)) # Cvalues = (0.01, 0.03, 0.1, 0.3, 1., 3., 10., 30.) sigmavalues = Cvalues best_pair, best_score = (0, 0), 0 for C in Cvalues: for sigma in sigmavalues: gamma = np.power(sigma,-2.)/2 model = svm.SVC(C=C,kernel='rbf',gamma=gamma) model.fit(X, y.flatten()) this_score = model.score(Xval, yval) if this_score > best_score: best_score = this_score best_pair = (C, sigma) print('best_pair={}, best_score={}'.format(best_pair, best_score)) C = best_pair[0] sigma = best_pair[1] # ========================================================================= return C, sigma if __name__ == '__main__': data = loadmat('ex6data3.mat') X, y = data['X'], data['y'] Xval, yval = data['Xval'], data['yval'] C, sigma= dataset3Params(X, y, Xval, yval) print('C', C) print('sigma', sigma)
输出:
best_pair=(1.0, 0.1), best_score=0.965
C 1.0
sigma 0.1
emailFeatures.py:
import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.io import loadmat import scipy.optimize as opt import re import nltk, nltk.stem.porter from processEmail import processEmail def emailFeatures(word_indices): #EMAILFEATURES takes in a word_indices vector and produces a feature vector #from the word indices # x = EMAILFEATURES(word_indices) takes in a word_indices vector and # produces a feature vector from the word indices. # Total number of words in the dictionary n = 1899; # You need to return the following variables correctly. x = np.zeros((n, 1)) # ====================== YOUR CODE HERE ====================== # Instructions: Fill in this function to return a feature vector for the # given email (word_indices). To help make it easier to # process the emails, we have have already pre-processed each # email and converted each word in the email into an index in # a fixed dictionary (of 1899 words). The variable # word_indices contains the list of indices of the words # which occur in one email. # # Concretely, if an email has the text: # # The quick brown fox jumped over the lazy dog. # # Then, the word_indices vector for this text might look # like: # # 60 100 33 44 10 53 60 58 5 # # where, we have mapped each word onto a number, for example: # # the -- 60 # quick -- 100 # ... # # (note: the above numbers are just an example and are not the # actual mappings). # # Your task is take one such word_indices vector and construct # a binary feature vector that indicates whether a particular # word occurs in the email. That is, x(i) = 1 when word i # is present in the email. Concretely, if the word 'the' (say, # index 60) appears in the email, then x(60) = 1. The feature # vector should look like: # # x = [ 0 0 0 0 1 0 0 0 ... 0 0 0 0 1 ... 0 0 0 1 0 ..]; # # for i in range(len(word_indices)): x[word_indices[i],0] = 1 # ========================================================================= return x if __name__ == '__main__': with open('emailSample1.txt', 'r') as f: email_contents = f.read() print(email_contents) word_indices = processEmail(email_contents) x = emailFeatures(word_indices) print('x', x)
输出:
x [[0.]
[0.]
[0.]
...
[0.]
[0.]
[0.]]
learningCurve.py:
import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.io import loadmat import scipy.optimize as opt def learningCurve(x1, x2, sigma): #RBFKERNEL returns a radial basis function kernel between x1 and x2 # sim = gaussianKernel(x1, x2) returns a gaussian kernel between x1 and x2 # and returns the value in sim # Ensure that x1 and x2 are column vectors # x1 = x1(:); x2 = x2(:); # You need to return the following variables correctly. # sim = 0; # ====================== YOUR CODE HERE ====================== # Instructions: Fill in this function to return the similarity between x1 # and x2 computed using a Gaussian kernel with bandwidth # sigma # # sim = np.exp( - np.sum(np.square(x1 - x2))/(2*np.square(sigma))) # ============================================================= return sim if __name__ == '__main__': data = loadmat('ex6data1.mat') x1, y1 = data['X'], data['y'] x1 = x1[:,1:] sim= learningCurve(x1,y1, 1) print('sim', sim)
输出:
sim 2.0846210945302472e-85
processEmail.py:
import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.io import loadmat import scipy.optimize as opt import re import nltk, nltk.stem.porter def processEmail(email_contents): #PROCESSEMAIL preprocesses a the body of an email and #returns a list of word_indices # word_indices = PROCESSEMAIL(email_contents) preprocesses # the body of an email and returns a list of indices of the # words contained in the email. # # Load Vocabulary vocabList = getVocabList(); # Init return value word_indices = []; # ========================== Preprocess Email =========================== # Find the Headers ( \n\n and remove ) # Uncomment the following lines if you are working with raw emails with the # full headers # hdrstart = strfind(email_contents, ([char(10) char(10)])); # email_contents = email_contents(hdrstart(1):end); # Lower case email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > and replace # and does not have any < or > in the tag it with a space email_contents = re.sub('<[^<>]+>', ' ',email_contents) # Handle Numbers # Look for one or more characters between 0-9 email_contents = re.sub('[0-9]+', 'number',email_contents) # Handle URLS # Look for strings starting with http:// or https:// email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr',email_contents) # Handle Email Addresses # Look for strings with @ in the middle email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents) # Handle $ sign email_contents = re.sub('[$]+', 'dollar', email_contents) # ========================== Tokenize Email =========================== # Output the email to screen as well print('\n==== Processed Email ====\n\n'); # Process file l = 0; # I'll use the NLTK stemmer because it more accurately duplicates the # performance of the OCTAVE implementation in the assignment stemmer = nltk.stem.porter.PorterStemmer() tokens = re.split('[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\}\,\'\"\>\_\<\;\%]', email_contents) tokenlist = [] for str in tokens: # Remove any non alphanumeric characters str = re.sub('[^a-zA-Z0-9]', '', str) # Stem the word # (the porterStemmer sometimes has issues, so we use a try catch block) # stemmed = stemmer.stem(str) # Skip the word if it is too short if len(str) < 1: continue # tokenlist.append(stemmed) # Look up the word in the dictionary and add to word_indices if # found # ====================== YOUR CODE HERE ====================== # Instructions: Fill in this function to add the index of str to # word_indices if it is in the vocabulary. At this point # of the code, you have a stemmed word from the email in # the variable str. You should look up str in the # vocabulary list (vocabList). If a match exists, you # should add the index of the word to the word_indices # vector. Concretely, if str = 'action', then you should # look up the vocabulary list to find where in vocabList # 'action' appears. For example, if vocabList{18} = # 'action', then, you should add 18 to the word_indices # vector (e.g., word_indices = [word_indices ; 18]; ). # # Note: vocabList{idx} returns a the word with index idx in the # vocabulary list. # # Note: You can use strcmp(str1, str2) to compare two strings (str1 and # str2). It will return 1 only if the two strings are equivalent. # for i in range(len(vocabList)): if vocabList[i,0] == str : word_indices.append(i) # ============================================================= # # Print to screen, ensuring that the output lines are not too long # if (l + len(str) + 1) > 78: # print('\n'); # l = 0; # print('#s ', str); # l = l + len(str) + 1; # Print footer print('\n\n=========================\n'); return word_indices def getVocabList(): #GETVOCABLIST reads the fixed vocabulary list in vocab.txt and returns a #cell array of the words # vocabList = GETVOCABLIST() reads the fixed vocabulary list in vocab.txt # and returns a cell array of the words in vocabList. ## Read the fixed vocabulary list df = pd.read_table('vocab.txt',names=['words']) vocab = df.values # Store all dictionary words in cell array vocab{} # n = 1899; # Total number of words in the dictionary # # # For ease of implementation, we use a struct to map the strings => integers # # In practice, you'll want to use some form of hashmap # vocabList = np.zeros(len(vocab)) # for i in range(1, n) : # # # Word Index (can ignore since it will be = i) # # fscanf(fid, '#d', 1); # # # Actual Word # # vocabList{i} = fscanf(fid, '#s', 1); # vocabList[i] = 1 return vocab if __name__ == '__main__': with open('emailSample1.txt', 'r') as f: email_contents = f.read() print(email_contents) word_indices= processEmail(email_contents) print('word_indices', word_indices)
输出:
word_indices [793, 1076, 882, 1698, 789, 1821, 1830, 882, 1170, 793, 1892, 1363, 237, 161, 687, 944, 1662, 1119, 1061, 1698, 1161, 1892, 1509, 798, 1181, 809, 180, 1698, 1895, 687, 960, 1476, 70, 529, 1698, 530]
在这个练习中,您将实现K-means算法并将其用于图像压缩。通过减少图像中出现的颜色的数量,只剩下那些在图像中最常见的颜色。
根据ex7data2.mat打印出本次作业的数据:
K-means算法:
1.先随机分配三个点(这里用三举例,也可以随意用大于1的正整数)
2.然后计算测试数据分布属于哪三个点(离哪三个点近)
3.重新计算三个簇中心,为这个簇里面所有点位置的平均值。
4.当有测试数据进来, 会计算归类到新的中心点(离哪三个新的中心点近)
PCA由两部分组成:
1.计算数据的方差矩阵
2.用SVD计算特征向量
在PCA之前,记得标准化数据。
然后计算方差矩阵,如果你的每条样本数据是以行的形式表示,那么计算公式如下:
接着就可以用SVD计算主成分
U包含了主成分,每一列就是我们数据要映射的向量,S为对角矩阵,为奇异值。
下面用python实现本周编程作业:
computeCentroids.py:
import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.io import loadmat import scipy.optimize as opt from findClosestCentroids import findClosestCentroids def computeCentroids(X, idx, K): #COMPUTECENTROIDS returns the new centroids by computing the means of the #data points assigned to each centroid. # centroids = COMPUTECENTROIDS(X, idx, K) returns the new centroids by # computing the means of the data points assigned to each centroid. It is # given a dataset X where each row is a single data point, a vector # idx of centroid assignments (i.e. each entry in range [1..K]) for each # example, and K, the number of centroids. You should return a matrix # centroids, where each row of centroids is the mean of the data points # assigned to it. # # Useful variables m, n = X.shape; # You need to return the following variables correctly. centroids = np.zeros((K, n)) # ====================== YOUR CODE HERE ====================== # Instructions: Go over every centroid and compute mean of all points that # belong to it. Concretely, the row vector centroids(i, :) # should contain the mean of the data points assigned to # centroid i. # # Note: You can use a for-loop over the centroids to compute this. # centroids = [] for i in range(K): temp = [] for j in range(len(idx)): if idx[j,0] ==i: temp.append(X[j]) centroids.append(np.array(temp).mean(axis=0)) # ============================================================= return np.array(centroids) if __name__ == '__main__': data = loadmat('ex7data2.mat') X = data['X'] centroids = np.array([[3, 3], [6, 2], [8, 5]]) idx = findClosestCentroids(X,centroids) centroids = computeCentroids(X, idx, K=3) print('centroids', centroids)
输出:
centroids [[2.42830111 3.15792418]
[5.81350331 2.63365645]
[7.11938687 3.6166844 ]]
findClosestCentroids.py:
import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.io import loadmat import scipy.optimize as opt def findClosestCentroids(X, centroids): #FINDCLOSESTCENTROIDS computes the centroid memberships for every example # idx = FINDCLOSESTCENTROIDS (X, centroids) returns the closest centroids # in idx for a dataset X where each row is a single example. idx = m x 1 # vector of centroid assignments (i.e. each entry in range [1..K]) # # Set K K = centroids.shape[0] # You need to return the following variables correctly. idx = np.zeros((X.shape[0], 1)) # ====================== YOUR CODE HERE ====================== # Instructions: Go over every example, find its closest centroid, and store # the index inside idx at the appropriate location. # Concretely, idx(i) should contain the index of the centroid # closest to example i. Hence, it should be a value in the # range 1..K # # Note: You can use a for-loop over the examples to compute this. # m = X.shape[0] for i in range(1,m): tdis = X[i,:]-centroids[0,:] bestdis = np.sum(np.square(tdis)) idx[i]=0 for j in range(1,K): tdis = X[i,:]-centroids[j,:] dis = np.sum(np.square(tdis)) if(bestdis>dis): bestdis=dis idx[i]=j # ============================================================= return idx if __name__ == '__main__': data = loadmat('ex7data2.mat') X = data['X'] centroids = np.array([[3, 3], [6, 2], [8, 5]]) idx= findClosestCentroids(X,centroids) print('idx', idx[0:5])
输出:
idx [[0.]
[2.]
[1.]
[0.]
[0.]]
kMeansInitCentroids.py:
import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.io import loadmat import scipy.optimize as opt def kMeansInitCentroids(X, K): # KMEANSINITCENTROIDS This function initializes K centroids that are to be # used in K-Means on the dataset X # centroids = KMEANSINITCENTROIDS(X, K) returns K initial centroids to be # used with the K-Means on the dataset X # # You should return this values correctly # centroids = zeros(K, size(X, 2)); # ====================== YOUR CODE HERE ====================== # Instructions: You should set centroids to randomly chosen examples from # the dataset X # m, n = X.shape idx = np.random.choice(m, K) centroids = X[idx] # ============================================================= return centroids if __name__ == '__main__': data = loadmat('ex7data2.mat') X = data['X'] centroids = kMeansInitCentroids(X, K=3) print('centroids', centroids)
输出:
centroids [[2.15653404 0.40358861]
[3.54010186 0.86446135]
[1.95538864 1.32156857]]
pca.py:
import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.io import loadmat import scipy.optimize as opt def pca(X): #PCA Run principal component analysis on the dataset X # [U, S, X] = pca(X) computes eigenvectors of the covariance matrix of X # Returns the eigenvectors U, the eigenvalues (on diagonal) in S # # Useful values m, n = X.shape # You need to return the following variables correctly. U = np.zeros(n); S = np.zeros(n); # ====================== YOUR CODE HERE ====================== # Instructions: You should first compute the covariance matrix. Then, you # should use the "svd" function to compute the eigenvectors # and eigenvalues of the covariance matrix. # # Note: When computing the covariance matrix, remember to divide by m (the # number of examples). # sigma = (1 / m)*(X.T.dot(X)) U, S ,V = np.linalg.svd(sigma) # ========================================================================= return U, S if __name__ == '__main__': data = loadmat('ex7data1.mat') X = data['X'] U, S = pca(X) print('U', U) print('S', S)
输出:
U [[-0.6298592 -0.77670934]
[-0.77670934 0.6298592 ]]
S [42.94043399 0.37681126]
projectData.py:
import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.io import loadmat import scipy.optimize as opt from pca import pca def projectData(X, U, K): #PROJECTDATA Computes the reduced data representation when projecting only #on to the top k eigenvectors # Z = projectData(X, U, K) computes the projection of # the normalized inputs X into the reduced dimensional space spanned by # the first K columns of U. It returns the projected examples in Z. # # You need to return the following variables correctly. Z = np.zeros((X.shape[0], K)) # ====================== YOUR CODE HERE ====================== # Instructions: Compute the projection of the data using only the top K # eigenvectors in U (first K columns). # For the i-th example X(i,:), the projection on to the k-th # eigenvector is given as follows: # x = X(i, :)'; # projection_k = x' * U(:, k); # Z = X.dot(U[:,:K]) # ============================================================= return Z if __name__ == '__main__': data = loadmat('ex7data1.mat') X = data['X'] U, S = pca(X) Z = projectData(X, U, K=1) print('Z', Z)
输出:
Z [[-4.76226381]
[-7.39891873]
...
[-7.9499247 ]]
recoverData.py:
import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.io import loadmat import scipy.optimize as opt from pca import pca from projectData import projectData def recoverData(X, U, K): #RECOVERDATA Recovers an approximation of the original data when using the #projected data # X_rec = RECOVERDATA(Z, U, K) recovers an approximation the # original data that has been reduced to K dimensions. It returns the # approximate reconstruction in X_rec. # # You need to return the following variables correctly. X_rec = np.zeros((Z.shape[0], U.shape[0])) # ====================== YOUR CODE HERE ====================== # Instructions: Compute the approximation of the data by projecting back # onto the original space using the top K eigenvectors in U. # # For the i-th example Z(i,:), the (approximate) # recovered data for dimension j is given as follows: # v = Z(i, :)'; # recovered_j = v' * U(j, 1:K)'; # # Notice that U(j, 1:K) is a row vector. # X_rec = Z.dot(U[:,:K].T) # ============================================================= return X_rec if __name__ == '__main__': data = loadmat('ex7data1.mat') X = data['X'] U, S = pca(X) Z = projectData(X, U, K=6) X_rec = recoverData(Z, U, K=6) print('X_rec', X_rec)
输出:
X_rec [[3.38156267 3.38911268]
[4.52787538 5.8541781 ]
...
[5.11795499 6.08507386]]
这部分,您将实现一个异常检测算法来检测服务器计算机中的异常行为。他的特征是测量每个服务器的响应速度(mb/s)和延迟(ms)。当你的服务器运行时,你收集到了m=307的样本,是无标签的。你相信其中绝大多数样本是正常的,但还是有一小部分的样本是异常的。
我们将使用高斯分布模型来检测数据集中的异常样本。
根据ex8data1.mat 打印出本周作业数据:
下面用python实现本周编程作业:
cofiCostFunc.py:
import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.io import loadmat import scipy.optimize as opt def cofiCostFunc(params, Y, R, num_users, num_movies, num_features, lamb_da): #COFICOSTFUNC Collaborative filtering cost function # [J, grad] = COFICOSTFUNC(params, Y, R, num_users, num_movies, ... # num_features, lambda) returns the cost and gradient for the # collaborative filtering problem. # # Unfold the U and W matrices from params # X = reshape(params(1:num_movies*num_features), num_movies, num_features) # Theta = reshape(params(num_movies*num_features+1:end),c num_users, num_features) # You need to return the following values correctly # J = 0; # X_grad = zeros(size(X)); # Theta_grad = zeros(size(Theta)); # ====================== YOUR CODE HERE ====================== # Instructions: Compute the cost function and gradient for collaborative # filtering. Concretely, you should first implement the cost # function (without regularization) and make sure it is # matches our costs. After that, you should implement the # gradient and use the checkCostFunction routine to check # that the gradient is correct. Finally, you should implement # regularization. # # Notes: X - num_movies x num_features matrix of movie features # Theta - num_users x num_features matrix of user features # Y - num_movies x num_users matrix of user ratings of movies # R - num_movies x num_users matrix, where R(i, j) = 1 if the # i-th movie was rated by the j-th user # # You should set the following variables correctly: # # X_grad - num_movies x num_features matrix, containing the # partial derivatives w.r.t. to each element of X # Theta_grad - num_users x num_features matrix, containing the # partial derivatives w.r.t. to each element of Theta # X, theta = deserialize(params, num_users, num_movies, num_features) part1 = np.sum(((X.dot(theta.T) - Y) ** 2) * R) / 2 part2 = lamb_da * np.sum(theta ** 2) / 2 part3 = lamb_da * np.sum(X ** 2) / 2 J = part1 + part2 + part3 X_grad = ((X.dot(theta.T)-Y)*R).dot(theta) + lamb_da*X Theta_grad = ((X.dot(theta.T)-Y)*R).T.dot(X) + lamb_da*theta # ============================================================= grad = serialize(X_grad, Theta_grad) return J, grad def serialize(X, theta): return np.concatenate((X.flatten(), theta.flatten()), axis=0) def deserialize(seq, nm, nu, nf): return seq[:nm*nf].reshape(nm, nf), seq[nm*nf:].reshape(nu, nf) if __name__ == '__main__': data1 = loadmat("ex8_movies.mat") Y = data1["Y"] # (1682,943) R = data1["R"] # (1682,943) data2 = loadmat("ex8_movieParams.mat") X = data2["X"] # (1682,10) theta = data2["Theta"] # (943,10) nu = data2["num_users"][0][0] # (1,1) 943 nm = data2["num_movies"][0][0] # (1,1) 1682 nf = data2["num_features"][0][0] # (1,1) 10 # 题目中计算数据不是全部数据,取nm=5,nu=4,nf=3,值为22.224603725685675 nu = 4 nm = 5 nf = 3 X = X[:nm, :nf] theta = theta[:nu, :nf] Y = Y[:nm, :nu] R = R[:nm, :nu] J, grad = cofiCostFunc(serialize(X, theta), Y, R, nm, nu, nf, lamb_da=0) print('J', J) print('grad', grad)
输出:
J 22.224603725685675
grad [ -2.52899165 7.57570308 -1.89979026 -0.56819597 3.35265031
-0.52339845 -0.83240713 4.91163297 -0.76677878 -0.38358278
2.26333698 -0.35334048 -0.80378006 4.74271842 -0.74040871
-10.5680202 4.62776019 -7.16004443 -3.05099006 1.16441367
-3.47410789 0. 0. 0. 0.
0. 0. ]
estimateGaussian.py:
import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.io import loadmat import scipy.optimize as opt def estimateGaussian(X): #ESTIMATEGAUSSIAN This function estimates the parameters of a #Gaussian distribution using the data in X # [mu sigma2] = estimateGaussian(X), # The input X is the dataset with each n-dimensional data point in one row # The output is an n-dimensional vector mu, the mean of the data set # and the variances sigma^2, an n x 1 vector # # Useful variables m, n = X.shape # You should return these values correctly mu = np.zeros((n, 1)) sigma2 = np.zeros((n, 1)) # ====================== YOUR CODE HERE ====================== # Instructions: Compute the mean of the data and the variances # In particular, mu(i) should contain the mean of # the data for the i-th feature and sigma2(i) # should contain variance of the i-th feature. # mu = np.sum(X,axis=0) / m sigma2 = ((X-mu).T .dot(X-mu)) / m # ============================================================= return mu,sigma2 if __name__ == '__main__': data = loadmat('ex8data1.mat') X = data['X'] mu,sigma2 = estimateGaussian(X) print('mu', mu) print('sigma2', sigma2)
输出:
mu [14.11222578 14.99771051]
sigma2 [[ 1.83263141 -0.22712233]
[-0.22712233 1.70974533]]
gaussian.py:
import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.io import loadmat import scipy.optimize as opt from estimateGaussian import estimateGaussian def gaussian(X, mu, sigma2): if np.ndim(sigma2)==1: sigma2=np.diag(sigma2) # Extract a diagonal or construct a diagonal array norm=1/(np.power(2*np.pi,X.shape[1]/2)*np.power(np.linalg.det(sigma2),0.5)) p=np.exp(-1/2*(X-mu).dot(np.linalg.inv(sigma2)).dot((X-mu).T)) return norm*np.diag(p) if __name__ == '__main__': data = loadmat('ex8data1.mat') X = data['X'] mu, sigma2 = estimateGaussian(X) ypred = gaussian(X,mu,sigma2) print('ypred ', ypred )
输出:
ypred [6.35952245e-02 4.71175698e-02 7.24075654e-02 4.68671935e-02
6.14847658e-02 4.13471164e-02
...
8.12920163e-21]
selectThreshold.py:
import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.io import loadmat import scipy.optimize as opt from gaussian import gaussian from estimateGaussian import estimateGaussian def selectThreshold(yval, pval): #SELECTTHRESHOLD Find the best threshold (epsilon) to use for selecting #outliers # [bestEpsilon bestF1] = SELECTTHRESHOLD(yval, pval) finds the best # threshold to use for selecting outliers based on the results from a # validation set (pval) and the ground truth (yval). # bestEpsilon = 0; bestF1 = 0; F1 = 0; # stepsize = (np.max(pval) - np.min(pval)) / 1000 epsilons=np.linspace(np.min(pval),np.max(pval),1000) for epsilon in epsilons: # ====================== YOUR CODE HERE ====================== # Instructions: Compute the F1 score of choosing epsilon as the # threshold and place the value in F1. The code at the # end of the loop will compare the F1 score for this # choice of epsilon and set it to be the best epsilon if # it is better than the current choice of epsilon. # # Note: You can use predictions = (pval < epsilon) to get a binary vector # of 0's and 1's of the outlier predictions pval_tmp=pval<epsilon # True or False vector Tp=np.sum(pval_tmp&yval) # correctly classified Fp=np.sum(pval_tmp&(yval^1)) # incorrectly classified Fn=np.sum((pval_tmp^1)&yval) # incorrectly classified prec=Tp/(Tp+Fp) if Tp+Fp else 0 rec=Tp/(Tp+Fn) if Tp+Fn else 0 F1=2*prec*rec/(prec+rec) if prec+rec else 0 # ============================================================= if F1 > bestF1: bestF1 = F1 bestEpsilon = epsilon return bestEpsilon, bestF1 if __name__ == '__main__': data = loadmat('ex8data2.mat') X = data['X'] yval = data['yval'] mu,sigma2 = estimateGaussian(X) ypred = gaussian(X, mu, sigma2) bestEpsilon, bestF1 = selectThreshold(yval, ypred) print('bestEpsilon', bestEpsilon) print('bestF1', bestF1)
输出:
bestEpsilon 3.0794215337381673e-15
bestF1 0.18180163785259326
总结:
作业第一到第五需要熟练掌握,最后可以用张纸上推导一下,记忆更深。作业第六到第八需要大概的理解即可。
Referrence:
https://blog.csdn.net/Cowry5/article/details/83302646
https://blog.csdn.net/buchidanhuang/article/details/83958947
https://blog.csdn.net/chosen1hyj/article/details/93176560
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。