赞
踩
- <pre name="code" class="python">def countCostFunc(X, y, theta, lamb):
- #X为训练集,y表示训练集上的lable,theta表示权重,lamb是正则常数
- #注意这里的X,theta都经过了扩充,X增加了一列1,theta增加了theta0,这主要是为了向量的统一运算
- m, n = np.shape(X) #X矩阵行m,列n
- h = np.dot(X, theta) #举证相乘
- regtheta = np.copy(theta)#深拷贝,因为,python是引用,
- regtheta[0,0] = 0
- J = ((np.dot((h-y).transpose(), (h - y)) + lamb * np.dot(regtheta.transpose(), regtheta) )/ float(2 * m))
- grad = (1 / float(m)) * (np.dot(X.transpose(), h-y) + lamb * regtheta) #对所有theta求导,这个就是J函数对theta分别求导后,然后变成一个列向量
- return J, grad
- #theta多了theta0但是X还没有增加1列,用训练出来的模型,预测结果
- def predict(X, theta):
- X = np.array(X)
- m,n = np.shape(X)
- X = np.hstack([np.ones((m, 1)), X]) #为X增加一列1
- h = np.dot(X, theta)
- return h
'运行
- #coding=utf-8
- import numpy as np
- import grandDescent
- import costFunction
- import loadData
- import predict
- import write2File
- import rmse
- import os
- import random
- import matplotlib.pyplot as plt
- from pylab import plot,show
- from scipy import stats
'运行
- def handle_data_self():
- X_train = np.array([0,1,2,3,4,5,6,7,8,9]).reshape(10, 1)
- y_train = np.array([0,2,3,4,5,6,7,4,9,10]).reshape(10, 1)
- X_cross = np.copy(X_train)
- y_cross = np.copy(y_train)
- X_test = np.array([0,1,2,3,4,5,6,7,8,9, 10,11,23,1]).reshape(14,1)
- plt.scatter(X_train, y_train, color="Red")
- # plt.show()
- return X_train,y_train,X_cross,y_cross,X_test
- <pre name="code" class="python">#coding=utf-8
- import numpy as np #系统库
- import grandDescent
- import costFunction
- import loadData
- import predict
- import write2File
- import rmse
- import os #库
- import random #库
- import matplotlib.pyplot as plt #库
-
-
-
- def regression_simple(X, y, X1, y1, option):
- alpha = float(option["alpha"])
- maxCycles = int(option["maxCycle"])
- lamb = float(option["lamb"])
- save = bool(option["saveRecord"])
- add = bool(option["add"])
- optGoal = option["optGoal"]
- methods = option["method"]
- thetaPath = option["thetaWritePath"] if option.has_key("thetaWritePath") else None
- m,n = np.shape(X)
- J_train = None
- theta = None
- if methods == "stocGradDescent":
- J_train, theta = grandDescent.stocGradDescent(np.copy(X), y, maxCycles, alpha, lamb)
- elif methods == "grandDescent":
- J_train, theta = grandDescent.grandDescent(np.copy(X), y, maxCycles, alpha, lamb)
- J_cross = costFunction.countCostFunc(np.hstack([np.ones((X1.shape[0], 1)), X1]), y1, theta, lamb)
- rmseResult = rmse.countrmse(np.copy(X1),y1,theta) #后面都是保存一些东西
- if save and thetaPath:
- if not add and os.path.exists(thetaPath):
- os.remove(thetaPath)
- file_object = None
- if not add:
- file_object = open(thetaPath, 'w')
- elif os.path.exists(thetaPath):
- file_object = open(thetaPath, 'a')
- else:
- file_object = open(thetaPath, 'w')
- file_object.write("J_traincost=>"+ str(J_train) + ",J_crosscost=>" + str(J_cross[0]) + ",alpha=>" + str(alpha) + ",lamb=>" + str(lamb) + ", Cycles=>"+ str(maxCycles) + ", rmse=>" + str(rmseResult) + "\n" + ",theata==>"+str(theta.transpose()) + "\n")
- file_object.close()
- J = None
- if optGoal == "J_train":
- J = J_train
- elif optGoal == "J_cross":
- J = J_cross
- elif optGoal == "rmse":
- J = rmseResult
- else:
- print "optGoal fault!!!"
- return J, theta
-
-
-
- def handle_data_self():
- X_train = np.array([0,1,2,3,4,5,6,7,8,9]).reshape(10, 1)
- y_train = np.array([0,2,3,4,5,6,7,4,9,10]).reshape(10, 1)
- X_cross = np.copy(X_train)
- y_cross = np.copy(y_train)
- X_test = np.array([0,1,2,3,4,5,6,7,8,9, 10,11,23,1]).reshape(14,1)
- plt.scatter(X_train, y_train, color="Red")
- plt.show()
- return X_train,y_train,X_cross,y_cross,X_test
-
- if __name__ == "__main__":
- print "load data..."
- X_train, y_train, X_cross, y_cross, X_test = handle_data_self()
- print "load data finished"
- print "进入traing..."
- action = "regression_grandDescent"
- J = None
- theta = None
- if action == "regression_grandDescent":
- option = {"maxCycle": 400, "alpha": 0.05, "lamb": 0.001, "saveRecord": 1,
- "thetaWritePath": "./thetaSave.txt", "add":1, "optGoal":"J_train",
- "method": "grandDescent"}
- J, theta = regression_simple(X_train.copy(), y_train.copy(), X_cross.copy(), y_cross.copy(), option)
- plt.plot(X_train, predict.predict(X_train, theta), color="Green")
- plt.show()
-
- print "完成traing"
-
- if action == "regression_grandDescent" or action == "regression_grandDescentWithBestAlphaAndLamb" or action == "regression_stocGradDescent":
- y_pre = predict.predict(X_test, theta)
- write2File.savePrediction(y_pre, path="sample_submission.csv")
- print "预测完毕","保存结果至sample_submission.csv"
'运行
其中
这里分别是,原始数据集X,原始数据集y,交叉验证集X,交叉验证y,检测集XX_train, y_train, X_cross, y_cross, X_test = handle_data_self()
原始数据集X:就是给你的训练样本的特征值,注意这里还没添加X0为一,后面会在其他地方加上,为什么要加一列,是为了统一后面的导数计算,
grandDescent.py文件
- <pre name="code" class="python">#coding=utf-8
-
- import numpy as np
- import operator
- import copy
- import math
- import costFunction
-
-
- def grandDescent(X, y, maxCycles, alpha, lamb):
- m,n = np.shape(X)
- theta = np.zeros((n + 1, 1))
- X = np.hstack([np.ones((m,1)), X])
- J = 0
- for i in range(maxCycles):
- J, grad = costFunction.countCostFunc(np.copy(X), y, np.copy(theta), lamb)
- #print "J的第",i+1,"次==>" ,J
- theta = theta - alpha * grad
- return J, theta
'运行


costFunction.py文件
- <pre name="code" class="python">#coding=utf-8
-
- import numpy as np
- import operator
- import copy
- import math
-
-
-
-
-
- def countCostFunc(X, y, theta, lamb):
- m, n = np.shape(X)
- h = np.dot(X, theta)
- regtheta = np.copy(theta)
- regtheta[0,0] = 0
- J = ((np.dot((h-y).transpose(), (h - y)) + lamb * np.dot(regtheta.transpose(), regtheta) )/ float(2 * m))
- grad = (1 / float(m)) * (np.dot(X.transpose(), h-y) + lamb * regtheta)
- return J, grad
'运行


rmse.py 文件
- #coding=utf-8
-
- import numpy as np
- import operator
- import copy
- import math
- import costFunction
-
-
- def countrmse(X, y, theta):
- m,n = np.shape(X)
- theta = theta
- #扩展1
- X = np.hstack([np.ones((m,1)), X])
- h = np.dot(X , theta)
- #按列求和
- sumDiffY = np.dot((h - y).transpose(), (h - y))
- J = sumDiffY / float(m)
- #print np.shape(J)
- return math.sqrt(J)
'运行

predict.py文件

- #coding=utf-8
- import numpy as np
- import costFunction
-
-
- #theta多了theta0但是X还没有增加1列
- def predict(X, theta):
- X = np.array(X)
- m,n = np.shape(X)
- X = np.hstack([np.ones((m, 1)), X])
- h = np.dot(X, theta)
- return h
write2File.py文件
- def savePrediction(y_pre, path):
- file_object = open(path, 'w')
- file_object.write("Id,reference\n")
- for i in range(len(y_pre)):
- file_object.write(str(i) + "," + str(y_pre[i][0]) + "\n")
alphas = [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001] maxCycles = 200 lambs = [100,10,1,0.1,0.01, 0.001]
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。