赞
踩
我们采用波士顿房价预测数据集进行回归任务分析。数据集分为训练集和测试集,训练集可用于训练回归模型,测试集需要进行预测。
linear regression
,或使用现成的线性回归函数,方法尝试使用Gradient Descent
,SGD
以及 ADAM
;learning rate
的结果。例如损失函数曲线图;regularization
的结果;feature scaling
的结果。code as follows
1、机器学习(LinearRegression)
2、深度学习(待开始)
""" Author:cold Date:2021-04-01 Version:1.0 Info:baseline """ from pandas import read_csv from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error import pandas as pd from sklearn.preprocessing import StandardScaler # 加载数据(455) dataset =read_csv('train_dataset.csv').values # 划分训练集和测试集 X = dataset[:,0:13] Y = dataset[:,13] x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.3) # 创建线性回归模型 lr = LinearRegression() # 拟合训练数据 lr.fit(x_train,y_train) # 得到预测结果 y_test_pred = lr.predict(x_test) y_train_pred = lr.predict(x_train) # 计算相应的评测指标 error_test = mean_squared_error(y_test,y_test_pred) error_train = mean_squared_error(y_train,y_train_pred) print("训练集误差为:{},测试集误差为:{}".format(error_train,error_test)) #预测结果 testset =read_csv('test_dataset.csv').values x_pred = testset[:,1:14] y_pred = lr.predict(x_pred) ID = [] for i in range(len(y_pred)): ID.append("id_"+str(i+1)) res = pd.DataFrame() res['ID']=ID res['value']=y_pred res.to_csv('res.csv',index=False) print("res.csv 已生成")
""" Author:cold Date:2021-04-04 Version:2.0 Info:baselineStd """ from pandas import read_csv from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error import pandas as pd from sklearn.preprocessing import StandardScaler # 加载数据(455) dataset =read_csv('train_dataset.csv').values # 划分训练集和测试集(+数据标准化) X = dataset[:,0:13] Y = dataset[:,13] stand = StandardScaler() X_std=stand.fit_transform(X) x_train,x_test,y_train,y_test = train_test_split(X_std,Y,test_size=0.3) # 创建线性回归模型 lr = LinearRegression() # 拟合训练数据 lr.fit(x_train,y_train) # 得到预测结果 y_test_pred = lr.predict(x_test) y_train_pred = lr.predict(x_train) # 计算相应的评测指标 error_test = mean_squared_error(y_test,y_test_pred) error_train = mean_squared_error(y_train,y_train_pred) print("训练集误差为:{},测试集误差为:{}".format(error_train,error_test)) #预测结果 testset =read_csv('test_dataset.csv').values x_pred = testset[:,1:14] y_pred = lr.predict(x_pred) ID = [] for i in range(len(y_pred)): ID.append("id_"+str(i+1)) res = pd.DataFrame() res['ID']=ID res['value']=y_pred res.to_csv('res.csv',index=False) print("res.csv 已生成")
""" Author:cold Date:2021-04-04 Version:3.0 Info:baselineSelFeatures """ from pandas import read_csv from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.feature_selection import SelectKBest,f_regression from matplotlib import pyplot as plt #字典结果:{提取k个最佳特征,及索引} def ToBeStdAndSel(X,Y,k): stand = StandardScaler() X_std = stand.fit_transform(X) best = SelectKBest(f_regression, k=k) X_best = best.fit_transform(X_std, Y)#A best_index = best.get_support()#B AB顺序不能换 BEST = {} BEST['best_index'] = best_index BEST['X_best'] = X_best # 等价于 X_std[:,best_index] return BEST #标准化 def ToBeStd(X): stand = StandardScaler() X_std = stand.fit_transform(X) return X_std # 保存csv def ToSaveCsv(y_pred): ID = [] for i in range(len(y_pred)): ID.append("id_" + str(i + 1)) res = pd.DataFrame() res['ID'] = ID res['value'] = y_pred res.to_csv('res.csv', index=False) print("res.csv 已生成") #预测 def TryToPredict(testset): x_pred = testset[:, 1:14] x_pred_best = ToBeStd(x_pred)[:, best_index] y_pred = lr.predict(x_pred_best) return y_pred # 加载数据(455) dataset =read_csv('train_dataset.csv').values # 划分训练集和测试集(+数据标准化,+特征提取)X--> X_std--> X_best X = dataset[:,0:13] Y = dataset[:,13] BEST = ToBeStdAndSel(X,Y,6) X_best = BEST['X_best'] best_index = BEST['best_index'] x_train,x_test,y_train,y_test = train_test_split(X_best,Y,test_size=0.3) # 创建线性回归模型 lr = LinearRegression() # 拟合训练数据 lr.fit(x_train,y_train) # 得到预测结果 y_test_pred = lr.predict(x_test) y_train_pred = lr.predict(x_train) # 计算相应的评测指标 error_test = mean_squared_error(y_test,y_test_pred) error_train = mean_squared_error(y_train,y_train_pred) print("训练集误差为:{},测试集误差为:{}".format(error_train,error_test)) plt.plot(y_test_pred,'r-',label='predict_value') plt.plot(y_test,'b-',label='true_value') plt.legend() plt.show() #预测、保存 testset =read_csv('test_dataset.csv').values y_pred = TryToPredict(testset) ToSaveCsv(y_pred)
""" Author:cold Date:2021-04-04 Version:1.0 Info: Features show """ from pandas import read_csv import matplotlib.pyplot as plt import math # 加载数据(455) dataset =read_csv('train_dataset.csv').values X = dataset[:,0:13] Y = dataset[:,13] #(特征工程) features = [] for i in read_csv('train_dataset.csv').keys(): features.append(i) nums = len(features)-1 columns =3 rows =math.ceil(nums /columns) plt.figure(figsize=(12,10)) for i in range(nums): plt.subplot(rows,columns,i+1) plt.plot(X[:,i],Y,'b+') plt.title(features[i]) plt.subplots_adjust(hspace=1.5) plt.show()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。