赞
踩
from sklearn.datasets import load_boston
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
boston=load_boston()
df=pd.DataFrame(boston.data,columns=boston.feature_names)
df['target']=boston.target
数据集共506条,包含有13个与房价相关的特征,分别是:
name | 释义 |
---|---|
CRIM | 城镇人均犯罪率 |
ZN | 住宅用地所占比例 |
INDUS | 城镇中非住宅用地所占比例 |
CHAS | 虚拟变量,用于回归分析 |
NOX | 环保指数 |
RM | 每栋住宅的房间数 |
AGE | 1940 年以前建成的自住单位的比例 |
DIS | 距离 5 个波士顿的就业中心的加权距离 |
RAD | 距离高速公路的便利指数 |
TAX | 每一万美元的不动产税率 |
PTRATIO | 城镇中的教师学生比例 |
B | 城镇中的黑人比例 |
LSTAT | 地区中有多少房东属于低收入人群 |
2、查看数据项
#查看数据项
df.head()
1、 画出各数据项和房价的散点图
2、 根据散点图粗略选择CRIM, RM, LSTAT三个特征值
features=df[['RM','CRIM', 'LSTAT']]
target=df['target']
利用多元线性回归模型,其中自变量为数据集中的 feature_names 的维度(13维度),因变量为数据集中的 target 维度(房价)
#数据集划分
split_num=int(len(features)*0.8)
X_train=features[:split_num]
Y_train=target[:split_num]
X_test=features[split_num:]
Y_test=target[split_num:]
设置标签字段,切分数据集:训练集80%,测试集20%
split_num=int(len(features)*0.8)
X_train=features[:split_num]
Y_train=target[:split_num]
X_test=features[split_num:]
Y_test=target[split_num:]
print(model.coef_,model.intercept_)
preds=model.predict(X_test)
def mae_value(y_true,y_pred):
n=len(y_true)
mae=sum(np.abs(y_true-y_pred))/n
return mae
def mse_value(y_true,y_pred):
n=len(y_true)
mse=sum(np.square(y_true-y_pred))/n
return mse
mae=mae_value(Y_test.values,preds)
mse=mse_value(Y_test.values,preds)
print("MAE",mae)
print("MSE",mse)
from sklearn.model_selection import learning_curve from sklearn.model_selection import ShuffleSplit import matplotlib.pyplot as plt import numpy as np def plot_learning_curve(plt,estimator,title,X,y,ylim=None,cv=None,n_jobs=1,train_sizes=np.linspace(.1,1.0,5)): plt.title(title) if ylim is not None: plt.ylim(ylim) plt.xlabel("Training examples") plt.ylabel("Score") train_sizes,train_scores,test_scores=learning_curve(estimator,X,y,cv=cv,n_jobs=n_jobs,train_sizes=train_sizes) train_scores_mean=np.mean(train_scores,axis=1) train_scores_std=np.std(train_scores,axis=1) test_scores_mean=np.mean(test_scores,axis=1) test_scores_std=np.std(test_scores,axis=1) plt.grid() plt.fill_between(train_sizes,train_scores_mean-train_scores_std,train_scores_mean+train_scores_std,alpha=0.1,color="r") plt.fill_between(train_sizes,test_scores_mean-test_scores_std,test_scores_mean+test_scores_std,alpha=0.1,color="g") plt.plot(train_sizes,train_scores_mean,'o--',color="r",label="Training scores") plt.plot(train_sizes,test_scores_mean,'o-',color="g",label="Cross-validation score") plt.legend(loc="best") return plt cv=ShuffleSplit(n_splits=10,test_size=0.2,random_state=0) plt.figure(figsize=(10,6)) plot_learning_curve(plt,model,"Learn Curve for LinearRegression",features,target,ylim=None,cv=cv) plt.show()
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import learning_curve
def split_data():
boston = load_boston()
x = boston.data
y = boston.target
print(boston.feature_names)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2,random_state=2)
return (x, y, x_train, x_test, y_train, y_test)
def mae_value(y_true,y_pred):
n=len(y_true)
mae=sum(np.abs(y_true-y_pred))/n
return mae
def mse_value(y_true,y_pred):
n=len(y_true)
mse=sum(np.square(y_true-y_pred))/n
return mse
def polynomial_regression(degree=1):
polynomial_features = PolynomialFeatures(degree=degree, include_bias=False)
#模型开启数据归一化
linear_regression_model = LinearRegression(normalize=True)
model = Pipeline([("polynomial_features", polynomial_features),
("linear_regression", linear_regression_model)])
return model
def train_model(x_train, x_test, y_train, y_test, degrees):
res = []
for degree in degrees:
model = polynomial_regression(degree)
model.fit(x_train, y_train)
train_score = model.score(x_train, y_train)
test_score = model.score(x_test, y_test)
res.append({"model": model, "degree": degree, "train_score": train_score, "test_score": test_score})
preds=model.predict(x_test)
mae=mae_value(y_test,preds)
mse=mse_value(y_test,preds)
print(" degree: " ,degree, " MAE:",mae," MSE",mse)
for r in res:
print("degree: {}; train score: {}; test_score: {}".format(r["degree"], r["train_score"], r["test_score"]))
return res
def plot_learning_curve(plt,estimator,title,X,y,ylim=None,cv=None,n_jobs=1,train_sizes=np.linspace(.1,1.0,5)): plt.title(title) if ylim is not None: plt.ylim(ylim) plt.xlabel("Training examples") plt.ylabel("Score") train_sizes,train_scores,test_scores=learning_curve(estimator,X,y,cv=cv,n_jobs=n_jobs,train_sizes=train_sizes) train_scores_mean=np.mean(train_scores,axis=1) train_scores_std=np.std(train_scores,axis=1) test_scores_mean=np.mean(test_scores,axis=1) test_scores_std=np.std(test_scores,axis=1) plt.grid() plt.fill_between(train_sizes,train_scores_mean-train_scores_std,train_scores_mean+train_scores_std,alpha=0.1,color="r") plt.fill_between(train_sizes,test_scores_mean-test_scores_std,test_scores_mean+test_scores_std,alpha=0.1,color="g") plt.plot(train_sizes,train_scores_mean,'o--',color="r",label="Training scores") plt.plot(train_sizes,test_scores_mean,'o-',color="g",label="Cross-validation score") plt.legend(loc="best") return plt
degrees = [1,2,3]
x, y, x_train, x_test, y_train, y_test = split_data()
res = train_model(x_train, x_test, y_train, y_test, degrees)
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
plt.figure(figsize=(10, 6))
for index, data in enumerate(res):
plot_learning_curve(plt, data["model"], "degree %d"%data["degree"], x, y, cv=cv)
plt.show()
通过对波士顿房价数据的分析预测练习,运用多元回归模型(一共十三个维度),前期训练量不足导致拟合程度不理想。经过模型的参数优化,采用了全部特征值,结果显示一次多项式训练准确度72%,测试准确度76%。二次多项式训练准确度92%,测试准确度89%,mae=2.36, mse=8.67。综上所述,采用二次多项式回归方法优化效果较好。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。