赞
踩
端到端机器学习导航:
【机器学习】python借助pandas加载并显示csv数据文件,并绘制直方图
【机器学习】python使用matplotlib进行二维数据绘图并保存为png图片
【机器学习】python借助pandas及scikit-learn使用三种方法分割训练集及测试集
【机器学习】python借助pandas及matplotlib将输入数据可视化,并计算相关性
【机器学习】python机器学习借助scikit-learn进行数据预处理工作:缺失值填补,文本处理(一)
【机器学习】python机器学习scikit-learn和pandas进行Pipeline处理工作:归一化和标准化及自定义转换器(二)
【机器学习】python机器学习使用scikit-learn评估模型:基于普通抽样及分层抽样的k折交叉验证做模型选择
【机器学习】python机器学习使用scikit-learn对模型进行微调:使用GridSearchCV及RandomizedSearchCV
【机器学习】python机器学习使用scikit-learn对模型进行评估:使用t分布及z分布评估模型误差的95%置信空间
【机器学习】python机器学习使用scikit-learn对模型进行微调:RandomizedSearchCV的分布参数设置
【机器学习】python机器学习使用scikit-learn对模型进行微调:按特征贡献大小保留最重要k个特征的transform
【机器学习】python机器学习使用scikit-learn对模型进行微调:使用RandomizedSearchCV对pipline进行参数选择
数据准备:以房屋地理位置、人群收入等,预测房价
import os
HOUSING_PATH = os.path.join("datasets", "housing")
import pandas as pd
def load_housing_data(housing_path=HOUSING_PATH):
csv_path = os.path.join(housing_path, "housing.csv")
return pd.read_csv(csv_path)
housing=load_housing_data()
housing2=housing.copy()
import numpy as np
原始数据做分层抽样,切分出测试集与训练集:
from sklearn.model_selection import StratifiedShuffleSplit
#下面预备做分层抽样按照阈值箱转成分类号,并新增列
housing2["income_cat"] = pd.cut(housing["median_income"],
bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
labels=[1, 2, 3, 4, 5])
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing2, housing2["income_cat"]):
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
#抽样后的数据,去除预测目标列,并拿出对应目标列准备数据训练
housing_train = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"]
预处理:填补缺失值,将部分文字字段转化为度热向量,并做标准化处理:
#预处理前去掉带文字的指定列
from sklearn.preprocessing import MinMaxScaler,StandardScaler,OneHotEncoder
housing_num = housing_train.drop("ocean_proximity", axis=1)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('std_scaler', StandardScaler())
])
from sklearn.compose import ColumnTransformer
#返回所有列名
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
("num", num_pipeline, num_attribs),
("cat", OneHotEncoder(), cat_attribs),
])
housing_prepared = full_pipeline.fit_transform(housing_train)
#引入线性回归模型:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
引入回归模型,检视在训练集上的均方误差,平均绝对误差;使用了10折交叉验证,这里其实没有用分层抽样划分的测试集与训练集:
#交叉验证不仅可以得到一个模型性能的评估值,还可以衡量该评估的精确度(即其标准差)
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)
def display_scores(scores):
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard deviation:", scores.std())
display_scores(tree_rmse_scores)
输出结果:
Scores: [72229.03469752 65318.2240289 67706.39604745 69368.53738998
66767.61061621 73003.75273869 70522.24414582 69440.77896541
66930.32945876 70756.31946074]
Mean: 69204.32275494763
Standard deviation: 2372.070791055922
下面采用分层抽样的方式做k折交叉验证:
#预处理前去掉带文字的指定列
from sklearn.preprocessing import MinMaxScaler,StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer
housing_num = housing2.drop("ocean_proximity", axis=1)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('std_scaler', StandardScaler())
])
from sklearn.compose import ColumnTransformer
#返回所有列名
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
#找出待独热编码列的最大分类数,不然在进行测试集划分处理时,
#容易造成独热向量因测试集构成不同而产生预处理后输出矩阵列数不一致的情况
categories=housing2['ocean_proximity'].unique()
full_pipeline = ColumnTransformer([
("num", num_pipeline, num_attribs),
("cat", OneHotEncoder(categories=[categories]), cat_attribs),
])
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedShuffleSplit
#下面预备做分层抽样按照阈值箱转成分类号,并新增列
#简单的随机抽样方法相比, 分层可以使估计误差更小,精度更高。层之间的差异越大,精度的提高就越大
housing2["income_cat"] = pd.cut(housing["median_income"],
bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
labels=[1, 2, 3, 4, 5])
def display_scores(scores):
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard deviation:", scores.std())
split = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=42)
#进行10次10折交叉验证:
from sklearn.base import clone
cross_score_SKF = []
lin_reg = LinearRegression()
for train_index, test_index in split.split(housing2, housing2["income_cat"]):
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
#抽样后的数据,去除预测目标列,并拿出对应目标列准备数据训练
housing_labels = strat_train_set["median_house_value"]
housing_prepared = full_pipeline.fit_transform(strat_train_set)
lin_reg.fit(housing_prepared, housing_labels)
thousing_labels = strat_test_set["median_house_value"]
#测试集上转换数据直接使用transform
thousing_prepared = full_pipeline.transform(strat_test_set)
thousing_predictions = lin_reg.predict(thousing_prepared)
lin_mse = mean_squared_error(thousing_labels, thousing_predictions)
lin_rmse = np.sqrt(lin_mse)
cross_score_SKF.append(lin_rmse)
display_scores(np.array(cross_score_SKF))
输出结果为:
Scores: [2.08293359e-10 1.26122971e-10 5.91227154e-10 1.15558193e-10
7.51679294e-10 1.42969263e-10 6.15771604e-10 7.53810961e-10
1.08330749e-10 2.12682576e-10]
Mean: 3.6264461229287684e-10
Standard deviation: 2.639928513333572e-10
最后是模型持久化:
#保存
import joblib
joblib.dump(my_model, "my_model.pkl") # DIFF
#加载
my_model_loaded = joblib.load("my_model.pkl") # DIFF
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。