赞
踩
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore') #过滤所有警告
# 读取数据集
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
#1.数据查看 train.shape #查看训练集形状 test.shape #查看验证集形状 train.info() #查看训练集info:Column代表列明, Non-Null Count代表不为空的数目,Dtype代表数据类型(object代表字符串) train.describe() # 数据描述(只针对非object的数据列)mean 代表均值,std代表方差,25%:排在第25%位置的数据 train.isnull().sum().sort_values(ascending=False) # 统计每一列NaN的数量,将结果按照降序排序 train.isnull().sum().sort_values(ascending=False) / train.shape[0] # 计算NaN的占比,将结果按照降序排序 test.isnull().sum().sort_values(ascending=False) / test.shape[0] # 每一列中,NaN所占比例 #2.数据清洗 train.drop(columns=['PoolQC','MiscFeature','Alley','Fence'], axis=1, inplace=True)# 对于NaN占比较高的PoolQC,MiscFeature,Alley,Fence 列删除 test.drop(columns=['PoolQC','MiscFeature','Alley','Fence'], axis=1, inplace=True) number_columns = [ col for col in train.columns if train[col].dtype != 'object'] # 统计train,test所有列中的:数值类型的列 和 分类类型的列 category_columns = [col for col in train.columns if train[col].dtype == 'object'] #3.数据分析——绘制显示数值类型列的数据分布 fig, axes = plt.subplots(nrows=13, ncols=3, figsize=(20, 18)) axes = axes.flatten() for i, col in zip(range(len(number_columns)), number_columns): sns.distplot(train[col], ax=axes[i]) plt.tight_layout() # 建造年份YearBuilt 与 售价SalePrice 的关系(散点图) plt.figure(figsize=(16, 8)) # 画布大小 plt.title("YearBuilt vs SalePrice") # 画布标题 #sns.scatterplot(x='YearBuilt', y='SalePrice', data=train) # 写法一 sns.scatterplot(train.YearBuilt, train.SalePrice) # 写法二 plt.show() # 楼层面积1stFlrSF 与 售价SalePrice 的关系(散点图) plt.figure(figsize=(16, 8)) sns.scatterplot(x='1stFlrSF', y='SalePrice', data=train) plt.show() #4.数据分析——绘制显示分类类型列的数据分布 fig, axes = plt.subplots(13, 3, figsize=(25, 20)) axes = axes.flatten() for i, col in enumerate(category_columns): sns.stripplot(x=col, y='SalePrice', data=train, ax=axes[i]) plt.tight_layout() plt.show()
#1.统计 train中有哪些列包含NaN train_nan_num = [] # train中数值类型的列 train_nan_cat = [] # train中分类类型的列 for col in number_columns: if train[col].isnull().sum() > 0: train_nan_num.append(col) for col in category_columns: if train[col].isnull().sum() > 0: train_nan_cat.append(col) #2.统计 test中有哪些列包含NaN test_nan_num = [] # test中数值类型的列 test_nan_cat = [] # test中分类类型的列 # 注意:需要将SalePrice清理,因为test中没有SalePrice(标签) number_columns.remove('SalePrice') for col in number_columns: if test[col].isnull().sum() > 0: test_nan_num.append(col) for col in category_columns: if test[col].isnull().sum() > 0: test_nan_cat.append(col)
train_one = train.dropna(axis=0)
test_one = test.dropna(axis=0)
print(train_one.shape)
print(test_one.shape)
# train
for col in train_nan_num:
# inplace=True代表在原来数据集上操作,不会返回新的DataFrame对象
train[col].fillna(train[col].median(), inplace=True) # 中位数替代
for col in train_nan_cat:
train[col].fillna('None', inplace=True)
# test
for col in test_nan_num:
test[col].fillna(test[col].median(), inplace=True) # 中位数
for col in test_nan_cat:
test[col].fillna('None', inplace=True)
#1.处理分类型数据 # 对 分类类型 列进行LabelEncoding # 举例:A, B, C, D, E --LabelEncoding--> 0, 1, 2, 3, 4 from sklearn.preprocessing import LabelEncoder LE = LabelEncoder() for col in category_columns: train[col] = LE.fit_transform(train[col]) test[col] = LE.fit_transform(test[col]) #2.构建训练集和验证集 X = train.drop(columns=['Id', 'SalePrice'], axis=1).values # 说明:Id不是特征,SalePrice是标签,需要屏蔽 y = train['SalePrice'].values # 标签 SalePrice #3.数据集分离 from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True) # 验证集占比30%,打乱顺序
# 1 线性回归
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
'''
MSE: Mean Squared Error
均方误差是指参数估计值与参数真值之差平方的期望值;
MSE可以评价数据的变化程度,MSE的值越小,说明预测模型描述实验数据具有更好的精确度。
'''
LR = LinearRegression() # 模型
LR.fit(X_train, y_train) # 训练
y_pred = LR.predict(X_test) # 预测
print(f'Root Mean Squared Error : {np.sqrt(mean_absolute_error(np.log(y_test), np.log(y_pred)))}')
# K折交叉验证 from sklearn.model_selection import KFold kf = KFold(n_splits=10) # 10折 rmse_scores = [] # 保存10折运行的结果 for train_indices, test_indices in kf.split(X): # 分割元数据,生成索引列表 X_train, X_test = X[train_indices], X[test_indices] # 训练集和验证集 y_train, y_test = y[train_indices], y[test_indices] # 训练标签集和验证标签集 # 初始化线性回归模型对象 LR = LinearRegression(normalize=True) LR.fit(X_train, y_train) # 训练 y_pred = LR.predict(X_test) # 预测 rmse = np.sqrt(mean_absolute_error(np.log(y_test), np.log(abs(y_pred)))) # 评估 rmse_scores.append(rmse) # 累计每一轮的验证结果 print("rmse scores : ", rmse_scores) print(f'average rmse score : {np.mean(rmse_scores)}')
# 2 随机森林(回归) from sklearn.ensemble import RandomForestRegressor # K折交叉验证 kf = KFold(n_splits=10) rmse_scores = [] for train_indices, test_indices in kf.split(X): X_train, X_test = X[train_indices], X[test_indices] y_train, y_test = y[train_indices], y[test_indices] # 初始化模型 RFR = RandomForestRegressor() # 基模型 # 训练/fit拟合 RFR.fit(X_train, y_train) # 预测 y_pred = RFR.predict(X_test) # 评估 rmse = mean_absolute_error(y_test, y_pred) # 累计结果 rmse_scores.append(rmse) print("rmse scores : ", rmse_scores) print(f'average rmse scores : {np.mean(rmse_scores)}')
# 3 lightGBM(回归) import lightgbm as lgb # K折交叉验证 kf = KFold(n_splits=10) rmse_scores = [] for train_indices, test_indices in kf.split(X): X_train, X_test = X[train_indices], X[test_indices] y_train, y_test = y[train_indices], y[test_indices] # 初始化模型 LGBR = lgb.LGBMRegressor() # 基模型 # 训练/fit拟合 LGBR.fit(X_train, y_train) # 预测 y_pred = LGBR.predict(X_test) # 评估 rmse = mean_absolute_error(y_test, y_pred) # 累计结果 rmse_scores.append(rmse) print("rmse scores : ", rmse_scores) print(f'average rmse scores : {np.mean(rmse_scores)}')
# xgboost import xgboost as xgb # K折交叉验证 kf = KFold(n_splits=10) rmse_scores = [] for train_indices, test_indices in kf.split(X): X_train, X_test = X[train_indices], X[test_indices] y_train, y_test = y[train_indices], y[test_indices] # 初始化模型 XGBR = xgb.XGBRegressor() # 基模型 # 训练/fit拟合 XGBR.fit(X_train, y_train) # 预测 y_pred = XGBR.predict(X_test) # 评估 rmse = mean_absolute_error(y_test, y_pred) # 累计结果 rmse_scores.append(rmse) print("rmse scores : ", rmse_scores) print(f'average rmse scores : {np.mean(rmse_scores)}')
# 1 选取 lightGBM 算法
LGBR.fit(X, y) # 在整个数据集上训练
test_pred = LGBR.predict(test.drop('Id',axis=1).values)
result_df = pd.DataFrame(columns=['SalePrice'])
result_df['SalePrice'] = test_pred
result_df.to_csv('LGBR_base_model.csv', index=None, header=True)
#绘制预测结果图:x为下标,y为SalePrice预测值
result_df['SalePrice'].plot(figsize=(16,8))
train_data = lgb.Dataset(X_train, label=y_train) # 训练集 test_data = lgb.Dataset(X_test, label=y_test, reference=train_data) # 验证集 # 参数 params = { 'objective':'regression', # 目标任务 'metric':'rmse', # 评估指标 'learning_rate':0.1, # 学习率 'max_depth':15, # 树的深度 'num_leaves':20, # 叶子数 } # 创建模型对象 model = lgb.train(params=params, train_set=train_data, num_boost_round=300, early_stopping_rounds=30, valid_names=['test'], valid_sets=[test_data]) # 模型评估 score = model.best_score['test']['rmse'] # 模型预测 test_pred = model.predict(test.drop('Id',axis=1).values) result_df2 = pd.DataFrame(columns=['SalePrice']) result_df2['SalePrice'] = test_pred result_df2.to_csv('LGBR_model2.csv', index=None, header=True) result_df2['SalePrice'].plot(figsize=(16,8))
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。