赞
踩
- import pandas as pd
-
- import numpy as np
-
- import seaborn as sns
-
- import statsmodels.api as sm
-
- import matplotlib.pyplot as plt
-
- import scipy.stats as sci
-
- from sklearn.model_selection import train_test_split
-
- from statsmodels.formula.api import ols
-
- from statsmodels.sandbox.regression.predstd import wls_prediction_std
-
- import statsmodels.formula.api as smf
-
- from sklearn.preprocessing import OneHotEncoder,LabelEncoder
-
- #copyright by Interstellar-Ark-AI on 2020_5_23
-
-
-
-
- #读取数据集-查看数据形状和类型以及分布
-
- data = pd.read_csv("house_train.csv")
-
- data.head() #查看数据
-
- data.shape #查看数据集形状
-
- print(data['price'].describe()) #查看价值目标类型和分布、有无异常值
-
- sns.distplot(data['price']) # 查看价格分布的散点图
-
- plt.show()
-
-
- #预处理和数据清洗,去除价格-面积的离群点
-
- sns.jointplot(x='area',y='price',data=data) #查看价格和面积的分布图
-
- plt.show()
-
- index_del = data[(data['area'] > 100) & (data['price'] < 700)].index
-
- data.drop(index=index_del, inplace=True) #噪声去除
-
- sns.jointplot(x='area',y='price',data=data) #再次查看价格和面积的分布图
-
- plt.show()
-
-
- #预处理和数据清洗,去除价格-PM2.5的离群点
-
- sns.jointplot(x='pm25',y='price',data=data) #查看价格和PM2.5的分布图
-
- plt.show()
-
- index_del = data[(data['pm25'] > 80) & (data['price'] < 400)].index
-
- data.drop(index=index_del, inplace=True) #噪声去除
-
- sns.jointplot(x='pm25',y='price',data=data) #再次查看价格和PM2.5的分布图
-
- plt.show()
-
-
- #预处理和数据清洗,去除犯罪率-价格的离群点
-
- sns.jointplot(x='crime_rate',y='price',data=data) #查看犯罪率和价格的分布图
-
- plt.show()
-
- index_del = data[(data['crime_rate'] > 7.5) & (data['price'] < 500)].index
-
- data.drop(index=index_del, inplace=True) #噪声去除
-
- sns.jointplot(x='crime_rate',y='price',data=data) #再次查看犯罪率和价格的分布图
-
- plt.show()
-
-
- #进行预处理数据清洗,查看什么特征拥有空值,去除空值列
-
- total = data.isnull().sum().sort_values(ascending=False)
-
- percent = (data.isnull().sum() / data.isnull().count()).sort_values(ascending = False)
-
- missing_data = pd.concat([total,percent],axis = 1,keys = ['Total','Percent'])
-
- data['distirct'] = data['distirct'].astype(str) #对于distirct数据,需要转化为object类型进行分类
-
- print(missing_data.head(13)) #查看空值列
-
- data1 = data.drop(['id'], axis = 1) #删除id列,id列是无效的数据
-
- data2 = data1.drop(data1.loc[data1['crime_rate'].isnull()].index) #删除犯罪率为空的行
-
- data3 = data2.drop(data2.loc[data2['green_rate'].isnull()].index) #删除绿化率为空的行
-
- print(data3.isnull().sum().max()) #检查最后数据
-
- #可以使用热力图表示变量相关性
-
- plt.rcParams['figure.figsize'] = (15, 10)# 计算相关系数
-
- corrmatrix = data3.corr()
-
- sns.heatmap(corrmatrix,square = True,vmax = 1,vmin = -1,center = 0.0,cmap = 'coolwarm')
-
- plt.show()
-
-
- #特征工程:求出各个特征之间的皮尔逊相关系数
-
- k = 10 #十个特征
-
- cols = corrmatrix.nlargest(k,'price')['price'].index
-
- cm = np.corrcoef(data3[cols].values.T)
-
- sns.set(font_scale = 1.25)
-
- hm = sns.heatmap(cm,cmap = 'RdPu',annot = True,square = True,fmt = '.2f',annot_kws = {'size':10},yticklabels = cols.values,xticklabels = cols.values)
-
- plt.show()
-
-
- #算法1:ols回归分析数据建模
-
- feature_data = data3.drop(['price'],axis = 1)
-
- target_data = data3['price'] #预测目标列为price
-
- X_train,X_test,y_train, y_test = train_test_split(feature_data, target_data, test_size = 0.3) #划分数据集和测试集进行交叉验证
-
- df_train = pd.concat([X_train,y_train],axis=1)
-
- print(data['distirct'].describe())
-
- lr_model = ols("price~area+C(floor)+C(oriented)+crime_rate+pm25+C(distirct)",data = df_train).fit()

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。