赞
踩
随着房地产市场发展,房价越来越高。为了的到影响房价的增长因素,现在从数据角度出发,分析以下左右房价的因素。
数据介绍
import math
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy import stats
from statsmodels.stats.outliers_influence import summary_table
from pylab import mpl
import copy
注:若没有statsmodels模块,请到Terminal用如下命令安装
sudo pip3 install statsmodels
# Terminal在jupyter首页NEW-Other-Terminal
data_source='../data/housedata.csv'#数据源文件
df = pd.read_csv(data_source,encoding='UTF8') #读入二手房数据
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
df_ana = df
df_ana.head() # 样本取样
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-OaeDIcdE-1629077234143)(Matplotlib基础课程.assets/Matplotlib_09_1.png)]
len(df_ana) # 样本数量
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Vx99izwd-1629077234145)(Matplotlib基础课程.assets/Matplotlib_09_2.png)]
df_ana['price'] = df_ana['price']/10000
df_ana.head()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-UMvnRrbI-1629077234146)(Matplotlib基础课程.assets/Matplotlib_09_3.png)]
list_roma = list(set(df_ana['CATE']))
list_roma.sort()
list_roma
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-UbpIYrsH-1629077234148)(Matplotlib基础课程.assets/Matplotlib_09_4.png)]
list_chinese = ['朝阳','东城','丰台','海淀','石景山','西城']
dict_map = dict()
for i in range(0,len(list_roma)):
dict_map[list_roma[i]]=list_chinese[i]
dict_map
用汉字替换拼音:
for x in dict_map.keys():
df_ana['CATE'] = df_ana['CATE'].str.replace(x,dict_map[x])
df_ana.head()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-MttjmoX6-1629077234150)(Matplotlib基础课程.assets/Matplotlib_09_6.png)]
df_ana = df_ana.sort_values(by = ['floor'],axis = 0,ascending = True)
df_ana.head()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-neHNSOuM-1629077234150)(Matplotlib基础课程.assets/Matplotlib_09_7.png)]
plt.hist(df_ana["price"],color='lightblue')
plt.xlabel('单位面积房价(万元/平方米)')
plt.ylabel('频度')
df_price_avg = df_ana['price'].mean()
df_price_avg
df_price_middle = df_ana['price'].median()
df_price_middle
df_price_max = df_ana.sort_values(by='price',ascending=False).head(1)
df_price_max
df_price_min = df_ana.sort_values(by='price',ascending=True).head(1)
df_price_min
df_ana.boxplot(column=["price"],by=['CATE'])
plt.ylabel("单位面积房价(万元/平方米)")
plt.xlabel("城区")
df_ana.boxplot(column=['price'],by=['subway'],labels=['非地铁房','地铁房'])
df_ana.boxplot(column=['price'],by=['school'],labels=['非学区房','学区房'])
df_ana.boxplot(column=['price'],by=['bedrooms'])
考察客厅数量对房价的影响:
df_ana.boxplot(column=['price'],by=['halls'])
考察楼层高低对房价的影响:
df_ana.boxplot(column=['price'],by=['floor'])
plt.plot(df_ana['AREA'],df_ana['price'],'ro',color='lightblue')
plt.xlabel('面积(平方米)')
plt.ylabel('单位面积房价(万元/平方米)')
plt.show()
客厅数做因子化处理,变成二分变量,使得建模有更好的解读。
def fun(x):
if isinstance(x,int):
if x == 0:
return 0
else:
return 1
else:
return 0
style_halls = df_ana
df_ana['have_halls'] = df_ana['halls'].apply(lambda x: fun(x))
col_n =['CATE','bedrooms','AREA','floor','subway','school','have_halls']
将变量参数数据化
y=df_ana.price
x=pd.DataFrame(df_ana,columns=col_n) #设置自变量x
x_dum_cate=pd.get_dummies(x['CATE']) #对哑变量编码
x_dum_floor=pd.get_dummies(x['floor']) #对哑变量编码
del x['CATE']
del x['floor']
x=pd.concat([x,x_dum_cate],axis=1)
x=pd.concat([x,x_dum_floor],axis=1)
X=sm.add_constant(x) #增加截距项
查看数据:
X.head()
线性回归模型(因变量:单位面积房价):
model=sm.OLS(y,X)
result=model.fit()
result.params #显示回归系数
result.summary()
y_hat=result.predict(X)
residuals=y-y_hat
fig = plt.figure()
fg1 = fig.add_subplot(221)
fg1.set_title('Residuals VS Fitted')
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.scatter(y_hat,residuals)
residuals_n=(residuals-np.mean(residuals))/np.std(residuals)
sorted_=np.sort(residuals_n)
yvals=np.arange(len(sorted_))/float(len(sorted_))
x_label=stats.norm.ppf(yvals)
fg2 = fig.add_subplot(222)
fg2.set_title('Normal Q-Q')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Standardized residuals')
plt.scatter(x_label,sorted_)
residuals_sq=np.sqrt(abs(residuals_n))
fg3 = fig.add_subplot(223)
fg3.set_title('Scale-Location')
plt.xlabel('Fitted Values')
plt.ylabel('sqrt(Standardized residuals)')
plt.scatter(y_hat,residuals_sq)
n=len(y)
y_m=np.mean(y)
Lyy=np.sum((y-y_m)**2)
Hii=1/n+(y-y_m)**2/Lyy
fg4 = fig.add_subplot(224) #绘制每个点的库克距离,检测异常点用
fg4.set_title("Cook's distance")
plt.xlabel('Obs number')
plt.ylabel("Cook's distance")
plt.plot(Hii.tolist())
#对房价取对数,得到——y_log
y_log=df_ana['price'].apply(lambda x:math.log(x))
#对数房价回归模型
model=sm.OLS(y_log,X)
result=model.fit()
result.params
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-uXK94rkO-1629077234162)(Matplotlib基础课程.assets/Matplotlib_09_27.png)]
考察模型参数:
result.summary()
X2=copy.deepcopy(X)
X2['丰台_学区']=X2['丰台']*X2['school']
X2['朝阳_学区']=X2['朝阳']*X2['school']
X2['东城_学区']=X2['东城']*X2['school']
X2['海淀_学区']=X2['海淀']*X2['school']
X2['石景山_学区']=X2['石景山']*X2['school']
X2['西城_学区']=X2['西城']*X2['school']
#对数房价、城区/学区交叉项回归模型
model=sm.OLS(y_log,X2)
result=model.fit()
residus=result.resid
result.params
fg = fig.add_subplot(221)
fg.set_title('Residuals VS Fitted')
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.scatter(y_hat,residuals)
residuals_n=(residuals-np.mean(residuals))/np.std(residuals)
sorted_=np.sort(residuals_n)
yvals=np.arange(len(sorted_))/float(len(sorted_))
x_label=stats.norm.ppf(yvals)
fg = fig.add_subplot(222)
fg.set_title('Normal Q-Q')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Standardized residuals')
plt.scatter(x_label,sorted_)
residuals_sq=np.sqrt(abs(residuals_n))
fg3 = fig.add_subplot(223)
fg3.set_title('Scale-Location')
plt.xlabel('Fitted Values')
plt.ylabel('sqrt(Standardized residuals)')
plt.scatter(y_hat,residuals_sq)
n=len(y)
y_m=np.mean(y)
Lyy=np.sum((y-y_m)**2)
Hii=1/n+(y-y_m)**2/Lyy
fg4 = fig.add_subplot(224) #绘制每个点的库克距离,检测异常点用
fg4.set_title("Cook's distance")
plt.xlabel('Obs number')
plt.ylabel("Cook's distance")
plt.plot(Hii.tolist())
假设需要在西城区买一套临近地铁的学区房,面积85平米。大概需要多少钱。
house_new=[1,2,85,1,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1]
price_new=np.exp(result.predict(house_new))
#单价
price_new
print('房价预测值为:%.2f'%price_new,'万元/平方米')
print('房价预测值为:%.2f'%(85*price_new),'万元')
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。