赞
踩
fuelType:燃料类型
rating:评级
renterTripsTaken:租房者出行
reviewCount:审阅计数
location.city:位置.城市
location.country:地点.国家/地区
location.latitude:位置.纬度
location.longitude:位置.长度
location.state:位置状态
owner.id:所有者id
rate.daily:每日费率
vehicle.make:车辆制造
vehicle.model:车辆型号
vehicle.type:车辆类型
vehicle.year:车辆.年
import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import plotly as py import plotly_express as px %matplotlib inline import warnings warnings.filterwarnings('ignore') #导入数据 df = pd.read_csv('C:/CarRentalData.csv',engine='python') df.head() #数据集大小 df.shape #(5851, 15)
df_fuelType = pd.DataFrame(df.fuelType.value_counts()).reset_index()
df_fuelType.rename(columns = {'index':'fuelType', 'fuelType':'count'}, inplace=True)
fig = px.pie(df_fuelType, values = 'count', names='fuelType', title = 'Fuel Type',hole=.3,
color_discrete_map={'ELECTRIC':'lightcyan',
'HYBRID':'cyan',
'GASOLINE':'royalblue',
'DIESEL':'darkblue'})
fig.show()
print("Rating Statistics:") print(df['rating'].describe()) ''' Rating Statistics: count 5350.000000 mean 4.920325 std 0.182425 min 1.000000 25% 4.900000 50% 5.000000 75% 5.000000 max 5.000000 Name: rating, dtype: float64 ''' fig = px.histogram(df, x = 'rating', title = 'Histogram of Rental Car Rating') fig.show()
print("Renter Trips Taken Statistics:") print(df['renterTripsTaken'].describe()) ''' Renter Trips Taken Statistics: count 5851.000000 mean 33.477354 std 41.898954 min 0.000000 25% 5.000000 50% 18.000000 75% 46.000000 max 395.000000 Name: renterTripsTaken, dtype: float64 ''' fig = px.histogram(df, x = 'renterTripsTaken', title = 'Histogram of Renter Trips Taken') fig.show()
print("Review Count Statistics:") print(df['reviewCount'].describe()) ''' Review Count Statistics: count 5851.000000 mean 28.454794 std 35.136113 min 0.000000 25% 4.000000 50% 16.000000 75% 39.000000 max 321.000000 Name: reviewCount, dtype: float64 ''' fig = px.histogram(df, x = 'reviewCount', title = 'Histogram of Review Count') fig.show()
import plotly.graph_objects as go def get_average_lat_long(city, ltype): choices = df[df['location.city'] == city] lat = choices['location.latitude'].mean() long = choices['location.longitude'].mean() if ltype == 0: return lat else: return long df_location = pd.DataFrame(df['location.city'].value_counts()).reset_index() df_location.rename(columns = {'index':'city', 'location.city':'count'}, inplace=True) df_location['latitude'] = df_location['city'].apply(lambda x: get_average_lat_long(x, 0)) df_location['longitude'] = df_location['city'].apply(lambda x: get_average_lat_long(x, 1)) df_location['text'] = df_location['city'] + '<br>Car Rentals ' + (df_location['count']).astype(str) limits = [(0,20),(21,50),(51,150),(151,200),(201,1000),(1001,1500)] colors = ["royalblue","orange","lightgrey","lightseagreen","red","crimson"] scale = 0.5 fig = go.Figure() for i in range(len(limits)): lim = limits[i] df_sub = df_location[lim[0]:lim[1]] fig.add_trace(go.Scattergeo( locationmode = 'USA-states', lon = df_sub['longitude'], lat = df_sub['latitude'], text = df_sub['text'], marker = dict( size = df_sub['count']/scale, color = colors[i], line_color='rgb(40,40,40)', line_width=0.5, sizemode = 'area' ), name = '{0} - {1}'.format(lim[0],lim[1]))) fig.update_layout( title_text = 'Car Rentals by City', showlegend = True, geo = dict( scope = 'usa', landcolor = 'rgb(217, 217, 217)', ) ) fig.show()
df_state = pd.DataFrame(df['location.state'].value_counts()).reset_index() df_state.rename(columns = {'index':'state', 'location.state':'count'}, inplace=True) fig = go.Figure(data=go.Choropleth( locations=df_state['state'], # 空间坐标 z = df_state['count'].astype(float), # 要进行颜色编码的数据 locationmode = 'USA-states', # 位置集与locations中的条目匹配 colorscale = 'Reds', colorbar_title = "Number of Cars Rented", )) fig.update_layout( title_text = 'Car Rentals by State', geo_scope='usa', ) fig.show()
df_owner = pd.DataFrame(df['owner.id'].value_counts()).reset_index() df_owner.rename(columns = {'index':'owner_id', 'owner.id':'number of rental cars'}, inplace=True) print('Total Number of Unique Rental Cars per Owner Statistics:') print(df_owner['number of rental cars'].describe()) ''' Total Number of Unique Rental Cars per Owner Statistics: count 3093.000000 mean 1.891691 std 2.789205 min 1.000000 25% 1.000000 50% 1.000000 75% 2.000000 max 49.000000 Name: number of rental cars, dtype: float64 ''' fig = px.histogram(df_owner, x = 'number of rental cars', title='Total Number of Unique Rental Cars per Owner') fig.show()
print('Daily Rate of Car Rental Statistics:') print(df['rate.daily'].describe()) ''' Daily Rate of Car Rental Statistics: count 5851.000000 mean 93.691506 std 96.080920 min 20.000000 25% 45.000000 50% 69.000000 75% 110.000000 max 1500.000000 Name: rate.daily, dtype: float64 ''' fig = px.histogram(df, x = 'rate.daily', title='Daily Rate of Car Rental') fig.show()
df_make_model = df.groupby(['vehicle.make', 'vehicle.model']).size().reset_index()
df_make_model.rename(columns = {0:'count'}, inplace=True)
df_make_model.replace('Mercedes-benz', 'Mercedes-Benz', inplace=True)
df_make_model['make_count'] = df_make_model['vehicle.make'].apply(lambda x : df_make_model[df_make_model['vehicle.make'] == x]['count'].sum())
df_make_model.sort_values(by = 'make_count', ascending=False, inplace=True)
fig = px.bar(df_make_model[df_make_model['make_count'] >45], x = 'vehicle.make', y='count', color = 'vehicle.model', title='Make and Model of Top 25 Most Rented Cars')
fig.update_layout(showlegend = False)
fig.show()
df_vehicleType = pd.DataFrame(df['vehicle.type'].value_counts()).reset_index()
df_vehicleType.rename(columns = {'index':'vehicle.type', 'vehicle.type':'count'}, inplace=True)
fig = px.pie(df_vehicleType, values = 'count', names='vehicle.type', title = 'Vehicle Type of Rented Cars')
fig.show()
print('Vehicle Year Statistics:') print(df['vehicle.year'].describe()) ''' Vehicle Year Statistics: count 5851.000000 mean 2015.340113 std 4.050813 min 1955.000000 25% 2014.000000 50% 2016.000000 75% 2018.000000 max 2020.000000 Name: vehicle.year, dtype: float64 ''' fig = px.histogram(df, x = 'vehicle.year', title='Year of Vehicle') fig.show()
plt.figure(figsize=(14,7))
sns.heatmap(df.corr(), annot=True)
#统计缺失值 df.isnull().sum() ''' fuelType 75 rating 501 renterTripsTaken 0 reviewCount 0 location.city 0 location.country 0 location.latitude 0 location.longitude 0 location.state 0 owner.id 0 rate.daily 0 vehicle.make 0 vehicle.model 0 vehicle.type 0 vehicle.year 0 dtype: int64 '''
#数据集各列的数据类型,是否为空值,内存占用情况 df.info() ''' <class 'pandas.core.frame.DataFrame'> RangeIndex: 5851 entries, 0 to 5850 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fuelType 5776 non-null object 1 rating 5350 non-null float64 2 renterTripsTaken 5851 non-null int64 3 reviewCount 5851 non-null int64 4 location.city 5851 non-null object 5 location.country 5851 non-null object 6 location.latitude 5851 non-null float64 7 location.longitude 5851 non-null float64 8 location.state 5851 non-null object 9 owner.id 5851 non-null int64 10 rate.daily 5851 non-null int64 11 vehicle.make 5851 non-null object 12 vehicle.model 5851 non-null object 13 vehicle.type 5851 non-null object 14 vehicle.year 5851 non-null int64 dtypes: float64(3), int64(5), object(7) memory usage: 685.8+ KB '''
df['fuelType'].unique() ''' array(['ELECTRIC', 'HYBRID', 'GASOLINE', nan, 'DIESEL'], dtype=object) ''' df['fuelType'].value_counts().reset_index() ''' index fuelType 0 GASOLINE 4810 1 ELECTRIC 622 2 HYBRID 274 3 DIESEL 70 ''' #填补缺失值 df['fuelType'] = df['fuelType'].fillna('GASOLINE',axis=0)
#求众数
mode = df['rating'].mode()
mode = int(mode)
mode
'''
5
'''
df['rating'] = df['rating'].fillna(mode,axis=0)
pd.isnull(df).sum() ''' fuelType 0 rating 0 renterTripsTaken 0 reviewCount 0 location.city 0 location.country 0 location.latitude 0 location.longitude 0 location.state 0 owner.id 0 rate.daily 0 vehicle.make 0 vehicle.model 0 vehicle.type 0 vehicle.year 0 dtype: int64 '''
可以发现缺失值已经填补完毕,没有缺失值了
df['fuelType'].unique()
'''
array(['ELECTRIC', 'HYBRID', 'GASOLINE', 'DIESEL'], dtype=object)
'''
#映射
fuelType_mapping= {'ELECTRIC':0.0, 'HYBRID':1.0,'GASOLINE':2.0,'DIESEL':3.0}
df['fuelType'] = df['fuelType'].map(fuelType_mapping)
df['location.state'].unique()
'''
array(['WA', 'NM', 'GA', 'SC', 'FL', 'TX', 'NC', 'CT', 'MA', 'ME', 'AL',
'MT', 'TN', 'KY', 'ID', 'UT', 'MD', 'DC', 'IA', 'OH', 'CO', 'VA',
'MI', 'NJ', 'IN', 'WI', 'KS', 'MO', 'NV', 'CA', 'LA', 'AR', 'IL',
'MS', 'NH', 'MN', 'OK', 'NE', 'OR', 'PA', 'DE', 'AZ', 'WV', 'RI',
'AK', 'HI'], dtype=object)
'''
locationState_mapping= {'WA':0.0, 'NM':1.0, 'GA':2.0, 'SC':3.0, 'FL':4.0, 'TX':5.0, 'NC':6.0, 'CT':7.0, 'MA':8.0, 'ME':9.0, 'AL':10.0,
'MT':11.0, 'TN':12.0, 'KY':13.0, 'ID':14.0, 'UT':15.0, 'MD':16.0, 'DC':17.0, 'IA':18.0, 'OH':19.0, 'CO':20.0, 'VA':21.0,
'MI':21.0, 'NJ':22.0, 'IN':23.0, 'WI':24.0, 'KS':25.0, 'MO':26.0, 'NV':27.0, 'CA':28.0, 'LA':29.0, 'AR':30.0, 'IL':31.0,
'MS':32.0, 'NH':33.0, 'MN':34.0, 'OK':35.0, 'NE':36.0, 'OR':37.0, 'PA':38.0, 'DE':39.0, 'AZ':40.0, 'WV':41.0, 'RI':42.0,
'AK':43.0, 'HI':44.0}
df['location.state'] = df['location.state'].map(locationState_mapping)
df['vehicle.type'].unique()
'''
array(['suv', 'car', 'truck', 'minivan', 'van'], dtype=object)
'''
vehicleType_mapping= {'suv':0.0, 'car':1.0,'truck':2.0,'minivan':3.0,'van':4.0}
df['vehicle.type'] = df['vehicle.type'].map(vehicleType_mapping)
其他列的值比较繁多,所以不做考虑,删掉它们就可以了
df['vehicle.make'].unique()
df['vehicle.model'].unique()
df = df.drop(labels=['location.city','location.country','vehicle.make','vehicle.model'],axis=1)
检查数据
df.head()
归一化与切分数据集
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
y = df.pop('vehicle.type')
X = df
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, y)
import sklearn from sklearn.preprocessing import StandardScaler,PolynomialFeatures from sklearn.impute import SimpleImputer from sklearn.preprocessing import OrdinalEncoder from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, r2_score rfc = RandomForestRegressor() rfc.fit(X_train, y_train) # Feature Importances features = X.columns feature_imps = rfc.feature_importances_#特征重要性 # 重要特征的可视化 fig = px.bar(x=features, y=feature_imps) fig.update_layout({'title':{'text':"Feature Importnace", 'x':0.5}}) fig.show()
#多分类线性模型
from sklearn.svm import LinearSVC
linear_svm = LinearSVC().fit(X_train,y_train)
print("模型train得分:{:.3f}".format(linear_svm.score(X_train,y_train)))
print("模型test得分:{:.3f}".format(linear_svm.score(X_valid,y_valid)))
'''
模型train得分:0.624
模型test得分:0.643
'''
# Prediction
pred = linear_svm.predict(X_valid)
linear_svm_mse = mean_squared_error(y_valid, pred)
linear_svm_rmse = np.sqrt(linear_svm_mse)
linear_svm_r2 = r2_score(y_valid, pred)
print(linear_svm_mse,linear_svm_rmse,linear_svm_r2)
'''
0.5399863294600137 0.7348376211517845 -0.11486343099011109
'''
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。