当前位置:   article > 正文

机器学习 CarRentalData数据集分析和预测_机器学习数据集分析

机器学习数据集分析

介绍数据集

fuelType:燃料类型
rating:评级
renterTripsTaken:租房者出行
reviewCount:审阅计数
location.city:位置.城市
location.country:地点.国家/地区
location.latitude:位置.纬度
location.longitude:位置.长度
location.state:位置状态
owner.id:所有者id
rate.daily:每日费率
vehicle.make:车辆制造
vehicle.model:车辆型号
vehicle.type:车辆类型
vehicle.year:车辆.年

实战演练

数据集展示
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py
import plotly_express as px
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

#导入数据
df = pd.read_csv('C:/CarRentalData.csv',engine='python')
df.head()

#数据集大小
df.shape 
#(5851, 15)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17

在这里插入图片描述

数据的可视化
fuelType
df_fuelType = pd.DataFrame(df.fuelType.value_counts()).reset_index()
df_fuelType.rename(columns = {'index':'fuelType', 'fuelType':'count'}, inplace=True)

fig = px.pie(df_fuelType, values = 'count', names='fuelType', title = 'Fuel Type',hole=.3, 
             color_discrete_map={'ELECTRIC':'lightcyan',
                                 'HYBRID':'cyan',
                                 'GASOLINE':'royalblue',
                                 'DIESEL':'darkblue'})
                                 
fig.show()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10

在这里插入图片描述

rating
print("Rating Statistics:")
print(df['rating'].describe())

'''
Rating Statistics:
count    5350.000000
mean        4.920325
std         0.182425
min         1.000000
25%         4.900000
50%         5.000000
75%         5.000000
max         5.000000
Name: rating, dtype: float64
'''

fig = px.histogram(df, x = 'rating', title = 'Histogram of Rental Car Rating')
fig.show()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18

在这里插入图片描述

renterTripsTaken
print("Renter Trips Taken Statistics:")
print(df['renterTripsTaken'].describe())

'''
Renter Trips Taken Statistics:
count    5851.000000
mean       33.477354
std        41.898954
min         0.000000
25%         5.000000
50%        18.000000
75%        46.000000
max       395.000000
Name: renterTripsTaken, dtype: float64
'''

fig = px.histogram(df, x = 'renterTripsTaken', title = 'Histogram of Renter Trips Taken')
fig.show()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18

在这里插入图片描述

reviewCount
print("Review Count Statistics:")
print(df['reviewCount'].describe())

'''
Review Count Statistics:
count    5851.000000
mean       28.454794
std        35.136113
min         0.000000
25%         4.000000
50%        16.000000
75%        39.000000
max       321.000000
Name: reviewCount, dtype: float64
'''

fig = px.histogram(df, x = 'reviewCount', title = 'Histogram of Review Count')
fig.show()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18

在这里插入图片描述

Car Rentals by City
import plotly.graph_objects as go

def get_average_lat_long(city, ltype):
    choices = df[df['location.city'] == city]
    lat = choices['location.latitude'].mean()
    long = choices['location.longitude'].mean()
    if ltype == 0:
        return lat
    else:
        return long
        
df_location = pd.DataFrame(df['location.city'].value_counts()).reset_index()
df_location.rename(columns = {'index':'city', 'location.city':'count'}, inplace=True)
df_location['latitude'] = df_location['city'].apply(lambda x: get_average_lat_long(x, 0))
df_location['longitude'] = df_location['city'].apply(lambda x: get_average_lat_long(x, 1))

df_location['text'] = df_location['city'] + '<br>Car Rentals ' + (df_location['count']).astype(str)
limits = [(0,20),(21,50),(51,150),(151,200),(201,1000),(1001,1500)]
colors = ["royalblue","orange","lightgrey","lightseagreen","red","crimson"]
scale = 0.5
fig = go.Figure()

for i in range(len(limits)):
    lim = limits[i]
    df_sub = df_location[lim[0]:lim[1]]
    fig.add_trace(go.Scattergeo(
        locationmode = 'USA-states',
        lon = df_sub['longitude'],
        lat = df_sub['latitude'],
        text = df_sub['text'],
        marker = dict(
            size = df_sub['count']/scale,
            color = colors[i],
            line_color='rgb(40,40,40)',
            line_width=0.5,
            sizemode = 'area'
        ),
        name = '{0} - {1}'.format(lim[0],lim[1])))

fig.update_layout(
        title_text = 'Car Rentals by City',
        showlegend = True,
        geo = dict(
            scope = 'usa',
            landcolor = 'rgb(217, 217, 217)',
        )
    )

fig.show()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49

在这里插入图片描述

Car Rentals by State
df_state = pd.DataFrame(df['location.state'].value_counts()).reset_index()
df_state.rename(columns = {'index':'state', 'location.state':'count'}, inplace=True)

fig = go.Figure(data=go.Choropleth(
    locations=df_state['state'], # 空间坐标
    z = df_state['count'].astype(float), # 要进行颜色编码的数据
    locationmode = 'USA-states', # 位置集与locations中的条目匹配
    colorscale = 'Reds',
    colorbar_title = "Number of Cars Rented",
))

fig.update_layout(
    title_text = 'Car Rentals by State',
    geo_scope='usa', 
)

fig.show()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17

在这里插入图片描述

own.id
df_owner = pd.DataFrame(df['owner.id'].value_counts()).reset_index()
df_owner.rename(columns = {'index':'owner_id', 'owner.id':'number of rental cars'}, inplace=True)

print('Total Number of Unique Rental Cars per Owner Statistics:')
print(df_owner['number of rental cars'].describe())

'''
Total Number of Unique Rental Cars per Owner Statistics:
count    3093.000000
mean        1.891691
std         2.789205
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max        49.000000
Name: number of rental cars, dtype: float64
'''

fig = px.histogram(df_owner, x = 'number of rental cars', title='Total Number of Unique Rental Cars per Owner')
fig.show()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21

在这里插入图片描述

rate.daily
print('Daily Rate of Car Rental Statistics:')
print(df['rate.daily'].describe())

'''
Daily Rate of Car Rental Statistics:
count    5851.000000
mean       93.691506
std        96.080920
min        20.000000
25%        45.000000
50%        69.000000
75%       110.000000
max      1500.000000
Name: rate.daily, dtype: float64
'''

fig = px.histogram(df, x = 'rate.daily', title='Daily Rate of Car Rental')
fig.show()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18

在这里插入图片描述

Make and Model of Top 25 Most Rented Cars
df_make_model = df.groupby(['vehicle.make', 'vehicle.model']).size().reset_index()
df_make_model.rename(columns = {0:'count'}, inplace=True)
df_make_model.replace('Mercedes-benz', 'Mercedes-Benz', inplace=True)
df_make_model['make_count'] = df_make_model['vehicle.make'].apply(lambda x : df_make_model[df_make_model['vehicle.make'] == x]['count'].sum())
df_make_model.sort_values(by = 'make_count', ascending=False, inplace=True)

fig = px.bar(df_make_model[df_make_model['make_count'] >45], x = 'vehicle.make', y='count', color = 'vehicle.model', title='Make and Model of Top 25 Most Rented Cars')
fig.update_layout(showlegend = False)
fig.show()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9

在这里插入图片描述

Vehicle Type of Rented Cars
df_vehicleType = pd.DataFrame(df['vehicle.type'].value_counts()).reset_index()
df_vehicleType.rename(columns = {'index':'vehicle.type', 'vehicle.type':'count'}, inplace=True)

fig = px.pie(df_vehicleType, values = 'count', names='vehicle.type', title = 'Vehicle Type of Rented Cars')
fig.show()
  • 1
  • 2
  • 3
  • 4
  • 5

在这里插入图片描述

Year of Vehicle
print('Vehicle Year Statistics:')
print(df['vehicle.year'].describe())

'''
Vehicle Year Statistics:
count    5851.000000
mean     2015.340113
std         4.050813
min      1955.000000
25%      2014.000000
50%      2016.000000
75%      2018.000000
max      2020.000000
Name: vehicle.year, dtype: float64
'''

fig = px.histogram(df, x = 'vehicle.year', title='Year of Vehicle')
fig.show()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18

在这里插入图片描述

热力图
plt.figure(figsize=(14,7))
sns.heatmap(df.corr(), annot=True)
  • 1
  • 2

在这里插入图片描述

数据预处理
#统计缺失值
df.isnull().sum()

'''
fuelType               75
rating                501
renterTripsTaken        0
reviewCount             0
location.city           0
location.country        0
location.latitude       0
location.longitude      0
location.state          0
owner.id                0
rate.daily              0
vehicle.make            0
vehicle.model           0
vehicle.type            0
vehicle.year            0
dtype: int64
'''
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
#数据集各列的数据类型,是否为空值,内存占用情况
df.info()

'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5851 entries, 0 to 5850
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   fuelType            5776 non-null   object 
 1   rating              5350 non-null   float64
 2   renterTripsTaken    5851 non-null   int64  
 3   reviewCount         5851 non-null   int64  
 4   location.city       5851 non-null   object 
 5   location.country    5851 non-null   object 
 6   location.latitude   5851 non-null   float64
 7   location.longitude  5851 non-null   float64
 8   location.state      5851 non-null   object 
 9   owner.id            5851 non-null   int64  
 10  rate.daily          5851 non-null   int64  
 11  vehicle.make        5851 non-null   object 
 12  vehicle.model       5851 non-null   object 
 13  vehicle.type        5851 non-null   object 
 14  vehicle.year        5851 non-null   int64  
dtypes: float64(3), int64(5), object(7)
memory usage: 685.8+ KB
'''

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
df['fuelType'].unique()

'''
array(['ELECTRIC', 'HYBRID', 'GASOLINE', nan, 'DIESEL'], dtype=object)
'''

df['fuelType'].value_counts().reset_index()
'''
	index	fuelType
0	GASOLINE	4810
1	ELECTRIC	622
2	HYBRID	274
3	DIESEL	70
'''
#填补缺失值
df['fuelType'] = df['fuelType'].fillna('GASOLINE',axis=0)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
#求众数
mode = df['rating'].mode()
mode = int(mode)
mode
'''
5
'''
df['rating'] = df['rating'].fillna(mode,axis=0)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
pd.isnull(df).sum()
'''
fuelType              0
rating                0
renterTripsTaken      0
reviewCount           0
location.city         0
location.country      0
location.latitude     0
location.longitude    0
location.state        0
owner.id              0
rate.daily            0
vehicle.make          0
vehicle.model         0
vehicle.type          0
vehicle.year          0
dtype: int64
'''
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19

可以发现缺失值已经填补完毕,没有缺失值了

df['fuelType'].unique()
'''
array(['ELECTRIC', 'HYBRID', 'GASOLINE', 'DIESEL'], dtype=object)
'''
#映射
fuelType_mapping= {'ELECTRIC':0.0, 'HYBRID':1.0,'GASOLINE':2.0,'DIESEL':3.0}
df['fuelType'] = df['fuelType'].map(fuelType_mapping)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
df['location.state'].unique()
'''
array(['WA', 'NM', 'GA', 'SC', 'FL', 'TX', 'NC', 'CT', 'MA', 'ME', 'AL',
       'MT', 'TN', 'KY', 'ID', 'UT', 'MD', 'DC', 'IA', 'OH', 'CO', 'VA',
       'MI', 'NJ', 'IN', 'WI', 'KS', 'MO', 'NV', 'CA', 'LA', 'AR', 'IL',
       'MS', 'NH', 'MN', 'OK', 'NE', 'OR', 'PA', 'DE', 'AZ', 'WV', 'RI',
       'AK', 'HI'], dtype=object)
'''

locationState_mapping= {'WA':0.0, 'NM':1.0, 'GA':2.0, 'SC':3.0, 'FL':4.0, 'TX':5.0, 'NC':6.0, 'CT':7.0, 'MA':8.0, 'ME':9.0, 'AL':10.0,
       'MT':11.0, 'TN':12.0, 'KY':13.0, 'ID':14.0, 'UT':15.0, 'MD':16.0, 'DC':17.0, 'IA':18.0, 'OH':19.0, 'CO':20.0, 'VA':21.0,
       'MI':21.0, 'NJ':22.0, 'IN':23.0, 'WI':24.0, 'KS':25.0, 'MO':26.0, 'NV':27.0, 'CA':28.0, 'LA':29.0, 'AR':30.0, 'IL':31.0,
       'MS':32.0, 'NH':33.0, 'MN':34.0, 'OK':35.0, 'NE':36.0, 'OR':37.0, 'PA':38.0, 'DE':39.0, 'AZ':40.0, 'WV':41.0, 'RI':42.0,
       'AK':43.0, 'HI':44.0}
df['location.state'] = df['location.state'].map(locationState_mapping)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
df['vehicle.type'].unique()
'''
array(['suv', 'car', 'truck', 'minivan', 'van'], dtype=object)
'''
vehicleType_mapping= {'suv':0.0, 'car':1.0,'truck':2.0,'minivan':3.0,'van':4.0}
df['vehicle.type'] = df['vehicle.type'].map(vehicleType_mapping)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6

其他列的值比较繁多,所以不做考虑,删掉它们就可以了

df['vehicle.make'].unique()
df['vehicle.model'].unique()

df = df.drop(labels=['location.city','location.country','vehicle.make','vehicle.model'],axis=1)
  • 1
  • 2
  • 3
  • 4

检查数据

df.head()
  • 1

在这里插入图片描述

归一化与切分数据集

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

y = df.pop('vehicle.type')
X = df
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, y)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
数据训练
import sklearn
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OrdinalEncoder

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error, r2_score

rfc = RandomForestRegressor()
rfc.fit(X_train, y_train)
# Feature Importances
features = X.columns
feature_imps = rfc.feature_importances_#特征重要性

# 重要特征的可视化
fig = px.bar(x=features, y=feature_imps)
fig.update_layout({'title':{'text':"Feature Importnace", 'x':0.5}})
fig.show()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20

在这里插入图片描述

#多分类线性模型
from sklearn.svm import  LinearSVC
linear_svm = LinearSVC().fit(X_train,y_train)
print("模型train得分:{:.3f}".format(linear_svm.score(X_train,y_train)))
print("模型test得分:{:.3f}".format(linear_svm.score(X_valid,y_valid)))
'''
模型train得分:0.624
模型test得分:0.643
'''
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
# Prediction
pred = linear_svm.predict(X_valid)

linear_svm_mse = mean_squared_error(y_valid, pred)
linear_svm_rmse = np.sqrt(linear_svm_mse)
linear_svm_r2 = r2_score(y_valid, pred)

print(linear_svm_mse,linear_svm_rmse,linear_svm_r2)
'''
0.5399863294600137 0.7348376211517845 -0.11486343099011109
'''
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/羊村懒王/article/detail/671138
推荐阅读
相关标签
  

闽ICP备14008679号