赞
踩
# load datasets
import pandas as pd
df_08 = pd.read_csv('all_alpha_08.csv')
df_18 = pd.read_csv('all_alpha_18.csv')
# view 2008 dataset
df_08.head(1)
# view 2018 dataset
df_18.head(1)
# drop columns from 2008 dataset
df_08.drop(['Stnd', 'Underhood ID', 'FE Calc Appr', 'Unadj Cmb MPG'], axis=1, inplace=True)
# confirm changes
df_08.head(1)
# drop columns from 2018 dataset
df_18.drop(['Stnd', 'Stnd Description', 'Underhood ID', 'Comb CO2'], axis=1, inplace=True)
# confirm changes
df_18.head(1)
# 将销售区域重命名为特定区域
df_08.rename(columns={'Sales Area': 'Cert Region'}, inplace=True)
# 确认更改
df_08.head(1)
# 在 2008 数据集中用下划线和小写标签代替空格
df_08.rename(columns=lambda x: x.strip().lower().replace(" ", "_"), inplace=True)
# 确认更改
df_08.head(1)
# 在 2018 数据集中用下划线和小写标签代替空格
df_18.rename(columns=lambda x: x.strip().lower().replace(" ", "_"), inplace=True)
# 确认更改
df_18.head(1)
# 确认 2008 和 2018 数据集的列标签相同
df_08.columns == df_18.columns
# 确定所有的列标签都相同,如下所示
(df_08.columns == df_18.columns).all()
# 保存新数据集,供下一段使用
df_08.to_csv('data_08.csv', index=False)
df_18.to_csv('data_18.csv', index=False)
# load datasets
import pandas as pd
df_08 = pd.read_csv('data_08.csv')
df_18 = pd.read_csv('data_18.csv')
# view dimensions of dataset
df_08.shape
# view dimensions of dataset
df_18.shape
# filter datasets for rows following California standards
df_08 = df_08.query('cert_region == "CA"')
df_18 = df_18.query('cert_region == "CA"')
# confirm only certification region is California
df_08['cert_region'].unique()
# confirm only certification region is California
df_18['cert_region'].unique()
# drop certification region columns form both datasets
df_08.drop('cert_region', axis=1, inplace=True)
df_18.drop('cert_region', axis=1, inplace=True)
df_08.shape
df_18.shape
# view missing value count for each feature in 2008
df_08.isnull().sum()
# view missing value count for each feature in 2018
df_18.isnull().sum()
# drop rows with any null values in both datasets
df_08.dropna(inplace=True)
df_18.dropna(inplace=True)
# checks if any of columns in 2008 have null values - should print False
df_08.isnull().sum().any()
# checks if any of columns in 2018 have null values - should print False
df_18.isnull().sum().any()
# print number of duplicates in 2008 and 2018 datasets
print(df_08.duplicated().sum())
print(df_18.duplicated().sum())
# drop duplicates in both datasets
df_08.drop_duplicates(inplace=True)
df_18.drop_duplicates(inplace=True)
# print number of duplicates again to confirm dedupe - should both be 0
print(df_08.duplicated().sum())
print(df_18.duplicated().sum())
# save progress for the next section
df_08.to_csv('data_08.csv', index=False)
df_18.to_csv('data_18.csv', index=False)
import pandas as pd
df_08 = pd.read_csv('all_alpha_08.csv')
df_18 = pd.read_csv('all_alpha_18.csv')
# drop columns from 2008 dataset
df_08.drop(['Stnd', 'Underhood ID', 'FE Calc Appr', 'Unadj Cmb MPG'], axis=1, inplace=True)
# confirm changes
df_08.head(1)
# drop columns from 2018 dataset
df_18.drop(['Stnd', 'Stnd Description', 'Underhood ID', 'Comb CO2'], axis=1, inplace=True)
# confirm changes
df_18.head(1)
# rename Sales Area to Cert Region
df_08.rename(columns={'Sales Area': 'Cert Region'}, inplace=True)
# confirm changes
df_08.head(1)
# replace spaces with underscores and lowercase labels for 2008 dataset
df_08.rename(columns=lambda x: x.strip().lower().replace(" ", "_"), inplace=True)
# confirm changes
df_08.head(1)
# replace spaces with underscores and lowercase labels for 2018 dataset
df_18.rename(columns=lambda x: x.strip().lower().replace(" ", "_"), inplace=True)
# confirm changes
df_18.head(1)
# confirm column labels for 2008 and 2018 datasets are identical
df_08.columns == df_18.columns
# make sure they're all identical like this
(df_08.columns == df_18.columns).all()
# save new datasets for next section
df_08.to_csv('data_08.csv', index=False)
df_18.to_csv('data_18.csv', index=False)
import pandas as pd
df_08 = pd.read_csv('data_08.csv')
df_18 = pd.read_csv('data_18.csv')
# 尝试使用 Pandas 的 to_numeric 或 astype 功能将
# 2008 air_pollution_score 列转换成浮点——无法转换
df_08.air_pollution_score = df_08.air_pollution_score.astype(float)
结果是错误的
df_08.iloc[582]
# 首先,获取 2008 年的所有混合动力
hb_08 = df_08[df_08['fuel'].str.contains('/')]
hb_08
# 2018年的混合动力
hb_18 = df_18[df_18['fuel'].str.contains('/')]
hb_18
# 创建 2008 混合动力数据框的两个副本
df1 = hb_08.copy() # 每个混合动力车第一种燃料类型的数据
df2 = hb_08.copy() # 每个混合动力车第二种燃料类型的数据
# 每个数据应该如下所示
df1
# 将被 "/" 分割的列
split_columns = ['fuel', 'air_pollution_score', 'city_mpg', 'hwy_mpg', 'cmb_mpg', 'greenhouse_gas_score']
# 对每个数据框副本的每个列应用分割功能
for c in split_columns:
df1[c] = df1[c].apply(lambda x: x.split("/")[0])
df2[c] = df2[c].apply(lambda x: x.split("/")[1])
# 这个数据框包含混合动力车第一种燃料类型的信息
# 也就是 "/" 前面的值
df1
# 这个数据框包含混合动力车第二种燃料类型的信息
# 也就是 "/" 后面的值
df2
# 将数据框组合,添加到原始数据框中
new_rows = df1.append(df2)
# 现在每辆车的每种燃料类型都有单独的行
new_rows
# 丢弃原始混合动力行
df_08.drop(hb_08.index, inplace=True)
# 添加新分割的行
df_08 = df_08.append(new_rows, ignore_index=True)
# 检查含有 "/" 的所有原始混合动力行是否都已删除
df_08[df_08['fuel'].str.contains('/')]
df_08.shape
# 创建 2018 混合动力数据框 hb_18 的两个副本
df1 = hb_18.copy()
df2 = hb_18.copy()
fuel
、city_mpg
、hwy_mpg
、cmb_mpg
的值不需要分割 air_pollution_score
或 greenhouse_gas_score
的值,因为这些列在 2018 数据集中已经是整数。
# 待分割列的列表
split_columns = ['fuel', 'city_mpg', 'hwy_mpg', 'cmb_mpg']
# 对每个数据框副本的每个列应用分割功能
for c in split_columns:
df1[c] = df1[c].apply(lambda x: x.split("/")[0])
df2[c] = df2[c].apply(lambda x: x.split("/")[1])
# 添加这两个数据框
new_rows = df1.append(df2)
# 从原始 2018 数据框中丢弃每个混合动力行
# 对 hb_18 的索引进行 Pandas 的丢弃功能,实现上述步骤
df_18.drop(hb_18.index, inplace=True)
# 将 new_rows 添加到 df_18
df_18 = df_18.append(new_rows, ignore_index=True)
# 检查是否已删除
df_18[df_18['fuel'].str.contains('/')]
df_18.shape
(832, 13)
# 对于 2008 空气污染列,将字符串转换成浮点
df_08.air_pollution_score = df_08.air_pollution_score.astype(float)
# 对于 2018 空气污染列,将整数转换成浮点
df_18.air_pollution_score = df_18.air_pollution_score.astype(float)
df_08.to_csv('data_08.csv', index=False)
df_18.to_csv('data_18.csv', index=False)
city_mpg
、hwy_mpg
、cmb_mpg
数据类型2008 和 2018:将字符串转换成浮点
# 加载数据集
import pandas as pd
df_08 = pd.read_csv('data_08.csv')
df_18 = pd.read_csv('data_18.csv')
# 将 mpg 列转换成浮点
mpg_columns = ['city_mpg', 'hwy_mpg', 'cmb_mpg']
for c in mpg_columns:
df_18[c] = df_18[c].astype(float)
df_08[c] = df_08[c].astype(float)
# 将浮点转换成整数
df_08['greenhouse_gas_score'] = df_08['greenhouse_gas_score'].astype(int)
使用下列空间来处理数据集clean_08.csv
和clean_18.csv
中的以下问题
# load datasets
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline
# load datasets
df_08 = pd.read_csv('clean_08.csv')
df_18 = pd.read_csv('clean_18.csv')
df_08.head(1)
df_08.fuel.value_counts()
df_18.fuel.value_counts()
# how many unique models used alternative sources of fuel in 2008
alt_08 = df_08.query('fuel in ["CNG", "ethanol"]').model.nunique()
alt_08
# how many unique models used alternative sources of fuel in 2018
alt_18 = df_18.query('fuel in ["Ethanol", "Electricity"]').model.nunique()
alt_18
plt.bar(["2008", "2018"], [alt_08, alt_18])
plt.title("Number of Unique Models Using Alternative Fuels")
plt.xlabel("Year")
plt.ylabel("Number of Unique Models");
# total unique models each year
total_08 = df_08.model.nunique()
total_18 = df_18.model.nunique()
total_08, total_18
prop_08 = alt_08/total_08
prop_18 = alt_18/total_18
prop_08, prop_18
plt.bar(["2008", "2018"], [prop_08, prop_18])
plt.title("Proportion of Unique Models Using Alternative Fuels")
plt.xlabel("Year")
plt.ylabel("Proportion of Unique Models");
veh_08 = df_08.groupby('veh_class').cmb_mpg.mean()
veh_08
veh_18 = df_18.groupby('veh_class').cmb_mpg.mean()
veh_18
inc = veh_18 - veh_08
inc
# only plot the classes that exist in both years
inc.dropna(inplace=True)
plt.subplots(figsize=(8, 5))
plt.bar(inc.index, inc)
plt.title('Improvements in Fuel Economy from 2008 to 2018 by Vehicle Class')
plt.xlabel('Vehicle Class')
plt.ylabel('Increase in Average Combined MPG');
# smartway labels for 2008
df_08.smartway.unique()
# get all smartway vehicles in 2008
smart_08 = df_08.query('smartway == "yes"')
# explore smartway vehicles in 2008
smart_08.describe()
# smartway labels for 2018
df_18.smartway.unique()
# get all smartway vehicles in 2018
smart_18 = df_18.query('smartway in ["Yes", "Elite"]')
smart_18.describe()
top_08 = df_08.query('cmb_mpg > cmb_mpg.mean()')
top_08.describe()
top_18 = df_18.query('cmb_mpg > cmb_mpg.mean()')
top_18.describe()
用 Pandas Merges 创建 clean_08.csv
和 clean_18.csv
的合并数据集。
import pandas as pd
df_08 = pd.read_csv('clean_08.csv')
df_18 = pd.read_csv('clean_18.csv')
# 重命名 2008 列
df_08.rename(columns=lambda x: x[:10] + "_2008", inplace=True)
# 查看并检查名称
df_08.head()
# 合并数据集
df_combined = df_08.merge(df_18, left_on='model_2008', right_on='model', how='inner')
# 查看并检查合并结果
df_combined.head()
保存合并数据集
df_combined.to_csv('combined_dataset.csv', index=False)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。