当前位置:   article > 正文

python数据分析案例2_df1["city"] = df1["city"].apply(lambda x:pinyin.ge

df1["city"] = df1["city"].apply(lambda x:pinyin.get(x, format="strip", delim

# load datasets
import pandas as pd

df_08 = pd.read_csv('all_alpha_08.csv') 
df_18 = pd.read_csv('all_alpha_18.csv')

 

# view 2008 dataset
df_08.head(1)

# view 2018 dataset
df_18.head(1)

 

# drop columns from 2008 dataset
df_08.drop(['Stnd', 'Underhood ID', 'FE Calc Appr', 'Unadj Cmb MPG'], axis=1, inplace=True)

# confirm changes
df_08.head(1)

# drop columns from 2018 dataset
df_18.drop(['Stnd', 'Stnd Description', 'Underhood ID', 'Comb CO2'], axis=1, inplace=True)

# confirm changes
df_18.head(1)

# 将销售区域重命名为特定区域

df_08.rename(columns={'Sales Area': 'Cert Region'}, inplace=True)

# 确认更改
df_08.head(1)

 

 

# 在 2008 数据集中用下划线和小写标签代替空格
df_08.rename(columns=lambda x: x.strip().lower().replace(" ", "_"), inplace=True)

# 确认更改
df_08.head(1)

 

# 在 2018 数据集中用下划线和小写标签代替空格
df_18.rename(columns=lambda x: x.strip().lower().replace(" ", "_"), inplace=True)

# 确认更改
df_18.head(1)

# 确认 2008 和 2018 数据集的列标签相同
df_08.columns == df_18.columns

 

 

# 确定所有的列标签都相同,如下所示
(df_08.columns == df_18.columns).all()

 

# 保存新数据集,供下一段使用
df_08.to_csv('data_08.csv', index=False)
df_18.to_csv('data_18.csv', index=False)

 

 

 

 

 

过滤、丢空、去重

# load datasets
import pandas as pd

df_08 = pd.read_csv('data_08.csv')
df_18 = pd.read_csv('data_18.csv')

# view dimensions of dataset
df_08.shape

# view dimensions of dataset
df_18.shape

# filter datasets for rows following California standards
df_08 = df_08.query('cert_region == "CA"')
df_18 = df_18.query('cert_region == "CA"')

# confirm only certification region is California
df_08['cert_region'].unique()

# confirm only certification region is California
df_18['cert_region'].unique()

# drop certification region columns form both datasets
df_08.drop('cert_region', axis=1, inplace=True)
df_18.drop('cert_region', axis=1, inplace=True)

df_08.shape

df_18.shape

# view missing value count for each feature in 2008
df_08.isnull().sum()

# view missing value count for each feature in 2018
df_18.isnull().sum()

# drop rows with any null values in both datasets
df_08.dropna(inplace=True)
df_18.dropna(inplace=True)

# checks if any of columns in 2008 have null values - should print False
df_08.isnull().sum().any()

# checks if any of columns in 2018 have null values - should print False
df_18.isnull().sum().any()

 

# print number of duplicates in 2008 and 2018 datasets
print(df_08.duplicated().sum())
print(df_18.duplicated().sum())

 

 

# drop duplicates in both datasets
df_08.drop_duplicates(inplace=True)
df_18.drop_duplicates(inplace=True)

 

# print number of duplicates again to confirm dedupe - should both be 0
print(df_08.duplicated().sum())
print(df_18.duplicated().sum())

 

# save progress for the next section
df_08.to_csv('data_08.csv', index=False)
df_18.to_csv('data_18.csv', index=False)

 

import pandas as pd

df_08 = pd.read_csv('all_alpha_08.csv') 
df_18 = pd.read_csv('all_alpha_18.csv')

 

# drop columns from 2008 dataset
df_08.drop(['Stnd', 'Underhood ID', 'FE Calc Appr', 'Unadj Cmb MPG'], axis=1, inplace=True)

# confirm changes
df_08.head(1)

# drop columns from 2018 dataset
df_18.drop(['Stnd', 'Stnd Description', 'Underhood ID', 'Comb CO2'], axis=1, inplace=True)

# confirm changes
df_18.head(1)

 

# rename Sales Area to Cert Region
df_08.rename(columns={'Sales Area': 'Cert Region'}, inplace=True)

# confirm changes
df_08.head(1)

# replace spaces with underscores and lowercase labels for 2008 dataset
df_08.rename(columns=lambda x: x.strip().lower().replace(" ", "_"), inplace=True)

# confirm changes
df_08.head(1)

 

# replace spaces with underscores and lowercase labels for 2018 dataset
df_18.rename(columns=lambda x: x.strip().lower().replace(" ", "_"), inplace=True)

# confirm changes
df_18.head(1)

 

# confirm column labels for 2008 and 2018 datasets are identical
df_08.columns == df_18.columns

# make sure they're all identical like this
(df_08.columns == df_18.columns).all()

 

# save new datasets for next section
df_08.to_csv('data_08.csv', index=False)
df_18.to_csv('data_18.csv', index=False)

 

修正数据类型

 


import pandas as pd

df_08 = pd.read_csv('data_08.csv')
df_18 = pd.read_csv('data_18.csv')

 

# 尝试使用 Pandas 的 to_numeric 或 astype 功能将
# 2008 air_pollution_score 列转换成浮点——无法转换
df_08.air_pollution_score = df_08.air_pollution_score.astype(float)

结果是错误的  

df_08.iloc[582]

# 首先,获取 2008 年的所有混合动力
hb_08 = df_08[df_08['fuel'].str.contains('/')]
hb_08

# 2018年的混合动力
hb_18 = df_18[df_18['fuel'].str.contains('/')]
hb_18

# 创建 2008 混合动力数据框的两个副本
df1 = hb_08.copy()  # 每个混合动力车第一种燃料类型的数据
df2 = hb_08.copy()  # 每个混合动力车第二种燃料类型的数据

# 每个数据应该如下所示
df1

# 将被 "/" 分割的列
split_columns = ['fuel', 'air_pollution_score', 'city_mpg', 'hwy_mpg', 'cmb_mpg', 'greenhouse_gas_score']

# 对每个数据框副本的每个列应用分割功能
for c in split_columns:
    df1[c] = df1[c].apply(lambda x: x.split("/")[0])
    df2[c] = df2[c].apply(lambda x: x.split("/")[1])

 

# 这个数据框包含混合动力车第一种燃料类型的信息
# 也就是 "/" 前面的值
df1

# 这个数据框包含混合动力车第二种燃料类型的信息
# 也就是 "/" 后面的值
df2

 

# 将数据框组合,添加到原始数据框中
new_rows = df1.append(df2)

# 现在每辆车的每种燃料类型都有单独的行
new_rows

 

 

# 丢弃原始混合动力行
df_08.drop(hb_08.index, inplace=True)

# 添加新分割的行
df_08 = df_08.append(new_rows, ignore_index=True)

 

# 检查含有 "/" 的所有原始混合动力行是否都已删除
df_08[df_08['fuel'].str.contains('/')]

df_08.shape

# 创建 2018 混合动力数据框 hb_18 的两个副本
df1 = hb_18.copy()
df2 = hb_18.copy()

分割 fuelcity_mpghwy_mpgcmb_mpg 的值

不需要分割 air_pollution_score 或 greenhouse_gas_score 的值,因为这些列在 2018 数据集中已经是整数。

 

# 待分割列的列表
split_columns =  ['fuel', 'city_mpg', 'hwy_mpg', 'cmb_mpg']

# 对每个数据框副本的每个列应用分割功能
for c in split_columns:
    df1[c] = df1[c].apply(lambda x: x.split("/")[0])
    df2[c] = df2[c].apply(lambda x: x.split("/")[1])

# 添加这两个数据框
new_rows = df1.append(df2)


# 从原始 2018 数据框中丢弃每个混合动力行
# 对 hb_18 的索引进行 Pandas 的丢弃功能,实现上述步骤
df_18.drop(hb_18.index, inplace=True)

# 将 new_rows 添加到 df_18
df_18 = df_18.append(new_rows, ignore_index=True)

 

# 检查是否已删除
df_18[df_18['fuel'].str.contains('/')]

 

df_18.shape

(832, 13)

# 对于 2008 空气污染列,将字符串转换成浮点
df_08.air_pollution_score = df_08.air_pollution_score.astype(float)

# 对于 2018 空气污染列,将整数转换成浮点
df_18.air_pollution_score = df_18.air_pollution_score.astype(float)

df_08.to_csv('data_08.csv', index=False)
df_18.to_csv('data_18.csv', index=False)

 

 

 

修正 city_mpghwy_mpgcmb_mpg 数据类型

20082018:将字符串转换成浮点

# 加载数据集
import pandas as pd

df_08 = pd.read_csv('data_08.csv')
df_18 = pd.read_csv('data_18.csv')

# 将 mpg 列转换成浮点
mpg_columns = ['city_mpg', 'hwy_mpg', 'cmb_mpg']
for c in mpg_columns:
    df_18[c] = df_18[c].astype(float)
    df_08[c] = df_08[c].astype(float)

 

# 将浮点转换成整数
df_08['greenhouse_gas_score'] = df_08['greenhouse_gas_score'].astype(int)

得出结论

使用下列空间来处理数据集clean_08.csv 和clean_18.csv中的以下问题

# load datasets
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline

# load datasets
df_08 = pd.read_csv('clean_08.csv')
df_18 = pd.read_csv('clean_18.csv')

df_08.head(1)

df_08.fuel.value_counts()

df_18.fuel.value_counts()

# how many unique models used alternative sources of fuel in 2008

alt_08 = df_08.query('fuel in ["CNG", "ethanol"]').model.nunique()
alt_08

# how many unique models used alternative sources of fuel in 2018

alt_18 = df_18.query('fuel in ["Ethanol", "Electricity"]').model.nunique()
alt_18

 

plt.bar(["2008", "2018"], [alt_08, alt_18])
plt.title("Number of Unique Models Using Alternative Fuels")
plt.xlabel("Year")
plt.ylabel("Number of Unique Models");

# total unique models each year
total_08 = df_08.model.nunique()
total_18 = df_18.model.nunique()
total_08, total_18

 

prop_08 = alt_08/total_08
prop_18 = alt_18/total_18
prop_08, prop_18

 

plt.bar(["2008", "2018"], [prop_08, prop_18])
plt.title("Proportion of Unique Models Using Alternative Fuels")
plt.xlabel("Year")
plt.ylabel("Proportion of Unique Models");

 

 

 

veh_08 = df_08.groupby('veh_class').cmb_mpg.mean()
veh_08

veh_18 = df_18.groupby('veh_class').cmb_mpg.mean()
veh_18

inc = veh_18 - veh_08
inc

# only plot the classes that exist in both years
inc.dropna(inplace=True)
plt.subplots(figsize=(8, 5))
plt.bar(inc.index, inc)
plt.title('Improvements in Fuel Economy from 2008 to 2018 by Vehicle Class')
plt.xlabel('Vehicle Class')
plt.ylabel('Increase in Average Combined MPG');

 

 

# smartway labels for 2008
df_08.smartway.unique()

# get all smartway vehicles in 2008
smart_08 = df_08.query('smartway == "yes"')

 

# explore smartway vehicles in 2008
smart_08.describe()

# smartway labels for 2018
df_18.smartway.unique()

# get all smartway vehicles in 2018
smart_18 = df_18.query('smartway in ["Yes", "Elite"]')

smart_18.describe()

 

top_08 = df_08.query('cmb_mpg > cmb_mpg.mean()')
top_08.describe()

top_18 = df_18.query('cmb_mpg > cmb_mpg.mean()')
top_18.describe()

 

 

合并数据集

用 Pandas Merges 创建 clean_08.csv 和 clean_18.csv 的合并数据集。

 

import pandas as pd

df_08 = pd.read_csv('clean_08.csv')
df_18 = pd.read_csv('clean_18.csv')

 

# 重命名 2008 列
df_08.rename(columns=lambda x: x[:10] + "_2008", inplace=True)

# 查看并检查名称
df_08.head()

 

 

# 合并数据集
df_combined = df_08.merge(df_18, left_on='model_2008', right_on='model', how='inner')

# 查看并检查合并结果
df_combined.head()

保存合并数据集

df_combined.to_csv('combined_dataset.csv', index=False)

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Gausst松鼠会/article/detail/540927
推荐阅读
相关标签
  

闽ICP备14008679号