赞
踩
目录
pandas做数据预处理是是很方便的。pandas中的缺失值用NAN表示
- import pandas
-
- dia = pandas.read_csv("E:\cluster\seaborn-data\diamonds.csv")
- print(type(dia))# 核心结构(DataFrame)
- print(dia.dtypes)# 字符型叫object
- print(help(pandas.read_csv))
- print(dia.head())# 默认显示前五条数据
- print("--------------------------------------------------------------")
- print(dia.head(2))# 指定显示行数
- print("--------------------------------------------------------------")
- print(dia.tail(4))# 显示后四条数据
- print("--------------------------------------------------------------")
- print(dia.columns)# 列名
- print(dia.shape) #(a,b)表示当前数据有a个样本,每个样本b个指标也可说a行b列
pandas取数据比较麻烦,不能直接通过index,需要通过函数.loc[index]
- print(dia.loc[0])# 取第0行数据
- print(dia.loc[53939]) # 取最后一行数据(如果超过范围就会报错)
- # print(dia.loc[3:6])# 取3到6行
pandas 读取csv文件认为第一行就是列名,可以通过第一行的名字来访问某一列
- # 定位一列
- col = dia["carat"]
- print(col)
- #与上面等价
- #name = "carat"
- # print(dia[name])
- # 定位两列
- cols = ["carat", "color"]
- print(dia[cols])
- # print(dia[["carat","color"]])
- # print(dia[["carat","color"]])
- # 查找单位为g的数据
- #cols_name = dia.colunms.tolist() 列名存为一个列表
- # print(cols_name)
- # for i in cols_name:
- # if i.endswith("g"): 单位为g
- # cols.append(i)
- # print(dia[cols])
- # 当两列维度相同时,结果为对应位置进行相应的运算
- xandy = dia["x"] * dia["y"]
- print(xandy.head(3))# 显示前3行数据
- # 对每个元素都/1000
- x_ = dia["x"]/1000
- print(dia.shape)
- # 加一列(注意:行数要对应)
- dia["x_"] = x_
- print(dia.shape)
- # 求某一列的最大值
- print(dia["x"].max())
- # 让某一列都除以最大值
- print((dia["y"]/dia["x"].max()).head(3))
- # 排序(默认从小到大)inplace=TRUE说明改变原来的数据,而不是新建数据
- dia.sort_values("x", inplace=True)
- print(dia['x'].tail(3))
- print("---------------")
- # 降序排
- dia.sort_values("x", inplace=True, ascending=False)
- print(dia["x"].head(3))
- import numpy as np
- import pandas as pd
-
- titanic_survival = pd.read_csv("titanic.csv")
- titanic_survival.head()
survived:表示当前数据的一个label值(即标签值)后面有个分类任务
pclass:表船内仓位的等级
sex:当前乘客的性别
age:当前乘客的年龄
sibsp:当前乘客的兄弟姐妹的数量
parch:(parents and child)当前乘客的老人和孩子总共多少
fare:船票价格
embarked:登船地点
- age = titanic_survival["age"]
- # print(age.loc[0:10])
- # 判断是否为缺失值
- age_is_null = pd.isnull(age)
- # 不是缺失值打印FALSE
- print(age_is_null)
- # 筛选出所有缺失值
- age_is_true = age[age_is_null]
- print(age_is_true)
- # 缺失值的个数
- age_is_true_sum = len(age_is_true)
- print(age_is_true_sum)
- # 当数据中有缺失值并且没有做任何处理时,会出现nan
- mean_age = sum(titanic_survival["age"])/len(titanic_survival["age"])
- print(mean_age)
nan
- # 处理:如果是缺失值就不参与计算
- good_ages = titanic_survival["age"][age_is_null == False]
- correct_mean_age = sum(good_ages)/len(good_ages)
- # 也可以通过titanic_survival["age"].mean()直接求均值(但一般不用,一般用平均数或中位数/众数来填充,使之成为完整的样本)
- print(correct_mean_age)
29.69911764705882
- # mean fares for each class(传统方法)
- passenger_classes = [1, 2, 3]
- fares_by_class = {}
- for this_class in passenger_classes:
- pclass_rows = titanic_survival[titanic_survival["pclass"] == this_class]# 取出对应pclass的数据
- pclass_fares = pclass_rows["fare"]# 定位到价格这一列
- fare_for_class = pclass_fares.mean() # 求平均值
- fares_by_class[this_class] = fare_for_class
- print(fares_by_class)
{1: 84.1546875, 2: 20.662183152173913, 3: 13.675550101832993}
- # pandas 较简介的方法
- passenger_survival = titanic_survival.pivot_table(index = "pclass", values="survived", aggfunc = np.mean)
- # index:以谁为基准
- # values:跟某个变量的关系值
- # aggfunc:求平均
- # pivot_table数据透视表,统计一个量与其他量关系的一个函数
- print(passenger_survival)# 获救几率
- # 求不同仓的乘客的平均年龄
- passenger_age = titanic_survival.pivot_table(index = "pclass", values="age")# 不指定aggfunc时,默认求均值
- print(passenger_age)
survived pclass 1 0.629630 2 0.472826 3 0.242363 age pclass 1 38.233441 2 29.877630 3 25.140620
- # 同时看一下一个量与其他两个量之间的关系
- port_stats = titanic_survival.pivot_table(index ="embarked", values=["fare","survived"], aggfunc = np.sum)# 总值
- print(port_stats)
fare survived embarked C 10072.2962 93 Q 1022.2543 30 S 17439.3988 217
- # axis=1或axis="columns" 会丢掉所有为空值的样本
- drop_na_columns = titanic_survival.dropna(axis=1)
- new_titanic_survival = titanic_survival.dropna(axis=0,subset=['age','sex'])
- print(new_titanic_survival)
survived pclass sex age sibsp parch fare embarked class \ 0 0 3 male 22.0 1 0 7.2500 S Third 1 1 1 female 38.0 1 0 71.2833 C First 2 1 3 female 26.0 0 0 7.9250 S Third 3 1 1 female 35.0 1 0 53.1000 S First 4 0 3 male 35.0 0 0 8.0500 S Third .. ... ... ... ... ... ... ... ... ... 885 0 3 female 39.0 0 5 29.1250 Q Third 886 0 2 male 27.0 0 0 13.0000 S Second 887 1 1 female 19.0 0 0 30.0000 S First 889 1 1 male 26.0 0 0 30.0000 C First 890 0 3 male 32.0 0 0 7.7500 Q Third who adult_male deck embark_town alive alone 0 man True NaN Southampton no False 1 woman False C Cherbourg yes False 2 woman False NaN Southampton yes True 3 woman False C Southampton yes False 4 man True NaN Southampton no True .. ... ... ... ... ... ... 885 woman False NaN Queenstown no False 886 man True NaN Southampton no True 887 woman False B Southampton yes True 889 man True C Cherbourg yes True 890 man True NaN Queenstown no True [714 rows x 15 columns]
- # 定位到具体的某一个数据
- row_index_83_age = titanic_survival.loc[83, "age"]# 第83个样本的年龄
- row_index_1000_pclass = titanic_survival.loc[766,"pclass"]
- print(row_index_83_age)
- print(row_index_1000_pclass)
28.0 1
- # .apply()自定义函数操作(做很多操作时)
- def hundredth_row(column):# 第一百行数据
- hundredth_item = column.loc[99]
- return hundredth_item
-
- hundredth_row = titanic_survival.apply(hundredth_row)
- #print(hundredth_row)
-
- # 将class换一种说法
- def which_class(row):
- pclass = row["pclass"]
- if pd.isnull(pclass):
- return "Unknown"
- elif pclass == 1:
- return "First class"
- elif pclass == 2:
- return "Second class"
- elif pclass == 3:
- return "Third class"
- classes = titanic_survival.apply(which_class, axis=1)
- # print(classes)
-
- # 每一个属性的缺失值的数量
- def not_null_count(column):
- column_null = pd.isnull(column)
- null = column[column_null]
- return len(null)
-
- column_null_count = titanic_survival.apply(not_null_count)
- print(column_null_count)
- print("--------------")
- # 将连续值年龄变成离散的
- def generate_age_label(row):
- age = row["age"]
- if pd.isnull(age):
- return "Unknown"
- elif age<18:
- return "minor"
- else:
- return "adult"
- age_labels = titanic_survival.apply(generate_age_label, axis=1)
- print(age_labels)
survived 0 pclass 0 sex 0 age 177 sibsp 0 parch 0 fare 0 embarked 2 class 0 who 0 adult_male 0 deck 688 embark_town 2 alive 0 alone 0 dtype: int64 ------------------------- 0 adult 1 adult 2 adult 3 adult 4 adult ... 886 adult 887 adult 888 Unknown 889 adult 890 adult Length: 891, dtype: object
- # 显示各年龄段的人获救几率
- titanic_survival['age_labels'] = age_labels
- age_group_survival = titanic_survival.pivot_table(index="age_labels", values="survived")
- print(age_group_survival)
survived age_labels Unknown 0.293785 adult 0.381032 minor 0.539823
前面提的都是pandas中的DateFrame结构(由行和列组成)有一些列series组成
Series结构:dateframe其中的一行或一列
- import pandas as pd
- survival = pd.read_csv("titanic.csv")
- series_fare = survival['fare']
- print(type(series_fare))
- print(series_fare[0:5])
- series_class = survival["class"]
- print(series_class[0:5])
<class 'pandas.core.series.Series'> 0 7.2500 1 71.2833 2 7.9250 3 53.1000 4 8.0500 Name: fare, dtype: float64 0 Third 1 First 2 Third 3 First 4 Third Name: class, dtype: object
- from pandas import Series
-
- fares = series_fare.values
- print(type(fares))
- # dateframe里面是series,series里面又是ndarray,因此说明pandas是在numpy基础之上封装的
- class_ = series_class.values
- # print(class_)
- survival = Series(fares,index=class_)
- # 索引要选择能唯一确定这个样本的数据,这里只是举个例子(这个例子不是很好)
- print(survival)
- #survival[["First","Second"]]
- fiveten = survival[889:891]
- print(fiveten)
-
- original_index = survival.index.tolist()
- sorted_index = sorted(original_index)
- # cannot reindex from a duplicate axis,当reindex中有重复的值时,会报错
- sorted_by_index = survival.reindex(sorted_index)
- # 索引不能重复
<class 'numpy.ndarray'> Third 7.2500 First 71.2833 Third 7.9250 First 53.1000 Third 8.0500 ... Second 13.0000 First 30.0000 Third 23.4500 First 30.0000 Third 7.7500 Length: 891, dtype: float64 First 30.00 Third 7.75 dtype: float64
series也可以按照值或者键进行排序
- import numpy as np
- # 两个series维度一样,对应位置相加;维度不同,分别相加
- print(np.add(survival,survival)[0:5])
- np.max(survival)
Third 14.5000 First 142.5666 Third 15.8500 First 106.2000 Third 16.1000 dtype: float64 512.3292
以上是一些pandas的简单操作,还有其他的会在后面的案例里再说。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。