赞
踩
先导入需要的模块
import pandas as pd
import numpy as np
data = pd.DataFrame({'A':[40,np.nan,16,22,8,44],
'B':[88,np.nan,66,np.nan,99,np.nan,],
'C':[2,np.nan,4,np.nan,np.nan,6]})
data
A | B | C | |
---|---|---|---|
0 | 40.0 | 88.0 | 2.0 |
1 | NaN | NaN | NaN |
2 | 16.0 | 66.0 | 4.0 |
3 | 22.0 | NaN | NaN |
4 | 8.0 | 99.0 | NaN |
5 | 44.0 | NaN | 6.0 |
data.info() # 查看data的信息
<class ‘pandas.core.frame.DataFrame’>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
# Column Non-Null Count Dtype
— ------ -------------- -----
0 A 5 non-null float64
1 B 3 non-null float64
2 C 3 non-null float64
dtypes: float64(3)
memory usage: 272.0 bytes
data.isnull().sum() # 也可这样查看data每一行的缺失值个数
A 1
B 3
C 3
dtype: int64
官方文档
dropna(axis=0, how=‘any’, thresh=None, subset=None, inplace=False)
data.dropna(axis=0, how='any', inplace=True) # 删除data中存在缺失值的行
data
A | B | C | |
---|---|---|---|
0 | 40 | 88.0 | 2.0 |
2 | 16 | 66.0 | 4.0 |
data.dropna(axis=0, how='all', inplace=True) # 删除data中全部为缺失值的行
data
A | B | C | |
---|---|---|---|
0 | 40 | 88.0 | 2.0 |
2 | 16 | 66.0 | 4.0 |
3 | 22 | NaN | NaN |
4 | 8 | 99.0 | NaN |
5 | 44 | NaN | 6.0 |
data.dropna(axis=1, how='any', inplace=True) # 删除data中存在缺失值的列
data
0 |
---|
1 |
2 |
3 |
4 |
5 |
data.dropna(axis=1, how='all', inplace=True) # 删除data中全部为缺失值的列
data
A | B | C | |
---|---|---|---|
0 | 40 | 88.0 | 2.0 |
1 | NaN | NaN | NaN |
2 | 16 | 66.0 | 4.0 |
3 | 22 | NaN | NaN |
4 | 8 | 99.0 | NaN |
5 | 44 | NaN | 6.0 |
均值mean,众数mode,中值median
data.C[data.C.isnull()] = data.C.mean()
data
A | B | C | |
---|---|---|---|
0 | 40 | 88.0 | 2.0 |
1 | NaN | NaN | 4.0 |
2 | 16 | 66.0 | 4.0 |
3 | 22 | NaN | 4.0 |
4 | 8 | 99.0 | 4.0 |
5 | 44 | NaN | 6.0 |
官方文档
fillna(value=None, method=None, axis=None, inplace=False, limit=None, downcast=None) # 使用指定的方法填充NA/NaN值
data.fillna(0) # 使用0填充缺失值
A | B | C | |
---|---|---|---|
0 | 40 | 88.0 | 2.0 |
1 | 0 | 0.0 | 0.0 |
2 | 16 | 66.0 | 4.0 |
3 | 22 | 0.0 | 0.0 |
4 | 8 | 99.0 | 0.0 |
5 | 44 | 0.0 | 6.0 |
data.fillna(value={'A':11, 'B':22, 'C':66}) # 使用字典填充
A | B | C | |
---|---|---|---|
0 | 40 | 88.0 | 2.0 |
1 | 11 | 22.0 | 66.0 |
2 | 16 | 66.0 | 4.0 |
3 | 22 | 22.0 | 66.0 |
4 | 8 | 99.0 | 66.0 |
5 | 44 | 22.0 | 6.0 |
data.fillna(method='ffill') # pad。使用前一个非缺失值填充
A | B | C | |
---|---|---|---|
0 | 40 | 88.0 | 2.0 |
1 | 40 | 88.0 | 2.0 |
2 | 16 | 66.0 | 4.0 |
3 | 22 | 66.0 | 4.0 |
4 | 8 | 99.0 | 4.0 |
5 | 44 | 99.0 | 6.0 |
data.fillna(method='bfill') # backfill。使用后一个非缺失值填充
A | B | C | |
---|---|---|---|
0 | 40 | 88.0 | 2.0 |
1 | 16 | 66.0 | 4.0 |
2 | 16 | 66.0 | 4.0 |
3 | 22 | 99.0 | 6.0 |
4 | 8 | 99.0 | 6.0 |
5 | 44 | NaN | 6.0 |
data.fillna(100, limit=1) # 限制填充的个数
A | B | C | |
---|---|---|---|
0 | 40 | 88.0 | 2.0 |
1 | 100 | 100.0 | 100.0 |
2 | 16 | 66.0 | 4.0 |
3 | 22 | NaN | NaN |
4 | 8 | 99.0 | NaN |
5 | 44 | NaN | 6.0 |
data.A.fillna(999) # 部分数据填充
0 40
1 999
2 16
3 22
4 8
5 44
Name: A, dtype: object
官方文档
interpolate(method=“linear”,axis=0,limit=None,inplace=False,limit_direction=“forward”,limit_area=None,downcast=None)
data.interpolate(method='linear') # 线性插值
A | B | C | |
---|---|---|---|
0 | 40 | 88.0 | 2.000000 |
1 | NaN | 77.0 | 3.000000 |
2 | 16 | 66.0 | 4.000000 |
3 | 22 | 82.5 | 4.666667 |
4 | 8 | 99.0 | 5.333333 |
5 | 44 | 99.0 | 6.000000 |
data['C'].interpolate(method='polynomial', order=2) # 使用二次多项式插值
0 2.000000
1 3.066667
2 4.000000
3 4.800000
4 5.466667
5 6.000000
Name: C, dtype: float64
data = pd.DataFrame({'A':[40,66,16,22,66],
'B':[4,88,5,99,88],
'C':[2,33,7,20,33]})
data
A | B | C | |
---|---|---|---|
0 | 40 | 4 | 2 |
1 | 66 | 88 | 33 |
2 | 16 | 5 | 7 |
3 | 22 | 99 | 20 |
4 | 66 | 88 | 33 |
data[data.duplicated()] # 查看重复值
A | B | C | |
---|---|---|---|
4 | 66 | 88 | 33 |
data.drop_duplicates(inplace=True) # 丢弃重复值
data
A | B | C | |
---|---|---|---|
0 | 40 | 4 | 2 |
1 | 66 | 88 | 33 |
2 | 16 | 5 | 7 |
3 | 22 | 99 | 20 |
# 重置索引
data.reset_index(drop=True, inplace=True) # drop=True表删除原索引
data
A | B | C | |
---|---|---|---|
0 | 40 | 4 | 2 |
1 | 66 | 88 | 33 |
2 | 16 | 5 | 7 |
4 | 22 | 99 | 20 |
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。