赞
踩
生成缺失数据
data = pd.DataFrame({'A':[None,2,3,4,None,6],'B':[4,None,7,10,15,21],'C':[3,6,18,7,10,13]})
data
data_1 = data.copy()
data_1 = data_1.dropna(axis=0,how='any') # 以行为参考,如果有空值,就删除该行
data_1
data_1 = data.copy()
data_1 = data_1.dropna(axis=1,how='any') # 以列为参考,如果有空值,就删除该列
data_1
# 均值填充
data_3 = data.copy()
data_3 = data_3.fillna(data_3.mean())
# 中位数填充
data_3 = data.copy()
data_3 = data_3.fillna(data_3.median())
# 使用前一个数据进行填充
data_3 = data.copy()
data_3 = data_3.fillna(value=None, method='ffill',axis=0)
# 使用后一个数据进行填充
data_3 = data.copy()
data_3 = data_3.fillna(value=None, method='backfill',axis=0)
# Sklearn 插值工具
from sklearn.impute import SimpleImputer
# Strategy可以指定填充方式,mean均值填补,median中位数填补,most_frequent众数填补,constant常数填补
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
data_3 = data.copy()
data_3 = imp.fit_transform(data_3)
data_3
from sklearn.impute import KNNImputer
imputer = KNNImputer(missing_values=np.nan,
n_neighbors=2, # 用于插补的相邻样本数。
weights='distance') # 用于预测的权重函数
data_4 = data.copy()
data_4 = imputer.fit_transform(data_4)
data_4
miceforest
库可以实现随机森林的链式方程式(MICE)多重插补,具有快速、内存利用率高的特征,无需太多设置即可插入缺失的分类和数值数据,并且具有一系列可用的诊断图。import miceforest as mf
from sklearn.datasets import load_iris
# 加载鸢尾花数据集
iris = pd.concat(load_iris(as_frame=True,return_X_y=True),axis=1)
iris.rename({"target": "species"}, inplace=True, axis=1)
iris['species'] = iris['species'].astype('category')
# 引入缺失数据
iris_amp = mf.ampute_data(iris,perc=0.25,random_state=1991)
iris_amp
# 初始化插补模型
kernel = mf.ImputationKernel(
iris_amp,
datasets=5, # 将datasets设置为5,创建多个插补数据集
save_all_iterations=True,
random_state=666
)
# MICE算法进行2次迭代
kernel.mice(2)
# 直接从内核中获取已插值的数据集
completed_dataset = kernel.complete_data(dataset=0, inplace=False)
completed_dataset
from matplotlib import pyplot as plt
# 加载鸢尾花数据集
iris = pd.concat(load_iris(as_frame=True,return_X_y=True),axis=1).iloc[:,0]
_ = plt.boxplot(iris)
# 生成异常值
iris_1 = iris.copy()
iris_1[4]=14
_ = plt.boxplot(iris_1)
2. 3
σ
\sigma
σ原则(拉依达准则)
def three_sigma(data, n=3):
"""
data: DataFrame某一列
"""
rule = (data.mean() - n * data.std() > data) | (data.mean() + n * data.std() < data)
index = np.arange(data.shape[0])[rule]
outrange = data.iloc[index]
return outrange
# 异常值检测
print(three_sigma(iris,n=3))
# 生成异常值
iris_1 = iris.copy()
iris_1[4]=14
print(three_sigma(iris_1,n=3))
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。