赞
踩
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
iris = load_iris()
std = StandardScaler()
c = std.fit_transform(iris.data)
c
from sklearn.preprocessing import Normalizer
Normalizer().fit_transform(iris.data)
from sklearn.preprocessing import MinMaxScaler
MinMaxScaler(feature_range=(1,2)).fit_transform(iris.data)
from sklearn.preprocessing import Binarizer
Binarizer(threshold=3).fit_transform(iris.data)
pd.get_dummies(iris.target, sparse=True)
from sklearn.preprocessing import OneHotEncoder
c = OneHotEncoder(categories='auto').fit_transform(iris.target.reshape((-1,1)))
c
>> <150x3 sparse matrix of type '<class 'numpy.float64'>'
with 150 stored elements in Compressed Sparse Row format>
c.toarray()
from sklearn.impute import SimpleImputer
d = SimpleImputer(strategy='mean').fit_transform(c)
d
a.dropna(how='all',axis=0,inplace =True)
a.fillna(method='ffill',axis=1)
a.fillna(np.mean(a.iloc[:,1]))
a.fillna(2)
b = a.duplicated(keep='first',subset=['nihao'])
b
c = a.drop_duplicates(keep='first',subset=['nihao'])
c
from scipy import stats
mean = a['age'].mean()
std = a['age'].std()
print(stats.kstest(a['age'],'norm',(mean,std)))
>> KstestResult(statistic=0.19419645496061633, pvalue=0.058218287631895405)
data = a[np.abs(a['age']- mean) <= 3*std]
a['age'].plot(kind = 'box')
>> 求下四分位数
q1 = a["age"].quantile(0.25)
q1
>> 求上四分位数
q3 = a["age"].quantile(0.75)
q3
iqr = q3 - q1
>> 下界
bottom = q1 - 1.5*iqr
bottom
>> 上界
upper = q3 + 1.5*iqr
upper
a[(a['age'] >= bottom) & (a['age'] <= upper)]
np.log(a)
from sklearn.preprocessing import FunctionTransformer
a = np.arange(0,12).reshape(2,6)
a
>>array([[ 0, 1, 2, 3, 4, 5],
[ 6, 7, 8, 9, 10, 11]])
m = FunctionTransformer(np.log).fit_transform(a)
m
>>array([[ -inf, 0. , 0.69314718, 1.09861229, 1.38629436,
1.60943791],
[1.79175947, 1.94591015, 2.07944154, 2.19722458, 2.30258509,
2.39789527]])
from sklearn.feature_selection import VarianceThreshold
b = VarianceThreshold(threshold=2).fit_transform(a)
b
X = X.drop(X.columns[X.std()==0], axis=1)
from sklearn.feature_selection import SelectKBest,chi2,f_classif
SelectKBest(k=2).fit_transform(iris.data,iris.target)
c = SelectKBest(chi2,k=2)
d = c.fit_transform(iris.data,iris.target)
d
c.scores_
>>array([ 10.81782088, 3.7107283 , 116.31261309, 67.0483602 ])
c.pvalues_
>> array([4.47651499e-03, 1.56395980e-01, 5.53397228e-26, 2.75824965e-15])
m = np.argsort(c.scores_)[::-1]
m
e = pd.DataFrame(iris.data)
e
list(e.columns.values[m[0:2]])
>> 索引名称为 [2, 3]
from sklearn.feature_selection import SelectKBest,chi2,f_classif
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
iris = load_iris()
c = SelectKBest(chi2,k=2)
d = c.fit_transform(iris.data,iris.target)
m = np.argsort(c.scores_)[::-1]
e = pd.DataFrame(iris.data)
list(e.columns.values[m[0:2]])
from sklearn.decomposition import PCA
def pca():
"""
主成分分析进行特征选择
:return:
"""
# 特征数量达到上百的时候 考虑数据简化 数据内容也会变 特征数量减少
# 1.整数 减少到特征数量 整数是1,就减少到一个特征
# 2.小数 0-1 90% 90%-95%
pca = PCA(n_components=0.95)
data = pca.fit_transform(iris.data)
print(data)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
LDA(n_components=2).fit_transform(iris.data,iris.target)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。