赞
踩
特征工程包括三大部分:
特征提取(feature extraction):从文字,图像,声音等其他非结构化数据中提取新信息作为特征。比如说,从淘宝宝贝的名称中提取出产品类别,产品颜色,是否是网红产品等等。
特征创造(feature creation):把现有特征进行组合,或互相计算,得到新的特征。比如说,我们有一列特征是速度,一列特征是距离,我们就可以通过让两列相处,创造新的特征:通过距离所花的时间。
特征选择(feature selection):从所有的特征中,选择出有意义,对模型有帮助的特征,以避免必须将所有特征都导入模型去训练的情况。
四种方法可以用来选择特征:过滤法,嵌入法,包装法,和降维算法
import pandas as pd
data = pd.read_csv(r"F:\计算机学习资料\机器学习b站菜菜\【机器学习】菜菜的sklearn课堂(1-12全课)\03数据预处理和特征工程\digit_recognizor.csv")
data.head()
x = data.iloc[:,1:]
y = data.iloc[:,0]
x.shape
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold() # 实例化,不填参数默认方差为0
x_var0 = selector.fit_transform(x) # 获取删除不合格特征之后的特征矩阵
# x_var0 = VarianceThreshold().fit_transform(x)
x_var0.shape
pd.DataFrame(x_var0).head()
import numpy as np
# np.median(x.var().values) # x.var() 返回Series数据 1352.286703180131
x_fsvar = VarianceThreshold(np.median(x.var().values)).fit_transform(x)
x_fsvar.shape
X_bvar = VarianceThreshold(.8 * (1 - .8)).fit_transform(x)
X_bvar.shape
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.model_selection import cross_val_score
import numpy as np
x = data.iloc[:,1:] # 方差过滤前的特征矩阵
y = data.iloc[:,0]
x_fsvar = VarianceThreshold(np.median(x.var().values)).fit_transform(x) # 方差过滤后的特征矩阵
#======【TIME WARNING:35mins +】======#
cross_val_score(KNN(),x,y,cv=5).mean() # 0.96585697
# python中的魔法命令,可以直接使用%%timeit来计算运行这个cell中的代码所需的时间
# 为了计算所需的时间,需要将这个cell中的代码运行很多次(通常是7次)后求平均值,因此运行%%timeit的时间会
# 远远超过cell中的代码单独运行的时间
#======【TIME WARNING:4 hours】======#
%%timeit
cross_val_score(KNN(),x,y,cv=5).mean()
#======【TIME WARNING:20 mins+】======#
cross_val_score(KNN(),x_fsvar,y,cv=5).mean() # 0.96599974
#======【TIME WARNING:2 hours】======#
%%timeit
cross_val_score(KNN(),x,y,cv=5).mean()
cross_val_score(RFC(n_estimators=10,random_state=0),x,y,cv=5).mean() # 0.93800038
#======【TIME WARNING:2 hours】======#
%%timeit
cross_val_score(RFC(n_estimators=10,random_state=0),x,y,cv=5).mean() # 11.5s
cross_val_score(RFC(n_estimators=10,random_state=0),x,y,cv=5).mean() # 0.93880981
#======【TIME WARNING:2 hours】======#
%%timeit
cross_val_score(RFC(n_estimators=10,random_state=0),x,y,cv=5).mean() # 11.1s
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import numpy as np
x = data.iloc[:,1:] # 方差过滤前的特征矩阵
y = data.iloc[:,0]
x_fsvar = VarianceThreshold(np.median(x.var().values)).fit_transform(x) # 方差过滤后的特征矩阵
# 假设选取300个特征 SelecKBest 实例化 里面填的参数为选择的统计量和要保留的特征数
x_fschi = SelectKBest(chi2,k=300).fit_transform(x_fsvar,y)
x_fschi.shape
# 使用随机森林交叉验证模型的效果
cross_val_score(RFC(n_estimators=10,random_state=10),x_fschi,y,cv=5).mean()
%matplotlib inline
import matplotlib.pyplot as plt
score = []
for i in range(350,200,-10):
x_fschi = SelectKBest(chi2,k=i).fit_transform(x_fsvar,y)
once = cross_val_score(RFC(n_estimators=10,random_state=0),x_fschi,y,cv=5).mean()
score.append(once)
plt.plot(range(350,200,-10),score)
plt.show()
chivalue, pvalues_chi = chi2(x_fsvar,y)
chivalue # 卡方值
pvalues_chi # P值
#k取多少?我们想要消除所有p值大于设定值,比如0.05或0.01的特征:
k = chivalue.shape[0] - (pvalues_chi > 0.05).sum()
#x_fschi = SelectKBest(chi2, k=填写具体的k).fit_transform(x_fsvar, y)
#cross_val_score(RFC(n_estimators=10,random_state=0),x_fschi,y,cv=5).mean()
from sklearn.feature_selection import f_classif
F, pvalues_f = f_classif(x_fsvar,y)
F
pvalues_f
# k = F.shape[0] - (pvalues_f > 0.05).sum()
# x_fsF = SelectKBest(f_classif, k=填写具体的k).fit_transform(x_fsvar, y)
# cross_val_score(RFC(n_estimators=10,random_state=0),x_fsF,y,cv=5).mean()
from sklearn.feature_selection import mutual_info_classif as MIC
result = MIC(x_fsvar,y) # 得到互信息量的估计
k = result.shape[0] - sum(result <= 0)
result
嵌入法是一种让算法自己决定使用哪些特征的方法
先使用某些机器学习的算法和模型进行训练,得到各个特征的权值系数,根据权值系数从大到小选择特征
模型权值系数是超参数
在选择完毕之后,需要自己来评估模型
feature_selection.SelectFromModel
class sklearn.feature_selection.SelectFromModel (estimator, threshold=None, prefit=False, norm_order=1,max_features=None)
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier as RFC
RFC_ = RFC(n_estimators =10,random_state=0) # 实例化后放入SelectFromModel
x_embedded = SelectFromModel(RFC_,threshold=0.005).fit_transform(x,y)
# 这里只想取出来有限的特征。
# 0.005这个阈值对于有780个特征的数据来说,是非常高的阈值,因为平均每个特征只能够分到大约0.001的feature_importances_
x_embedded.shape
#模型的维度明显被降低
#同样的,也可以画学习曲线来找最佳阈值
import numpy as np
import matplotlib.pyplot as plt
RFC_.fit(x,y).feature_importances_
threshold = np.linspace(0,(RFC_.fit(x,y).feature_importances_).max(),20)
score = []
for i in threshold:
x_embedded = SelectFromModel(RFC_,threshold=i).fit_transform(x,y)
once = cross_val_score(RFC_,x_embedded,y,cv=5).mean()
score.append(once)
plt.plot(threshold,score)
plt.show()
x_embedded = SelectFromModel(RFC_,threshold=0.00067).fit_transform(x,y)
x_embedded.shape
cross_val_score(RFC_,x_embedded,y,cv=5).mean()
# 细化学习曲线
score2 = []
for i in np.linspace(0,0.002,20):
x_embedded = SelectFromModel(RFC_,threshold=i).fit_transform(x,y)
once = cross_val_score(RFC_,x_embedded,y,cv=5).mean()
score2.append(once)
plt.figure(figsize=[20,5])
plt.plot(np.linspace(0,0.002,20),score2)
plt.xticks(np.linspace(0,0.002,20))
plt.show()
from sklearn.feature_selection import RFE
RFC_ = RFC(n_estimators =10,random_state=0)
selector = RFE(RFC_, n_features_to_select=340, step=50).fit(x, y)
selector.support_.sum()
selector.ranking_
x_wrapper = selector.transform(x)
cross_val_score(RFC_,x_wrapper,y,cv=5).mean()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。