当前位置:   article > 正文

Python进行特征提取_提取数据包中的特征 python

提取数据包中的特征 python
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Mon Aug 21 10:57:29 2017
  4. @author: 飘的心
  5. """
  6. #过滤式特征选择
  7. #根据方差进行选择,方差越小,代表该属性识别能力很差,可以剔除
  8. from sklearn.feature_selection import VarianceThreshold
  9. x=[[100,1,2,3],
  10. [100,4,5,6],
  11. [100,7,8,9],
  12. [101,11,12,13]]
  13. selector=VarianceThreshold(1) #方差阈值值,
  14. selector.fit(x)
  15. selector.variances_ #展现属性的方差
  16. selector.transform(x)#进行特征选择
  17. selector.get_support(True) #选择结果后,特征之前的索引
  18. selector.inverse_transform(selector.transform(x)) #将特征选择后的结果还原成原始数据
  19. #被剔除掉的数据,显示为0
  20. #单变量特征选择
  21. from sklearn.feature_selection import SelectKBest,f_classif
  22. x=[[1,2,3,4,5],
  23. [5,4,3,2,1],
  24. [3,3,3,3,3],
  25. [1,1,1,1,1]]
  26. y=[0,1,0,1]
  27. selector=SelectKBest(score_func=f_classif,k=3)#选择3个特征,指标使用的是方差分析F值
  28. selector.fit(x,y)
  29. selector.scores_ #每一个特征的得分
  30. selector.pvalues_
  31. selector.get_support(True) #如果为true,则返回被选出的特征下标,如果选择False,则
  32. #返回的是一个布尔值组成的数组,该数组只是那些特征被选择
  33. selector.transform(x)
  34. #包裹时特征选择
  35. from sklearn.feature_selection import RFE
  36. from sklearn.svm import LinearSVC #选择svm作为评定算法
  37. from sklearn.datasets import load_iris #加载数据集
  38. iris=load_iris()
  39. x=iris.data
  40. y=iris.target
  41. estimator=LinearSVC()
  42. selector=RFE(estimator=estimator,n_features_to_select=2) #选择2个特征
  43. selector.fit(x,y)
  44. selector.n_features_ #给出被选出的特征的数量
  45. selector.support_ #给出了被选择特征的mask
  46. selector.ranking_ #特征排名,被选出特征的排名为1
  47. #注意:特征提取对于预测性能的提升没有必然的联系,接下来进行比较;
  48. from sklearn.feature_selection import RFE
  49. from sklearn.svm import LinearSVC
  50. from sklearn import cross_validation
  51. from sklearn.datasets import load_iris
  52. #加载数据
  53. iris=load_iris()
  54. X=iris.data
  55. y=iris.target
  56. #特征提取
  57. estimator=LinearSVC()
  58. selector=RFE(estimator=estimator,n_features_to_select=2)
  59. X_t=selector.fit_transform(X,y)
  60. #切分测试集与验证集
  61. x_train,x_test,y_train,y_test=cross_validation.train_test_split(X,y,
  62. test_size=0.25,random_state=0,stratify=y)
  63. x_train_t,x_test_t,y_train_t,y_test_t=cross_validation.train_test_split(X_t,y,
  64. test_size=0.25,random_state=0,stratify=y)
  65. clf=LinearSVC()
  66. clf_t=LinearSVC()
  67. clf.fit(x_train,y_train)
  68. clf_t.fit(x_train_t,y_train_t)
  69. print('origin dataset test score:',clf.score(x_test,y_test))
  70. #origin dataset test score: 0.973684210526
  71. print('selected Dataset:test score:',clf_t.score(x_test_t,y_test_t))
  72. #selected Dataset:test score: 0.947368421053
  73. import numpy as np
  74. from sklearn.feature_selection import RFECV
  75. from sklearn.svm import LinearSVC
  76. from sklearn.datasets import load_iris
  77. iris=load_iris()
  78. x=iris.data
  79. y=iris.target
  80. estimator=LinearSVC()
  81. selector=RFECV(estimator=estimator,cv=3)
  82. selector.fit(x,y)
  83. selector.n_features_
  84. selector.support_
  85. selector.ranking_
  86. selector.grid_scores_
  87. #嵌入式特征选择
  88. import numpy as np
  89. from sklearn.feature_selection import SelectFromModel
  90. from sklearn.svm import LinearSVC
  91. from sklearn.datasets import load_digits
  92. digits=load_digits()
  93. x=digits.data
  94. y=digits.target
  95. estimator=LinearSVC(penalty='l1',dual=False)
  96. selector=SelectFromModel(estimator=estimator,threshold='mean')
  97. selector.fit(x,y)
  98. selector.transform(x)
  99. selector.threshold_
  100. selector.get_support(indices=True)
  101. #scikitlearn提供了Pipeline来讲多个学习器组成流水线,通常流水线的形式为:将数据标准化,
  102. #--》特征提取的学习器————》执行预测的学习器,除了最后一个学习器之后,
  103. #前面的所有学习器必须提供transform方法,该方法用于数据转化(如归一化、正则化、
  104. #以及特征提取
  105. #学习器流水线(pipeline)
  106. from sklearn.svm import LinearSVC
  107. from sklearn.datasets import load_digits
  108. from sklearn import cross_validation
  109. from sklearn.linear_model import LogisticRegression
  110. from sklearn.pipeline import Pipeline
  111. def test_Pipeline(data):
  112. x_train,x_test,y_train,y_test=data
  113. steps=[('linear_svm',LinearSVC(C=1,penalty='l1',dual=False)),
  114. ('logisticregression',LogisticRegression(C=1))]
  115. pipeline=Pipeline(steps)
  116. pipeline.fit(x_train,y_train)
  117. print('named steps',pipeline.named_steps)
  118. print('pipeline score',pipeline.score(x_test,y_test))
  119. if __name__=='__main__':
  120. data=load_digits()
  121. x=data.data
  122. y=data.target
  123. test_Pipeline(cross_validation.train_test_split(x,y,test_size=0.25,
  124. random_state=0,stratify=y))


本文内容由网友自发贡献,转载请注明出处:【wpsshop博客】
推荐阅读
相关标签
  

闽ICP备14008679号