当前位置:   article > 正文

【天池入门笔记】【算法入门】sklearn入门系列二:聚类算法与特征选择_伸缩聚类法是特征选择方法

伸缩聚类法是特征选择方法

聚类算法主要有三种:层次聚类,划分聚类(sklearn),密度聚类(DBSCAN)

1、聚类

  1. #层次聚类
  2. from sklearn.cluster import Agglomerative Clustering
  3. import pandas as pd
  4. from sklearn.preprocessing import StandardScaler
  5. data = pd.read_csv('data.csv').fillna(0)
  6. label = data.label
  7. feature = data.drop('label',axis=1)
  8. feature = StandardScaler().fit_transform(feature)
  9. cluster = AgglomerativeClustering(n_clusters=2)
  10. cluster.fit(feature)
  11. pred = cluster.fit_predict(feature)
  12. from sklearn.metrics import accuracy_score
  13. print(accuracy_score(label,pred))
  14. #划分聚类(kmeans)
  15. cluster = KMeans(n_clusters=2,n_jobs=-1,init='k_means++')
  16. #密度聚类(DBSCAN)
  17. from sklearn import DBSCAN
  18. cluster = DBSCAN(n_jobs=-1,eps=0.01)
  19. pred = cluster.fit_predict(feature)

2、特征选择

  1. #采用pearsonr相关系数选特征
  2. import numpy as np
  3. import pandas as pd
  4. data.label.replace(-1,0,inplace = True)
  5. data = data.fillna(0)
  6. y = data.label
  7. x = data.drop('label',axis=1)
  8. from sklearn.model_selection import train_test_split
  9. X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.1,random_state=010%的数据作为测试集
  10. from scipy.stats import pearsonr
  11. columns = X_train.columns
  12. pearsonr()
  13. feature_importance = [(column,pearsonr(X_train[column],y_train)[0]) for column in columns]
  14. #pearsonr()函数返回pearsonr值和p值,我们只需要pearsonr值,故取[0]
  15. feature_importance.sort(key = lambda x:x[1])
  16. #lambda函数是取其pearsonr值进行排序,丢弃column

 

  1. #采用xgboost检验一下特征选择效果
  2. import xgboost as xgb
  3. dtrain = xbg.DMatrix(X_train,label=y_train)
  4. dtest = xgb.DMatrix(X_test,label=y_test)
  5. params = {
  6. 'booster':'gbtree',
  7. 'objective':'rank:pairwise',
  8. 'eval_metric':'auc',
  9. 'gamma':0.1,
  10. 'min_child_weight':2,
  11. 'max_depth':5,
  12. 'lambda':10,
  13. 'subsample':0.7,
  14. 'colsample bytree':0.7,
  15. 'eta':0.01,
  16. 'tree_method':'exact',
  17. 'seed':0,
  18. 'nthead':7
  19. }
  20. watchlist = [(dtrain,'train'),(dtest,'test')]
  21. model = xgb.train(params,dtrain,num_boost_round=100,evals=watchlist)
  22. #再看一下删除相关系数小的特征之后的结果
  23. #查看feature_importance,发现['merchant_max_distance']的pearsonr值较小
  24. delete_feature = ['merchant_max_distance']
  25. X_train = X_train[[for i in columns if i not in delete_feature]]
  26. X_test = X_test[[for i in columns if i not in delete_feature]]
  27. dtrain = xbg.DMatrix(X_train,label=y_train)
  28. dtest = xgb.DMatrix(X_test,label=y_test)
  29. watchlist = [(dtrain,'train'),(dtest,'test')]
  30. model = xgb.train(params,dtrain,num_boost_round=100,evals=watchlist)
  31. #运行之后比较出来的结果的auc值

 

  1. #使用模型进行特征选择
  2. #LogisticRegression
  3. from sklearn.metrics import roc_auc_score
  4. from sklearn.linear_model import LogisticRegression
  5. X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.1,random_state=0)
  6. lr = LogisticRegression(penalty='l2',random_state=0,n_jobs=-1).fit(X_train,y_train)
  7. pred = lr.predict_proba(X_test)[:,1]
  8. print(roc_auc_score(y_test,pred))
  9. #Lasso
  10. from sklearn.linear_model import RandomizedLasso
  11. from sklearn.datasets import load_boston
  12. boston = load_boston()
  13. X = boston['data']
  14. Y = boston['target']
  15. names = boston['feature_names']
  16. rlasso = RandomizedLasso(alpha=0.025).fit(X,Y)
  17. feature_importance = sorted(zip(names,rlasso.scores_))
  18. #RFE
  19. from sklearn.feature selection import RFE
  20. X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.1,random_state=0)
  21. rf = RandomForestClassifier
  22. rfe =fit(X_train,y_train)
  23. feature_importance = sorted(zip(
  24. map(lambda x:round(x,4),rfe.ranking_),columns),reverse=True)

 

  1. #SVC
  2. cls = SVC(probability = True,kernel = 'rbf',c=0.1,max_iter=10)
  3. cls.fit(X_train,y_train)
  4. y_pred = cls.predict_proba(X_test)[:,1]
  5. metrics.roc_auc_score(y_test)
  6. #MLPRegresson
  7. from sklearn.neural_network import MLPClassifier,MLPRegression
  8. reg = MLPRegression(hidden_layer_sizes = (10,10,10),learning_rate = 0.1)
  9. #DecisionTreeClassifier
  10. from sklearn.tree import DecisionTreeClassifier
  11. cls = DecisionTreeClassifier(max_depth=6,min_samples_split=10,
  12. min_samples_leaf=5,max_features=0.7)
  13. cls.fit(X_train,y_train)
  14. y_pred = cls.predict_proba(X_test)[:,1]
  15. metrics.roc_auc_score(y_test)
  16. #RandomForestClassifier
  17. cls = RandomForestClassifier(max_depth=6,min_samples_split=10,
  18. min_samples_leaf=5,max_features=0.7)
  19. cls.fit(X_train,y_train)
  20. y_pred = cls.predict_proba(X_test)[:,1]
  21. metrics.roc_auc_score(y_test)
  22. #ExtraTreesClassifier

 

本文内容由网友自发贡献,转载请注明出处:【wpsshop博客】
推荐阅读
相关标签
  

闽ICP备14008679号