赞
踩
- from sklearn.feature_selection import VarianceThreshold
- data = .
- var_thresh = VarianceThreshold(threshold=0.1)
- transformed_data = var_thresh.fit_transform(data)
- import pandas as pd
- from sklearn.datasets import fetch_california_housing
- data = fetch_california_housing()
- X = data["data"]
- col_names = data["feature_names"]
- y = data["target"]
- df = pd.DataFrame(X, columns=col_names)
- df.loc[:, "MedInc_Sqrt"] = df.MedInc.apply(np.sqrt)
- df.corr()
得出相关矩阵,如图 1 所⽰。
- from sklearn.feature_selection import chi2
- from sklearn.feature_selection import f_classif
- from sklearn.feature_selection import f_regression
- from sklearn.feature_selection import mutual_info_classif
- from sklearn.feature_selection import mutual_info_regression
- from sklearn.feature_selection import SelectKBest
- from sklearn.feature_selection import SelectPercentile
-
- class UnivariateFeatureSelction:
- def __init__(self, n_features, problem_type, scoring):
- if problem_type == "classification":
- valid_scoring = {
- "f_classif": f_classif,
- "chi2": chi2,
- "mutual_info_classif": mutual_info_classif
- }
- else:
- valid_scoring = {
- "f_regression": f_regression,
- "mutual_info_regression": mutual_info_regression
- }
- if scoring not in valid_scoring:
- raise Exception("Invalid scoring function")
- if isinstance(n_features, int):
- self.selection = SelectKBest(
- valid_scoring[scoring],
- k=n_features
- )
- elif isinstance(n_features, float):
- self.selection = SelectPercentile(
- valid_scoring[scoring],
- percentile=int(n_features * 100)
- )
- else:
- raise Exception("Invalid type of feature")
-
- def fit(self, X, y):
- return self.selection.fit(X, y)
-
- def transform(self, X):
- return self.selection.transform(X)
-
- def fit_transform(self, X, y):
- return self.selection.fit_transform(X, y)
使⽤该类⾮常简单。
- # Example usage:
- ufs = UnivariateFeatureSelction(
- n_features=0.1,
- problem_type="regression",
- scoring="f_regression"
- )
- ufs.fit(X, y)
- X_transformed = ufs.transform(X)
- import pandas as pd
- from sklearn import linear_model
- from sklearn import metrics
- from sklearn.datasets import make_classification
-
- class GreedyFeatureSelection:
- def evaluate_score(self, X, y):
- model = linear_model.LogisticRegression()
- model.fit(X, y)
- predictions = model.predict_proba(X)[:, 1]
- auc = metrics.roc_auc_score(y, predictions)
- return auc
-
- def _feature_selection(self, X, y):
- good_features = []
- best_scores = []
- num_features = X.shape[1]
- while True:
- this_feature = None
- best_score = 0
- for feature in range(num_features):
- if feature in good_features:
- continue
- selected_features = good_features + [feature]
- xtrain = X[:, selected_features]
- score = self.evaluate_score(xtrain, y)
- if score > best_score:
- this_feature = feature
- best_score = score
- if this_feature is None:
- break
- good_features.append(this_feature)
- best_scores.append(best_score)
- if len(best_scores) > 1:
- if best_scores[-1] < best_scores[-2]:
- break
- return best_scores[:-1], good_features[:-1]
-
- def __call__(self, X, y):
- scores, features = self._feature_selection(X, y)
- return X[:, features], scores
-
- if __name__ == "__main__":
- X, y = make_classification(n_samples=1000, n_features=100)
- X_transformed, scores = GreedyFeatureSelection()(X, y)
- import pandas as pd
- from sklearn.feature_selection import RFE
- from sklearn.linear_model import LinearRegression
- from sklearn.datasets import fetch_california_housing
-
- data = fetch_california_housing()
- X = data["data"]
- col_names = data["feature_names"]
- y = data["target"]
-
- model = LinearRegression()
- rfe = RFE(
- estimator=model,
- n_features_to_select=3
- )
- rfe.fit(X, y)
- X_transformed = rfe.transform(X)
- import pandas as pd
- from sklearn.datasets import load_diabetes
- from sklearn.ensemble import RandomForestRegressor
- data = load_diabetes()
- X = data["data"]
- col_names = data["feature_names"]
- y = data["target"]
- model = RandomForestRegressor()
- model.fit(X, y)
随机森林(或任何模型)的特征重要性可按如下⽅式绘制。
- importances = model.feature_importances_
- idxs = np.argsort(importances)
- plt.title('Feature Importances')
- plt.barh(range(len(idxs)), importances[idxs], align='center')
- plt.yticks(range(len(idxs)), [col_names[i] for i in idxs])
- plt.xlabel('Random Forest Feature Importance')
- plt.show()
结果如图 3 所⽰。
- import pandas as pd
- from sklearn.datasets import load_diabetes
- from sklearn.ensemble import RandomForestRegressor
- from sklearn.feature_selection import SelectFromModel
- data = load_diabetes()
- X = data["data"]
- col_names = data["feature_names"]
- y = data["target"]
- model = RandomForestRegressor()
- sfm = SelectFromModel(estimator=model)
- X_transformed = sfm.fit_transform(X, y)
- support = sfm.get_support()
- print([x for x, y in zip(col_names, support) if y = True ])
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。