赞
踩
- best_accuracy = 0
- best_parameters = {"a": 0, "b": 0, "c": 0}
- for a in range(1, 11):
- for b in range(1, 11):
- for c in range(1, 11):
- model = MODEL(a, b, c)
- model.fit(training_data)
- preds = model.predict(validation_data)
- accuracy = metrics.accuracy_score(targets, preds)
- if accuracy > best_accuracy:
- best_accuracy = accuracy
- best_parameters["a"] = a
- best_parameters["b"] = b
- best_parameters["c"] = c
- RandomForestClassifier(
- n_estimators=100,
- criterion='gini',
- max_depth=None,
- min_samples_split=2,
- min_samples_leaf=1,
- min_weight_fraction_leaf=0.0,
- max_features='auto',
- max_leaf_nodes=None,
- min_impurity_decrease=0.0,
- min_impurity_split=None,
- bootstrap=True,
- oob_score=False,
- n_jobs=None,
- random_state=None,
- verbose=0,
- warm_start=False,
- class_weight=None,
- ccp_alpha=0.0,
- max_samples=None,
- )
- # rf_grid_search.py
- import numpy as np
- import pandas as pd
- from sklearn import ensemble
- from sklearn import metrics
- from sklearn import model_selection
-
- if __name__ == "__main__":
- df = pd.read_csv("./input/mobile_train.csv")
- X = df.drop("price_range", axis=1).values
- y = df.price_range.values
-
- classifier = ensemble.RandomForestClassifier(n_jobs=-1)
- param_grid = {
- "n_estimators": [100, 200, 250, 300, 400, 500],
- "max_depth": [1, 2, 5, 7, 11, 15],
- "criterion": ["gini", "entropy"]
- }
-
- model = model_selection.GridSearchCV(
- estimator=classifier,
- param_grid=param_grid,
- scoring="accuracy",
- verbose=10,
- n_jobs=1,
- cv=5
- )
-
- model.fit(X, y)
- print(f"Best score: {model.best_score_}")
- print("Best parameters set:")
- best_parameters = model.best_estimator_.get_params()
- for param_name in sorted(param_grid.keys()):
- print(f"\t{param_name}: {best_parameters[param_name]}")
[ CV ] criterion = entropy , max_depth = 15 , n_estimators = 500 , score = 0.895 ,total = 1.0 s[ CV ] criterion = entropy , max_depth = 15 , n_estimators = 500 ...............[ CV ] criterion = entropy , max_depth = 15 , n_estimators = 500 , score = 0.890 ,total = 1.1 s[ CV ] criterion = entropy , max_depth = 15 , n_estimators = 500 ...............[ CV ] criterion = entropy , max_depth = 15 , n_estimators = 500 , score = 0.910 ,total = 1.1 s[ CV ] criterion = entropy , max_depth = 15 , n_estimators = 500 ...............[ CV ] criterion = entropy , max_depth = 15 , n_estimators = 500 , score = 0.880 ,total = 1.1 s[ CV ] criterion = entropy , max_depth = 15 , n_estimators = 500 ...............[ CV ] criterion = entropy , max_depth = 15 , n_estimators = 500 , score = 0.870 , total = 1.1 s[ Parallel ( n_jobs = 1 )]: Done 360 out of 360 | elapsed : 3.7 min finishedBest score : 0.889Best parameters set :criterion : 'entropy'max_depth : 15n_estimators : 500
- if __name__ == "__main__":
- classifier = ensemble.RandomForestClassifier(n_jobs=-1)
- param_grid = {
- "n_estimators": np.arange(100, 1500, 100),
- "max_depth": np.arange(1, 31),
- "criterion": ["gini", "entropy"]
- }
- model = model_selection.RandomizedSearchCV(
- estimator=classifier,
- param_distributions=param_grid,
- n_iter=20,
- scoring="accuracy",
- verbose=10,
- n_jobs=1,
- cv=5
- )
- model.fit(X, y)
- print(f"Best score: {model.best_score_}")
- print("Best parameters set:")
- best_parameters = model.best_estimator_.get_params()
- for param_name in sorted(param_grid.keys()):
- print(f"\t{param_name}: {best_parameters[param_name]}")
我们更改了随机搜索的参数⽹格,结果似乎有了些许改进。
Best score : 0.8905Best parameters set :criterion : entropymax_depth : 25n_estimators : 300
- import numpy as np
- import pandas as pd
- from sklearn import metrics
- from sklearn import model_selection
- from sklearn import pipeline
- from sklearn.decomposition import TruncatedSVD
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.preprocessing import StandardScaler
- from sklearn.svm import SVC
-
- def quadratic_weighted_kappa(y_true, y_pred):
- return metrics.cohen_kappa_score(y_true, y_pred, weights="quadratic")
-
- if __name__ == '__main__':
- train = pd.read_csv('./input/train.csv')
- idx = test.id.values.astype(int)
- train = train.drop('id', axis=1)
- test = test.drop('id', axis=1)
-
- y = train.relevance.values
- traindata = list(train.apply(lambda x:'%s %s' % (x['text1'], x['text2']), axis=1))
- testdata = list(test.apply(lambda x:'%s %s' % (x['text1'], x['text2']), axis=1))
-
- tfv = TfidfVectorizer(
- min_df=3,
- max_features=None,
- strip_accents='unicode',
- analyzer='word',
- token_pattern=r'\w{1,}',
- ngram_range=(1, 3),
- use_idf=1,
- smooth_idf=1,
- sublinear_tf=1,
- stop_words='english'
- )
-
- tfv.fit(traindata)
- X = tfv.transform(traindata)
- X_test = tfv.transform(testdata)
-
- svd = TruncatedSVD()
- scl = StandardScaler()
- svm_model = SVC()
-
- clf = pipeline.Pipeline([
- ('svd', svd),
- ('scl', scl),
- ('svm', svm_model)
- ])
-
- param_grid = {
- 'svd__n_components': [200, 300],
- 'svm__C': [10, 12]
- }
-
- kappa_scorer = metrics.make_scorer(
- quadratic_weighted_kappa,
- greater_is_better=True
- )
-
- model = model_selection.GridSearchCV(
- estimator=clf,
- param_grid=param_grid,
- scoring=kappa_scorer,
- verbose=10,
- n_jobs=-1,
- refit=True,
- cv=5
- )
-
- model.fit(X, y)
- print("Best score: %0.3f" % model.best_score_)
- print("Best parameters set:")
- best_parameters = model.best_estimator_.get_params()
- for param_name in sorted(param_grid.keys()):
- print("\t%s: %r" % (param_name, best_parameters[param_name]))
-
- best_model = model.best_estimator_
- best_model.fit(X, y)
- preds = best_model.predict(X_test)
- # rf_gp_minimize.py
- import numpy as np
- import pandas as pd
- from functools import partial
- from sklearn import ensemble
- from sklearn import metrics
- from sklearn import model_selection
- from skopt import gp_minimize
- from skopt import space
-
- def optimize(params, param_names, x, y):
- params = dict(zip(param_names, params))
- model = ensemble.RandomForestClassifier(**params)
- kf = model_selection.StratifiedKFold(n_splits=5)
- accuracies = []
-
- for idx in kf.split(X=x, y=y):
- train_idx, test_idx = idx[0], idx[1]
- xtrain = x[train_idx]
- ytrain = y[train_idx]
- xtest = x[test_idx]
- ytest = y[test_idx]
-
- model.fit(xtrain, ytrain)
- preds = model.predict(xtest)
- fold_accuracy = metrics.accuracy_score(ytest, preds)
- accuracies.append(fold_accuracy)
-
- return -1 * np.mean(accuracies)
-
- if __name__ == "__main__":
- df = pd.read_csv("./input/mobile_train.csv")
- X = df.drop("price_range", axis=1).values
- y = df.price_range.values
-
- param_space = [
- space.Integer(3, 15, name="max_depth"),
- space.Integer(100, 1500, name="n_estimators"),
- space.Categorical(["gini", "entropy"], name="criterion"),
- space.Real(0.01, 1, prior="uniform", name="max_features")
- ]
-
- param_names = [
- "max_depth",
- "n_estimators",
- "criterion",
- "max_features"
- ]
-
- optimization_function = partial(
- optimize,
- param_names=param_names,
- x=X,
- y=y
- )
-
- result = gp_minimize(
- optimization_function,
- dimensions=param_space,
- n_calls=15,
- n_random_starts=10,
- verbose=10
- )
-
- best_params = dict(
- zip(
- param_names,
- result.x
- )
- )
- print(best_params)
这同样会产⽣⼤量输出,最后⼀部分如下所⽰。
Iteration No : 14 started . Searching for the next optimal point .Iteration No : 14 ended . Search finished for the next optimal point .Time taken : 4.7793Function value obtained : - 0.9075Current minimum : - 0.9075Iteration No : 15 started . Searching for the next optimal point .Iteration No : 15 ended . Search finished for the next optimal point .Time taken : 49.4186Function value obtained : - 0.9075Current minimum : - 0.9075{ 'max_depth' : 12 , 'n_estimators' : 100 , 'criterion' : 'entropy' ,'max_features' : 1.0 }
from skopt . plots import plot_convergenceplot_convergence ( result )
收敛图如图 2 所⽰。
- import numpy as np
- import pandas as pd
- from functools import partial
- from sklearn import ensemble
- from sklearn import metrics
- from sklearn import model_selection
- from hyperopt import hp, fmin, tpe, Trials
- from hyperopt.pyll.base import scope
-
- def optimize(params, x, y):
- model = ensemble.RandomForestClassifier(**params)
- kf = model_selection.StratifiedKFold(n_splits=5)
- accuracies = []
-
- for idx in kf.split(X=x, y=y):
- train_idx, test_idx = idx[0], idx[1]
- xtrain = x[train_idx]
- ytrain = y[train_idx]
- xtest = x[test_idx]
- ytest = y[test_idx]
-
- model.fit(xtrain, ytrain)
- preds = model.predict(xtest)
- fold_accuracy = metrics.accuracy_score(ytest, preds)
- accuracies.append(fold_accuracy)
-
- return -1 * np.mean(accuracies)
-
- if __name__ == "__main__":
- df = pd.read_csv("./input/mobile_train.csv")
- X = df.drop("price_range", axis=1).values
- y = df.price_range.values
-
- param_space = {
- "max_depth": scope.int(hp.quniform("max_depth", 1, 15, 1)),
- "n_estimators": scope.int(hp.quniform("n_estimators", 100, 1500, 1)),
- "criterion": hp.choice("criterion", ["gini", "entropy"]),
- "max_features": hp.uniform("max_features", 0, 1)
- }
-
- optimization_function = partial(
- optimize,
- x=X,
- y=y
- )
-
- trials = Trials()
- hopt = fmin(
- fn=optimization_function,
- space=param_space,
- algo=tpe.suggest,
- max_evals=15,
- trials=trials
- )
-
- print(hopt)
❯ python rf_hyperopt . py100 %| ██████████████████ | 15 / 15 [ 0 4 : 38 < 0 0 : 0 0 , 18.57 s / trial , best loss : -0.9095000000000001 ]{ 'criterion' : 1 , 'max_depth' : 11.0 , 'max_features' : 0.821163568049807 ,'n_estimators' : 806.0 }
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。