def list_add(a, b):
# 这个函数实现列表a与列表b相加,同时相加后的值存到列表a中
assert len(a) == len(b)
for i in range(len(a)):
a[i] += b[i]
def list_div(a, num):
# 这个函数将列表a的各值除以num,同时将处理后的值存到列表a中
for i in range(len(a)):
a[i] /= num
return a
acc_mean = 0
acc_mean_10test = 0
feature_mean = [0] * len(df.columns[:-1])
n = 1000
max_acc = 0;
index_of_max = 0 # 何时的acc最大,以及对应的index
min_acc = 1;
index_of_min = 0 # 何时的acc最小,以及对应的index
max_arg_acc = 0;
index_of_max_acc2 = 0 # 何时的acc最大,以及对应的index
# 树的个数
trees = 10
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.6, random_state=i + 1)
x_test2, x_check, y_test2, y_check = train_test_split(x_test, y_test, train_size=0.25, random_state=i + 1)
def train_test(): global acc_mean, acc_mean_10test, max_acc, max_arg_acc, min_acc, index_of_max, index_of_max_acc2, index_of_min for i in range(50): # print('*'*150) print('第 %d 次 test' % (i + 1)) # x_test,x_train,y_test,y_train=train_test_split(x,y,random_state=i+1) x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.6, random_state=i + 1) x_test2, x_check, y_test2, y_check = train_test_split(x_test, y_test, train_size=0.25, random_state=i + 1) forest = RandomForestClassifier(n_estimators=trees, random_state=100) #y_train改成y_train.astype('int') forest.fit(x_train, y_train.astype('int')) # 经测试,random_state=594时验证集上精度最高 # n_estimators表示树的个数,测试中10颗树足够 # print('*'*100) # print('start') test_score = forest.score(x_check, y_check.astype('int')) # 300条数据得到的score # print("random forest with %d trees:"%trees) print("accuracy on the training subset:{:.3f}".format(forest.score(x_train, y_train.astype('int')))) print("accuracy on the test subset:{:.3f}".format(test_score)) # print("avg accuracy on the test subset:{:.3f}".format(acc_mean/(i+1))) # print('Feature importances:{}'.format(forest.feature_importances_)) if (max_acc <= test_score): max_acc = test_score index_of_max = i + 1 if (min_acc >= test_score): min_acc = test_score index_of_min = i + 1 acc_mean += test_score # check subset mean print("i times avg accuracy on the check subset:{:.3f}".format(acc_mean / (i + 1))) acc_mean2 = 0 # 临时的 acc_mean2 += test_score print('10 check score') for j in range(10): # x_test2,x_train2,y_test2,y_train2=train_test_split(x,y,random_state=j+1) # x_train2,x_test2,y_train2,y_test2=train_test_split(x,y,random_state=j+1) # x_train2,x_test2,y_train2,y_test2=train_test_split(x,y,train_size=0.4,test_size=0.4,random_state=j+1) # x_train2,x_test2,y_train2,y_test2=train_test_split(x,y,test_size=0.4,random_state=j+1) x_test2, x_check, y_test2, y_check = train_test_split(x_test, y_test, train_size=0.25, random_state=j + 1) # 划分10次check集 acc_mean2 += forest.score(x_check, y_check.astype('int')) print(forest.score(x_test2, y_test2.astype('int')), end=' ') print() print("avg accuracy on the 11 check subset:{:.3f}".format(acc_mean2 / 11)) # 在此模型下多次划分check集合得到一个平均精度 acc_mean2 = acc_mean2 / 11 # 得到最大时的index if (max_arg_acc <= acc_mean2): max_arg_acc = acc_mean2 index_of_max_acc2 = i + 1 list_add(feature_mean, forest.feature_importances_) acc_mean_10test += acc_mean2 print("all-avg accuracy on the 10 check subsets:{:.3f}".format(acc_mean_10test / (i + 1))) # 得到一千次训练的平均精度 # 然后暂时可以先用最高精度构造出分类器后测试其在不同测试集上的精度 # print() print('Final :') print("avg accuracy on the check subset:{:.3f}".format(acc_mean / n)) # print('avg Feature importances:{}'.format(list_div(feature_mean,n))) print('max_acc on the check subset: %f' % max_acc) print('index : %d' % index_of_max) print('min_acc on the check subset: %f' % min_acc) print('index : %d' % index_of_min) print('max_arg_acc on 11 check subsets: %f' % max_arg_acc) print('index : %d' % index_of_max_acc2) train_test()
train_test() # x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.4,test_size=0.4,random_state=index_of_max_acc2) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=index_of_max_acc2) x_test2, x_check, y_test2, y_check = train_test_split(x_test, y_test, train_size=0.25, random_state=index_of_max_acc2) # x_test,x_train,y_test,y_train=train_test_split(x,y,random_state=index_of_max_acc2) forest = RandomForestClassifier(n_estimators=trees, random_state=100) forest.fit(x_train, y_train.astype('int')) result_acc = 0 result_acc += forest.score(x_check, y_check.astype('int')) for j in range(10): x_test2, x_check, y_test2, y_check = train_test_split(x_test, y_test, train_size=0.25, random_state=j + 1) print(forest.score(x_check, y_check.astype('int')), end=' ') # print(forest.predict(x_check)) result_acc += forest.score(x_check, y_check.astype('int')) print() print("avg accuracy on the 11 check subsets:{:.3f}".format(result_acc / 11)) # 做出特征重要性的图 n_features = x.shape[1] plt.barh(range(n_features), forest.feature_importances_, align='center') plt.yticks(np.arange(n_features), names) plt.title("random forest with %d trees:" % trees) plt.xlabel('Feature Importance') plt.ylabel('Feature') plt.show()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。