赞
踩
在Kaggle有这样一个经典的题目,根据船上的用户基本信息,判断剩下的人是否能生存下来。话不多说直接进入主题。
包含了源代码+训练集+ 测试集
def select_data(): selected_features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare','Embarked'] train_data = load_data() test_data = load_data("test") train_x = train_data[selected_features] train_y = train_data["Survived"] test_x = test_data[selected_features] train_x["Age"].fillna(train_x["Age"].mean(), inplace = True) train_x['Embarked'].fillna('S',inplace=True) #'S'出现次数最多,因此以'S'进行填充 test_x["Age"].fillna(test_x["Age"].mean(), inplace = True) test_x["Fare"].fillna(test_x["Fare"].mean(), inplace = True) train_x = format_data(train_x) test_x = format_data(test_x) print(test_x.info()) return train_x, train_y, test_x
def format_data(train_x):
# 数据化性别
train_x.loc[train_x['Sex'] == "male", "Sex"] = 0
train_x.loc[train_x["Sex"] == "female", "Sex"] = 1
train_x.loc[train_x['Embarked'] == "S", "Embarked"] = 0
train_x.loc[train_x["Embarked"] == "C", "Embarked"] = 1
train_x.loc[train_x['Embarked'] == "Q", "Embarked"] = 2
return train_x
def random_forest(): test_data = load_data('test') x_train, y_train, x_test = select_data() model = RandomForestClassifier() paras = {'n_estimators': np.arange(10, 100, 10), 'criterion': ['gini', 'entropy'], 'max_depth': np.arange(5, 50, 5)} gs = GridSearchCV(model, paras, cv=5, verbose=1,n_jobs=-1) gs.fit(x_train, y_train) y_pre = gs.predict(x_test) print('best score:', gs.best_score_) print('best parameters:', gs.best_params_) print((test_data)) result = '' with open('./result.csv', 'w', encoding="utf-8") as f: f.write("PassengerId,Survived" + "\n") for i in range(len(y_pre)): result = str(test_data.iloc[i,0]) + "," + str(y_pre[i]) f.write(result + "\n")
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。