赞
踩
使用python的机器学习包sklearn完成试验,IDE是jupyter notebook。
- # 导入数据集
- import pandas as pd
- datas_train = pd.read_csv('TTNKHP/train.csv') # 训练集数据
- # 查看数据前几条
- datas_train.head()
- # 收集特征值和目标值
- x = datas_train[['Pclass','Sex','Age','SibSp','Parch']]
- y = datas_train['Survived']
- # 填充缺失值
- x['Age'].fillna(x['Age'].mean(),inplace=True)
- # 转为字典类型
- x = x.to_dict(orient='records')
- # 数据集划分
- from sklearn.model_selection import train_test_split
- x_train,x_test,y_train,y_test = train_test_split(x,y)
- # 特征抽取
- from sklearn.feature_extraction import DictVectorizer
- transter = DictVectorizer()
- x_train= transter.fit_transform(x_train)
- x_test = transter.transform(x_test)
- # 使用网格搜索和交叉验证进行调参
- from sklearn.tree import DecisionTreeClassifier
- medicter = DecisionTreeClassifier(criterion='entropy')
- from sklearn.model_selection import GridSearchCV
- GSCV = GridSearchCV(medicter,param_grid={'max_depth':[1,2,3,4,5,6,7,8,9,10,11,12,13]},cv=4)
- GSCV.fit(x_train,y_train)
- # 输出信息
- GSCV.score(x_test,y_test)
- GSCV.best_params_
- #0.8340807174887892
- #{'max_depth': 4}
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。