赞
踩
#1、导入模块
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
form sklearn.metrics import classification_report
#2、查看数据
data = pd.read_csv('data/train.csv')
data.head()
#3、数据清洗
#查看数据行列
data.shape
#检查空值情况
data.isnull().sum()
#4、保存有用的特征,删掉不重要的特征 data = data.drop(labels=['PassengerId','Name','Ticket','Cabin'],axis=1) data.head() #去除有缺失值的行 data = data.dropna() #将有空值的行删除 data #进行变量编码 data_dummy = pd.get_dummies(data[['Sex','Embarked']]) #独热编码 data_dummy.head() ` ```python #5、将其它列的信息与编码后的变量结合在一起 data_conti = pd.DataFrame(data,columns=['Survived','Pclass','Age','SibSp','Parch','Fare'],index=data.index) data = data_conti.join(data_dummy) data.head()
#6、分离测试集与训练集。train为训练集,test为测试集
X = data.iloc[:,1:] #取除第一列的所有列
y = data.iloc[:,0] #取第一列
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0) #数据集的划分
#标准化
stdsc = StandardScaler()
X_train_conti_std = stdsc.fit_transform(X_train[['Age','SibSp','Parch','Fare']]) #拟合
X_test_conti_std = stdsc.fit_transform(X_test[['Age','SibSp','Parch','Fare']]) #拟合
#将ndarray转为DataFrame
X_train_conti_std = pd.DataFrame(data = X_train_conti_std,columns=['Age','SibSp','Parch','Fare'],index=X_train.index)
X_test_conti_std = pd.DataFrame(data = X_test_conti_std,columns=['Age','SibSp','Parch','Fare'],index=X_test.index)
#使用逻辑回归建模
classifier = LogisticRegression(random_state=0) #实例化算法
classifier.fit(X_train,y_train) #模型训练
#将模型应用于测试并查看混淆矩阵
y_pred = classifier.predict(X_test)
confusion_matrix = confusion_matrix(y_pred,y_test)
print(confusion_matrix)
#测试集上的准确率
print(classifier.score(X_test,y_test))
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。