赞
踩
from IPython.display import Image
Image(filename=r'C:\Users\a\Desktop\暑假\Titantic\QQ截图20190827081938.png',width=800)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#读取文件 查看数据描述
data = pd.read_csv('titanic_train.csv')
data.describe()
PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
可以看到数据中的Age列和Cabin列以及Embarked的列有缺失值,Cabin列缺失值数量太多,直接舍去,然后Ticket列对于实际的获救应该也没有什么关系
data.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
查阅当时的背景资料发现在当时泰坦尼克失事之后的逃生政策是妇女和儿童优先,下面查看一下妇女和儿童的逃生率,可以看到女性的平均获救人数是0.74远大于男性的
#查阅当时的背景资料发现在当时泰坦尼克失事之后的逃生政策是妇女和儿童优先,下面查看一下妇女和儿童的逃生率
data.pivot_table(index='Sex',values='Survived') #可以看到女性的平均获救人数是0.74远大于男性的0.18
Survived | |
---|---|
Sex | |
female | 0.742038 |
male | 0.188908 |
查看获救人群的平均年龄 然而好像说明不了什么问题
#查看获救人群的平均年龄 然而好像说明不了什么问题
data.pivot_table(index='Survived',values='Age')
Age | |
---|---|
Survived | |
0 | 30.626179 |
1 | 28.343690 |
查看女性人群的平均年龄 然而好像说明不了什么问题
#查看女性人群的平均年龄 然而好像说明不了什么问题
data[data['Sex']=='female'].pivot_table(index='Survived',values='Age')
Age | |
---|---|
Survived | |
0 | 25.046875 |
1 | 28.847716 |
查阅资料发现当时儿童的定义为14岁以下,查看儿童的获救率 发现随着年龄的增长获救率会降低这也印证了妇女和儿童优先的逃生政策
#查阅资料发现当时儿童的定义为14岁以下,查看儿童的获救率 发现随着年龄的增长获救率会降低这也印证了妇女和儿童优先的政策
#所以我们认为年龄是一个重要特征
for i in np.arange(20):
print(data[data['Age']<= i].pivot_table(index = 'Sex',values='Survived'))
Empty DataFrame Columns: [] Index: [] Survived Sex female 1.0 male 0.8 Survived Sex female 0.600000 male 0.642857 Survived Sex female 0.583333 male 0.722222 Survived Sex female 0.705882 male 0.652174 Survived Sex female 0.761905 male 0.652174 Survived Sex female 0.739130 male 0.666667 Survived Sex female 0.750000 male 0.615385 Survived Sex female 0.730769 male 0.607143 Survived Sex female 0.633333 male 0.593750 Survived Sex female 0.612903 male 0.575758 Survived Sex female 0.593750 male 0.555556 Survived Sex female 0.593750 male 0.567568 Survived Sex female 0.617647 male 0.567568 Survived Sex female 0.631579 male 0.538462 Survived Sex female 0.651163 male 0.525000 Survived Sex female 0.673469 male 0.431373 Survived Sex female 0.690909 male 0.396552 Survived Sex female 0.676471 male 0.338028 Survived Sex female 0.706667 male 0.292135
在当时的社会等级制度严格 查看一下三个船舱等级对应的获救率 发现船舱等级不同获救率也会有很大的不同所以船舱等级也是一个重要特征
#在当时的社会等级制度严格 查看一下三个船舱等级对应的获救率 发现船舱等级不同获救率也会有很大的不同所以船舱等级也是一个重要特征
data.pivot_table(index='Pclass',values='Survived')
Survived | |
---|---|
Pclass | |
1 | 0.629630 |
2 | 0.472826 |
3 | 0.242363 |
那么登船地点会不会影响获救率呢 看起来登船地点对应的获救率也有较大区别,可能不同的登船地点上到船上的位置不同,距离逃生地点的远近也不同
#那么登船地点会不会影响获救率呢
data.pivot_table(index='Embarked',values='Survived')
#看起来登船地点对应的获救率也有较大区别,可能不同的登船地点上到船上的位置不同,距离逃生地点的远近也不同
Survived | |
---|---|
Embarked | |
C | 0.553571 |
Q | 0.389610 |
S | 0.336957 |
那么家里的兄弟姐妹的数量会不会影响获救率呢 可以看到越多的兄弟姐妹获救率越低
#那么家里的兄弟姐妹的数量会不会影响获救率呢 可以看到越多的兄弟姐妹获救率越低
data.pivot_table(index='SibSp',values='Survived')
Survived | |
---|---|
SibSp | |
0 | 0.345395 |
1 | 0.535885 |
2 | 0.464286 |
3 | 0.250000 |
4 | 0.166667 |
5 | 0.000000 |
8 | 0.000000 |
那么家里老人和小孩的数量会不会影响获救率呢 可以看到总体来说老人和小孩的数量越多获救率也越大
#那么家里老人和小孩的数量会不会影响获救率呢 可以看到总体来说老人和小孩的数量越多获救率也越大
data.pivot_table(index='Parch',values='Survived')
Survived | |
---|---|
Parch | |
0 | 0.343658 |
1 | 0.550847 |
2 | 0.500000 |
3 | 0.600000 |
4 | 0.000000 |
5 | 0.200000 |
6 | 0.000000 |
至此,我们认为重要特征为Pclass,Sex,Age,Embarked,SibSp,Parch
构造一个新的数据表
#至此,我们认为重要特征为Pclass,Sex,Age,Embarked,SibSp,Parch
#构造一个新的数据表
columns = ['Pclass','Sex','Age','SibSp','Parch','Embarked','Survived','Fare']
new_data = data[columns]
new_data.head()
Pclass | Sex | Age | SibSp | Parch | Embarked | Survived | Fare | |
---|---|---|---|---|---|---|---|---|
0 | 3 | male | 22.0 | 1 | 0 | S | 0 | 7.2500 |
1 | 1 | female | 38.0 | 1 | 0 | C | 1 | 71.2833 |
2 | 3 | female | 26.0 | 0 | 0 | S | 1 | 7.9250 |
3 | 1 | female | 35.0 | 1 | 0 | S | 1 | 53.1000 |
4 | 3 | male | 35.0 | 0 | 0 | S | 0 | 8.0500 |
查看缺失值
#查看缺失值
new_data.isnull().sum()
Pclass 0
Sex 0
Age 177
SibSp 0
Parch 0
Embarked 2
Survived 0
Fare 0
dtype: int64
填充缺失值,年龄填充为中位数,登船地点填充为众数
#填充缺失值,年龄填充为中位数,登船地点填充为众数
new_data['Age'].fillna(new_data['Age'].median(),inplace = True)
print(new_data['Age'].median())
print(new_data['Embarked'].mode())
# #查看数据表描述
new_data.describe()
28.0
0 S
dtype: objec
Pclass | Age | SibSp | Parch | Survived | Fare | |
---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 2.308642 | 29.361582 | 0.523008 | 0.381594 | 0.383838 | 32.204208 |
std | 0.836071 | 13.019697 | 1.102743 | 0.806057 | 0.486592 | 49.693429 |
min | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 2.000000 | 22.000000 | 0.000000 | 0.000000 | 0.000000 | 7.910400 |
50% | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 3.000000 | 35.000000 | 1.000000 | 0.000000 | 1.000000 | 31.000000 |
max | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 1.000000 | 512.329200 |
查看空值
new_data.isnull().sum()
Pclass 0
Sex 0
Age 0
SibSp 0
Parch 0
Embarked 2
Survived 0
Fare 0
dtype: int64
将登船地点填充
#将登船地点填充
new_data["Embarked"] = new_data["Embarked"].fillna('S')
new_data.isnull().sum()
Pclass 0
Sex 0
Age 0
SibSp 0
Parch 0
Embarked 0
Survived 0
Fare 0
dtype: int64
将男性和女性的字符值转化为数值男0,女1
#将男性和女性的字符值转化为数值男0,女1
new_data.loc[new_data['Sex']=='male','Sex'] = 0
new_data.loc[new_data['Sex']=='female','Sex'] = 1
将登船地点对应的字符值转化为数值 C:0,Q:1,S:2
#将登船地点对应的字符值转化为数值 C:0,Q:1,S:2
new_data.loc[new_data['Embarked']=='C','Embarked'] = 0
new_data.loc[new_data['Embarked']=='Q','Embarked'] = 1
new_data.loc[new_data['Embarked']=='S','Embarked'] = 2
new_data.head()
Pclass | Sex | Age | SibSp | Parch | Embarked | Survived | Fare | |
---|---|---|---|---|---|---|---|---|
0 | 3 | 0 | 22.0 | 1 | 0 | 2 | 0 | 7.2500 |
1 | 1 | 1 | 38.0 | 1 | 0 | 0 | 1 | 71.2833 |
2 | 3 | 1 | 26.0 | 0 | 0 | 2 | 1 | 7.9250 |
3 | 1 | 1 | 35.0 | 1 | 0 | 2 | 1 | 53.1000 |
4 | 3 | 0 | 35.0 | 0 | 0 | 2 | 0 | 8.0500 |
#开始建模,使用线性回归 from sklearn.linear_model import LinearRegression #使用交叉验证方法 from sklearn.model_selection import KFold,cross_val_score kf = KFold(5,random_state=0) predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Embarked"] train_target = new_data['Survived'] LR = LinearRegression() accuracys=[] for train,test in kf.split(new_data): LR.fit(new_data.loc[train,predictors],new_data.loc[train,'Survived']) pred = LR.predict(new_data.loc[test,predictors]) pred[pred >= 0.60] = 1 pred[pred < 0.60] = 0 accuracy = len(pred[pred == new_data.loc[test,'Survived']])/len(test) accuracys.append(accuracy) print(np.mean(accuracys))
0.8035653756826313
#开始建模,使用逻辑回归 from sklearn.linear_model import LogisticRegression from sklearn.model_selection import KFold,cross_val_score kf = KFold(5,random_state=0) predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Embarked"] lr = LogisticRegression(C = 0.1,solver='liblinear',penalty='l2') lr.fit(new_data[predictors],new_data['Survived']) print(cross_val_score(lr,new_data[predictors],new_data['Survived'],cv = kf).mean()) accuracys = [] for train,test in kf.split(new_data): lr.fit(new_data.loc[train,predictors],new_data.loc[train,'Survived']) pred = lr.predict_proba(new_data.loc[test,predictors]) # print(pred.shape) new_pred = pred[:,1] # print(new_pred) new_pred[new_pred >= 0.50] = 1 new_pred[new_pred < 0.50] = 0 accuracy = len(new_pred[new_pred == new_data.loc[test,'Survived']])/len(test) accuracys.append(accuracy) print(np.mean(accuracys))
0.7956939300734418
0.7956939300734418
#开始建模,使用决策树
from sklearn import tree
dt = tree.DecisionTreeClassifier(min_samples_split=4, min_samples_leaf=4)
kf = KFold(5,random_state=0)
accuracys = []
for train,test in kf.split(new_data):
dt.fit(new_data.loc[train,predictors],new_data.loc[train,'Survived'])
pred = dt.predict(new_data.loc[test,predictors])
accuracy = len(pred[pred == new_data.loc[test,'Survived']])/len(test)
accuracys.append(accuracy)
print(np.mean(accuracys))
print(cross_val_score(dt,new_data[predictors],new_data['Survived'],cv=kf).mean())
0.804758018956751
0.8036344234511331
#开始建模,使用随机森林 from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score alg = RandomForestClassifier(random_state=1, n_estimators=80, min_samples_split=4, min_samples_leaf=4) predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Embarked"] kf = KFold(5,random_state=0) scores = cross_val_score(alg, new_data[predictors], new_data["Survived"], cv=kf) #方法一 print(scores.mean()) accuracys = [] #方法二 for train,test in kf.split(new_data): alg.fit(new_data.loc[train,predictors],new_data.loc[train,'Survived']) pred = alg.predict(new_data.loc[test,predictors]) accuracy = len(pred[pred == new_data.loc[test,'Survived']])/len(test) accuracys.append(accuracy) print(np.mean(accuracys))
0.820475801895675
0.820475801895675
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。