赞
踩
这是kaggle上Getting Started 的Prediction Competition,也是比较入门和简单的新人赛,我的最好成绩好像有进入top8%,重新地回顾巩固一下这个比赛,我将分成三个部分:
赛题地址:Titanic: Machine Learning from Disaster
1912年4月15日,在她的处女航中,被普遍认为“沉没”的RMS泰坦尼克号与冰山相撞后沉没。
不幸的是,船上没有足够的救生艇供所有人使用,导致2224名乘客和机组人员中的1502人死亡。虽然幸存有一些运气,但似乎有些人比其他人更有可能生存。
在这一挑战中,我们要求您建立一个预测模型来回答以下问题:“什么样的人更有可能生存?” 使用乘客数据(即姓名,年龄,性别,社会经济舱等)
任务分析:这是一个分类任务,建立模型预测幸存者
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
seed =2020
加载数据方法 :pd.read_csv(),pd.read_table()
例如用read_csv()读取tsv文件,df = pd.read_csv(file_path,sep='\t')
处理大型文件或内存不足时,采用分块读取方式:
df = pd.read_csv(file_path,chunksize = 100)
for i in df: ##用循环的方式即可迭代读取DataFrame
print(i)
更多常用参数
##加载数据 不建议把特征名转成中文,在画图时可能出现乱码
train_df = pd.read_csv('data_train.csv')
test_df = pd.read_csv('data_test.csv')
##数据预览 查看前5行的数据
train_df.head()
test_df.head()
train_df.info()
train_df.describe()
test_df.info()
test_df.describe()
### 可以根据自己的需求封装一个函数 def _data_info(data,categorical_features): print('number of train examples = {}'.format(data.shape[0])) print('number of train Shape = {}'.format(data.shape)) print('Features={}'.format(data.columns)) print('\n--------输出类别特征的种类--------') for i in categorical_features: if i in list(data.columns): print("train:"+i+":",list(data[i].unique())) print('\n--------缺失值--------') missing = data.isnull().sum() missing = missing[missing > 0] print(missing) missing.sort_values(inplace=True) missing.plot.bar() plt.show() def data_info(data_train,data_test,categorical_features): print('--------训练集基本概况--------') _data_info(data_train,categorical_features) print('\n\n--------测试集基本概况--------') _data_info(data_test,categorical_features)
训练集的样本数:891, 特征数:11+1(一个标签)
测试集的样本数:418, 特征数:11(一个标签)
类别特征的情况:
data_info(train_df,test_df,['Survived','Pclass','Sex','Cabin','Embarked','SibSp','Parch'])
了解了缺失值的情况后,可以对其进行简单的填充
#将数据合并一起处理,添加一个train特征用于区分训练集和测试集
train_df['train'] = 1
test_df['train'] = 0
data_df = pd.concat([train_df,test_df],sort=True).reset_index(drop=True)
## 删除PassengerId特征
data_df.drop('PassengerId',inplace=True,axis=1)
## 先将非数字的类别特征数字化
from sklearn import preprocessing
ler_sex = preprocessing.LabelEncoder()
ler_sex.fit(data_df['Sex'])
data_df['Sex'] = ler_sex.transform(data_df['Sex'])
缺失数量少,考虑使用众值进行填充
data_df['Embarked'].fillna(data_df['Embarked'].mode()[0],inplace=True)
## 填充完Embarker后,先将非数字的类别特征数字化
ler_Embarked = preprocessing.LabelEncoder()
ler_Embarked.fit(data_df['Embarked'])
data_df['Embarked'] = ler_Embarked.transform(data_df['Embarked'])
177 + 86 891 + 418 ≈ 20 % {177+86\over891+418}\approx 20\% 891+418177+86≈20%
缺失率约20%,考虑对齐进行填充,如果直接采用数据集的数据特征进行填充,效果可能可能不是很好
尝试结合其他聚合特征对Age进行填充,从相关性程度分析中可以看出,Pclass的程度较大
abs(data_df.corr()['Age']).sort_values(ascending=False)
Age 1.000000
Pclass 0.408106
SibSp 0.243699
Fare 0.178740
Parch 0.150917
Embarked 0.080195
Survived 0.077221
Sex 0.063645
train 0.018528
y = data_df['Age']
plt.figure(1)
plt.title('Distribution of Age')
sns.distplot(y, kde=True)
## 不同性别的年龄分布,可以看出他们的分布趋于相同
plt.figure(2);
Age_Sex0 = data_df.loc[data_df['Sex']==0,'Age']
Age_Sex1 = data_df.loc[data_df['Sex']==1,'Age']
plt.title('Distribution of Age in Sex');
plt.legend(['Sex0','Sex1']);
sns.distplot(Age_Sex0, kde=True);
sns.distplot(Age_Sex1, kde=True);
Age_p1 = data_df.loc[data_df['Pclass']==1,'Age']
Age_p2 = data_df.loc[data_df['Pclass']==2,'Age']
Age_p3 = data_df.loc[data_df['Pclass']==3,'Age']
sns.distplot(Age_p1,kde=True,color='b')
sns.distplot(Age_p2,kde=True,color='green')
sns.distplot(Age_p3,kde=True,color='grey')
plt.title('Distribution of Age in Pclass')
plt.legend(['p1','p2','p3'])
Age_Pclass = data_df.groupby([ 'Pclass']).median()['Age']
for pclass in range(1, 4):
print('Median age of Pclass {}: {}'.format(pclass,Age_Pclass [pclass]))
print('Median age of all passengers: {}'.format(data_df['Age'].median()))
# 根据Pclass填充Age值
data_df['Age'] = data_df.groupby(['Pclass'])['Age'].apply(lambda x: x.fillna(x.median()))
Fare缺失的样本只有一个,可以考虑直接用数据集的统计特征填充
不过看到SibSp,Parch都等于0,说明购置的是单人票,Plass又是机舱等级,可以考虑聚合以上属性
#查看Fare缺失的样本
data_df[data_df['Fare'].isnull()]
## Pclass对价格的影响很大
abs(data_df.corr()['Fare']).sort_values(ascending=False)
Fare 1.000000
Pclass 0.558629
Survived 0.257307
Embarked 0.238005
Parch 0.221539
Age 0.202512
Sex 0.185523
SibSp 0.160238
train 0.030831
## 聚合数据属性
print(data_df.groupby(['Pclass', 'Parch','SibSp','Embarked']).Fare.max()[3][0][0][0])#18.7875
print(data_df.groupby(['Pclass', 'Parch','SibSp','Embarked']).Fare.min()[3][0][0][0])#4.0125
print(data_df.groupby(['Pclass', 'Parch','SibSp','Embarked']).Fare.median()[3][0][0][0])#7.2292
print(data_df.groupby(['Pclass', 'Parch','SibSp','Embarked']).Fare.mean()[3][0][0][0])#7.923984210526318
## 选择中位数进行填充
data_df['Fare'].fillna(data_df.groupby(['Pclass', 'Parch','SibSp','Embarked'])['Fare'].median()[3][0][0][0],inplace=True)
Cabin缺失较多,如果没有很好的填充数据的方法时,建议将其直接删除。
data_df.drop('Cabin',inplace=True,axis=1)
#从data_df得到训练集
train_data = data_df[data_df.train==1]
train_data['Survived'] = train_df['Survived']
train_data.drop('train',axis=1,inplace=True)
#从data_df得到测试训练集
test_data = data_df[data_df.train==0]
test_data.drop(['Survived','train'],axis=1,inplace=True)
train_data.corr()
### 从幸存和性别成负相关程度较大
### 从幸存和Pclass 成负相关程度较大
### 从幸存和Fare 成负相关程度较大
train_data.corr()['Survived'].sort_values(ascending=False)
Survived 1.000000
Fare 0.257307
Parch 0.081629
SibSp -0.035322
Age -0.046230
Embarked -0.167675
Pclass -0.338481
Sex -0.543351
plt.figure( figsize=(10, 10))
plt.title('Train Set Correlation HeatMap ',y=1,size=16)
sns.heatmap(train_data.corr(),square = True, vmax=0.7,annot=True,cmap='Accent')
plt.bar(['Not Survived','Survived'],train_data['Survived'].value_counts().values)
plt.title('Train_Set_Survived')
test_data.corr()
plt.figure( figsize=(10, 10))
plt.title('Test Set Correlation HeatMap ',y=1,size=16)
sns.heatmap(test_data.corr(),square = True, vmax=0.7,annot=True,cmap='Accent')
从Age、Fare的生存情况分布中可以看出的是
continue_features = ['Age', 'Fare'] survived = train_data['Survived'] == 1 fig, axs = plt.subplots(ncols=2, nrows=2, figsize=(20, 20)) plt.subplots_adjust(right=1.5) for i, feature in enumerate(continue_features): sns.distplot(train_data[~survived][feature], label='Not Survived', hist=True, color='#e74c3c', ax=axs[0][i]) sns.distplot(train_data[survived][feature], label='Survived', hist=True, color='#2ecc71', ax=axs[0][i]) sns.distplot(train_data[feature], label='Training Set', hist=False, color='#e74c3c', ax=axs[1][i]) sns.distplot(test_data[feature], label='Test Set', hist=False, color='#2ecc71', ax=axs[1][i]) axs[0][i].set_xlabel('') axs[1][i].set_xlabel('') for j in range(2): axs[i][j].tick_params(axis='x', labelsize=20) axs[i][j].tick_params(axis='y', labelsize=20) axs[0][i].legend(loc='upper right', prop={'size': 20}) axs[1][i].legend(loc='upper right', prop={'size': 20}) axs[0][i].set_title('Distribution of Survival in {}'.format(feature), size=20, y=1.05) axs[1][0].set_title('Distribution of {} Feature'.format('Age'), size=20, y=1.05) axs[1][1].set_title('Distribution of {} Feature'.format('Fare'), size=20, y=1.05) plt.show()
Categorical_features = ['Embarked', 'Parch','SibSp','Sex', 'Pclass'] fig, axs = plt.subplots(ncols=2, nrows=3, figsize=(20, 20)) plt.subplots_adjust(right=1.5, top=1.25) for i, feature in enumerate(Categorical_features, 1): plt.subplot(2, 3, i) sns.countplot(x=feature, hue='Survived', data=train_data) plt.tick_params(axis='x', labelsize=20) plt.tick_params(axis='y', labelsize=20) plt.xlabel('{}'.format(feature), size=20, labelpad=15) plt.ylabel('Passenger Count', size=20, labelpad=15) plt.legend(['Not Survived', 'Survived'], loc='upper center') plt.title('Count of Survival in {} Feature'.format(feature), size=20, y=1.05) plt.show() fig, axs = plt.subplots(ncols=2, nrows=3, figsize=(15, 15)) plt.subplots_adjust(right=1.5, top=1.25) for i, feature in enumerate(Categorical_features, 1): plt.subplot(2, 3, i) sns.pointplot(feature,y='Survived',data=train_data) plt.tick_params(axis='x', labelsize=20) plt.tick_params(axis='y', labelsize=20) plt.xlabel('{}'.format(feature), size=20, labelpad=15) plt.ylabel('Passenger Count', size=20, labelpad=15) plt.title('Rate of Survival in {} Feature'.format(feature), size=20, y=1.05) plt.show()
train_data.to_csv('./train.csv',index=False)
test_data.to_csv('./test.csv',index=False)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。