Kaggle Link : https://www.kaggle.com/code/yoontaeklee/titanic-survival-rate-random-forest
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
# 입력 데이터는 "../input/" 디렉토리 안에서만 사용 가능
: 여러 feature들을 개별적으로 분석하고, feature들 간의 상관관계 확인, 여러 시각화 툴을 사용해 insight 얻기
: 모델 세우기 전, 모델의 성능을 높일 수 있도록 feature들을 수정. One-hot encoding, class 나누기, 구간으로 나누기, 텍스트 데이터 처리 등
: sklearn 사용하여 모델 만들기
: trainset으로 모델을 학습시킨 후, testset으로 prediction
: 예측 성능이 원하는 수준인지 판단
df_train = pd.read_csv('../input/titanic/train.csv')
df_test = pd.read_csv('../input/titanic/test/csv')
df_train.head()
df_train.describe()
df_test.describe()
for column in df_train.columns:
msg = 'column: {:>11}\t Percent of NaN value: {:.2f}%'.format(
column, 100 * (df_train[column].isnull().sum() / df_train[column].shape[0]))
print(msg)
for column in df_test.columns:
msg = 'column: {:>11}\t Percent of NaN value: {:.2f}%'.format(
column, 100 * (df_test[column].isnull().sum() / df_test[column].shape[0]))
print(msg)
msno.matrix(df=df_train.iloc[:, :], figsize=(7,7), color=(0.01,0.3,0.6))
msno.bar(df=df_train.iloc[:,:], figsize=(7,7), color=(0.01,0.3,0.6))
f, ax = plt.subplots(1,2,figsize=(18,8))
df_train['Survived'].value_counts().plot.pie(explode=[0,0.1], autopct='%1.1f%%',
ax=ax[0], shadow=True)
ax[0].set_title('Pie plot - Survived')
ax[0].set_ylabel('')
sns.countplot('Survived', data=df_train, ax=ax[1])
ax[1].set_title('Count plot - Survived')
plt.show()
df_train[['Pclass','Survived']].groupby(['Pclass'], as_index=True).count()
df_train[['Pclass','Survived']].groupby(['Pclass'],as_index=True).sum()
pd.crosstab(df_train['Pclass'],df_train['Survived'], margins=True)
.style.background_gradient(cmap='summer_r')
df_train[['Pclass','Survived']].groupby(['Pclass'],as_index=True).mean()
.sort_values(by='Survived',ascending=False).plot.bar()
y_position = 1.0
f, ax = plt.subplots(1,2,figsize=(18,8))
df_train['Pclass'].value_counts().plot.bar(color=['#CD7F32','#D3D3D3'], ax=ax[0])
ax[0].set_title('Number of Passengers By Pclass', y=y_position)
ax[0].set_ylabel('Count')
sns.countplot('Pclass', hue='Survived', data=df_train, ax=ax[1])
ax[1].set_title('Pclass: Survived vs Dead', y=y_position)
plt.show()
⇒ 현재까지 살펴본 결과, Pclass와 생존율 사이에 밀접한 관계가 있는 것을 확인하였다
따라서 후에 모델을 세울 때 Pclass feature를 사용한다
f, ax = plt.subplots(1,2,figsize=(18,8))
df_train[['Sex','Survived']].groupby(['Sex'],as_index=True).mean().plot.bar(ax=ax[0])
ax[0].set_title('Survived vs Sex')
sns.countplot('Sex',hue='Survived',data=df_train,ax=ax[1])
ax[1].set_title('Sex: Survived vd Dead')
plt.show()
df_train[['Sex','Survived']].groupby(['Sex'], as_index=False).mean()
.sort_values(by='Survived', ascending=False)
sns.factorplot('Pclass','Survived',hue='Sex',data=df_train)
sns.factorplot(x='Sex',y='Survived',col='Pclass',data=df_train, satureation=.5,
size=6,aspect=1)
print('나이가 가장 많은 탑승객 : {:.1f} Years'.format(df_train['Age'].max()))
print('나이가 적은 많은 탑승객 : {:.1f} Years'.format(df_train['Age'].min()))
print('탑승객 평균 나이 : {:.1f} Years'.format(df_train['Age'].mean()))
fig, ax = plt.subplots(1,1,figsize=(9,5))
sns.kdeplot(df_train[df_train['Survived'] == 1]['Age'],ax=ax)
sns.kdeplot(df_train[df_train['Survived'] == 0]['Age'],ax=ax)
plt.legend(['Survived == 1','Survived == 0'])
plt.show()
plt.figure(figsize=(8,6))
df_train['Age'][df_train['Pclass'] == 1].plot(kind='kde')
df_train['Age'][df_train['Pclass'] == 2].plot(kind='kde')
df_train['Age'][df_train['Pclass'] == 3].plot(kind='kde')
plt.xlabel('Age')
plt.title('Age Distribution within classes')
plt.legend(['1st Class','2nd Class','3rd Class'])
cummulate_survival_ratio = []
for i in range(1,80):
cummulate_survival_ratio.append(df_train[df_train['Age'] < i]['Survived'].sum()
/ len(df_train[df_train['Age'] < i]['Survived']))
plt.figure(figsize=(7,7))
plt.plot(cummulate_survival_ratio)
plt.title('Survival rate change depending on range of Age',y=1.01)
plt.ylabel('Survival rate')
plt.xlabel('Range of Age(0~x)')
plt.show()
f, ax = plt.subplots(1,1,figsize=(7,7))
df_train[['Embarked','Survived']].groupby(['Survived',as_index=True).mean()
.sort_values(by='Survived',ascending=False).plot.bar(ax=ax)
f, ax = plt.subplots(2,2,figsize=(20,15))
sns.countplot('Embarked',data=df_train,ax=ax[0,0])
ax[0,0].set_title('(1) No. Of Passengers Boarded')
sns.countplot('Embarked',hue='Sex',data=df_train,ax=ax[0,1])
ax[0,1].set_title('(2) Male-Female Split for Embarked')
sns.countplot('Embarked',hue='Survived',data=df_train,ax=ax[1,0])
ax[1,0].set_title('(3) Embarked vs Survived')
sns.countplot('Embarked',hue='Pclass',data=df_train,ax=ax[1,1])
ax[1,1].set_title('(4) Embarked vs Pclass')
plt.subplots_adjust(wspace=0.2, hspace=0.5)
plt.show()
df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch'] + 1
df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch'] + 1
print('Max size of Family: ',df_train['FamilySize'].max())
print('Max size of Family: ',df_test['FamilySize'].max())
f, ax = plt.subplots(1,3,figsize=(30,10))
sns.countplot('FamilySize',data=df_train,ax=ax[0])
ax[0].set_title('(1) Number of Passengers Boarded',y=1.02)
sns.countplot('FamilySize',hue='Survived',data=df_train,ax=ax[1])
ax[1].set_title('(2) Survived countplot depending on FamilySize',y=1.02)
df_train[['FamilySize','Survived']].groupby(['FamilySize'],as_index=True).mean()
.sort_values(by='Survived',ascending=False).plot.bar(ax=ax[2])
ax[2].set_title('(3) Survived rate depending on Family Size',y=1.02)
plt.subplots_adjust(wspace=0.2, hspace=0.5)
plt.show()
fig, ax = plt.subplots(1,1,figsize=(8,8))
g = sns.distplot(df_train['Fare'], color='b', label='Skewness : {:.2f}'.format(
df_train['Fare'].skew()),ax=ax)
g=g.legend(loc='best')
df_test.loc[df_test.Fare.isnull(),'Fare'] = df_test['Fare'].mean()
df_train['Fare'] = df_train['Fare'].map(lambda i: np.log(i) if i > 0 else 0)
df_test['Fare'] = df_test['Fare'].map(lambda i: np.log(i) if i > 0 else 0)
fig, ax = plt.subplots(1,1,figsize=(8,8))
g = sns.distplot(df_train['Fare'], color='b', label='Skewness : {:.2f}'.format(
df_train['Fare'].skew()),ax=ax)
g=g.legend(loc='best')
3.1.1 Fill Null in Age using title
df_train['Initial'] = df_train.Name.str.extract('([A-Za-z]+\.')
df_test['Initial'] = df_test.Name.str.extract('([A-Za-z]+\.')
pd.crosstab(df_train['Initial'],df_train['Sex']).T.style.background_gradient(cmap='summer_r')
df_train['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady',
'Countess','Jonkheer','Col','Rev','Capt','Sir','Don','Dona'],[
'Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other',
'Mr','Mr','Mr','Mr'],inplace=True)
df_test['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady',
'Countess','Jonkheer','Col','Rev','Capt','Sir','Don','Dona'],[
'Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other',
'Mr','Mr','Mr','Mr'],inplace=True)
df_train.groupby('Initial').mean()
df_train.groupby('Initial')['Survived'].mean().plot.bar()
df_train.groupby('Initial').mean()
df_train.loc[(df_train.Age.isnull())&(df_train.Initial=='Mr'),'Age'] = 33
df_train.loc[(df_train.Age.isnull())&(df_train.Initial=='Miss'),'Age'] = 22
df_train.loc[(df_train.Age.isnull())&(df_train.Initial=='Master'),'Age'] = 5
df_train.loc[(df_train.Age.isnull())&(df_train.Initial=='Mrs'),'Age'] = 36
df_train.loc[(df_train.Age.isnull())&(df_train.Initial=='Other'),'Age'] = 46
print('Embarked has ',sum(df_train['Embarked'].isnull()),'Null values')
# Embarked has 2 Null values
df_train['Embarked'].fillna('S',inplace=True)
# dataframe의 indexing 방법인 loc 사용
df_train['Age_cat'] = 0
df_train.loc[df_train['Age'] < 10, 'Age_cat'] = 0
df_train.loc[(10 <= df_train['Age']) & (df_train['Age'] < 20),'Age_cat'] = 1
df_train.loc[(20 <= df_train['Age']) & (df_train['Age'] < 30),'Age_cat'] = 2
df_train.loc[(30 <= df_train['Age']) & (df_train['Age'] < 40),'Age_cat'] = 3
df_train.loc[(40 <= df_train['Age']) & (df_train['Age'] < 50),'Age_cat'] = 4
df_train.loc[(50 <= df_train['Age']) & (df_train['Age'] < 60),'Age_cat'] = 5
df_train.loc[(60 <= df_train['Age']) & (df_train['Age'] < 70),'Age_cat'] = 6
df_train.loc[70 <= df_train['Age'],'Age_cat'] = 7
df_test['Age_cat'] = 0
df_test.loc[df_test['Age'] < 10, 'Age_cat'] = 0
df_test.loc[(10 <= df_test['Age']) & (df_test['Age'] < 20),'Age_cat'] = 1
df_test.loc[(20 <= df_test['Age']) & (df_test['Age'] < 30),'Age_cat'] = 2
df_test.loc[(30 <= df_train['Age']) & (df_test['Age'] < 40),'Age_cat'] = 3
df_test.loc[(40 <= df_train['Age']) & (df_test['Age'] < 50),'Age_cat'] = 4
df_test.loc[(50 <= df_train['Age']) & (df_test['Age'] < 60),'Age_cat'] = 5
df_test.loc[(60 <= df_train['Age']) & (df_test['Age'] < 70),'Age_cat'] = 6
df_test.loc[70 <= df_train['Age'],'Age_cat'] = 7
# apply 사용
def category_age(x):
if x < 10:
return 0
elif x < 20:
return 1
elif x < 30:
return 2
elif x < 40:
return 3
elif x < 50:
return 4
elif x < 60:
return 5
elif x < 70:
return 6
else:
return 7
df_train['Age_cat_2'] = df_train['Age'].apply(category_age)
(df_train['Age_cat'] == df_train['Age_cat_2']).all()
# True
df_train.drop(['Age','Age_cat_2'], axis=1, inplace=True)
df_test.drop(['Age'], axis=1, inplace=True)
df_train['Initial'] = df_train['Initial'].map({'Master':0,'Miss':1,'Mr':2,
'Mrs':3,'Other':4})
df_test['Initial'] = df_train['Initial'].map({'Master':0,'Miss':1,'Mr':2,
'Mrs':3,'Other':4})
df_train['Embarked'].unique()
# array(['S','C','Q'], dtype=object)
df_train['Embarked'].value_counts()
# S 646
# C 168
# Q 77
# Name: Embarked, dtype: int64
df_train['Embarked'] = df_train['Embarked'].map({'C':0,'Q':1,'S':2})
df_test['Embarked'] = df_test['Embarked'].map({'C':0,'Q':1,'S':2})
df_train['Embarked'].isnull().any()
# False
df_train['Sex'] = df_train['Sex'].map({'female':0,'male':1})
df_test['Sex'] = df_test['Sex'].map({'female':0,'male':1})
heatmap_data = df_train[['Survived','Pclass','Sex','Fare','Embarked','FamilySize',
'Initial','Age_cat']]
colormap = plt.cm.RdBu
plt.figure(figsize=(14,12))
plt.title('Pearson Correlation of Features',y=1.05,size=15)
sns.heatmap(heatmap_data.astype(float).corr(),linewidths=0.1,vmax=1.0,square=True,
cmap=colormap,linecolor='white',annot=True,annot_kws={'size':16})
del heatmap_data
df_train = pd.get_dummies(df_train, columns=['Initial'], prefix='Initial')
df_test = pd.get_dummies(df_test, columns=['Initial'], prefix='Initial')
df_train = pd.get_dummies(df_train, columns=['Embarked'],prefix='Embarked')
df_test = pd.get_dummies(df_test, columns=['Embarked'],prefix='Embarked')
df_train.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin'],
axis=1,inplace=True)
df_test.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin'],
axis=1,inplace=True)
df_train.head()
#importing all required ML packages
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
X_train = df_train.drop('Survived',axis=1).values
target_label = df_train['Survived'].values
X_test = df_test.values
X_tr, X_vld, y_tr, y_vld = train_test_split(X_train, target_label, test_size=0.3,
random_state=2018)
model = RandomForestClassifier()
model.fit(X_tr, y_tr)
prediction = model.predict(X_vld)
print('총 {}명 중 {:.2f}% 정확도로 생존을 맞춤'.format(y_vld.shape[0],
100 * metrics.accuracy_score(prediction,y_vld)))
# 총 268명 중 81.72% 정확도로 생존을 맞춤
from pandas import Series
feature_importance = model.feature_importances_
Series_feat_imp = Series(feature_importance, index=df_test.columns)
plt.figure(figsize=(8,8))
Series_feat_imp.sort_values(ascending=True).plot.barh()
plt.xlabel('Feature importance')
plt.ylabel('Feature')
plt.show()