f, ax = plt.subplots(1,2, figsize=(18,8))
titanic['survived'].value_counts().plot.pie(ax=ax[0], autopct = '%1.1f%%')
ax[0].set_title('Pie plot - survived')
ax[0].set_ylabel('')
sns.countplot(x='survived', data=titanic, ax=ax[1])
ax[1].set_title('Count plot - survived')
f, ax = plt.subplots(1,2, figsize=(18,8))
sns.countplot(x='sex', data = titanic, ax=ax[0])
ax[0].set_title('Count of Passengerf of Sex')
ax[0].set_ylabel('')
sns.countplot(x='sex', hue = 'survived', data = titanic, ax=ax[1])
ax[1].set_title('Sex: Survived and Unsurvived')
ax[1].set_ylabel('')
즉, 남성의 생존 가능성이 더 낮다
pd.crosstab(titanic['pclass'], titanic['survived'], margins=True)
pclass | 0 | 1 | All |
---|---|---|---|
1 | 123 | 200 | 323 |
2 | 158 | 119 | 277 |
3 | 528 | 181 | 709 |
All | 809 | 500 | 1309 |
pclass = n등실
0 = 사망, 1 = 생존
1. 1등실의 생존 가능성이 매우 높다
2. 그런데 여성의 생존률도 높다
3. 3등실은 생존가능성이 낮다
그럼 1등실에 여성이 많이 타고 있었나?
grid = sns.FacetGrid(titanic, row='pclass', col='sex', height = 4, aspect = 2)
grid.map(plt.hist, 'age', alpha=0.8, bins=20)
grid.add_legend()
3등실에는 남성이 많았다 - 특히 20대 남성
px.histogram(titanic, x='age')
애기들도 많았고 젊은 사람들도 많다
gird = sns.FacetGrid(titanic, col='survived', row='pclass', height=4, aspect=2)
grid.map(plt.hist, 'age', alpha=0.5, bins=20)
grid.add_legend()
등급이 높을수록 생존율이 높다
titanic['age_cat'] = pd.cut(titanic['age'], bins=[0,7,15,30,60,100], include_lowest=True, labels=['baby','teen','young','adult','old'])
titanic.head()
plt.figure(figsize=(12,4))
plt.subplot(131)
sns.barplot(x='pclass',y='survived',data=titanic)
plt.subplot(132)
sns.barplot(x='age_cat',y='survived',data=titanic)
plt.subplot(133)
sns.barplot(x='sex',y='survived',data=titanic)
어리고, 여성이고, 1등실일수록 생존하기 유리했을까?
1. 역시나 여성이 더 생존율이 높지만
2. 남성의 애기들도 생존율은 높다
title=[]
for idx, dataset in titanic.iterrows():
tmp = dataset['name']
title.append(re.search('\,\s\w+(\s\w+)?\.',tmp).group()[2:-1])
titanic['title'] = title
titanic.head()
pd.crosstab(titanic['title'], titanic['sex'])
titanic['title'] = titanic['title'].replace('Mlle', 'Miss')
titanic['title'] = titanic['title'].replace('Ms', 'Miss')
titanic['title'] = titanic['title'].replace('Mme', 'Miss')
Rare_f=['Dona','Lady','the Countess']
Rare_m=['Capt','Col','Don','Major','Rev','Dr','Master','Sir','Jonkheer']
for each in Rare_f:
titanic['title'] = titanic['title'].replace(each, 'Rare_f')
for each in Rare_m:
titanic['title'] = titanic['title'].replace(each, 'Rare_m')
titanic[['title','survived']].groupby(['title'], as_index=False).mean()
1. 귀족 남성이더라도 평민 여성보다 생존율이 낮다
2. 평민남성 생존율이 가~장 낮다
평민 남성 - 귀족 남성 - 평민여성 - 귀족 여성 순으로 생존율이 높다.
titanic.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 16 columns):
Column Non-Null Count Dtype
0 pclass 1309 non-null int64
1 survived 1309 non-null int64
2 name 1309 non-null object
3 sex 1309 non-null object
4 age 1046 non-null float64
5 sibsp 1309 non-null int64
6 parch 1309 non-null int64
7 ticket 1309 non-null object
8 fare 1308 non-null float64
9 cabin 295 non-null object
10 embarked 1307 non-null object
11 boat 486 non-null object
12 body 121 non-null float64
13 home.dest 745 non-null object
14 age_cat 1046 non-null category
15 title 1309 non-null object
dtypes: category(1), float64(3), int64(4), object(8)
memory usage: 155.0+ KB
성별이 숫자가 아니기 때문에 숫자로 변경
le = LabelEncoder()
le.fit(titanic['sex'])
titanic['gender'] = le.transform(titanic['sex'])
titanic.head()
titanic = titanic[titanic['age'].notnull()]
titanic = titanic[titanic['fare'].notnull()]
survived & gender
x = titanic[['pclass','age','sibsp','parch','fare','gender']]
y = titanic['survived']
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.8, random_state=13)
특성은 'pclass','age','sibsp','parch','fare','gender'로 선정
dt = DecisionTreeClassifier(max_depth=4, random_state=13)
dt.fit(x_train, y_train)
pred = dt.predict(x_test)
print(accuracy_score(y_test, pred))
0.7655502392344498
dicaprio = np.array([[3,18,0,0,5,1]])
print('Decaprio : ', dt.predict_proba(dicaprio)[0,1])
3 : 3등석
18 : 나이
0 : 부모형제자매 수
0 : 부부 수
5 : 탑승 금액
1 : 남성
Decaprio : 0.22950819672131148
앗...생존 확률 22%...
winslet = np.array([[1, 16, 1,1,100,0]])
print('winslet : ', dt.predict_proba(winslet)[0,1])
winslet : 1.0
😮