import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# 데이터 불러오기
titanic_df=pd.read_csv("./data/Titanic_data.csv")
# 성별에 따른 생존률 계산
survived_by_sex=titanic_df.groupby('Sex')['Survived'].mean()
print(survived_by_sex)
Sex
female 0.742038
male 0.188908
Name: Survived, dtype: float64
# 막대 그래프로 시각화
plt.figure(figsize=(6, 4))
sns.barplot(x=survived_by_sex.index, y=survived_by_sex.values)
plt.title('Survival Rate by Sex')
plt.xlabel('Sex')
plt.ylabel('Survival Rate')
plt.show()
# Embarked
survived_embarked = titanic_df.groupby('Embarked')['Survived'].sum()
dead_embarked = titanic_df.groupby('Embarked')['Survived'].count() - survived_embarked
print(survived_embarked,'\n',dead_embarked)
Embarked
C 93
Q 30
S 217
Name: Survived, dtype: int64
Embarked
C 75
Q 47
S 427
Name: Survived, dtype: int64
# 그래프 그리기
plt.figure(figsize=(10, 6))
plt.bar(survived_embarked.index, survived_embarked.values, color='g', alpha=0.5, label='Survived')
plt.bar(dead_embarked.index, dead_embarked.values, bottom=survived_embarked.values, color='r',
alpha=0.5, label='Dead')
plt.title('Survivors by Embarked')
plt.xlabel('Embarked')
plt.ylabel('Number of passengers')
plt.legend()
plt.show()
# 객실 등급에 따른 생존자 수
survived_class= titanic_df.groupby('Pclass')['Survived'].sum()
dead_class=titanic_df.groupby('Pclass')['Survived'].count()-survived_class
# 그래프 그리기
plt.figure(figsize=(10, 6))
plt.bar(survived_class.index, survived_class.values, color='g', alpha=0.5, label='Survived')
plt.bar(dead_class.index, dead_class.values, bottom=survived_class.values, color='r', alpha=0.5, label='Dead')
plt.title('Survivors by Class')
import numpy as np
from scipy.stats import chi2_contingency
# 예시 데이터 생성
observed_values=np.array([[10,20,30],[6,15,9]])
# 카이제곱 검정 수행
chi2,p_value,dof,expected_values=chi2_contingency(observed_values)
# 결과 출력
print(f'카이제곱 통계량:{chi2}')
print(f'p-value:{p_value}')
print(f'자유도:{dof}')
print(f'기대값:{expected_values}')
카이제곱 통계량:3.3997252747252746
p-value:0.18270861966696167
자유도:2
기대값:[[10.66666667 23.33333333 26. ]
[ 5.33333333 11.66666667 13. ]]
예시
import seaborn as sns
import matplotlib.pyplot as plt
# 데이터 생성
data=[[5,10,15],[10,15,20]]
# seaborn을 이용한 히트맵 시각화
sns.heatmap(data,cmap='YlGnBu',annot=True,fmt='d',cbar=False)
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
# 데이터 불러오기
data=pd.read_csv('./data/college_data.csv')
data['admisson_level']=pd.qcut(data['top10perc'],q=4,labels=['very_low','low','high','very_high'])
private apps accept enroll top10perc top25perc f_undergrad \
0 Yes 1660 1232 721 23 52 2885
1 Yes 2186 1924 512 16 29 2683
2 Yes 1428 1097 336 22 50 1036
3 Yes 417 349 137 60 89 510
4 Yes 193 146 55 16 44 249
.. ... ... ... ... ... ... ...
772 No 2197 1515 543 4 26 3089
773 Yes 1959 1805 695 24 47 2849
774 Yes 2097 1915 695 34 61 2793
775 Yes 10705 2453 1317 95 99 5217
776 Yes 2989 1855 691 28 63 2988
p_undergrad outstate room_board books personal phd terminal \
0 537 7440 3300 450 2200 70 78
1 1227 12280 6450 750 1500 29 30
2 99 11250 3750 400 1165 53 66
3 63 12960 5450 450 875 92 97
4 869 7560 4120 800 1500 76 72
.. ... ... ... ... ... ... ...
772 2029 6797 3900 500 1200 60 60
773 1107 11520 4960 600 1250 73 75
774 166 6900 4200 617 781 67 75
775 83 19840 6510 630 2115 96 96
776 1726 4990 3560 500 1250 75 75
s_f_ratio perc_alumni expend grad_rate admisson_level
0 18.1 12 7041 60 low
1 12.2 16 10527 56 low
2 12.9 30 8735 54 low
3 7.7 37 19016 59 very_high
4 11.9 2 10922 15 low
.. ... ... ... ... ...
772 21.0 14 4469 40 very_low
773 13.3 31 9189 83 high
774 14.4 20 8323 49 high
775 5.8 49 40386 99 very_high
776 18.1 28 4509 99 high
[777 rows x 19 columns]
# 그래프 그리기
plt.figure(figsize=(10, 6))
plt.bar(data['admisson_level'].value_counts().index, data['admisson_level'].value_counts().values)
plt.title('Number of Colleges by Admission Level')
plt.xlabel('Admission Level')
plt.ylabel('Number of Colleges')
plt.show()