Summary :
# Data Analyze
import pandas as pd
import numpy as np
# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
# 데이터 tqdm으로 살피기
import tqdm.notebook as tqdm
ML Goal is to find out who is transported to other dimension or not
train_data = pd.read_csv("./dataset/train.csv")
train_data
# Checking train data
train_data.isna().sum()
# Checking train data
train_data.isna().sum()
train_data.dropna()
# fill missing value with 0
train_data=train_data.fillna(0)
train_data.isna().sum()
print(train_data.columns)
print(train_data.dtypes)
print("행 열 :", train_data.shape)
train_data["Transported"] = train_data["Transported"].astype(int)
# Date데이터의 연월일을 변환
PassengerId = dict()
HomePlanet= dict()
CryoSleep= dict()
Cabin= dict()
Destination= dict()
VIP= dict()
Name= dict()
for i in tqdm.tqdm(range(len(train_data['PassengerId']))):
if train_data.iloc[i]["PassengerId"] in PassengerId :
PassengerId[train_data.iloc[i]["PassengerId"]]+=1
else :
PassengerId[train_data.iloc[i]["PassengerId"]]=1
if train_data.iloc[i]["HomePlanet"] in HomePlanet :
HomePlanet[train_data.iloc[i]["HomePlanet"]]+=1
else :
HomePlanet[train_data.iloc[i]["HomePlanet"]]=1
if train_data.iloc[i]["CryoSleep"] in CryoSleep :
CryoSleep[train_data.iloc[i]["CryoSleep"]]+=1
else :
CryoSleep[train_data.iloc[i]["CryoSleep"]]=1
if train_data.iloc[i]["Cabin"] in Cabin :
Cabin[train_data.iloc[i]["Cabin"]]+=1
else :
Cabin[train_data.iloc[i]["Cabin"]]=1
if train_data.iloc[i]["Destination"] in Destination :
Destination[train_data.iloc[i]["Destination"]]+=1
else :
Destination[train_data.iloc[i]["Destination"]]=1
if train_data.iloc[i]["VIP"] in VIP :
VIP[train_data.iloc[i]["VIP"]]+=1
else :
VIP[train_data.iloc[i]["VIP"]]=1
if train_data.iloc[i]["Name"] in Name :
Name[train_data.iloc[i]["Name"]]+=1
else :
Name[train_data.iloc[i]["Name"]]=1
print(PassengerId)

{'Europa': 2131, 'Earth': 4602, 'Mars': 1759, 0: 201}
print(CryoSleep)
{False: 5656, True: 3037}
print(Cabin)
print(Destination)
{'TRAPPIST-1e': 5915, 'PSO J318.5-22': 796, '55 Cancri e': 1800, 0: 182}
print(VIP)
{False: 8494, True: 199}
print(Name)
train_data["VIP"] = train_data["VIP"].astype('int')
train_data["CryoSleep"] = train_data["CryoSleep"].astype('int')
des = pd.get_dummies(train_data['Destination'], prefix = 'Destination')
hpt = pd.get_dummies(train_data['HomePlanet'], prefix = 'HomePlanet')
train_data = train_data.drop(['Destination', 'HomePlanet','Name','PassengerId'],axis=1)
train_data = pd.concat([train_data, des, hpt], axis=1)
train_data
deck =[]
num=[]
side=[]
for i in tqdm.tqdm(range(len(train_data["Cabin"]))):
temp = (str(train_data.iloc[i]["Cabin"]).split('/'))
if len(temp) == 3:
deck.append(temp[0])
num.append(int(temp[1]))
side.append(temp[2])
else :
deck.append(0)
num.append(0)
side.append(0)
train_data["deck"]=deck
train_data["num"] =num
train_data["side"] =side
de = pd.get_dummies(train_data['deck'], prefix = 'deck')
si = pd.get_dummies(train_data['side'], prefix = 'side')
train_data = train_data.drop(['deck', 'side'],axis=1)
train_data = pd.concat([train_data, de, si], axis=1)
train_data
train_data.hist(figsize=(30,20))
train_data.describe()
target = train_data['Transported']
norm = train_data.drop('Transported', axis = 1)
# z-정규화( x-평균/표준편차)
train_data_normed = (norm- norm.mean())/norm.std()
train_data_normed
analysis = pd.merge(train_data_normed, train_data['Transported'],
left_index = True, right_index=True)
`# 선형성 확인
plt.figure(figsize=(16,16))
sns.heatmap(train_data.corr(), linewidths=.5, cmap = 'Blues', annot=True)
#pairplot with Seaborn
sns.pairplot(analysis[['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Transported']],hue='Transported')
plt.show()
sns.pairplot(analysis[['Spa', 'VRDeck', 'Destination_0', 'Destination_55 Cancri e', 'Destination_TRAPPIST-1e', 'Destination_PSO J318.5-22', 'Transported']],hue='Transported')
plt.show()
sns.pairplot(analysis[['HomePlanet_0', 'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars', 'num', 'Transported']],hue='Transported')
plt.show()
sns.pairplot(analysis[['deck_0', 'deck_A', 'deck_B', 'deck_C', 'deck_D', 'deck_E', 'Transported']],hue='Transported')
plt.show()
sns.pairplot(analysis[['deck_G', 'deck_T', 'side_0', 'side_P', 'side_S', 'Transported']],hue='Transported')
plt.show()