Comprehensive data exploration with Python(by Pedro Marcelino)
โ๏ธ ํ์ํ ๋ผ์ด๋ธ๋ฌ๋ฆฌ ๋ถ๋ฌ์ค๊ธฐ
# ๋ผ์ด๋ธ๋ฌ๋ฆฌ ๋ถ๋ฌ์ค๊ธฐ
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
# warnings ๋ผ์ด๋ธ๋ฌ๋ฆฌ๋ก ๊ฒฝ๊ณ ๋ฉ์ธ์ง ๋ฌด์ํ๊ธฐ
import warnings
warnings.filterwarnings(action='ignore')
โ๏ธ ๋ฐ์ดํฐ์ ๊ฐ์ ธ์ค๊ธฐ
# ๋ฐ์ดํฐ์
๊ฐ์ ธ์ค๊ธฐ
train_data = 'C:\\Users\\USER\\Desktop\\Data Analysis\\data\\train2.csv'
test_data = 'C:\\Users\\USER\\Desktop\\Data Analysis\\data\\test2.csv'
df_train = pd.read_csv(train_data)
df_test = pd.read_csv(test_data)
โ๏ธ train set์ ์ปฌ๋ผ(๋ณ์) ํ์ธํ๊ธฐ
print(df_train.columns.values)
โ๏ธ ๋ฐ์ดํฐ ํ์ธํ๊ธฐ
df_train.head()
โ๏ธ train, test set์ ์์ฝ์ ๋ณด ํ์ธํ๊ธฐ
df_train.info()
print('\n')
df_test.info()
- ์ด ๋ณ์๊ฐ ์ง์ ๊ตฌ๋งคํ ๋ ํ์ํ๊ฐ?
- ๊ทธ๋ ๋ค๋ฉด, ์ด ๋ณ์๊ฐ ์ผ๋ง๋ ์ค์ํ๊ฐ?
- ์ด ๋ณ์๊ฐ ๋ค๋ฅธ ๋ณ์์ ์ํด ์ด๋ฏธ ์ค๋ช ๋์ด ์๋๊ฐ?
โ๏ธ ํต๊ณ ์์ฝ์ ๋ณด ํ์ธํ๊ธฐ
df_train['SalePrice'].describe()
โ๏ธ ํ์คํ ๊ทธ๋จ ๊ทธ๋ ค๋ณด๊ธฐ
sns.distplot(df_train['SalePrice'])
Deviate from the normal distribution: ์ ๊ท๋ถํฌ๋ฅผ ๋ฒ์ด๋จ.
Have appreciable positive skewness: ์์ ์๋๋ฅผ ๊ฐ์ง.
Show peakedness: ๋พฐ์กฑํ ๋ชจ์์ ๊ฐ์ง.
โ๏ธ scatter plot (GrLibArea, SalePrice)
var = 'GrLivArea'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1) # axis=1: ์ด ๋ฐฉํฅ์ผ๋ก ๊ฒฐํฉ
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000))
โ๏ธ scatter plot (TotalBsmtSF, SalePrice)
var = 'TotalBsmtSF'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000))
โป pd.concat: ๋ฐ์ดํฐํ๋ ์ ๊ฒฐํฉ
pd.concat(df,
axis=0, # axis: ์ถ ๋ฐฉํฅ
keys=None, # ์๋ณธ๋ฐ์ดํฐ ์ด๋ฆ ์ง์
levels=None,
names=None)
โ๏ธ box plot (OverallQual, SalePrice)
var = 'OverallQual'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
f, ax = plt.subplots(figsize=(8,6))
fig = sns.boxplot(x=var, y='SalePrice', data=data)
fig.axis(ymin=0, ymax=800000)
โ๏ธ box plot (YearBuilt, SalePrice)
var = 'YearBuilt'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
f, ax = plt.subplots(figsize=(16,8))
fig = sns.boxplot(x=var, y='SalePrice', data=data)
fig.axis(ymin=0, ymax=800000)
plt.xticks(rotation=90) # x์ถ ๋๊ธ ๋ผ๋ฒจ ํ์ ํ๊ธฐ(90๋)
โ๏ธ correlation matrix
corrmat = df_train.corr()
f, ax = plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=.8, square=True)
โ๏ธ SalePrice correlation matrix
k = 10 # heatmap์ ๋ณ์์ ๊ฐ์
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(df_train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values,xticklabels=cols.values)
# cbar: colorbar์ ์ ๋ฌด, annot: ๊ฐ ์
์ ๊ฐ ํ๊ธฐ ์ ๋ฌด
# fmt: ๊ฐ์ ๋ฐ์ดํฐํ์
์ค์ -> fmt='.2f': ์์์ ๋์งธ์๋ฆฌ๊น์ง
# yticklabels=cols.values: y์ถ์ ์ปฌ๋ผ๋ช
์ถ๋ ฅ
plt.show()
โป heatmap ๊ธฐ๋ณธ๋ฌธ๋ฒ (์ฐธ๊ณ ์๋ฃ)
heatmap(df, # ๋ฐ์ดํฐ
vmin=100, # ์ต์๊ฐ
vmax=700, # ์ต๋๊ฐ
cbar=True, # colorbar์ ์ ๋ฌด
center=400, # ์ค์๊ฐ
linewidths=0.5, # cell ์ฌ์ด์ ์ ์ ์ง์ด ๋ฃ์
annot=True, # ๊ฐ cell์ ๊ฐ ํ๊ธฐ ์ ๋ฌด
fmt="d", # cell์ ํ์๋ ๊ฐ์ ๋ฐ์ดํฐ ํ์
cmap='Blues') # heatmap์ ์๊น
โ๏ธ correlation matrix
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(df_train[cols], size=2.5) # ๋ณ์ ๊ฐ ๊ด๊ณ ํ์
plt.show()
โ๏ธ missing data์ ๊ฐ์ ํ์ธํ๊ธฐ
total = df_train.isnull().sum().sort_values(ascending=False)
# isnull์ ๊ฒฐ๊ณผ -> True(1): ๋๋ฝ๋ฐ์ดํฐ, False(0): ์ ํจํ ๋ฐ์ดํฐ
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)
Total | Percent | |
---|---|---|
PoolQC | 1453 | 0.995205 |
MiscFeature | 1406 | 0.963014 |
Alley | 1369 | 0.937671 |
Fence | 1179 | 0.807534 |
FireplaceQu | 690 | 0.472603 |
LotFrontage | 259 | 0.177397 |
GarageYrBlt | 81 | 0.055479 |
GarageCond | 81 | 0.055479 |
GarageType | 81 | 0.055479 |
โฆ | โฆ | โฆ |
PoolQC, MiscFeature, Alley,Fence, FireplaceQu, LotFrontage: ๊ฒฐ์ธก์น ๋งค์ฐ ๋ง๊ณ , ์ง์ ๊ตฌ๋งคํ ๋ ์ค์ํ ์์๋ ์๋ ๊ฒ์ผ๋ก ํ๋จ๋จ โ ํด๋น ๋ณ์ ์ ๊ฑฐ ๊ณ ๋ ค
GarageYrBlt, GarageCond, GarageType, GarageFinish, GarageQual: ๊ฒฐ์ธก์น ๊ฐ์๊ฐ ๊ฐ์. garage์ ์ค์ํ ๋ณ์ ์ค SalePrice์ ๊ฐ์ฅ correlation์ด ๋์ ๊ฒ์ 'GarageCars'์ด๋ฏ๋ก, ํด๋น ๋ณ์๋ค์ ์ ๊ฑฐํจ
BsmtFinType2, BsmtExposure, BsmtQual, BsmtCond, BsmtFinType1: ์์ ๊ฐ์ ๋ ผ๋ฆฌ๋ฅผ ์ ์ฉํ์ฌ, ํด๋น ๋ณ์๋ค์ ์ ๊ฑฐํจ
MasVnrArea, MasVnrType: ์ด๋ฏธ ๊ณ ๋ ค๋์์ธ YearBuild, OverallQual๊ณผ ๊ฐํ correlation์ ๊ฐ๊ณ ์์ผ๋ฏ๋ก, ํด๋น ๋ณ์๋ ์ ๊ฑฐํจ
Electrical: ๊ฒฐ์ธก์น 1๊ฐ ์กด์ฌํ๋ฏ๋ก, ๊ฒฐ์ธก์น๋ง ์ ๊ฑฐํจ
โ๏ธ dealing with missing data
df_train = df_train.drop((missing_data[missing_data['Total'] > 1]).index,1) # missing data๊ฐ 1๊ฐ๋ณด๋ค ๋ง์ผ๋ฉด drop
df_train = df_train.drop(df_train.loc[df_train['Electrical'].isnull()].index) # Electrical์ ์กด์ฌํ๋ missing data(1๊ฐ)๋ฅผ drop
df_train.isnull().sum().max()
โ๏ธ ๋ฐ์ดํฐ ํ์คํ
# standardizing data
saleprice_scaled = StandardScaler().fit_transform(df_train['SalePrice'][:,np.newaxis])
low_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][:10]
high_range= saleprice_scaled[saleprice_scaled[:,0].argsort()][-10:]
print('outer range (low) of the distribution:')
print(low_range)
print('\nouter range (high) of the distribution:')
print(high_range)
outlier๋ฅผ ํ๋จํ๊ธฐ ์ํด ๋ฐ์ดํฐ๋ฅผ ํ์คํํจ
โ๏ธ scatter plot (GrLibArea, SalePrice)
๊ทธ๋ํ์ ์ค๋ฅธ์ชฝ ์๋์ ์์นํ 2๊ฐ์ ์ ์ outlier๋ก ํ๋จํ๊ณ ์ ๊ฑฐํจ
๊ทธ๋ํ์ ์ค๋ฅธ์ชฝ ์์ ์์นํ 2๊ฐ์ ์ ์ trend๋ฅผ ๋ฐ๋ฅด๊ณ ์์ผ๋ฏ๋ก, ์ ๊ฑฐํ์ง ์์
โ๏ธ Deleting points
df_train.sort_values(by='GrLivArea', ascending=False)[:2]
# GrLivArea๋ฅผ ๋ด๋ฆผ์ฐจ์์ผ๋ก ์ ๋ ฌํ๊ณ , ๊ทธ์ค ๊ฐ์ฅ ํฐ GrLivArea ๊ฐ์ ๊ฐ๋ 2๊ฐ์ ํ๋ง ์ถ๋ ฅ
# Id๊ฐ 1299, 524์ธ ํ(outlier) ์ญ์
df_train = df_train.drop(df_train[df_train['Id'] == 1299].index)
df_train = df_train.drop(df_train[df_train['Id'] == 524].index)
outlier๋ก ํ๋จ๋๋ ์ 2๊ฐ๋ฅผ ์ ๊ฑฐํจ
โ๏ธ scatter plot (saleprice, TotalBsmtSF)
โ๏ธ normality (SalePrice)
# histogram and normal probability plot(Q-Q Plot)
sns.distplot(df_train['SalePrice'], fit=norm)
fig = plt.figure()
res = stats.probplot(df_train['SalePrice'], plot=plt)
๋ก๊ทธ๋ณํ
df_train['SalePrice'] = np.log(df_train['SalePrice'])
# transformed histogram and normal probability plot
sns.distplot(df_train['SalePrice'], fit=norm)
fig = plt.figure()
res = stats.probplot(df_train['SalePrice'], plot=plt)
โ๏ธ normality (GrLivArea)
# histogram and normal probability plot
sns.distplot(df_train['GrLivArea'], fit=norm)
fig = plt.figure()
res = stats.probplot(df_train['GrLivArea'], plot=plt)
๋ก๊ทธ๋ณํ
df_train['GrLivArea'] = np.log(df_train['GrLivArea'])
# transformed histogram and normal probability plot
sns.distplot(df_train['GrLivArea'], fit=norm)
fig = plt.figure()
res = stats.probplot(df_train['GrLivArea'], plot=plt)
โ๏ธ normality (TotalBsmtSF)
# histogram and normal probability plot
sns.distplot(df_train['TotalBsmtSF'], fit=norm)
fig = plt.figure()
res = stats.probplot(df_train['TotalBsmtSF'], plot=plt)
๋ค์์ ๊ด์ธก์น๊ฐ 0 ๊ฐ์ ๊ฐ์ง(basement๊ฐ ์๋ ์ง์ธ ๊ฒฝ์ฐ) โ ๋ก๊ทธ๋ณํ์ ํ ์ ์์
=> basement ์กด์ฌ ์ฌ๋ถ์ ๋ฐ๋ผ 0 ๋๋ 1 ๊ฐ์ ๊ฐ๋ ๋ณ์๋ฅผ ์์ฑํ์ฌ, 0์ด ์๋ ๊ด์ธก์น์ ๋ํด์๋ง ๋ก๊ทธ๋ณํ์ ์ค์ํจ
โ๏ธ์๋ก์ด ๋ณ์ ์์ฑํ๊ธฐ
# ์๋ก์ด ๋ณ์ ์์ฑ (basement์ ์กด์ฌ ์ฌ๋ถ๋ฅผ 0, 1๋ก ๋ฒ์ฃผํ)
df_train['HasBsmt'] = pd.Series(len(df_train['TotalBsmtSF']), index=df_train.index)
df_train['HasBsmt'] = 0
df_train.loc[df_train['TotalBsmtSF'] > 0, 'HasBsmt'] = 1
# transform data
df_train.loc[df_train['HasBsmt'] == 1, 'TotalBsmtSF'] = np.log(df_train['TotalBsmtSF'])
# histogram and normal probability plot
sns.distplot(df_train[df_train['TotalBsmtSF'] > 0]['TotalBsmtSF'], fit=norm)
fig = plt.figure()
res = stats.probplot(df_train[df_train['TotalBsmtSF'] > 0]['TotalBsmtSF'], plot=plt)
โ๏ธ scatter plot (SalePrice, GrLivArea)
plt.scatter(df_train['GrLivArea'], df_train['SalePrice'])
โ๏ธ scatter plot (SalePrice, TotalBsmtSF)
plt.scatter(df_train[df_train['TotalBsmtSF'] > 0]['TotalBsmtSF'], df_train[df_train['TotalBsmtSF'] > 0]['SalePrice'])
โ๏ธ categorical ๋ณ์๋ฅผ dummy ๋ณ์๋ก ๋ณํํ๊ธฐ
df_train = pd.get_dummies(df_train)
House Prices - Advanced Regression Techniques
Comprehensive data exploration with Python(by Pedro Marcelino)
heatmap ๊ธฐ๋ณธ ๋ฌธ๋ฒ