Kaggle Link : https://www.kaggle.com/code/yoontaeklee/porto-seguro-s-safe-driver-prediction
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer # 결측치 대체
from sklearn.preprocessing import PolynomialFeatures # 교호작용 변수 생성
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
# FeatureSelection에서 분산이 기준치보다 낮은 feature 탈락
from sklearn.feature_selection import SelectFromModel
# Feature Importance를 제공하는 모델의 importance를 활용하여 변수 선택
from sklearn,utils import shuffle
from sklearn,ensemble import RandomForestClassifier
pd.set_option('display.max_columns',100)
df_train = pd.read_csv("../input/porto-seguros-safe-driver-prediction-dataset/train.csv")
df_test = pd.read_csv("../input/porto-seguros-safe-driver-prediction-dataset/test.csv")
df_train.head()
df_train.tail()
df_train.shape
# (595212, 59)
df_train.isnull().sum()
data = []
for f in df_train.columns:
# 데이터 역할 지정
if f == 'target':
role = 'target';
elif f == 'id':
role = 'id'
else:
role = 'input'
# 데이터 레벨 지정
if 'bin' in f or f == 'target':
level = 'binary'
elif 'cat' in f or f == 'id':
level = 'nominal'
elif df_train[f].dtype == float:
level = 'interval'
elif df_train[f].dtype == int:
level = 'ordinal'
# id는 False 지정
keep = True
if f == 'id':
keep = False
# 데이터 타입 지정
dtype = df_train[f].dtype
# DataFrame으로 만들기 위해 딕셔너리 타입으로 생성
f_dict = {
'varname': f,
'role': role,
'level': level,
'keep': keep,
'dtype': dtype
}
data.append(f_dict)
meta = pd.DataFrame(data, columns = ['varname','role','level','keep','dtype'])
meta.set_index('varname',inplage = True)
Interval = meta[(meta['level'] == 'interval') & (meta['keep'])].index
# describe 사용하여 interval 변수 통계량 확인
df_train[Interval].describe()
Ordinal = meta[(meta['level'] == 'ordinal') & (meta['keep'])].index
df_train[Ordinal].describe()
Binary = meta[(meta['level'] == 'binary') & (meta['keep'])].index
df_train[Binary].describe()
--------불균형 데이터 어떻게 처리하는가--------
f, ax = plt.subplots(figsize = (8,8))
df_train['target'].value_counts().plot.pie(explode = [0, 0.1],
autopct = '%1.1f%%', shadow = True,
colors = ['lightcoral','lightskyblue'], textprops={'fontsize':18})
plt.title('Target PiePlot', size = 20)
# 언더샘플링 비율
desired_apriori=0.10
# target 변수의 클래스에 따른 인덱스 지정
idx_0 = df_train[df_train['target'] == 0].index
idx_1 = df_train[df_train['target'] == 1].index
# 지정해준 인덱스로 클래스 길이(레코드 수) 지정
nb_0 = len(df_train.loc[idx_0])
nb_1 = len(df_train.loc[idx_1])
# undersampling
undersampling_rate = ((1-desired_apriori)*nb_1)/(nb_0*desired_apriori)
undersampled_nb_0 = int(undersampling_rate*nb_0)
print('target = 0 에 대한 언더샘플링 비율 : {}'.format(undersampling_rate))
print('언더샘플링 전 target = 0 레코드 수 : {}'.format(nb_0))
print('언더샘플링 후 target = 0 레코드 수 : {}'.format(undersampled_nb_0))
# 언더샘플링 비율이 적용된 개수만큼 랜덤하게 샘플을 뽑아 인덱스 저장
undersampled_idx = shuffle(idx_0, random_state=37,
n_samples=undersampled_nb_0)
# 언더샘플링 인덱스와 클래스 1의 인덱스를 리스트로 저장
idx_list = list(undersampled_idx) + list(idx_1)
# 저장한 인덱스로 train셋 인덱싱
df_train = df_train.loc[idx_list].reset_index(drop=True)
vars_with_missing = []
# 모든 컬럼에 -1이라는 값이 1개 이상 있는 것을 확인하여 출력
# 어느 변수에 몇개의 레코드가 있는지, 비율은 얼마나 되는지 확인하여 출력
for f in df_train.columns:
missings = df_train[df_train[f] == -1][f].count()
if missings > 0:
vars_with_missing.append(f)
missings_perc = missings/df_train.shape[0]
print('Variable {}\t has {:10} records\t ({:.2%})\t with missing values'.format(f, missings, missings_perc))
print()
print('There are {} variables with missing values total'.format(len(vars_with_missing)))
# 결측치가 많은 변수 제거
vars_to_drop = ['ps_car_03_cat','ps_car_05_cat']
df_train.drop(vars_to_drop, inplace=True, axis=1)
# 만들어두었던 메타데이터 업데이터 (버린 변수 keep=True -> False)
meta.loc[(vars_to_drop),'keep'] = False
# 그 외 결측치는 평균 혹은 최빈값으로 대체
# SimpleInputer 사용
mean_imp = SimpleImputer(missing_values=-1, strategy='mean')
mode_imp = SimpleImputer(missing_values=-1, strategy='most_frequent')
df_train['ps_reg_03'] = mean_imp.fit_transform(df_train[['ps_reg_03']])
df_train['ps_car_12'] = mean_imp.fit_transform(df_train[['ps_car_12']])
df_train['ps_car_14'] = mean_imp.fit_transform(df_train[['ps_car_14']])
df_train['ps_car_11'] = mode_imp.fit_transform(df_train[['ps_car_11']])
Nominal = meta[(meta['level'] == 'nominal') & (meta['keep'])].index
for f in Nominal:
dist_values = df_train[f].value_counts().shape[0]
print('Variable {} has {} distinct values'.format(f, dist_values))
# 오버피팅 방지를 위한 noise 추가
# smoothing을 통해 치우쳐진 평균값 보정
def add_noise(series, noise_level):
return series * (1 + noise_level * np.random.randn(len(series)))
def target_encode(trn_series=None, tst_series=None, target=None, min_samples_leaf=1, smoothing=1, noise_level=0):
assert len(trn_series) == len(target)
assert trn_series.name == tst_series.name
temp = pd.concat([trn_series, target], axis=1)
averages = temp.groupby(by=trn_series.name)[target.name].agg(['mean','count'])
# 오버피팅 방지를 위한 smoothing
smoothing = 1 / (1 + np.exp(-(averages['count'] - min_samples_leaf) / smoothing))
prior = target.mean()
averages[target.name] = prior * (1 - smoothing) + averages['mean'] * smoothing
averages.drop(['mean','count'], axis=1, inplace=True)
ft_trn_series = pd.merge(trn_series.to_frame(trn_series.name), averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),on=trn_series.name, how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
ft_trn_series.index = trn_series.index
ft_tst_series = pd.merge(tst_series.to_frame(tst_series.name),averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),on=tst_series.name, how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
ft_tst_series.index = tst_series.index
return add_noise(ft_trn_series, noise_level),add_noise(ft_tst_series, noise_level)
# 위에서 구현한 함수를 ps_car_11_cat(104개의 유니크 값)에 적용
# feature가 바뀌었으므로 메타데이터 업데이트
train_encoded, test_encoded = target_encode(df_train['ps_car_11_cat'],df_test['ps_car_11_cat'],target=df_train.target,min_samples_leaf=100,smoothing=10,noise_level=0.01)
df_train['ps_car_11_cat_te'] = train_encoded
df_train.drop('ps_car_11_cat', axis=1, inplace=True)
meta.loc['ps_car_11_cat', 'keep'] = False
df_test['ps_car_11_cat_te'] = test_encoded
df_test.drop('ps_car_11_cat', axis=1, inplace=True)
Nominal = meta[(meta['level'] == 'nominal') & (meta['keep'])].index
# 변수별로 barplot
for f in Nominal:
plt.figure()
fig, ax = plt.subplots(figsize=(20,10))
ax.grid(axis = 'y', linestyle = '--')
cat_perc = df_train[[f, 'target']].groupby([f], as_index=False).mean()
cat_perc.sort_values(by='target', ascending=False, inplace=True)
# 위의 계산을 통해 나온 비율로 target=1의 데이터 중 어떤 유니크 값의 비율이 높은지 확인
sns.barplot(ax=ax, x=f, y='target', palette='Pastel1', edgecolor='black', linewidth=0.8, data=cat_perc, order=cat_perc[f],)
plt.ylabel('% target', fontsize=18)
plt.xlabel(f, fontsize=18)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.show()
def corr_heatmap(Interval):
correlations = df_train[Interval].corr()
# Create color map ranging between two colors
cmap = sns.diverging_palette(220, 10, as_cmap=True)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(correlations, cmap=cmap, vmax=1.0, center=0, fmt='.2f',
square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .75})
plt.show();
Interval = meta[(meta["role"] == "target") | (meta["level"] == 'interval') & (meta["keep"])].index
corr_heatmap(Interval)
ps_reg_02 & ps_reg_03
sns.lmplot(x='ps_reg_02', y='ps_reg_03', data=df_train, hue='target', palette='Set1', scatter_kws={'alpha':0.3})
plt.show()
ps_car_12 & ps_car_13
sns.lmplot(x='ps_reg_02', y='ps_reg_03', data=df_train, hue='target', palette='Set1', scatter_kws={'alpha':0.3})
plt.show()
ps_car_12 and ps_car_14
sns.lmplot(x='ps_car_12', y='ps_car_14', data=df_train, hue='target', palette='Set1', scatter_kws={'alpha':0.3})
plt.show()
ps_car_13 & ps_car_15
sns.lmplot(x='ps_car_15', y='ps_car_13', data=df_train, hue='target', palette='Set1', scatter_kws={'alpha':0.3})
plt.show()
Ordinal = meta[(meta["role"] == "target") | (meta["level"] == 'ordinal') & (meta["keep"])].index
corr_heatmap(Ordinal)
Nominal = meta[(meta['level'] == 'nominal') & (meta['keep'])].index
print('One-hot Encoding 전 train 데이터 셋 변수: {}개'.format(df_train.shape[1]))
df_train = pd.get_dummies(df_train, columns=Nominal, drop_first=True)
df_test = pd.get_dummies(df_test, columns=Nominal, drop_first=True)
print('One-Hot Encoding 후 train 데이터 셋 변수 개수: {}'.format(df_train.shape[1]))
Parameter
1. degree : 차수
2. interaction_only : 상호작용 항 출력(x1,x2일 때 자신의 제곱항 무시하고 x1x2만 출력)
3. include_bias : 상수항 생성 여부
Interval = meta[(meta['level'] == 'interval') & (meta['keep'])].index
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
interactions = pd.DataFrame(data=poly.fit_transform(df_train[Interval]), columns=poly.get_feature_names_out(Interval))
interactions.drop(Interval, axis=1, inplace=True)
# 새로 만든 변수들을 기존 데이터에 concat
print('Interaction value 생성 전 train 데이터 셋 변수 : {}개'.format(df_train.shape[1]))
df_train = pd.concat([df_train, interactions], axis=1)
df_test = pd.concat([df_test, interactions], axis=1)
print('교호작용 변수 생성 후 train 데이터 셋 변수 개수: {}'.format(df_train.shape[1]))