import numpy as np
import pandas as pd
import gc
import time
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 200)
def get_dataset():
app_train = pd.read_csv('application_train.csv')
app_test = pd.read_csv('application_test.csv')
apps = pd.concat([app_train, app_test])
prev = pd.read_csv('previous_application.csv')
bureau = pd.read_csv('bureau.csv')
bureau_bal = pd.read_csv('bureau_balance.csv')
return apps, prev, bureau, bureau_bal
apps, prev, bureau, bureau_bal = get_dataset()
<ipython-input-2-eb2bde2565da>:4: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
To accept the future behavior, pass 'sort=False'.
To retain the current behavior and silence the warning, pass 'sort=True'.
apps = pd.concat([app_train, app_test])
| Table | 컬럼명 | 컬럼 대분류 | 컬럼 중분류 | 컬럼 설명 |
|---|---|---|---|---|
| bureau.csv | SK_ID_CURR | 대출 | 고유ID | 현재 대출 고유 ID |
| bureau.csv | SK_BUREAU_ID | 대출 | 고유ID | 타 기관 대출 고유 ID |
| bureau.csv | CREDIT_ACTIVE | 대출 | 대출 상태 | 대출 상태(Active: 대출 상환중, Closed: 상환 완료) |
| bureau.csv | CREDIT_CURRENCY | 대출 | 대출 금액 | 대출 금액 화폐유형 |
| bureau.csv | DAYS_CREDIT | 대출 | 행동 | 현재 대출 신청 일 기준 과거 대출 신청 지난 기간 |
| bureau.csv | CREDIT_DAY_OVERDUE | 대출 | 행동 | 대출 신청 시 CB 크레딧 연체 일수 |
| bureau.csv | DAYS_CREDIT_ENDDATE | 대출 | 상태 | CB 크레딧 채무 완료까지 남아있는 일수(신청일 기준) |
| bureau.csv | DAYS_ENDDATE_FACT | 대출 | 상태 | CB 크레딧 채무 완료까지 걸린 실제 일수(신청일 기준, 상태가 Close일때만) |
| bureau.csv | AMT_CREDIT_MAX_OVERDUE | 대출 | 상태 | 최대 연체금액 |
| bureau.csv | CNT_CREDIT_PROLONG | 대출 | 상태 | 신용 연장 횟수 |
| bureau.csv | AMT_CREDIT_SUM | 대출 | 대출 금액 | 현재 크레딧 금액 총액 |
| bureau.csv | AMT_CREDIT_SUM_DEBT | 대출 | 대출 금액 | 현재 채무 금액 총액 |
| bureau.csv | AMT_CREDIT_SUM_LIMIT | 대출 | 대출 금액 | 신용 카드 현재 신용한도 |
| bureau.csv | AMT_CREDIT_SUM_OVERDUE | 대출 | 상태 | 현재 연체 금액 |
| bureau.csv | CREDIT_TYPE | 대출 | 대출 유형 | 크레딧 유형 |
| bureau.csv | DAYS_CREDIT_UPDATE | 대출 | 대출 행동 | 대출 신청전 마지막 정보 받은 기간 |
| bureau.csv | AMT_ANNUITY | 대출 | 대출 금액 | 월 대출 지급액 |
| bureau_balance.csv | SK_BUREAU_ID | 대출 | 고유ID | 타 기관 대출 고유 ID |
| bureau_balance.csv | MONTHS_BALANCE | 대출 | 상태 | 신청일 기준 잔액 월 |
| bureau_balance.csv | STATUS | 대출 | 상태 | 월별 대출 상태(Active: 대출 상환중, Closed: 상환 완료, DPD0-30: 30일 이전 연체 상태등 |
bureau.head(20)
| SK_ID_CURR | SK_ID_BUREAU | CREDIT_ACTIVE | CREDIT_CURRENCY | DAYS_CREDIT | CREDIT_DAY_OVERDUE | DAYS_CREDIT_ENDDATE | DAYS_ENDDATE_FACT | AMT_CREDIT_MAX_OVERDUE | CNT_CREDIT_PROLONG | AMT_CREDIT_SUM | AMT_CREDIT_SUM_DEBT | AMT_CREDIT_SUM_LIMIT | AMT_CREDIT_SUM_OVERDUE | CREDIT_TYPE | DAYS_CREDIT_UPDATE | AMT_ANNUITY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 215354 | 5714462 | Closed | currency 1 | -497 | 0 | -153.0 | -153.0 | NaN | 0 | 91323.00 | 0.00 | NaN | 0.0 | Consumer credit | -131 | NaN |
| 1 | 215354 | 5714463 | Active | currency 1 | -208 | 0 | 1075.0 | NaN | NaN | 0 | 225000.00 | 171342.00 | NaN | 0.0 | Credit card | -20 | NaN |
| 2 | 215354 | 5714464 | Active | currency 1 | -203 | 0 | 528.0 | NaN | NaN | 0 | 464323.50 | NaN | NaN | 0.0 | Consumer credit | -16 | NaN |
| 3 | 215354 | 5714465 | Active | currency 1 | -203 | 0 | NaN | NaN | NaN | 0 | 90000.00 | NaN | NaN | 0.0 | Credit card | -16 | NaN |
| 4 | 215354 | 5714466 | Active | currency 1 | -629 | 0 | 1197.0 | NaN | 77674.5 | 0 | 2700000.00 | NaN | NaN | 0.0 | Consumer credit | -21 | NaN |
| 5 | 215354 | 5714467 | Active | currency 1 | -273 | 0 | 27460.0 | NaN | 0.0 | 0 | 180000.00 | 71017.38 | 108982.62 | 0.0 | Credit card | -31 | NaN |
| 6 | 215354 | 5714468 | Active | currency 1 | -43 | 0 | 79.0 | NaN | 0.0 | 0 | 42103.80 | 42103.80 | 0.00 | 0.0 | Consumer credit | -22 | NaN |
| 7 | 162297 | 5714469 | Closed | currency 1 | -1896 | 0 | -1684.0 | -1710.0 | 14985.0 | 0 | 76878.45 | 0.00 | 0.00 | 0.0 | Consumer credit | -1710 | NaN |
| 8 | 162297 | 5714470 | Closed | currency 1 | -1146 | 0 | -811.0 | -840.0 | 0.0 | 0 | 103007.70 | 0.00 | 0.00 | 0.0 | Consumer credit | -840 | NaN |
| 9 | 162297 | 5714471 | Active | currency 1 | -1146 | 0 | -484.0 | NaN | 0.0 | 0 | 4500.00 | 0.00 | 0.00 | 0.0 | Credit card | -690 | NaN |
| 10 | 162297 | 5714472 | Active | currency 1 | -1146 | 0 | -180.0 | NaN | 0.0 | 0 | 337500.00 | 0.00 | 0.00 | 0.0 | Credit card | -690 | NaN |
| 11 | 162297 | 5714473 | Closed | currency 1 | -2456 | 0 | -629.0 | -825.0 | NaN | 0 | 675000.00 | 0.00 | 0.00 | 0.0 | Consumer credit | -706 | NaN |
| 12 | 162297 | 5714474 | Active | currency 1 | -277 | 0 | 5261.0 | NaN | 0.0 | 0 | 7033500.00 | NaN | NaN | 0.0 | Mortgage | -31 | NaN |
| 13 | 402440 | 5714475 | Active | currency 1 | -96 | 0 | 269.0 | NaN | 0.0 | 0 | 89910.00 | 76905.00 | 0.00 | 0.0 | Consumer credit | -22 | NaN |
| 14 | 238881 | 5714482 | Closed | currency 1 | -318 | 0 | -187.0 | -187.0 | NaN | 0 | 0.00 | 0.00 | 0.00 | 0.0 | Credit card | -185 | NaN |
| 15 | 238881 | 5714484 | Closed | currency 1 | -2911 | 0 | -2607.0 | -2604.0 | NaN | 0 | 48555.00 | NaN | NaN | 0.0 | Consumer credit | -2601 | NaN |
| 16 | 238881 | 5714485 | Closed | currency 1 | -2148 | 0 | -1595.0 | -987.0 | NaN | 0 | 135000.00 | NaN | NaN | 0.0 | Consumer credit | -984 | NaN |
| 17 | 238881 | 5714486 | Active | currency 1 | -381 | 0 | NaN | NaN | NaN | 0 | 450000.00 | 520920.00 | NaN | 0.0 | Consumer credit | -4 | NaN |
| 18 | 238881 | 5714487 | Active | currency 1 | -95 | 0 | 1720.0 | NaN | NaN | 0 | 67500.00 | 8131.50 | NaN | 0.0 | Credit card | -7 | NaN |
| 19 | 238881 | 5714488 | Closed | currency 1 | -444 | 0 | -77.0 | -77.0 | 0.0 | 0 | 107184.06 | 0.00 | 0.00 | 0.0 | Consumer credit | -71 | NaN |
bureau.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1716428 entries, 0 to 1716427
Data columns (total 17 columns):
SK_ID_CURR int64
SK_ID_BUREAU int64
CREDIT_ACTIVE object
CREDIT_CURRENCY object
DAYS_CREDIT int64
CREDIT_DAY_OVERDUE int64
DAYS_CREDIT_ENDDATE float64
DAYS_ENDDATE_FACT float64
AMT_CREDIT_MAX_OVERDUE float64
CNT_CREDIT_PROLONG int64
AMT_CREDIT_SUM float64
AMT_CREDIT_SUM_DEBT float64
AMT_CREDIT_SUM_LIMIT float64
AMT_CREDIT_SUM_OVERDUE float64
CREDIT_TYPE object
DAYS_CREDIT_UPDATE int64
AMT_ANNUITY float64
dtypes: float64(8), int64(6), object(3)
memory usage: 222.6+ MB
# DAYS_ENDDATE_FACT, AMT_CREDIT_MAX_OVERDUE, AMT_CREDIT_SUM_LIMT, AMT_ANNUITY 등이 Null 컬럼이 많음.
bureau.isnull().sum()
SK_ID_CURR 0
SK_ID_BUREAU 0
CREDIT_ACTIVE 0
CREDIT_CURRENCY 0
DAYS_CREDIT 0
CREDIT_DAY_OVERDUE 0
DAYS_CREDIT_ENDDATE 105553
DAYS_ENDDATE_FACT 633653
AMT_CREDIT_MAX_OVERDUE 1124488
CNT_CREDIT_PROLONG 0
AMT_CREDIT_SUM 13
AMT_CREDIT_SUM_DEBT 257669
AMT_CREDIT_SUM_LIMIT 591780
AMT_CREDIT_SUM_OVERDUE 0
CREDIT_TYPE 0
DAYS_CREDIT_UPDATE 0
AMT_ANNUITY 1226791
dtype: int64
bureau_bal.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27299925 entries, 0 to 27299924
Data columns (total 3 columns):
SK_ID_BUREAU int64
MONTHS_BALANCE int64
STATUS object
dtypes: int64(2), object(1)
memory usage: 624.8+ MB
bureau_bal.head(30)
| SK_ID_BUREAU | MONTHS_BALANCE | STATUS | |
|---|---|---|---|
| 0 | 5715448 | 0 | C |
| 1 | 5715448 | -1 | C |
| 2 | 5715448 | -2 | C |
| 3 | 5715448 | -3 | C |
| 4 | 5715448 | -4 | C |
| 5 | 5715448 | -5 | C |
| 6 | 5715448 | -6 | C |
| 7 | 5715448 | -7 | C |
| 8 | 5715448 | -8 | C |
| 9 | 5715448 | -9 | 0 |
| 10 | 5715448 | -10 | 0 |
| 11 | 5715448 | -11 | X |
| 12 | 5715448 | -12 | X |
| 13 | 5715448 | -13 | X |
| 14 | 5715448 | -14 | 0 |
| 15 | 5715448 | -15 | 0 |
| 16 | 5715448 | -16 | 0 |
| 17 | 5715448 | -17 | 0 |
| 18 | 5715448 | -18 | 0 |
| 19 | 5715448 | -19 | 0 |
| 20 | 5715448 | -20 | X |
| 21 | 5715448 | -21 | X |
| 22 | 5715448 | -22 | X |
| 23 | 5715448 | -23 | X |
| 24 | 5715448 | -24 | X |
| 25 | 5715448 | -25 | X |
| 26 | 5715448 | -26 | X |
| 27 | 5715449 | 0 | C |
| 28 | 5715449 | -1 | C |
| 29 | 5715449 | -2 | C |
bureau[bureau['SK_ID_BUREAU'] == 5715448].head()
| SK_ID_CURR | SK_ID_BUREAU | CREDIT_ACTIVE | CREDIT_CURRENCY | DAYS_CREDIT | CREDIT_DAY_OVERDUE | DAYS_CREDIT_ENDDATE | DAYS_ENDDATE_FACT | AMT_CREDIT_MAX_OVERDUE | CNT_CREDIT_PROLONG | AMT_CREDIT_SUM | AMT_CREDIT_SUM_DEBT | AMT_CREDIT_SUM_LIMIT | AMT_CREDIT_SUM_OVERDUE | CREDIT_TYPE | DAYS_CREDIT_UPDATE | AMT_ANNUITY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 768 | 380361 | 5715448 | Active | currency 1 | -820 | 0 | 31069.0 | NaN | NaN | 0 | 67500.0 | 0.0 | 67500.0 | 0.0 | Credit card | -183 | 0.0 |
# TARGET 값을 가져오기 위해 bureau를 apps와 조인
bureau_app = bureau.merge(apps[['SK_ID_CURR', 'TARGET']], on='SK_ID_CURR', how='left')
num_columns = bureau_app.dtypes[bureau_app.dtypes != 'object'].index.tolist()
# 숫자형 컬럼중 ID와 TARGET은 제외
num_columns = [column for column in num_columns if column not in['SK_ID_BUREAU', 'SK_ID_CURR', 'TARGET']]
print(num_columns)
def show_hist_by_target(df, columns):
cond_1 = (df['TARGET'] == 1)
cond_0 = (df['TARGET'] == 0)
for column in columns:
print("column:", column)
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(12, 4), squeeze=False)
# bureau는 특정 컬럼값이 infinite로 들어가있는 경우가 있음. infinite일 경우 KDE histogram 시각화 시 문제 발생하여 이를 제거
sns.violinplot(x='TARGET', y=column, data=df[np.isfinite(df[column])], ax=axs[0][0])
sns.distplot(df[cond_0 & np.isfinite(df[column])][column], label='0', color='blue', ax=axs[0][1])
sns.distplot(df[cond_1 & np.isfinite(df[column])][column], label='1', color='red', ax=axs[0][1])
show_hist_by_target(bureau_app, num_columns)
['DAYS_CREDIT', 'CREDIT_DAY_OVERDUE', 'DAYS_CREDIT_ENDDATE', 'DAYS_ENDDATE_FACT', 'AMT_CREDIT_MAX_OVERDUE', 'CNT_CREDIT_PROLONG', 'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT', 'AMT_CREDIT_SUM_OVERDUE', 'DAYS_CREDIT_UPDATE', 'AMT_ANNUITY']
column: DAYS_CREDIT
column: CREDIT_DAY_OVERDUE
/Users/andylee/opt/anaconda3/lib/python3.8/site-packages/seaborn/distributions.py:369: UserWarning: Default bandwidth for data is 0; skipping density estimation.
warnings.warn(msg, UserWarning)
/Users/andylee/opt/anaconda3/lib/python3.8/site-packages/seaborn/distributions.py:369: UserWarning: Default bandwidth for data is 0; skipping density estimation.
warnings.warn(msg, UserWarning)
column: DAYS_CREDIT_ENDDATE
column: DAYS_ENDDATE_FACT
column: AMT_CREDIT_MAX_OVERDUE
/Users/andylee/opt/anaconda3/lib/python3.8/site-packages/seaborn/distributions.py:369: UserWarning: Default bandwidth for data is 0; skipping density estimation.
warnings.warn(msg, UserWarning)
column: CNT_CREDIT_PROLONG
/Users/andylee/opt/anaconda3/lib/python3.8/site-packages/seaborn/distributions.py:369: UserWarning: Default bandwidth for data is 0; skipping density estimation.
warnings.warn(msg, UserWarning)
/Users/andylee/opt/anaconda3/lib/python3.8/site-packages/seaborn/distributions.py:369: UserWarning: Default bandwidth for data is 0; skipping density estimation.
warnings.warn(msg, UserWarning)
column: AMT_CREDIT_SUM
column: AMT_CREDIT_SUM_DEBT
column: AMT_CREDIT_SUM_LIMIT
/Users/andylee/opt/anaconda3/lib/python3.8/site-packages/seaborn/distributions.py:369: UserWarning: Default bandwidth for data is 0; skipping density estimation.
warnings.warn(msg, UserWarning)
/Users/andylee/opt/anaconda3/lib/python3.8/site-packages/seaborn/distributions.py:369: UserWarning: Default bandwidth for data is 0; skipping density estimation.
warnings.warn(msg, UserWarning)
column: AMT_CREDIT_SUM_OVERDUE
/Users/andylee/opt/anaconda3/lib/python3.8/site-packages/seaborn/distributions.py:369: UserWarning: Default bandwidth for data is 0; skipping density estimation.
warnings.warn(msg, UserWarning)
/Users/andylee/opt/anaconda3/lib/python3.8/site-packages/seaborn/distributions.py:369: UserWarning: Default bandwidth for data is 0; skipping density estimation.
warnings.warn(msg, UserWarning)
column: DAYS_CREDIT_UPDATE
column: AMT_ANNUITY












np.isfinite(bureau_app['AMT_CREDIT_SUM_LIMIT']).sum()
1124648
object_columns = bureau.dtypes[bureau.dtypes=='object'].index.tolist()
print(object_columns)
def show_category_by_target(df, columns):
for column in columns:
chart = sns.catplot(x=column, col="TARGET", data=df, kind="count")
chart.set_xticklabels(rotation=65)
show_category_by_target(bureau_app, object_columns)
['CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'CREDIT_TYPE']



# 예정 채무 시작 및 완료일과 실제 채무 완료일간의 차이 및 날짜 비율 가공.
bureau['BUREAU_ENDDATE_FACT_DIFF'] = bureau['DAYS_CREDIT_ENDDATE'] - bureau['DAYS_ENDDATE_FACT']
bureau['BUREAU_CREDIT_FACT_DIFF'] = bureau['DAYS_CREDIT'] - bureau['DAYS_ENDDATE_FACT']
bureau['BUREAU_CREDIT_ENDDATE_DIFF'] = bureau['DAYS_CREDIT'] - bureau['DAYS_CREDIT_ENDDATE']
# 채무 금액 대비/대출 금액 비율 및 차이 가공
bureau['BUREAU_CREDIT_DEBT_RATIO'] = bureau['AMT_CREDIT_SUM_DEBT'] / bureau['AMT_CREDIT_SUM']
bureau['BUREAU_CREDIT_DEBT_DIFF'] = bureau['AMT_CREDIT_SUM_DEBT'] - bureau['AMT_CREDIT_SUM']
bureau['AMT_CREDIT_SUM_DEBT'].value_counts().head(30)
0.0 1016434
4.5 653
-450.0 543
135000.0 344
90000.0 320
45000.0 316
22500.0 307
67500.0 238
225000.0 237
13500.0 205
450000.0 177
112500.0 156
18000.0 143
157500.0 139
27000.0 132
54000.0 125
9000.0 121
270000.0 107
22950.0 101
900000.0 98
675000.0 97
180000.0 97
9.0 94
225.0 88
-45.0 84
31500.0 84
36000.0 81
450.0 81
1350000.0 81
49500.0 81
Name: AMT_CREDIT_SUM_DEBT, dtype: int64
bureau['AMT_CREDIT_SUM_DEBT'].hist()

# 연체 건수가 많지 않음.
bureau['CREDIT_DAY_OVERDUE'].value_counts()
0 1712211
30 311
60 126
13 103
8 103
...
1548 1
1546 1
519 1
2565 1
372 1
Name: CREDIT_DAY_OVERDUE, Length: 942, dtype: int64
bureau[bureau['CREDIT_DAY_OVERDUE'] > 120].shape
(1143, 22)
bureau[bureau['CREDIT_DAY_OVERDUE'] > 120]['CREDIT_DAY_OVERDUE'].hist(bins=300)

bureau['BUREAU_IS_DPD'] = bureau['CREDIT_DAY_OVERDUE'].apply(lambda x: 1 if x > 0 else 0)
bureau['BUREAU_IS_DPD'].value_counts()
0 1712211
1 4217
Name: BUREAU_IS_DPD, dtype: int64
bureau['BUREAU_IS_DPD_OVER120'] = bureau['CREDIT_DAY_OVERDUE'].apply(lambda x: 1 if x >120 else 0)
bureau['BUREAU_IS_DPD_OVER120'].value_counts()
0 1715285
1 1143
Name: BUREAU_IS_DPD_OVER120, dtype: int64
bureau_agg_dict = {
# 기존 컬럼
'SK_ID_BUREAU':['count'],
'DAYS_CREDIT':['min', 'max', 'mean'],
'CREDIT_DAY_OVERDUE':['min', 'max', 'mean'],
'DAYS_CREDIT_ENDDATE':['min', 'max', 'mean'],
'DAYS_ENDDATE_FACT':['min', 'max', 'mean'],
'AMT_CREDIT_MAX_OVERDUE': ['max', 'mean'],
'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
'AMT_CREDIT_SUM_OVERDUE': ['max', 'mean', 'sum'],
'AMT_ANNUITY': ['max', 'mean', 'sum'],
# 추가 가공 컬럼
'BUREAU_ENDDATE_FACT_DIFF':['min', 'max', 'mean'],
'BUREAU_CREDIT_FACT_DIFF':['min', 'max', 'mean'],
'BUREAU_CREDIT_ENDDATE_DIFF':['min', 'max', 'mean'],
'BUREAU_CREDIT_DEBT_RATIO':['min', 'max', 'mean'],
'BUREAU_CREDIT_DEBT_DIFF':['min', 'max', 'mean'],
'BUREAU_IS_DPD':['mean', 'sum'],
'BUREAU_IS_DPD_OVER120':['mean', 'sum']
}
bureau_grp = bureau.groupby('SK_ID_CURR')
bureau_day_amt_agg = bureau_grp.agg(bureau_agg_dict)
# BUREAU_ 접두어로 하는 새로운 컬럼명 할당.
bureau_day_amt_agg.columns = ['BUREAU_'+('_').join(column).upper() for column in bureau_day_amt_agg.columns.ravel()]
# 조인을 위해 SK_ID_CURR을 reset_index()로 컬럼화
bureau_day_amt_agg = bureau_day_amt_agg.reset_index()
print(bureau_day_amt_agg.shape)
(305811, 47)
bureau_day_amt_agg.head(10)
| SK_ID_CURR | BUREAU_SK_ID_BUREAU_COUNT | BUREAU_DAYS_CREDIT_MIN | BUREAU_DAYS_CREDIT_MAX | BUREAU_DAYS_CREDIT_MEAN | BUREAU_CREDIT_DAY_OVERDUE_MIN | BUREAU_CREDIT_DAY_OVERDUE_MAX | BUREAU_CREDIT_DAY_OVERDUE_MEAN | BUREAU_DAYS_CREDIT_ENDDATE_MIN | BUREAU_DAYS_CREDIT_ENDDATE_MAX | BUREAU_DAYS_CREDIT_ENDDATE_MEAN | BUREAU_DAYS_ENDDATE_FACT_MIN | BUREAU_DAYS_ENDDATE_FACT_MAX | BUREAU_DAYS_ENDDATE_FACT_MEAN | BUREAU_AMT_CREDIT_MAX_OVERDUE_MAX | BUREAU_AMT_CREDIT_MAX_OVERDUE_MEAN | BUREAU_AMT_CREDIT_SUM_MAX | BUREAU_AMT_CREDIT_SUM_MEAN | BUREAU_AMT_CREDIT_SUM_SUM | BUREAU_AMT_CREDIT_SUM_DEBT_MAX | BUREAU_AMT_CREDIT_SUM_DEBT_MEAN | BUREAU_AMT_CREDIT_SUM_DEBT_SUM | BUREAU_AMT_CREDIT_SUM_OVERDUE_MAX | BUREAU_AMT_CREDIT_SUM_OVERDUE_MEAN | BUREAU_AMT_CREDIT_SUM_OVERDUE_SUM | BUREAU_AMT_ANNUITY_MAX | BUREAU_AMT_ANNUITY_MEAN | BUREAU_AMT_ANNUITY_SUM | BUREAU_BUREAU_ENDDATE_FACT_DIFF_MIN | BUREAU_BUREAU_ENDDATE_FACT_DIFF_MAX | BUREAU_BUREAU_ENDDATE_FACT_DIFF_MEAN | BUREAU_BUREAU_CREDIT_FACT_DIFF_MIN | BUREAU_BUREAU_CREDIT_FACT_DIFF_MAX | BUREAU_BUREAU_CREDIT_FACT_DIFF_MEAN | BUREAU_BUREAU_CREDIT_ENDDATE_DIFF_MIN | BUREAU_BUREAU_CREDIT_ENDDATE_DIFF_MAX | BUREAU_BUREAU_CREDIT_ENDDATE_DIFF_MEAN | BUREAU_BUREAU_CREDIT_DEBT_RATIO_MIN | BUREAU_BUREAU_CREDIT_DEBT_RATIO_MAX | BUREAU_BUREAU_CREDIT_DEBT_RATIO_MEAN | BUREAU_BUREAU_CREDIT_DEBT_DIFF_MIN | BUREAU_BUREAU_CREDIT_DEBT_DIFF_MAX | BUREAU_BUREAU_CREDIT_DEBT_DIFF_MEAN | BUREAU_BUREAU_IS_DPD_MEAN | BUREAU_BUREAU_IS_DPD_SUM | BUREAU_BUREAU_IS_DPD_OVER120_MEAN | BUREAU_BUREAU_IS_DPD_OVER120_SUM | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100001 | 7 | -1572 | -49 | -735.000000 | 0 | 0 | 0.0 | -1329.0 | 1778.0 | 82.428571 | -1328.0 | -544.0 | -825.500000 | NaN | NaN | 378000.0 | 207623.571429 | 1453365.000 | 373239.0 | 85240.928571 | 596686.5 | 0.0 | 0.0 | 0.0 | 10822.5 | 3545.357143 | 24817.5 | -1.0 | 698.0 | 197.000000 | -335.0 | -32.0 | -228.750000 | -1827.0 | -243.0 | -817.428571 | 0.0 | 0.987405 | 0.282518 | -279720.0 | -4761.0 | -122382.642857 | 0.0 | 0 | 0.0 | 0 |
| 1 | 100002 | 8 | -1437 | -103 | -874.000000 | 0 | 0 | 0.0 | -1072.0 | 780.0 | -349.000000 | -1185.0 | -36.0 | -697.500000 | 5043.645 | 1681.029 | 450000.0 | 108131.945625 | 865055.565 | 245781.0 | 49156.200000 | 245781.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 1029.0 | 252.600000 | -609.0 | -76.0 | -277.000000 | -1822.0 | -87.0 | -719.833333 | 0.0 | 0.546180 | 0.136545 | -204219.0 | 0.0 | -98388.513000 | 0.0 | 0 | 0.0 | 0 |
| 2 | 100003 | 4 | -2586 | -606 | -1400.750000 | 0 | 0 | 0.0 | -2434.0 | 1216.0 | -544.500000 | -2131.0 | -540.0 | -1097.333333 | 0.000 | 0.000 | 810000.0 | 254350.125000 | 1017400.500 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | -303.0 | 201.0 | -34.000000 | -1096.0 | -154.0 | -568.333333 | -1822.0 | -152.0 | -856.250000 | 0.0 | 0.000000 | 0.000000 | -810000.0 | -22248.0 | -254350.125000 | 0.0 | 0 | 0.0 | 0 |
| 3 | 100004 | 2 | -1326 | -408 | -867.000000 | 0 | 0 | 0.0 | -595.0 | -382.0 | -488.500000 | -683.0 | -382.0 | -532.500000 | 0.000 | 0.000 | 94537.8 | 94518.900000 | 189037.800 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | 0.0 | 88.0 | 44.000000 | -643.0 | -26.0 | -334.500000 | -731.0 | -26.0 | -378.500000 | 0.0 | 0.000000 | 0.000000 | -94537.8 | -94500.0 | -94518.900000 | 0.0 | 0 | 0.0 | 0 |
| 4 | 100005 | 3 | -373 | -62 | -190.666667 | 0 | 0 | 0.0 | -128.0 | 1324.0 | 439.333333 | -123.0 | -123.0 | -123.000000 | 0.000 | 0.000 | 568800.0 | 219042.000000 | 657126.000 | 543087.0 | 189469.500000 | 568408.5 | 0.0 | 0.0 | 0.0 | 4261.5 | 1420.500000 | 4261.5 | -5.0 | -5.0 | -5.000000 | -250.0 | -250.0 | -250.000000 | -1461.0 | -184.0 | -630.000000 | 0.0 | 0.954794 | 0.601256 | -58500.0 | -4504.5 | -29572.500000 | 0.0 | 0 | 0.0 | 0 |
| 5 | 100007 | 1 | -1149 | -1149 | -1149.000000 | 0 | 0 | 0.0 | -783.0 | -783.0 | -783.000000 | -783.0 | -783.0 | -783.000000 | 0.000 | 0.000 | 146250.0 | 146250.000000 | 146250.000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.000000 | -366.0 | -366.0 | -366.000000 | -366.0 | -366.0 | -366.000000 | 0.0 | 0.000000 | 0.000000 | -146250.0 | -146250.0 | -146250.000000 | 0.0 | 0 | 0.0 | 0 |
| 6 | 100008 | 3 | -1097 | -78 | -757.333333 | 0 | 0 | 0.0 | -853.0 | 471.0 | -391.333333 | -1028.0 | -790.0 | -909.000000 | 0.000 | 0.000 | 267606.0 | 156148.500000 | 468445.500 | 240057.0 | 80019.000000 | 240057.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | -2.0 | 175.0 | 86.500000 | -307.0 | -69.0 | -188.000000 | -549.0 | -244.0 | -366.000000 | 0.0 | 0.897054 | 0.299018 | -105705.0 | -27549.0 | -76129.500000 | 0.0 | 0 | 0.0 | 0 |
| 7 | 100009 | 18 | -2882 | -239 | -1271.500000 | 0 | 0 | 0.0 | -2152.0 | 1402.0 | -794.937500 | -2152.0 | -313.0 | -1108.500000 | 0.000 | 0.000 | 1777500.0 | 266711.750000 | 4800811.500 | 557959.5 | 76953.535714 | 1077349.5 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | -713.0 | 1459.0 | 114.785714 | -893.0 | -92.0 | -357.214286 | -1826.0 | -30.0 | -529.000000 | 0.0 | 0.967787 | 0.169369 | -1777500.0 | -10872.0 | -227088.000000 | 0.0 | 0 | 0.0 | 0 |
| 8 | 100010 | 2 | -2741 | -1138 | -1939.500000 | 0 | 0 | 0.0 | -928.0 | 689.0 | -119.500000 | -1138.0 | -1138.0 | -1138.000000 | NaN | NaN | 675000.0 | 495000.000000 | 990000.000 | 348007.5 | 174003.750000 | 348007.5 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | 210.0 | 210.0 | 210.000000 | -1603.0 | -1603.0 | -1603.000000 | -1827.0 | -1813.0 | -1820.000000 | 0.0 | 0.515567 | 0.257783 | -326992.5 | -315000.0 | -320996.250000 | 0.0 | 0 | 0.0 | 0 |
| 9 | 100011 | 4 | -2508 | -1309 | -1773.000000 | 0 | 0 | 0.0 | -2173.0 | -860.0 | -1293.250000 | -2197.0 | -968.0 | -1463.250000 | 10147.230 | 5073.615 | 145242.0 | 108807.075000 | 435228.300 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | -102.0 | 758.0 | 170.000000 | -347.0 | -239.0 | -309.750000 | -1100.0 | -239.0 | -479.750000 | 0.0 | 0.000000 | 0.000000 | -145242.0 | -54000.0 | -96662.100000 | 0.0 | 0 | 0.0 | 0 |
cond_active = bureau['CREDIT_ACTIVE'] == 'Active'
bureau_active_grp = bureau[cond_active].groupby('SK_ID_CURR')
bureau_agg_dict = {
# 기존 컬럼
'SK_ID_BUREAU':['count'],
'DAYS_CREDIT':['min', 'max', 'mean'],
'CREDIT_DAY_OVERDUE':['min', 'max', 'mean'],
'DAYS_CREDIT_ENDDATE':['min', 'max', 'mean'],
'DAYS_ENDDATE_FACT':['min', 'max', 'mean'],
'AMT_CREDIT_MAX_OVERDUE': ['max', 'mean'],
'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
'AMT_CREDIT_SUM_OVERDUE': ['max', 'mean', 'sum'],
'AMT_ANNUITY': ['max', 'mean', 'sum'],
# 추가 가공 컬럼
'BUREAU_ENDDATE_FACT_DIFF':['min', 'max', 'mean'],
'BUREAU_CREDIT_FACT_DIFF':['min', 'max', 'mean'],
'BUREAU_CREDIT_ENDDATE_DIFF':['min', 'max', 'mean'],
'BUREAU_CREDIT_DEBT_RATIO':['min', 'max', 'mean'],
'BUREAU_CREDIT_DEBT_DIFF':['min', 'max', 'mean'],
'BUREAU_IS_DPD':['mean', 'sum'],
'BUREAU_IS_DPD_OVER120':['mean', 'sum']
}
bureau_active_agg = bureau_active_grp.agg(bureau_agg_dict)
# BUREAU_ACT을 접두어로 하는 새로운 컬럼명 할당.
bureau_active_agg.columns = ['BUREAU_ACT_'+('_').join(column).upper() for column in bureau_active_agg.columns.ravel()]
# 조인을 위해 SK_ID_CURR을 reset_index()로 컬럼화
bureau_active_agg = bureau_active_agg.reset_index()
print(bureau_active_agg.shape)
bureau_active_agg.head(10)
(251815, 47)
| SK_ID_CURR | BUREAU_ACT_SK_ID_BUREAU_COUNT | BUREAU_ACT_DAYS_CREDIT_MIN | BUREAU_ACT_DAYS_CREDIT_MAX | BUREAU_ACT_DAYS_CREDIT_MEAN | BUREAU_ACT_CREDIT_DAY_OVERDUE_MIN | BUREAU_ACT_CREDIT_DAY_OVERDUE_MAX | BUREAU_ACT_CREDIT_DAY_OVERDUE_MEAN | BUREAU_ACT_DAYS_CREDIT_ENDDATE_MIN | BUREAU_ACT_DAYS_CREDIT_ENDDATE_MAX | BUREAU_ACT_DAYS_CREDIT_ENDDATE_MEAN | BUREAU_ACT_DAYS_ENDDATE_FACT_MIN | BUREAU_ACT_DAYS_ENDDATE_FACT_MAX | BUREAU_ACT_DAYS_ENDDATE_FACT_MEAN | BUREAU_ACT_AMT_CREDIT_MAX_OVERDUE_MAX | BUREAU_ACT_AMT_CREDIT_MAX_OVERDUE_MEAN | BUREAU_ACT_AMT_CREDIT_SUM_MAX | BUREAU_ACT_AMT_CREDIT_SUM_MEAN | BUREAU_ACT_AMT_CREDIT_SUM_SUM | BUREAU_ACT_AMT_CREDIT_SUM_DEBT_MAX | BUREAU_ACT_AMT_CREDIT_SUM_DEBT_MEAN | BUREAU_ACT_AMT_CREDIT_SUM_DEBT_SUM | BUREAU_ACT_AMT_CREDIT_SUM_OVERDUE_MAX | BUREAU_ACT_AMT_CREDIT_SUM_OVERDUE_MEAN | BUREAU_ACT_AMT_CREDIT_SUM_OVERDUE_SUM | BUREAU_ACT_AMT_ANNUITY_MAX | BUREAU_ACT_AMT_ANNUITY_MEAN | BUREAU_ACT_AMT_ANNUITY_SUM | BUREAU_ACT_BUREAU_ENDDATE_FACT_DIFF_MIN | BUREAU_ACT_BUREAU_ENDDATE_FACT_DIFF_MAX | BUREAU_ACT_BUREAU_ENDDATE_FACT_DIFF_MEAN | BUREAU_ACT_BUREAU_CREDIT_FACT_DIFF_MIN | BUREAU_ACT_BUREAU_CREDIT_FACT_DIFF_MAX | BUREAU_ACT_BUREAU_CREDIT_FACT_DIFF_MEAN | BUREAU_ACT_BUREAU_CREDIT_ENDDATE_DIFF_MIN | BUREAU_ACT_BUREAU_CREDIT_ENDDATE_DIFF_MAX | BUREAU_ACT_BUREAU_CREDIT_ENDDATE_DIFF_MEAN | BUREAU_ACT_BUREAU_CREDIT_DEBT_RATIO_MIN | BUREAU_ACT_BUREAU_CREDIT_DEBT_RATIO_MAX | BUREAU_ACT_BUREAU_CREDIT_DEBT_RATIO_MEAN | BUREAU_ACT_BUREAU_CREDIT_DEBT_DIFF_MIN | BUREAU_ACT_BUREAU_CREDIT_DEBT_DIFF_MAX | BUREAU_ACT_BUREAU_CREDIT_DEBT_DIFF_MEAN | BUREAU_ACT_BUREAU_IS_DPD_MEAN | BUREAU_ACT_BUREAU_IS_DPD_SUM | BUREAU_ACT_BUREAU_IS_DPD_OVER120_MEAN | BUREAU_ACT_BUREAU_IS_DPD_OVER120_SUM | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100001 | 3 | -559 | -49 | -309.333333 | 0 | 0 | 0.0 | 411.0 | 1778.0 | 1030.333333 | NaN | NaN | NaN | NaN | NaN | 378000.0 | 294675.0000 | 884025.000 | 373239.00 | 198895.500 | 596686.50 | 0.0 | 0.0 | 0.0 | 10822.5 | 8272.50 | 24817.5 | NaN | NaN | NaN | NaN | NaN | NaN | -1827.0 | -731.0 | -1339.666667 | 0.335128 | 0.987405 | 0.659208 | -224514.0 | -4761.000 | -95779.5000 | 0.0 | 0 | 0.0 | 0 |
| 1 | 100002 | 2 | -1042 | -103 | -572.500000 | 0 | 0 | 0.0 | 780.0 | 780.0 | 780.000000 | NaN | NaN | NaN | 40.50 | 40.50 | 450000.0 | 240994.2825 | 481988.565 | 245781.00 | 122890.500 | 245781.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | -1822.0 | -1822.0 | -1822.000000 | 0.000000 | 0.546180 | 0.273090 | -204219.0 | -31988.565 | -118103.7825 | 0.0 | 0 | 0.0 | 0 |
| 2 | 100003 | 1 | -606 | -606 | -606.000000 | 0 | 0 | 0.0 | 1216.0 | 1216.0 | 1216.000000 | NaN | NaN | NaN | 0.00 | 0.00 | 810000.0 | 810000.0000 | 810000.000 | 0.00 | 0.000 | 0.00 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | -1822.0 | -1822.0 | -1822.000000 | 0.000000 | 0.000000 | 0.000000 | -810000.0 | -810000.000 | -810000.0000 | 0.0 | 0 | 0.0 | 0 |
| 3 | 100005 | 2 | -137 | -62 | -99.500000 | 0 | 0 | 0.0 | 122.0 | 1324.0 | 723.000000 | NaN | NaN | NaN | 0.00 | 0.00 | 568800.0 | 299313.0000 | 598626.000 | 543087.00 | 284204.250 | 568408.50 | 0.0 | 0.0 | 0.0 | 4261.5 | 2130.75 | 4261.5 | NaN | NaN | NaN | NaN | NaN | NaN | -1461.0 | -184.0 | -822.500000 | 0.848974 | 0.954794 | 0.901884 | -25713.0 | -4504.500 | -15108.7500 | 0.0 | 0 | 0.0 | 0 |
| 4 | 100008 | 1 | -78 | -78 | -78.000000 | 0 | 0 | 0.0 | 471.0 | 471.0 | 471.000000 | NaN | NaN | NaN | 0.00 | 0.00 | 267606.0 | 267606.0000 | 267606.000 | 240057.00 | 240057.000 | 240057.00 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | -549.0 | -549.0 | -549.000000 | 0.897054 | 0.897054 | 0.897054 | -27549.0 | -27549.000 | -27549.0000 | 0.0 | 0 | 0.0 | 0 |
| 5 | 100009 | 4 | -1293 | -239 | -591.750000 | 0 | 0 | 0.0 | -209.0 | 1402.0 | 596.500000 | NaN | NaN | NaN | NaN | NaN | 642861.0 | 381890.2500 | 1527561.000 | 557959.50 | 269337.375 | 1077349.50 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | -1826.0 | -30.0 | -928.000000 | 0.000000 | 0.967787 | 0.592792 | -187200.0 | -10872.000 | -112552.8750 | 0.0 | 0 | 0.0 | 0 |
| 6 | 100010 | 1 | -1138 | -1138 | -1138.000000 | 0 | 0 | 0.0 | 689.0 | 689.0 | 689.000000 | NaN | NaN | NaN | NaN | NaN | 675000.0 | 675000.0000 | 675000.000 | 348007.50 | 348007.500 | 348007.50 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | -1827.0 | -1827.0 | -1827.000000 | 0.515567 | 0.515567 | 0.515567 | -326992.5 | -326992.500 | -326992.5000 | 0.0 | 0 | 0.0 | 0 |
| 7 | 100014 | 2 | -423 | -376 | -399.500000 | 0 | 0 | 0.0 | 704.0 | 723.0 | 713.500000 | NaN | NaN | NaN | 12752.28 | 6376.14 | 571500.0 | 502875.0000 | 1005750.000 | 420201.00 | 379107.000 | 758214.00 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | -1127.0 | -1099.0 | -1113.000000 | 0.735260 | 0.778383 | 0.756822 | -151299.0 | -96237.000 | -123768.0000 | 0.0 | 0 | 0.0 | 0 |
| 8 | 100016 | 4 | -262 | -128 | -168.250000 | 0 | 0 | 0.0 | 223.0 | 845.0 | 381.750000 | NaN | NaN | NaN | NaN | NaN | 91264.5 | 84189.3750 | 336757.500 | 63724.50 | 31862.250 | 63724.50 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | -1107.0 | -364.0 | -550.000000 | 0.000000 | 0.716034 | 0.358017 | -67500.0 | -25272.000 | -46386.0000 | 0.0 | 0 | 0.0 | 0 |
| 9 | 100019 | 2 | -495 | -495 | -495.000000 | 0 | 0 | 0.0 | 419.0 | 10463.0 | 5441.000000 | NaN | NaN | NaN | 0.00 | 0.00 | 450000.0 | 360000.0000 | 720000.000 | 245470.14 | 122735.070 | 245470.14 | 0.0 | 0.0 | 0.0 | 27000.0 | 27000.00 | 54000.0 | NaN | NaN | NaN | NaN | NaN | NaN | -10958.0 | -914.0 | -5936.000000 | 0.000000 | 0.545489 | 0.272745 | -270000.0 | -204529.860 | -237264.9300 | 0.0 | 0 | 0.0 | 0 |
bureau_agg = bureau_day_amt_agg.merge(bureau_active_agg, on='SK_ID_CURR', how='left')
bureau_agg.head(20)
| SK_ID_CURR | BUREAU_SK_ID_BUREAU_COUNT | BUREAU_DAYS_CREDIT_MIN | BUREAU_DAYS_CREDIT_MAX | BUREAU_DAYS_CREDIT_MEAN | BUREAU_CREDIT_DAY_OVERDUE_MIN | BUREAU_CREDIT_DAY_OVERDUE_MAX | BUREAU_CREDIT_DAY_OVERDUE_MEAN | BUREAU_DAYS_CREDIT_ENDDATE_MIN | BUREAU_DAYS_CREDIT_ENDDATE_MAX | BUREAU_DAYS_CREDIT_ENDDATE_MEAN | BUREAU_DAYS_ENDDATE_FACT_MIN | BUREAU_DAYS_ENDDATE_FACT_MAX | BUREAU_DAYS_ENDDATE_FACT_MEAN | BUREAU_AMT_CREDIT_MAX_OVERDUE_MAX | BUREAU_AMT_CREDIT_MAX_OVERDUE_MEAN | BUREAU_AMT_CREDIT_SUM_MAX | BUREAU_AMT_CREDIT_SUM_MEAN | BUREAU_AMT_CREDIT_SUM_SUM | BUREAU_AMT_CREDIT_SUM_DEBT_MAX | BUREAU_AMT_CREDIT_SUM_DEBT_MEAN | BUREAU_AMT_CREDIT_SUM_DEBT_SUM | BUREAU_AMT_CREDIT_SUM_OVERDUE_MAX | BUREAU_AMT_CREDIT_SUM_OVERDUE_MEAN | BUREAU_AMT_CREDIT_SUM_OVERDUE_SUM | BUREAU_AMT_ANNUITY_MAX | BUREAU_AMT_ANNUITY_MEAN | BUREAU_AMT_ANNUITY_SUM | BUREAU_BUREAU_ENDDATE_FACT_DIFF_MIN | BUREAU_BUREAU_ENDDATE_FACT_DIFF_MAX | BUREAU_BUREAU_ENDDATE_FACT_DIFF_MEAN | BUREAU_BUREAU_CREDIT_FACT_DIFF_MIN | BUREAU_BUREAU_CREDIT_FACT_DIFF_MAX | BUREAU_BUREAU_CREDIT_FACT_DIFF_MEAN | BUREAU_BUREAU_CREDIT_ENDDATE_DIFF_MIN | BUREAU_BUREAU_CREDIT_ENDDATE_DIFF_MAX | BUREAU_BUREAU_CREDIT_ENDDATE_DIFF_MEAN | BUREAU_BUREAU_CREDIT_DEBT_RATIO_MIN | BUREAU_BUREAU_CREDIT_DEBT_RATIO_MAX | BUREAU_BUREAU_CREDIT_DEBT_RATIO_MEAN | BUREAU_BUREAU_CREDIT_DEBT_DIFF_MIN | BUREAU_BUREAU_CREDIT_DEBT_DIFF_MAX | BUREAU_BUREAU_CREDIT_DEBT_DIFF_MEAN | BUREAU_BUREAU_IS_DPD_MEAN | BUREAU_BUREAU_IS_DPD_SUM | BUREAU_BUREAU_IS_DPD_OVER120_MEAN | BUREAU_BUREAU_IS_DPD_OVER120_SUM | BUREAU_ACT_SK_ID_BUREAU_COUNT | BUREAU_ACT_DAYS_CREDIT_MIN | BUREAU_ACT_DAYS_CREDIT_MAX | BUREAU_ACT_DAYS_CREDIT_MEAN | BUREAU_ACT_CREDIT_DAY_OVERDUE_MIN | BUREAU_ACT_CREDIT_DAY_OVERDUE_MAX | BUREAU_ACT_CREDIT_DAY_OVERDUE_MEAN | BUREAU_ACT_DAYS_CREDIT_ENDDATE_MIN | BUREAU_ACT_DAYS_CREDIT_ENDDATE_MAX | BUREAU_ACT_DAYS_CREDIT_ENDDATE_MEAN | BUREAU_ACT_DAYS_ENDDATE_FACT_MIN | BUREAU_ACT_DAYS_ENDDATE_FACT_MAX | BUREAU_ACT_DAYS_ENDDATE_FACT_MEAN | BUREAU_ACT_AMT_CREDIT_MAX_OVERDUE_MAX | BUREAU_ACT_AMT_CREDIT_MAX_OVERDUE_MEAN | BUREAU_ACT_AMT_CREDIT_SUM_MAX | BUREAU_ACT_AMT_CREDIT_SUM_MEAN | BUREAU_ACT_AMT_CREDIT_SUM_SUM | BUREAU_ACT_AMT_CREDIT_SUM_DEBT_MAX | BUREAU_ACT_AMT_CREDIT_SUM_DEBT_MEAN | BUREAU_ACT_AMT_CREDIT_SUM_DEBT_SUM | BUREAU_ACT_AMT_CREDIT_SUM_OVERDUE_MAX | BUREAU_ACT_AMT_CREDIT_SUM_OVERDUE_MEAN | BUREAU_ACT_AMT_CREDIT_SUM_OVERDUE_SUM | BUREAU_ACT_AMT_ANNUITY_MAX | BUREAU_ACT_AMT_ANNUITY_MEAN | BUREAU_ACT_AMT_ANNUITY_SUM | BUREAU_ACT_BUREAU_ENDDATE_FACT_DIFF_MIN | BUREAU_ACT_BUREAU_ENDDATE_FACT_DIFF_MAX | BUREAU_ACT_BUREAU_ENDDATE_FACT_DIFF_MEAN | BUREAU_ACT_BUREAU_CREDIT_FACT_DIFF_MIN | BUREAU_ACT_BUREAU_CREDIT_FACT_DIFF_MAX | BUREAU_ACT_BUREAU_CREDIT_FACT_DIFF_MEAN | BUREAU_ACT_BUREAU_CREDIT_ENDDATE_DIFF_MIN | BUREAU_ACT_BUREAU_CREDIT_ENDDATE_DIFF_MAX | BUREAU_ACT_BUREAU_CREDIT_ENDDATE_DIFF_MEAN | BUREAU_ACT_BUREAU_CREDIT_DEBT_RATIO_MIN | BUREAU_ACT_BUREAU_CREDIT_DEBT_RATIO_MAX | BUREAU_ACT_BUREAU_CREDIT_DEBT_RATIO_MEAN | BUREAU_ACT_BUREAU_CREDIT_DEBT_DIFF_MIN | BUREAU_ACT_BUREAU_CREDIT_DEBT_DIFF_MAX | BUREAU_ACT_BUREAU_CREDIT_DEBT_DIFF_MEAN | BUREAU_ACT_BUREAU_IS_DPD_MEAN | BUREAU_ACT_BUREAU_IS_DPD_SUM | BUREAU_ACT_BUREAU_IS_DPD_OVER120_MEAN | BUREAU_ACT_BUREAU_IS_DPD_OVER120_SUM | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100001 | 7 | -1572 | -49 | -735.000000 | 0 | 0 | 0.0 | -1329.0 | 1778.0 | 82.428571 | -1328.0 | -544.0 | -825.500000 | NaN | NaN | 378000.0 | 2.076236e+05 | 1453365.000 | 373239.00 | 8.524093e+04 | 596686.50 | 0.0 | 0.0 | 0.0 | 10822.5 | 3545.357143 | 24817.5 | -1.0 | 698.0 | 197.000000 | -335.0 | -32.0 | -228.750000 | -1827.0 | -243.0 | -817.428571 | 0.000000 | 0.987405 | 0.282518 | -279720.00 | -4761.00 | -122382.642857 | 0.0 | 0 | 0.0 | 0 | 3.0 | -559.0 | -49.0 | -309.333333 | 0.0 | 0.0 | 0.0 | 411.0 | 1778.0 | 1030.333333 | NaN | NaN | NaN | NaN | NaN | 378000.0 | 2.946750e+05 | 884025.000 | 373239.00 | 198895.500 | 596686.50 | 0.0 | 0.0 | 0.0 | 10822.5 | 8272.50 | 24817.5 | NaN | NaN | NaN | NaN | NaN | NaN | -1827.0 | -731.0 | -1339.666667 | 0.335128 | 0.987405 | 0.659208 | -224514.0 | -4761.000 | -95779.5000 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 100002 | 8 | -1437 | -103 | -874.000000 | 0 | 0 | 0.0 | -1072.0 | 780.0 | -349.000000 | -1185.0 | -36.0 | -697.500000 | 5043.645 | 1681.0290 | 450000.0 | 1.081319e+05 | 865055.565 | 245781.00 | 4.915620e+04 | 245781.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 1029.0 | 252.600000 | -609.0 | -76.0 | -277.000000 | -1822.0 | -87.0 | -719.833333 | 0.000000 | 0.546180 | 0.136545 | -204219.00 | 0.00 | -98388.513000 | 0.0 | 0 | 0.0 | 0 | 2.0 | -1042.0 | -103.0 | -572.500000 | 0.0 | 0.0 | 0.0 | 780.0 | 780.0 | 780.000000 | NaN | NaN | NaN | 40.50 | 40.50 | 450000.0 | 2.409943e+05 | 481988.565 | 245781.00 | 122890.500 | 245781.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | -1822.0 | -1822.0 | -1822.000000 | 0.000000 | 0.546180 | 0.273090 | -204219.0 | -31988.565 | -118103.7825 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 100003 | 4 | -2586 | -606 | -1400.750000 | 0 | 0 | 0.0 | -2434.0 | 1216.0 | -544.500000 | -2131.0 | -540.0 | -1097.333333 | 0.000 | 0.0000 | 810000.0 | 2.543501e+05 | 1017400.500 | 0.00 | 0.000000e+00 | 0.00 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | -303.0 | 201.0 | -34.000000 | -1096.0 | -154.0 | -568.333333 | -1822.0 | -152.0 | -856.250000 | 0.000000 | 0.000000 | 0.000000 | -810000.00 | -22248.00 | -254350.125000 | 0.0 | 0 | 0.0 | 0 | 1.0 | -606.0 | -606.0 | -606.000000 | 0.0 | 0.0 | 0.0 | 1216.0 | 1216.0 | 1216.000000 | NaN | NaN | NaN | 0.00 | 0.00 | 810000.0 | 8.100000e+05 | 810000.000 | 0.00 | 0.000 | 0.00 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | -1822.0 | -1822.0 | -1822.000000 | 0.000000 | 0.000000 | 0.000000 | -810000.0 | -810000.000 | -810000.0000 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 100004 | 2 | -1326 | -408 | -867.000000 | 0 | 0 | 0.0 | -595.0 | -382.0 | -488.500000 | -683.0 | -382.0 | -532.500000 | 0.000 | 0.0000 | 94537.8 | 9.451890e+04 | 189037.800 | 0.00 | 0.000000e+00 | 0.00 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | 0.0 | 88.0 | 44.000000 | -643.0 | -26.0 | -334.500000 | -731.0 | -26.0 | -378.500000 | 0.000000 | 0.000000 | 0.000000 | -94537.80 | -94500.00 | -94518.900000 | 0.0 | 0 | 0.0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 100005 | 3 | -373 | -62 | -190.666667 | 0 | 0 | 0.0 | -128.0 | 1324.0 | 439.333333 | -123.0 | -123.0 | -123.000000 | 0.000 | 0.0000 | 568800.0 | 2.190420e+05 | 657126.000 | 543087.00 | 1.894695e+05 | 568408.50 | 0.0 | 0.0 | 0.0 | 4261.5 | 1420.500000 | 4261.5 | -5.0 | -5.0 | -5.000000 | -250.0 | -250.0 | -250.000000 | -1461.0 | -184.0 | -630.000000 | 0.000000 | 0.954794 | 0.601256 | -58500.00 | -4504.50 | -29572.500000 | 0.0 | 0 | 0.0 | 0 | 2.0 | -137.0 | -62.0 | -99.500000 | 0.0 | 0.0 | 0.0 | 122.0 | 1324.0 | 723.000000 | NaN | NaN | NaN | 0.00 | 0.00 | 568800.0 | 2.993130e+05 | 598626.000 | 543087.00 | 284204.250 | 568408.50 | 0.0 | 0.0 | 0.0 | 4261.5 | 2130.75 | 4261.5 | NaN | NaN | NaN | NaN | NaN | NaN | -1461.0 | -184.0 | -822.500000 | 0.848974 | 0.954794 | 0.901884 | -25713.0 | -4504.500 | -15108.7500 | 0.0 | 0.0 | 0.0 | 0.0 |
| 5 | 100007 | 1 | -1149 | -1149 | -1149.000000 | 0 | 0 | 0.0 | -783.0 | -783.0 | -783.000000 | -783.0 | -783.0 | -783.000000 | 0.000 | 0.0000 | 146250.0 | 1.462500e+05 | 146250.000 | 0.00 | 0.000000e+00 | 0.00 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.000000 | -366.0 | -366.0 | -366.000000 | -366.0 | -366.0 | -366.000000 | 0.000000 | 0.000000 | 0.000000 | -146250.00 | -146250.00 | -146250.000000 | 0.0 | 0 | 0.0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 6 | 100008 | 3 | -1097 | -78 | -757.333333 | 0 | 0 | 0.0 | -853.0 | 471.0 | -391.333333 | -1028.0 | -790.0 | -909.000000 | 0.000 | 0.0000 | 267606.0 | 1.561485e+05 | 468445.500 | 240057.00 | 8.001900e+04 | 240057.00 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | -2.0 | 175.0 | 86.500000 | -307.0 | -69.0 | -188.000000 | -549.0 | -244.0 | -366.000000 | 0.000000 | 0.897054 | 0.299018 | -105705.00 | -27549.00 | -76129.500000 | 0.0 | 0 | 0.0 | 0 | 1.0 | -78.0 | -78.0 | -78.000000 | 0.0 | 0.0 | 0.0 | 471.0 | 471.0 | 471.000000 | NaN | NaN | NaN | 0.00 | 0.00 | 267606.0 | 2.676060e+05 | 267606.000 | 240057.00 | 240057.000 | 240057.00 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | -549.0 | -549.0 | -549.000000 | 0.897054 | 0.897054 | 0.897054 | -27549.0 | -27549.000 | -27549.0000 | 0.0 | 0.0 | 0.0 | 0.0 |
| 7 | 100009 | 18 | -2882 | -239 | -1271.500000 | 0 | 0 | 0.0 | -2152.0 | 1402.0 | -794.937500 | -2152.0 | -313.0 | -1108.500000 | 0.000 | 0.0000 | 1777500.0 | 2.667118e+05 | 4800811.500 | 557959.50 | 7.695354e+04 | 1077349.50 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | -713.0 | 1459.0 | 114.785714 | -893.0 | -92.0 | -357.214286 | -1826.0 | -30.0 | -529.000000 | 0.000000 | 0.967787 | 0.169369 | -1777500.00 | -10872.00 | -227088.000000 | 0.0 | 0 | 0.0 | 0 | 4.0 | -1293.0 | -239.0 | -591.750000 | 0.0 | 0.0 | 0.0 | -209.0 | 1402.0 | 596.500000 | NaN | NaN | NaN | NaN | NaN | 642861.0 | 3.818902e+05 | 1527561.000 | 557959.50 | 269337.375 | 1077349.50 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | -1826.0 | -30.0 | -928.000000 | 0.000000 | 0.967787 | 0.592792 | -187200.0 | -10872.000 | -112552.8750 | 0.0 | 0.0 | 0.0 | 0.0 |
| 8 | 100010 | 2 | -2741 | -1138 | -1939.500000 | 0 | 0 | 0.0 | -928.0 | 689.0 | -119.500000 | -1138.0 | -1138.0 | -1138.000000 | NaN | NaN | 675000.0 | 4.950000e+05 | 990000.000 | 348007.50 | 1.740038e+05 | 348007.50 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | 210.0 | 210.0 | 210.000000 | -1603.0 | -1603.0 | -1603.000000 | -1827.0 | -1813.0 | -1820.000000 | 0.000000 | 0.515567 | 0.257783 | -326992.50 | -315000.00 | -320996.250000 | 0.0 | 0 | 0.0 | 0 | 1.0 | -1138.0 | -1138.0 | -1138.000000 | 0.0 | 0.0 | 0.0 | 689.0 | 689.0 | 689.000000 | NaN | NaN | NaN | NaN | NaN | 675000.0 | 6.750000e+05 | 675000.000 | 348007.50 | 348007.500 | 348007.50 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | -1827.0 | -1827.0 | -1827.000000 | 0.515567 | 0.515567 | 0.515567 | -326992.5 | -326992.500 | -326992.5000 | 0.0 | 0.0 | 0.0 | 0.0 |
| 9 | 100011 | 4 | -2508 | -1309 | -1773.000000 | 0 | 0 | 0.0 | -2173.0 | -860.0 | -1293.250000 | -2197.0 | -968.0 | -1463.250000 | 10147.230 | 5073.6150 | 145242.0 | 1.088071e+05 | 435228.300 | 0.00 | 0.000000e+00 | 0.00 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | -102.0 | 758.0 | 170.000000 | -347.0 | -239.0 | -309.750000 | -1100.0 | -239.0 | -479.750000 | 0.000000 | 0.000000 | 0.000000 | -145242.00 | -54000.00 | -96662.100000 | 0.0 | 0 | 0.0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 10 | 100013 | 4 | -2070 | -1210 | -1737.500000 | 0 | 0 | 0.0 | -1707.0 | -567.0 | -1068.000000 | -1334.0 | -549.0 | -1054.750000 | 19305.000 | 19305.0000 | 1262250.0 | 5.180700e+05 | 2072280.060 | 0.00 | 0.000000e+00 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | -489.0 | 425.0 | -13.250000 | -1116.0 | -92.0 | -682.750000 | -1098.0 | -121.0 | -669.500000 | 0.000000 | 0.000000 | 0.000000 | -26490.06 | -26490.06 | -26490.060000 | 0.0 | 0 | 0.0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 11 | 100014 | 8 | -2308 | -376 | -1095.375000 | 0 | 0 | 0.0 | -2004.0 | 723.0 | -387.375000 | -1992.0 | -346.0 | -821.333333 | 12752.280 | 3726.3525 | 900000.0 | 3.412416e+05 | 2729932.425 | 420201.00 | 1.516428e+05 | 758214.00 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | -29.0 | 469.0 | 67.000000 | -1357.0 | -89.0 | -506.000000 | -1826.0 | -60.0 | -708.000000 | 0.000000 | 0.778383 | 0.302729 | -900000.00 | -96237.00 | -343778.985000 | 0.0 | 0 | 0.0 | 0 | 2.0 | -423.0 | -376.0 | -399.500000 | 0.0 | 0.0 | 0.0 | 704.0 | 723.0 | 713.500000 | NaN | NaN | NaN | 12752.28 | 6376.14 | 571500.0 | 5.028750e+05 | 1005750.000 | 420201.00 | 379107.000 | 758214.00 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | -1127.0 | -1099.0 | -1113.000000 | 0.735260 | 0.778383 | 0.756822 | -151299.0 | -96237.000 | -123768.0000 | 0.0 | 0.0 | 0.0 | 0.0 |
| 12 | 100015 | 4 | -1409 | -319 | -947.750000 | 0 | 0 | 0.0 | -1045.0 | -16.0 | -598.250000 | -907.0 | -8.0 | -555.500000 | NaN | NaN | 131103.0 | 1.023739e+05 | 409495.500 | 0.00 | 0.000000e+00 | 0.00 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | -138.0 | 0.0 | -42.750000 | -502.0 | -311.0 | -392.250000 | -366.0 | -303.0 | -349.500000 | 0.000000 | 0.000000 | 0.000000 | -94320.00 | -94320.00 | -94320.000000 | 0.0 | 0 | 0.0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 13 | 100016 | 7 | -1634 | -128 | -618.428571 | 0 | 0 | 0.0 | -1369.0 | 845.0 | -217.142857 | -1369.0 | -347.0 | -929.666667 | 0.000 | 0.0000 | 91264.5 | 6.785486e+04 | 474984.000 | 63724.50 | 1.274490e+04 | 63724.50 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | -258.0 | 0.0 | -86.000000 | -561.0 | -125.0 | -289.000000 | -1107.0 | -125.0 | -401.285714 | 0.000000 | 0.716034 | 0.143207 | -67500.00 | -22086.00 | -46199.700000 | 0.0 | 0 | 0.0 | 0 | 4.0 | -262.0 | -128.0 | -168.250000 | 0.0 | 0.0 | 0.0 | 223.0 | 845.0 | 381.750000 | NaN | NaN | NaN | NaN | NaN | 91264.5 | 8.418938e+04 | 336757.500 | 63724.50 | 31862.250 | 63724.50 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | -1107.0 | -364.0 | -550.000000 | 0.000000 | 0.716034 | 0.358017 | -67500.0 | -25272.000 | -46386.0000 | 0.0 | 0.0 | 0.0 | 0.0 |
| 14 | 100017 | 6 | -2881 | -909 | -1944.333333 | 0 | 0 | 0.0 | -2546.0 | 197.0 | -1512.333333 | -2575.0 | -738.0 | -1677.833333 | 0.000 | 0.0000 | 315000.0 | 1.432950e+05 | 859770.000 | 0.00 | 0.000000e+00 | 0.00 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | 0.0 | 935.0 | 165.500000 | -366.0 | -139.0 | -266.500000 | -1106.0 | -139.0 | -432.000000 | 0.000000 | 0.000000 | 0.000000 | -225000.00 | -63990.00 | -132073.500000 | 0.0 | 0 | 0.0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 15 | 100019 | 2 | -495 | -495 | -495.000000 | 0 | 0 | 0.0 | 419.0 | 10463.0 | 5441.000000 | NaN | NaN | NaN | 0.000 | 0.0000 | 450000.0 | 3.600000e+05 | 720000.000 | 245470.14 | 1.227351e+05 | 245470.14 | 0.0 | 0.0 | 0.0 | 27000.0 | 27000.000000 | 54000.0 | NaN | NaN | NaN | NaN | NaN | NaN | -10958.0 | -914.0 | -5936.000000 | 0.000000 | 0.545489 | 0.272745 | -270000.00 | -204529.86 | -237264.930000 | 0.0 | 0 | 0.0 | 0 | 2.0 | -495.0 | -495.0 | -495.000000 | 0.0 | 0.0 | 0.0 | 419.0 | 10463.0 | 5441.000000 | NaN | NaN | NaN | 0.00 | 0.00 | 450000.0 | 3.600000e+05 | 720000.000 | 245470.14 | 122735.070 | 245470.14 | 0.0 | 0.0 | 0.0 | 27000.0 | 27000.00 | 54000.0 | NaN | NaN | NaN | NaN | NaN | NaN | -10958.0 | -914.0 | -5936.000000 | 0.000000 | 0.545489 | 0.272745 | -270000.0 | -204529.860 | -237264.9300 | 0.0 | 0.0 | 0.0 | 0.0 |
| 16 | 100020 | 4 | -492 | -90 | -261.500000 | 0 | 0 | 0.0 | -311.0 | 277.0 | -32.500000 | -311.0 | -135.0 | -223.000000 | 0.000 | 0.0000 | 135000.0 | 4.987125e+04 | 199485.000 | 117243.00 | 3.077212e+04 | 123088.50 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | 0.0 | 31.0 | 15.500000 | -181.0 | -153.0 | -167.000000 | -367.0 | -181.0 | -229.000000 | 0.000000 | 0.868467 | 0.307325 | -29160.00 | -10354.50 | -19099.125000 | 0.0 | 0 | 0.0 | 0 | 2.0 | -176.0 | -90.0 | -133.000000 | 0.0 | 0.0 | 0.0 | 8.0 | 277.0 | 142.500000 | NaN | NaN | NaN | 0.00 | 0.00 | 135000.0 | 7.560000e+04 | 151200.000 | 117243.00 | 61544.250 | 123088.50 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | -367.0 | -184.0 | -275.500000 | 0.360833 | 0.868467 | 0.614650 | -17757.0 | -10354.500 | -14055.7500 | 0.0 | 0.0 | 0.0 | 0.0 |
| 17 | 100022 | 2 | -385 | -289 | -337.000000 | 0 | 0 | 0.0 | 441.0 | 1439.0 | 940.000000 | NaN | NaN | NaN | 0.000 | 0.0000 | 765000.0 | 5.287500e+05 | 1057500.000 | 205276.50 | 2.052765e+05 | 205276.50 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | -1824.0 | -730.0 | -1277.000000 | 0.701800 | 0.701800 | 0.701800 | -87223.50 | -87223.50 | -87223.500000 | 0.0 | 0 | 0.0 | 0 | 2.0 | -385.0 | -289.0 | -337.000000 | 0.0 | 0.0 | 0.0 | 441.0 | 1439.0 | 940.000000 | NaN | NaN | NaN | 0.00 | 0.00 | 765000.0 | 5.287500e+05 | 1057500.000 | 205276.50 | 205276.500 | 205276.50 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | -1824.0 | -730.0 | -1277.000000 | 0.701800 | 0.701800 | 0.701800 | -87223.5 | -87223.500 | -87223.5000 | 0.0 | 0.0 | 0.0 | 0.0 |
| 18 | 100023 | 13 | -2624 | -157 | -1164.384615 | 0 | 0 | 0.0 | -2228.0 | 1669.0 | -364.916667 | -2335.0 | -138.0 | -997.300000 | 5980.050 | 2720.8500 | 444078.0 | 1.265917e+05 | 1645692.345 | 70443.00 | 1.370385e+04 | 137038.50 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | -16.0 | 1456.0 | 287.111111 | -653.0 | -35.0 | -296.900000 | -1827.0 | -184.0 | -833.750000 | 0.000000 | 0.930700 | 0.188812 | -444078.00 | -3118.50 | -101004.934500 | 0.0 | 0 | 0.0 | 0 | 4.0 | -2370.0 | -157.0 | -1141.250000 | 0.0 | 0.0 | 0.0 | -1250.0 | 1669.0 | 313.000000 | -2335.0 | -2335.0 | -2335.0 | 0.00 | 0.00 | 444078.0 | 1.644289e+05 | 657715.500 | 70443.00 | 28081.125 | 112324.50 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | 1085.0 | 1085.0 | 1085.0 | -35.0 | -35.0 | -35.0 | -1827.0 | -1044.0 | -1454.250000 | 0.000000 | 0.930700 | 0.375114 | -444078.0 | -3118.500 | -136347.7500 | 0.0 | 0.0 | 0.0 | 0.0 |
| 19 | 100025 | 1 | -700 | -700 | -700.000000 | 0 | 0 | 0.0 | 1124.0 | 1124.0 | 1124.000000 | NaN | NaN | NaN | NaN | NaN | 2452500.0 | 2.452500e+06 | 2452500.000 | 1886544.00 | 1.886544e+06 | 1886544.00 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | -1824.0 | -1824.0 | -1824.000000 | 0.769233 | 0.769233 | 0.769233 | -565956.00 | -565956.00 | -565956.000000 | 0.0 | 0 | 0.0 | 0 | 1.0 | -700.0 | -700.0 | -700.000000 | 0.0 | 0.0 | 0.0 | 1124.0 | 1124.0 | 1124.000000 | NaN | NaN | NaN | NaN | NaN | 2452500.0 | 2.452500e+06 | 2452500.000 | 1886544.00 | 1886544.000 | 1886544.00 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | -1824.0 | -1824.0 | -1824.000000 | 0.769233 | 0.769233 | 0.769233 | -565956.0 | -565956.000 | -565956.0000 | 0.0 | 0.0 | 0.0 | 0.0 |
bureau_agg.shape
(305811, 93)
select_columns = ['SK_ID_CURR', 'BUREAU_SK_ID_BUREAU_COUNT', 'BUREAU_BUREAU_IS_DPD_SUM', 'BUREAU_BUREAU_IS_DPD_OVER120_SUM',
'BUREAU_ACT_BUREAU_IS_DPD_SUM', 'BUREAU_ACT_BUREAU_IS_DPD_OVER120_SUM']
bureau_agg[select_columns].head(20)
bureau_agg['BUREAU_BUREAU_IS_DPD_SUM'].value_counts()
0 301947
1 3615
2 186
3 39
4 13
5 6
6 4
7 1
Name: BUREAU_BUREAU_IS_DPD_SUM, dtype: int64
bureau_agg.columns.tolist()
['SK_ID_CURR',
'BUREAU_SK_ID_BUREAU_COUNT',
'BUREAU_DAYS_CREDIT_MIN',
'BUREAU_DAYS_CREDIT_MAX',
'BUREAU_DAYS_CREDIT_MEAN',
'BUREAU_CREDIT_DAY_OVERDUE_MIN',
'BUREAU_CREDIT_DAY_OVERDUE_MAX',
'BUREAU_CREDIT_DAY_OVERDUE_MEAN',
'BUREAU_DAYS_CREDIT_ENDDATE_MIN',
'BUREAU_DAYS_CREDIT_ENDDATE_MAX',
'BUREAU_DAYS_CREDIT_ENDDATE_MEAN',
'BUREAU_DAYS_ENDDATE_FACT_MIN',
'BUREAU_DAYS_ENDDATE_FACT_MAX',
'BUREAU_DAYS_ENDDATE_FACT_MEAN',
'BUREAU_AMT_CREDIT_MAX_OVERDUE_MAX',
'BUREAU_AMT_CREDIT_MAX_OVERDUE_MEAN',
'BUREAU_AMT_CREDIT_SUM_MAX',
'BUREAU_AMT_CREDIT_SUM_MEAN',
'BUREAU_AMT_CREDIT_SUM_SUM',
'BUREAU_AMT_CREDIT_SUM_DEBT_MAX',
'BUREAU_AMT_CREDIT_SUM_DEBT_MEAN',
'BUREAU_AMT_CREDIT_SUM_DEBT_SUM',
'BUREAU_AMT_CREDIT_SUM_OVERDUE_MAX',
'BUREAU_AMT_CREDIT_SUM_OVERDUE_MEAN',
'BUREAU_AMT_CREDIT_SUM_OVERDUE_SUM',
'BUREAU_AMT_ANNUITY_MAX',
'BUREAU_AMT_ANNUITY_MEAN',
'BUREAU_AMT_ANNUITY_SUM',
'BUREAU_BUREAU_ENDDATE_FACT_DIFF_MIN',
'BUREAU_BUREAU_ENDDATE_FACT_DIFF_MAX',
'BUREAU_BUREAU_ENDDATE_FACT_DIFF_MEAN',
'BUREAU_BUREAU_CREDIT_FACT_DIFF_MIN',
'BUREAU_BUREAU_CREDIT_FACT_DIFF_MAX',
'BUREAU_BUREAU_CREDIT_FACT_DIFF_MEAN',
'BUREAU_BUREAU_CREDIT_ENDDATE_DIFF_MIN',
'BUREAU_BUREAU_CREDIT_ENDDATE_DIFF_MAX',
'BUREAU_BUREAU_CREDIT_ENDDATE_DIFF_MEAN',
'BUREAU_BUREAU_CREDIT_DEBT_RATIO_MIN',
'BUREAU_BUREAU_CREDIT_DEBT_RATIO_MAX',
'BUREAU_BUREAU_CREDIT_DEBT_RATIO_MEAN',
'BUREAU_BUREAU_CREDIT_DEBT_DIFF_MIN',
'BUREAU_BUREAU_CREDIT_DEBT_DIFF_MAX',
'BUREAU_BUREAU_CREDIT_DEBT_DIFF_MEAN',
'BUREAU_BUREAU_IS_DPD_MEAN',
'BUREAU_BUREAU_IS_DPD_SUM',
'BUREAU_BUREAU_IS_DPD_OVER120_MEAN',
'BUREAU_BUREAU_IS_DPD_OVER120_SUM',
'BUREAU_ACT_SK_ID_BUREAU_COUNT',
'BUREAU_ACT_DAYS_CREDIT_MIN',
'BUREAU_ACT_DAYS_CREDIT_MAX',
'BUREAU_ACT_DAYS_CREDIT_MEAN',
'BUREAU_ACT_CREDIT_DAY_OVERDUE_MIN',
'BUREAU_ACT_CREDIT_DAY_OVERDUE_MAX',
'BUREAU_ACT_CREDIT_DAY_OVERDUE_MEAN',
'BUREAU_ACT_DAYS_CREDIT_ENDDATE_MIN',
'BUREAU_ACT_DAYS_CREDIT_ENDDATE_MAX',
'BUREAU_ACT_DAYS_CREDIT_ENDDATE_MEAN',
'BUREAU_ACT_DAYS_ENDDATE_FACT_MIN',
'BUREAU_ACT_DAYS_ENDDATE_FACT_MAX',
'BUREAU_ACT_DAYS_ENDDATE_FACT_MEAN',
'BUREAU_ACT_AMT_CREDIT_MAX_OVERDUE_MAX',
'BUREAU_ACT_AMT_CREDIT_MAX_OVERDUE_MEAN',
'BUREAU_ACT_AMT_CREDIT_SUM_MAX',
'BUREAU_ACT_AMT_CREDIT_SUM_MEAN',
'BUREAU_ACT_AMT_CREDIT_SUM_SUM',
'BUREAU_ACT_AMT_CREDIT_SUM_DEBT_MAX',
'BUREAU_ACT_AMT_CREDIT_SUM_DEBT_MEAN',
'BUREAU_ACT_AMT_CREDIT_SUM_DEBT_SUM',
'BUREAU_ACT_AMT_CREDIT_SUM_OVERDUE_MAX',
'BUREAU_ACT_AMT_CREDIT_SUM_OVERDUE_MEAN',
'BUREAU_ACT_AMT_CREDIT_SUM_OVERDUE_SUM',
'BUREAU_ACT_AMT_ANNUITY_MAX',
'BUREAU_ACT_AMT_ANNUITY_MEAN',
'BUREAU_ACT_AMT_ANNUITY_SUM',
'BUREAU_ACT_BUREAU_ENDDATE_FACT_DIFF_MIN',
'BUREAU_ACT_BUREAU_ENDDATE_FACT_DIFF_MAX',
'BUREAU_ACT_BUREAU_ENDDATE_FACT_DIFF_MEAN',
'BUREAU_ACT_BUREAU_CREDIT_FACT_DIFF_MIN',
'BUREAU_ACT_BUREAU_CREDIT_FACT_DIFF_MAX',
'BUREAU_ACT_BUREAU_CREDIT_FACT_DIFF_MEAN',
'BUREAU_ACT_BUREAU_CREDIT_ENDDATE_DIFF_MIN',
'BUREAU_ACT_BUREAU_CREDIT_ENDDATE_DIFF_MAX',
'BUREAU_ACT_BUREAU_CREDIT_ENDDATE_DIFF_MEAN',
'BUREAU_ACT_BUREAU_CREDIT_DEBT_RATIO_MIN',
'BUREAU_ACT_BUREAU_CREDIT_DEBT_RATIO_MAX',
'BUREAU_ACT_BUREAU_CREDIT_DEBT_RATIO_MEAN',
'BUREAU_ACT_BUREAU_CREDIT_DEBT_DIFF_MIN',
'BUREAU_ACT_BUREAU_CREDIT_DEBT_DIFF_MAX',
'BUREAU_ACT_BUREAU_CREDIT_DEBT_DIFF_MEAN',
'BUREAU_ACT_BUREAU_IS_DPD_MEAN',
'BUREAU_ACT_BUREAU_IS_DPD_SUM',
'BUREAU_ACT_BUREAU_IS_DPD_OVER120_MEAN',
'BUREAU_ACT_BUREAU_IS_DPD_OVER120_SUM']
# SK_ID_CURR 레벨로 DPD 비율, DPD > 120 비율 확률, Active 상태에서 DPD 비율, DPD > 120 비율 확률
bureau_agg['BUREAU_IS_DPD_RATIO'] = bureau_agg['BUREAU_BUREAU_IS_DPD_SUM']/bureau_agg['BUREAU_SK_ID_BUREAU_COUNT']
bureau_agg['BUREAU_IS_DPD_OVER120_RATIO'] = bureau_agg['BUREAU_BUREAU_IS_DPD_OVER120_SUM']/bureau_agg['BUREAU_SK_ID_BUREAU_COUNT']
bureau_agg['BUREAU_ACT_IS_DPD_RATIO'] = bureau_agg['BUREAU_ACT_BUREAU_IS_DPD_SUM']/bureau_agg['BUREAU_SK_ID_BUREAU_COUNT']
bureau_agg['BUREAU_ACT_IS_DPD_OVER120_RATIO'] = bureau_agg['BUREAU_ACT_BUREAU_IS_DPD_OVER120_SUM']/bureau_agg['BUREAU_SK_ID_BUREAU_COUNT']
bureau_agg.head(10)
| SK_ID_CURR | BUREAU_SK_ID_BUREAU_COUNT | BUREAU_DAYS_CREDIT_MIN | BUREAU_DAYS_CREDIT_MAX | BUREAU_DAYS_CREDIT_MEAN | BUREAU_CREDIT_DAY_OVERDUE_MIN | BUREAU_CREDIT_DAY_OVERDUE_MAX | BUREAU_CREDIT_DAY_OVERDUE_MEAN | BUREAU_DAYS_CREDIT_ENDDATE_MIN | BUREAU_DAYS_CREDIT_ENDDATE_MAX | BUREAU_DAYS_CREDIT_ENDDATE_MEAN | BUREAU_DAYS_ENDDATE_FACT_MIN | BUREAU_DAYS_ENDDATE_FACT_MAX | BUREAU_DAYS_ENDDATE_FACT_MEAN | BUREAU_AMT_CREDIT_MAX_OVERDUE_MAX | BUREAU_AMT_CREDIT_MAX_OVERDUE_MEAN | BUREAU_AMT_CREDIT_SUM_MAX | BUREAU_AMT_CREDIT_SUM_MEAN | BUREAU_AMT_CREDIT_SUM_SUM | BUREAU_AMT_CREDIT_SUM_DEBT_MAX | BUREAU_AMT_CREDIT_SUM_DEBT_MEAN | BUREAU_AMT_CREDIT_SUM_DEBT_SUM | BUREAU_AMT_CREDIT_SUM_OVERDUE_MAX | BUREAU_AMT_CREDIT_SUM_OVERDUE_MEAN | BUREAU_AMT_CREDIT_SUM_OVERDUE_SUM | BUREAU_AMT_ANNUITY_MAX | BUREAU_AMT_ANNUITY_MEAN | BUREAU_AMT_ANNUITY_SUM | BUREAU_BUREAU_ENDDATE_FACT_DIFF_MIN | BUREAU_BUREAU_ENDDATE_FACT_DIFF_MAX | BUREAU_BUREAU_ENDDATE_FACT_DIFF_MEAN | BUREAU_BUREAU_CREDIT_FACT_DIFF_MIN | BUREAU_BUREAU_CREDIT_FACT_DIFF_MAX | BUREAU_BUREAU_CREDIT_FACT_DIFF_MEAN | BUREAU_BUREAU_CREDIT_ENDDATE_DIFF_MIN | BUREAU_BUREAU_CREDIT_ENDDATE_DIFF_MAX | BUREAU_BUREAU_CREDIT_ENDDATE_DIFF_MEAN | BUREAU_BUREAU_CREDIT_DEBT_RATIO_MIN | BUREAU_BUREAU_CREDIT_DEBT_RATIO_MAX | BUREAU_BUREAU_CREDIT_DEBT_RATIO_MEAN | BUREAU_BUREAU_CREDIT_DEBT_DIFF_MIN | BUREAU_BUREAU_CREDIT_DEBT_DIFF_MAX | BUREAU_BUREAU_CREDIT_DEBT_DIFF_MEAN | BUREAU_BUREAU_IS_DPD_MEAN | BUREAU_BUREAU_IS_DPD_SUM | BUREAU_BUREAU_IS_DPD_OVER120_MEAN | BUREAU_BUREAU_IS_DPD_OVER120_SUM | BUREAU_ACT_SK_ID_BUREAU_COUNT | BUREAU_ACT_DAYS_CREDIT_MIN | BUREAU_ACT_DAYS_CREDIT_MAX | BUREAU_ACT_DAYS_CREDIT_MEAN | BUREAU_ACT_CREDIT_DAY_OVERDUE_MIN | BUREAU_ACT_CREDIT_DAY_OVERDUE_MAX | BUREAU_ACT_CREDIT_DAY_OVERDUE_MEAN | BUREAU_ACT_DAYS_CREDIT_ENDDATE_MIN | BUREAU_ACT_DAYS_CREDIT_ENDDATE_MAX | BUREAU_ACT_DAYS_CREDIT_ENDDATE_MEAN | BUREAU_ACT_DAYS_ENDDATE_FACT_MIN | BUREAU_ACT_DAYS_ENDDATE_FACT_MAX | BUREAU_ACT_DAYS_ENDDATE_FACT_MEAN | BUREAU_ACT_AMT_CREDIT_MAX_OVERDUE_MAX | BUREAU_ACT_AMT_CREDIT_MAX_OVERDUE_MEAN | BUREAU_ACT_AMT_CREDIT_SUM_MAX | BUREAU_ACT_AMT_CREDIT_SUM_MEAN | BUREAU_ACT_AMT_CREDIT_SUM_SUM | BUREAU_ACT_AMT_CREDIT_SUM_DEBT_MAX | BUREAU_ACT_AMT_CREDIT_SUM_DEBT_MEAN | BUREAU_ACT_AMT_CREDIT_SUM_DEBT_SUM | BUREAU_ACT_AMT_CREDIT_SUM_OVERDUE_MAX | BUREAU_ACT_AMT_CREDIT_SUM_OVERDUE_MEAN | BUREAU_ACT_AMT_CREDIT_SUM_OVERDUE_SUM | BUREAU_ACT_AMT_ANNUITY_MAX | BUREAU_ACT_AMT_ANNUITY_MEAN | BUREAU_ACT_AMT_ANNUITY_SUM | BUREAU_ACT_BUREAU_ENDDATE_FACT_DIFF_MIN | BUREAU_ACT_BUREAU_ENDDATE_FACT_DIFF_MAX | BUREAU_ACT_BUREAU_ENDDATE_FACT_DIFF_MEAN | BUREAU_ACT_BUREAU_CREDIT_FACT_DIFF_MIN | BUREAU_ACT_BUREAU_CREDIT_FACT_DIFF_MAX | BUREAU_ACT_BUREAU_CREDIT_FACT_DIFF_MEAN | BUREAU_ACT_BUREAU_CREDIT_ENDDATE_DIFF_MIN | BUREAU_ACT_BUREAU_CREDIT_ENDDATE_DIFF_MAX | BUREAU_ACT_BUREAU_CREDIT_ENDDATE_DIFF_MEAN | BUREAU_ACT_BUREAU_CREDIT_DEBT_RATIO_MIN | BUREAU_ACT_BUREAU_CREDIT_DEBT_RATIO_MAX | BUREAU_ACT_BUREAU_CREDIT_DEBT_RATIO_MEAN | BUREAU_ACT_BUREAU_CREDIT_DEBT_DIFF_MIN | BUREAU_ACT_BUREAU_CREDIT_DEBT_DIFF_MAX | BUREAU_ACT_BUREAU_CREDIT_DEBT_DIFF_MEAN | BUREAU_ACT_BUREAU_IS_DPD_MEAN | BUREAU_ACT_BUREAU_IS_DPD_SUM | BUREAU_ACT_BUREAU_IS_DPD_OVER120_MEAN | BUREAU_ACT_BUREAU_IS_DPD_OVER120_SUM | BUREAU_IS_DPD_RATIO | BUREAU_IS_DPD_OVER120_RATIO | BUREAU_ACT_IS_DPD_RATIO | BUREAU_ACT_IS_DPD_OVER120_RATIO | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100001 | 7 | -1572 | -49 | -735.000000 | 0 | 0 | 0.0 | -1329.0 | 1778.0 | 82.428571 | -1328.0 | -544.0 | -825.500000 | NaN | NaN | 378000.0 | 207623.571429 | 1453365.000 | 373239.0 | 85240.928571 | 596686.5 | 0.0 | 0.0 | 0.0 | 10822.5 | 3545.357143 | 24817.5 | -1.0 | 698.0 | 197.000000 | -335.0 | -32.0 | -228.750000 | -1827.0 | -243.0 | -817.428571 | 0.0 | 0.987405 | 0.282518 | -279720.0 | -4761.0 | -122382.642857 | 0.0 | 0 | 0.0 | 0 | 3.0 | -559.0 | -49.0 | -309.333333 | 0.0 | 0.0 | 0.0 | 411.0 | 1778.0 | 1030.333333 | NaN | NaN | NaN | NaN | NaN | 378000.0 | 294675.0000 | 884025.000 | 373239.0 | 198895.500 | 596686.5 | 0.0 | 0.0 | 0.0 | 10822.5 | 8272.50 | 24817.5 | NaN | NaN | NaN | NaN | NaN | NaN | -1827.0 | -731.0 | -1339.666667 | 0.335128 | 0.987405 | 0.659208 | -224514.0 | -4761.000 | -95779.5000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 100002 | 8 | -1437 | -103 | -874.000000 | 0 | 0 | 0.0 | -1072.0 | 780.0 | -349.000000 | -1185.0 | -36.0 | -697.500000 | 5043.645 | 1681.029 | 450000.0 | 108131.945625 | 865055.565 | 245781.0 | 49156.200000 | 245781.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 1029.0 | 252.600000 | -609.0 | -76.0 | -277.000000 | -1822.0 | -87.0 | -719.833333 | 0.0 | 0.546180 | 0.136545 | -204219.0 | 0.0 | -98388.513000 | 0.0 | 0 | 0.0 | 0 | 2.0 | -1042.0 | -103.0 | -572.500000 | 0.0 | 0.0 | 0.0 | 780.0 | 780.0 | 780.000000 | NaN | NaN | NaN | 40.5 | 40.5 | 450000.0 | 240994.2825 | 481988.565 | 245781.0 | 122890.500 | 245781.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | -1822.0 | -1822.0 | -1822.000000 | 0.000000 | 0.546180 | 0.273090 | -204219.0 | -31988.565 | -118103.7825 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 100003 | 4 | -2586 | -606 | -1400.750000 | 0 | 0 | 0.0 | -2434.0 | 1216.0 | -544.500000 | -2131.0 | -540.0 | -1097.333333 | 0.000 | 0.000 | 810000.0 | 254350.125000 | 1017400.500 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | -303.0 | 201.0 | -34.000000 | -1096.0 | -154.0 | -568.333333 | -1822.0 | -152.0 | -856.250000 | 0.0 | 0.000000 | 0.000000 | -810000.0 | -22248.0 | -254350.125000 | 0.0 | 0 | 0.0 | 0 | 1.0 | -606.0 | -606.0 | -606.000000 | 0.0 | 0.0 | 0.0 | 1216.0 | 1216.0 | 1216.000000 | NaN | NaN | NaN | 0.0 | 0.0 | 810000.0 | 810000.0000 | 810000.000 | 0.0 | 0.000 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | -1822.0 | -1822.0 | -1822.000000 | 0.000000 | 0.000000 | 0.000000 | -810000.0 | -810000.000 | -810000.0000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 100004 | 2 | -1326 | -408 | -867.000000 | 0 | 0 | 0.0 | -595.0 | -382.0 | -488.500000 | -683.0 | -382.0 | -532.500000 | 0.000 | 0.000 | 94537.8 | 94518.900000 | 189037.800 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | 0.0 | 88.0 | 44.000000 | -643.0 | -26.0 | -334.500000 | -731.0 | -26.0 | -378.500000 | 0.0 | 0.000000 | 0.000000 | -94537.8 | -94500.0 | -94518.900000 | 0.0 | 0 | 0.0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | NaN | NaN |
| 4 | 100005 | 3 | -373 | -62 | -190.666667 | 0 | 0 | 0.0 | -128.0 | 1324.0 | 439.333333 | -123.0 | -123.0 | -123.000000 | 0.000 | 0.000 | 568800.0 | 219042.000000 | 657126.000 | 543087.0 | 189469.500000 | 568408.5 | 0.0 | 0.0 | 0.0 | 4261.5 | 1420.500000 | 4261.5 | -5.0 | -5.0 | -5.000000 | -250.0 | -250.0 | -250.000000 | -1461.0 | -184.0 | -630.000000 | 0.0 | 0.954794 | 0.601256 | -58500.0 | -4504.5 | -29572.500000 | 0.0 | 0 | 0.0 | 0 | 2.0 | -137.0 | -62.0 | -99.500000 | 0.0 | 0.0 | 0.0 | 122.0 | 1324.0 | 723.000000 | NaN | NaN | NaN | 0.0 | 0.0 | 568800.0 | 299313.0000 | 598626.000 | 543087.0 | 284204.250 | 568408.5 | 0.0 | 0.0 | 0.0 | 4261.5 | 2130.75 | 4261.5 | NaN | NaN | NaN | NaN | NaN | NaN | -1461.0 | -184.0 | -822.500000 | 0.848974 | 0.954794 | 0.901884 | -25713.0 | -4504.500 | -15108.7500 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 5 | 100007 | 1 | -1149 | -1149 | -1149.000000 | 0 | 0 | 0.0 | -783.0 | -783.0 | -783.000000 | -783.0 | -783.0 | -783.000000 | 0.000 | 0.000 | 146250.0 | 146250.000000 | 146250.000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.000000 | -366.0 | -366.0 | -366.000000 | -366.0 | -366.0 | -366.000000 | 0.0 | 0.000000 | 0.000000 | -146250.0 | -146250.0 | -146250.000000 | 0.0 | 0 | 0.0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | NaN | NaN |
| 6 | 100008 | 3 | -1097 | -78 | -757.333333 | 0 | 0 | 0.0 | -853.0 | 471.0 | -391.333333 | -1028.0 | -790.0 | -909.000000 | 0.000 | 0.000 | 267606.0 | 156148.500000 | 468445.500 | 240057.0 | 80019.000000 | 240057.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | -2.0 | 175.0 | 86.500000 | -307.0 | -69.0 | -188.000000 | -549.0 | -244.0 | -366.000000 | 0.0 | 0.897054 | 0.299018 | -105705.0 | -27549.0 | -76129.500000 | 0.0 | 0 | 0.0 | 0 | 1.0 | -78.0 | -78.0 | -78.000000 | 0.0 | 0.0 | 0.0 | 471.0 | 471.0 | 471.000000 | NaN | NaN | NaN | 0.0 | 0.0 | 267606.0 | 267606.0000 | 267606.000 | 240057.0 | 240057.000 | 240057.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | -549.0 | -549.0 | -549.000000 | 0.897054 | 0.897054 | 0.897054 | -27549.0 | -27549.000 | -27549.0000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 7 | 100009 | 18 | -2882 | -239 | -1271.500000 | 0 | 0 | 0.0 | -2152.0 | 1402.0 | -794.937500 | -2152.0 | -313.0 | -1108.500000 | 0.000 | 0.000 | 1777500.0 | 266711.750000 | 4800811.500 | 557959.5 | 76953.535714 | 1077349.5 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | -713.0 | 1459.0 | 114.785714 | -893.0 | -92.0 | -357.214286 | -1826.0 | -30.0 | -529.000000 | 0.0 | 0.967787 | 0.169369 | -1777500.0 | -10872.0 | -227088.000000 | 0.0 | 0 | 0.0 | 0 | 4.0 | -1293.0 | -239.0 | -591.750000 | 0.0 | 0.0 | 0.0 | -209.0 | 1402.0 | 596.500000 | NaN | NaN | NaN | NaN | NaN | 642861.0 | 381890.2500 | 1527561.000 | 557959.5 | 269337.375 | 1077349.5 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | -1826.0 | -30.0 | -928.000000 | 0.000000 | 0.967787 | 0.592792 | -187200.0 | -10872.000 | -112552.8750 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 8 | 100010 | 2 | -2741 | -1138 | -1939.500000 | 0 | 0 | 0.0 | -928.0 | 689.0 | -119.500000 | -1138.0 | -1138.0 | -1138.000000 | NaN | NaN | 675000.0 | 495000.000000 | 990000.000 | 348007.5 | 174003.750000 | 348007.5 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | 210.0 | 210.0 | 210.000000 | -1603.0 | -1603.0 | -1603.000000 | -1827.0 | -1813.0 | -1820.000000 | 0.0 | 0.515567 | 0.257783 | -326992.5 | -315000.0 | -320996.250000 | 0.0 | 0 | 0.0 | 0 | 1.0 | -1138.0 | -1138.0 | -1138.000000 | 0.0 | 0.0 | 0.0 | 689.0 | 689.0 | 689.000000 | NaN | NaN | NaN | NaN | NaN | 675000.0 | 675000.0000 | 675000.000 | 348007.5 | 348007.500 | 348007.5 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | -1827.0 | -1827.0 | -1827.000000 | 0.515567 | 0.515567 | 0.515567 | -326992.5 | -326992.500 | -326992.5000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 9 | 100011 | 4 | -2508 | -1309 | -1773.000000 | 0 | 0 | 0.0 | -2173.0 | -860.0 | -1293.250000 | -2197.0 | -968.0 | -1463.250000 | 10147.230 | 5073.615 | 145242.0 | 108807.075000 | 435228.300 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | -102.0 | 758.0 | 170.000000 | -347.0 | -239.0 | -309.750000 | -1100.0 | -239.0 | -479.750000 | 0.0 | 0.000000 | 0.000000 | -145242.0 | -54000.0 | -96662.100000 | 0.0 | 0 | 0.0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | NaN | NaN |
# 데이터 세트 재로딩.
apps, prev, bureau, bureau_bal = get_dataset()
<ipython-input-2-eb2bde2565da>:4: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
To accept the future behavior, pass 'sort=False'.
To retain the current behavior and silence the warning, pass 'sort=True'.
apps = pd.concat([app_train, app_test])
print(bureau_bal.shape, bureau.shape)
# bureau_bal과 bureau join
bureau_bal = bureau_bal.merge(bureau[['SK_ID_CURR', 'SK_ID_BUREAU']], on='SK_ID_BUREAU', how='left')
bureau_bal.shape
(27299925, 3) (1716428, 17)
(27299925, 4)
bureau_bal.head()
| SK_ID_BUREAU | MONTHS_BALANCE | STATUS | SK_ID_CURR | |
|---|---|---|---|---|
| 0 | 5715448 | 0 | C | 380361.0 |
| 1 | 5715448 | -1 | C | 380361.0 |
| 2 | 5715448 | -2 | C | 380361.0 |
| 3 | 5715448 | -3 | C | 380361.0 |
| 4 | 5715448 | -4 | C | 380361.0 |
bureau_bal['STATUS'].value_counts()
C 13646993
0 7499507
X 5810482
1 242347
5 62406
2 23419
3 8924
4 5847
Name: STATUS, dtype: int64
bureau_bal['BUREAU_BAL_IS_DPD'] = bureau_bal['STATUS'].apply(lambda x: 1 if x in['1','2','3','4','5'] else 0)
bureau_bal['BUREAU_BAL_IS_DPD_OVER120'] = bureau_bal['STATUS'].apply(lambda x: 1 if x =='5' else 0)
bureau_bal['BUREAU_BAL_IS_DPD'].value_counts()
0 26956982
1 342943
Name: BUREAU_BAL_IS_DPD, dtype: int64
bureau_bal_grp = bureau_bal.groupby('SK_ID_CURR')
# SK_ID_CURR 레벨로 건수와 MONTHS_BALANCE의 aggregation 가공
bureau_bal_agg_dict = {
'SK_ID_CURR':['count'],
'MONTHS_BALANCE':['min', 'max', 'mean'],
'BUREAU_BAL_IS_DPD':['mean', 'sum'],
'BUREAU_BAL_IS_DPD_OVER120':['mean', 'sum']
}
bureau_bal_agg = bureau_bal_grp.agg(bureau_bal_agg_dict)
# BUREAU_BAL을 접두어로 하는 새로운 컬럼명 할당.
bureau_bal_agg.columns = [ 'BUREAU_BAL_'+('_').join(column).upper() for column in bureau_bal_agg.columns.ravel() ]
# 조인을 위해 SK_ID_CURR을 reset_index()로 컬럼화
bureau_bal_agg = bureau_bal_agg.reset_index()
print(bureau_bal_agg.shape)
bureau_bal_agg.head()
(134542, 9)
| SK_ID_CURR | BUREAU_BAL_SK_ID_CURR_COUNT | BUREAU_BAL_MONTHS_BALANCE_MIN | BUREAU_BAL_MONTHS_BALANCE_MAX | BUREAU_BAL_MONTHS_BALANCE_MEAN | BUREAU_BAL_BUREAU_BAL_IS_DPD_MEAN | BUREAU_BAL_BUREAU_BAL_IS_DPD_SUM | BUREAU_BAL_BUREAU_BAL_IS_DPD_OVER120_MEAN | BUREAU_BAL_BUREAU_BAL_IS_DPD_OVER120_SUM | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 100001.0 | 172 | -51 | 0 | -16.279070 | 0.005814 | 1 | 0.0 | 0 |
| 1 | 100002.0 | 110 | -47 | 0 | -24.554545 | 0.245455 | 27 | 0.0 | 0 |
| 2 | 100005.0 | 21 | -12 | 0 | -4.333333 | 0.000000 | 0 | 0.0 | 0 |
| 3 | 100010.0 | 72 | -90 | -2 | -46.000000 | 0.000000 | 0 | 0.0 | 0 |
| 4 | 100013.0 | 230 | -68 | 0 | -29.373913 | 0.030435 | 7 | 0.0 | 0 |
bureau_bal_agg.columns
Index(['SK_ID_CURR', 'BUREAU_BAL_SK_ID_CURR_COUNT',
'BUREAU_BAL_MONTHS_BALANCE_MIN', 'BUREAU_BAL_MONTHS_BALANCE_MAX',
'BUREAU_BAL_MONTHS_BALANCE_MEAN', 'BUREAU_BAL_BUREAU_BAL_IS_DPD_MEAN',
'BUREAU_BAL_BUREAU_BAL_IS_DPD_SUM',
'BUREAU_BAL_BUREAU_BAL_IS_DPD_OVER120_MEAN',
'BUREAU_BAL_BUREAU_BAL_IS_DPD_OVER120_SUM'],
dtype='object')
# SK_ID_CURR 레벨로 DPD 비율, DPD > 120 비율 가공
bureau_bal_agg['BUREAU_BAL_IS_DPD_RATIO'] = bureau_bal_agg['BUREAU_BAL_BUREAU_BAL_IS_DPD_SUM']/bureau_bal_agg['BUREAU_BAL_SK_ID_CURR_COUNT']
bureau_bal_agg['BUREAU_BAL_IS_DPD_OVER120_RATIO'] = bureau_bal_agg['BUREAU_BAL_BUREAU_BAL_IS_DPD_OVER120_SUM']/bureau_bal_agg['BUREAU_BAL_SK_ID_CURR_COUNT']
##### bureau_day_amt_agg, bureau_bal_agg 조인.
bureau_agg = bureau_day_amt_agg.merge(bureau_active_agg, on='SK_ID_CURR', how='left')
bureau_agg = bureau_agg.merge(bureau_bal_agg, on='SK_ID_CURR', how='left')
print(bureau_agg.shape)
(305811, 103)
# bureau 채무 완료 날짜 및 대출 금액 대비 채무 금액 관련 컬럼 가공.
def get_bureau_processed(bureau):
# 예정 채무 시작 및 완료일과 실제 채무 완료일간의 차이 및 날짜 비율 가공.
bureau['BUREAU_ENDDATE_FACT_DIFF'] = bureau['DAYS_CREDIT_ENDDATE'] - bureau['DAYS_ENDDATE_FACT']
bureau['BUREAU_CREDIT_FACT_DIFF'] = bureau['DAYS_CREDIT'] - bureau['DAYS_ENDDATE_FACT']
bureau['BUREAU_CREDIT_ENDDATE_DIFF'] = bureau['DAYS_CREDIT'] - bureau['DAYS_CREDIT_ENDDATE']
# 채무 금액 대비/대출 금액 비율 및 차이 가공
bureau['BUREAU_CREDIT_DEBT_RATIO']=bureau['AMT_CREDIT_SUM_DEBT']/bureau['AMT_CREDIT_SUM']
#bureau['BUREAU_CREDIT_DEBT_DIFF'] = bureau['AMT_CREDIT_SUM'] - bureau['AMT_CREDIT_SUM_DEBT']
bureau['BUREAU_CREDIT_DEBT_DIFF'] = bureau['AMT_CREDIT_SUM_DEBT'] - bureau['AMT_CREDIT_SUM']
bureau['BUREAU_IS_DPD'] = bureau['CREDIT_DAY_OVERDUE'].apply(lambda x: 1 if x > 0 else 0)
bureau['BUREAU_IS_DPD_OVER120'] = bureau['CREDIT_DAY_OVERDUE'].apply(lambda x: 1 if x >120 else 0)
return bureau
# bureau 주요 컬럼 및 앞에서 채무 및 대출금액 관련 컬럼들로 SK_ID_CURR 레벨의 aggregation 컬럼 생성.
def get_bureau_day_amt_agg(bureau):
bureau_agg_dict = {
'SK_ID_BUREAU':['count'],
'DAYS_CREDIT':['min', 'max', 'mean'],
'CREDIT_DAY_OVERDUE':['min', 'max', 'mean'],
'DAYS_CREDIT_ENDDATE':['min', 'max', 'mean'],
'DAYS_ENDDATE_FACT':['min', 'max', 'mean'],
'AMT_CREDIT_MAX_OVERDUE': ['max', 'mean'],
'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
'AMT_CREDIT_SUM_OVERDUE': ['max', 'mean', 'sum'],
'AMT_ANNUITY': ['max', 'mean', 'sum'],
# 추가 가공 컬럼
'BUREAU_ENDDATE_FACT_DIFF':['min', 'max', 'mean'],
'BUREAU_CREDIT_FACT_DIFF':['min', 'max', 'mean'],
'BUREAU_CREDIT_ENDDATE_DIFF':['min', 'max', 'mean'],
'BUREAU_CREDIT_DEBT_RATIO':['min', 'max', 'mean'],
'BUREAU_CREDIT_DEBT_DIFF':['min', 'max', 'mean'],
'BUREAU_IS_DPD':['mean', 'sum'],
'BUREAU_IS_DPD_OVER120':['mean', 'sum']
}
bureau_grp = bureau.groupby('SK_ID_CURR')
bureau_day_amt_agg = bureau_grp.agg(bureau_agg_dict)
bureau_day_amt_agg.columns = ['BUREAU_'+('_').join(column).upper() for column in bureau_day_amt_agg.columns.ravel()]
# 조인을 위해 SK_ID_CURR을 reset_index()로 컬럼화
bureau_day_amt_agg = bureau_day_amt_agg.reset_index()
print('bureau_day_amt_agg shape:', bureau_day_amt_agg.shape)
return bureau_day_amt_agg
# Bureau의 CREDIT_ACTIVE='Active' 인 데이터만 filtering 후 주요 컬럼 및 앞에서 채무 및 대출금액 관련 컬럼들로 SK_ID_CURR 레벨의 aggregation 컬럼 생성
def get_bureau_active_agg(bureau):
# CREDIT_ACTIVE='Active' 인 데이터만 filtering
cond_active = bureau['CREDIT_ACTIVE'] == 'Active'
bureau_active_grp = bureau[cond_active].groupby(['SK_ID_CURR'])
bureau_agg_dict = {
'SK_ID_BUREAU':['count'],
'DAYS_CREDIT':['min', 'max', 'mean'],
'CREDIT_DAY_OVERDUE':['min', 'max', 'mean'],
'DAYS_CREDIT_ENDDATE':['min', 'max', 'mean'],
'DAYS_ENDDATE_FACT':['min', 'max', 'mean'],
'AMT_CREDIT_MAX_OVERDUE': ['max', 'mean'],
'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
'AMT_CREDIT_SUM_OVERDUE': ['max', 'mean', 'sum'],
'AMT_ANNUITY': ['max', 'mean', 'sum'],
# 추가 가공 컬럼
'BUREAU_ENDDATE_FACT_DIFF':['min', 'max', 'mean'],
'BUREAU_CREDIT_FACT_DIFF':['min', 'max', 'mean'],
'BUREAU_CREDIT_ENDDATE_DIFF':['min', 'max', 'mean'],
'BUREAU_CREDIT_DEBT_RATIO':['min', 'max', 'mean'],
'BUREAU_CREDIT_DEBT_DIFF':['min', 'max', 'mean'],
'BUREAU_IS_DPD':['mean', 'sum'],
'BUREAU_IS_DPD_OVER120':['mean', 'sum']
}
bureau_active_agg = bureau_active_grp.agg(bureau_agg_dict)
bureau_active_agg.columns = ['BUREAU_ACT_'+('_').join(column).upper() for column in bureau_active_agg.columns.ravel()]
# 조인을 위해 SK_ID_CURR을 reset_index()로 컬럼화
bureau_active_agg = bureau_active_agg.reset_index()
print('bureau_active_agg shape:', bureau_active_agg.shape)
return bureau_active_agg
# bureau_bal을 SK_ID_CURR 레벨로 건수와 MONTHS_BALANCE의 aggregation 가공
def get_bureau_bal_agg(bureau, bureau_bal):
bureau_bal = bureau_bal.merge(bureau[['SK_ID_CURR', 'SK_ID_BUREAU']], on='SK_ID_BUREAU', how='left')
bureau_bal['BUREAU_BAL_IS_DPD'] = bureau_bal['STATUS'].apply(lambda x: 1 if x in['1','2','3','4','5'] else 0)
bureau_bal['BUREAU_BAL_IS_DPD_OVER120'] = bureau_bal['STATUS'].apply(lambda x: 1 if x =='5' else 0)
bureau_bal_grp = bureau_bal.groupby('SK_ID_CURR')
# SK_ID_CURR 레벨로 건수와 MONTHS_BALANCE의 aggregation 가공
bureau_bal_agg_dict = {
'SK_ID_CURR':['count'],
'MONTHS_BALANCE':['min', 'max', 'mean'],
'BUREAU_BAL_IS_DPD':['mean', 'sum'],
'BUREAU_BAL_IS_DPD_OVER120':['mean', 'sum']
}
bureau_bal_agg = bureau_bal_grp.agg(bureau_bal_agg_dict)
bureau_bal_agg.columns = [ 'BUREAU_BAL_'+('_').join(column).upper() for column in bureau_bal_agg.columns.ravel() ]
# 조인을 위해 SK_ID_CURR을 reset_index()로 컬럼화
bureau_bal_agg = bureau_bal_agg.reset_index()
print('bureau_bal_agg shape:', bureau_bal_agg.shape)
return bureau_bal_agg
# 가공된 bureau관련 aggregation 컬럼들을 모두 결합
def get_bureau_agg(bureau, bureau_bal):
bureau = get_bureau_processed(bureau)
bureau_day_amt_agg = get_bureau_day_amt_agg(bureau)
bureau_active_agg = get_bureau_active_agg(bureau)
bureau_bal_agg = get_bureau_bal_agg(bureau, bureau_bal)
# 가공된 bureau관련 aggregation 컬럼들을 모두 조인하여 결합 후 return
bureau_agg = bureau_day_amt_agg.merge(bureau_active_agg, on='SK_ID_CURR', how='left')
bureau_agg = bureau_agg.merge(bureau_bal_agg, on='SK_ID_CURR', how='left')
print('bureau_agg shape:', bureau_agg.shape)
return bureau_agg
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
def get_apps_processed(apps):
# EXT_SOURCE_X FEATURE 가공
apps['APPS_EXT_SOURCE_MEAN'] = apps[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
apps['APPS_EXT_SOURCE_STD'] = apps[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
apps['APPS_EXT_SOURCE_STD'] = apps['APPS_EXT_SOURCE_STD'].fillna(apps['APPS_EXT_SOURCE_STD'].mean())
# AMT_CREDIT 비율로 Feature 가공
apps['APPS_ANNUITY_CREDIT_RATIO'] = apps['AMT_ANNUITY']/apps['AMT_CREDIT']
apps['APPS_GOODS_CREDIT_RATIO'] = apps['AMT_GOODS_PRICE']/apps['AMT_CREDIT']
# AMT_INCOME_TOTAL 비율로 Feature 가공
apps['APPS_ANNUITY_INCOME_RATIO'] = apps['AMT_ANNUITY']/apps['AMT_INCOME_TOTAL']
apps['APPS_CREDIT_INCOME_RATIO'] = apps['AMT_CREDIT']/apps['AMT_INCOME_TOTAL']
apps['APPS_GOODS_INCOME_RATIO'] = apps['AMT_GOODS_PRICE']/apps['AMT_INCOME_TOTAL']
apps['APPS_CNT_FAM_INCOME_RATIO'] = apps['AMT_INCOME_TOTAL']/apps['CNT_FAM_MEMBERS']
# DAYS_BIRTH, DAYS_EMPLOYED 비율로 Feature 가공
apps['APPS_EMPLOYED_BIRTH_RATIO'] = apps['DAYS_EMPLOYED']/apps['DAYS_BIRTH']
apps['APPS_INCOME_EMPLOYED_RATIO'] = apps['AMT_INCOME_TOTAL']/apps['DAYS_EMPLOYED']
apps['APPS_INCOME_BIRTH_RATIO'] = apps['AMT_INCOME_TOTAL']/apps['DAYS_BIRTH']
apps['APPS_CAR_BIRTH_RATIO'] = apps['OWN_CAR_AGE'] / apps['DAYS_BIRTH']
apps['APPS_CAR_EMPLOYED_RATIO'] = apps['OWN_CAR_AGE'] / apps['DAYS_EMPLOYED']
return apps
def get_prev_processed(prev):
# 대출 신청 금액과 실제 대출액/대출 상품금액 차이 및 비율
prev['PREV_CREDIT_DIFF'] = prev['AMT_APPLICATION'] - prev['AMT_CREDIT']
prev['PREV_GOODS_DIFF'] = prev['AMT_APPLICATION'] - prev['AMT_GOODS_PRICE']
prev['PREV_CREDIT_APPL_RATIO'] = prev['AMT_CREDIT']/prev['AMT_APPLICATION']
# prev['PREV_ANNUITY_APPL_RATIO'] = prev['AMT_ANNUITY']/prev['AMT_APPLICATION']
prev['PREV_GOODS_APPL_RATIO'] = prev['AMT_GOODS_PRICE']/prev['AMT_APPLICATION']
prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
# 첫번째 만기일과 마지막 만기일까지의 기간
prev['PREV_DAYS_LAST_DUE_DIFF'] = prev['DAYS_LAST_DUE_1ST_VERSION'] - prev['DAYS_LAST_DUE']
# 매월 납부 금액과 납부 횟수 곱해서 전체 납부 금액 구함.
all_pay = prev['AMT_ANNUITY'] * prev['CNT_PAYMENT']
# 전체 납부 금액 대비 AMT_CREDIT 비율을 구하고 여기에 다시 납부횟수로 나누어서 이자율 계산.
prev['PREV_INTERESTS_RATE'] = (all_pay/prev['AMT_CREDIT'] - 1)/prev['CNT_PAYMENT']
return prev
def get_prev_amt_agg(prev):
# 새롭게 생성된 대출 신청액 대비 다른 금액 차이 및 비율로 aggregation 수행.
agg_dict = {
# 기존 컬럼 aggregation.
'SK_ID_CURR':['count'],
'AMT_CREDIT':['mean', 'max', 'sum'],
'AMT_ANNUITY':['mean', 'max', 'sum'],
'AMT_APPLICATION':['mean', 'max', 'sum'],
'AMT_DOWN_PAYMENT':['mean', 'max', 'sum'],
'AMT_GOODS_PRICE':['mean', 'max', 'sum'],
'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
'DAYS_DECISION': ['min', 'max', 'mean'],
'CNT_PAYMENT': ['mean', 'sum'],
# 가공 컬럼 aggregation
'PREV_CREDIT_DIFF':['mean', 'max', 'sum'],
'PREV_CREDIT_APPL_RATIO':['mean', 'max'],
'PREV_GOODS_DIFF':['mean', 'max', 'sum'],
'PREV_GOODS_APPL_RATIO':['mean', 'max'],
'PREV_DAYS_LAST_DUE_DIFF':['mean', 'max', 'sum'],
'PREV_INTERESTS_RATE':['mean', 'max']
}
prev_group = prev.groupby('SK_ID_CURR')
prev_amt_agg = prev_group.agg(agg_dict)
# multi index 컬럼을 '_'로 연결하여 컬럼명 변경
prev_amt_agg.columns = ["PREV_"+ "_".join(x).upper() for x in prev_amt_agg.columns.ravel()]
return prev_amt_agg
def get_prev_refused_appr_agg(prev):
# 원래 groupby 컬럼 + 세부 기준 컬럼으로 groupby 수행. 세분화된 레벨로 aggregation 수행 한 뒤에 unstack()으로 컬럼레벨로 변형.
prev_refused_appr_group = prev[prev['NAME_CONTRACT_STATUS'].isin(['Approved', 'Refused'])].groupby([ 'SK_ID_CURR', 'NAME_CONTRACT_STATUS'])
prev_refused_appr_agg = prev_refused_appr_group['SK_ID_CURR'].count().unstack()
# 컬럼명 변경.
prev_refused_appr_agg.columns = ['PREV_APPROVED_COUNT', 'PREV_REFUSED_COUNT' ]
# NaN값은 모두 0으로 변경.
prev_refused_appr_agg = prev_refused_appr_agg.fillna(0)
return prev_refused_appr_agg
def get_prev_agg(prev):
prev = get_prev_processed(prev)
prev_amt_agg = get_prev_amt_agg(prev)
prev_refused_appr_agg = get_prev_refused_appr_agg(prev)
# prev_amt_agg와 조인.
prev_agg = prev_amt_agg.merge(prev_refused_appr_agg, on='SK_ID_CURR', how='left')
# SK_ID_CURR별 과거 대출건수 대비 APPROVED_COUNT 및 REFUSED_COUNT 비율 생성.
prev_agg['PREV_REFUSED_RATIO'] = prev_agg['PREV_REFUSED_COUNT']/prev_agg['PREV_SK_ID_CURR_COUNT']
prev_agg['PREV_APPROVED_RATIO'] = prev_agg['PREV_APPROVED_COUNT']/prev_agg['PREV_SK_ID_CURR_COUNT']
# 'PREV_REFUSED_COUNT', 'PREV_APPROVED_COUNT' 컬럼 drop
prev_agg = prev_agg.drop(['PREV_REFUSED_COUNT', 'PREV_APPROVED_COUNT'], axis=1)
return prev_agg
def get_apps_all_with_prev_agg(apps, prev):
apps_all = get_apps_processed(apps)
prev_agg = get_prev_agg(prev)
print('prev_agg shape:', prev_agg.shape)
print('apps_all before merge shape:', apps_all.shape)
apps_all = apps_all.merge(prev_agg, on='SK_ID_CURR', how='left')
print('apps_all after merge with prev_agg shape:', apps_all.shape)
return apps_all
def get_apps_all_encoded(apps_all):
object_columns = apps_all.dtypes[apps_all.dtypes == 'object'].index.tolist()
for column in object_columns:
apps_all[column] = pd.factorize(apps_all[column])[0]
return apps_all
def get_apps_all_train_test(apps_all):
apps_all_train = apps_all[~apps_all['TARGET'].isnull()]
apps_all_test = apps_all[apps_all['TARGET'].isnull()]
apps_all_test = apps_all_test.drop('TARGET', axis=1)
return apps_all_train, apps_all_test
def train_apps_all(apps_all_train):
ftr_app = apps_all_train.drop(['SK_ID_CURR', 'TARGET'], axis=1)
target_app = apps_all_train['TARGET']
train_x, valid_x, train_y, valid_y = train_test_split(ftr_app, target_app, test_size=0.3, random_state=2020)
print('train shape:', train_x.shape, 'valid shape:', valid_x.shape)
clf = LGBMClassifier(
nthread=4,
n_estimators=2000,
learning_rate=0.02,
max_depth = 15,
num_leaves=64,
colsample_bytree=0.582,
subsample=0.638,
max_bin=371,
reg_alpha=3.633,
reg_lambda=0.323,
min_child_weight=24,
min_child_samples=168,
silent=-1,
verbose=-1,
)
clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'auc', verbose= 100,
early_stopping_rounds= 200)
return clf
def get_apps_all_with_prev_bureau_agg(apps, prev, bureau, bureau_bal):
apps_all = get_apps_processed(apps)
prev_agg = get_prev_agg(prev)
bureau_agg = get_bureau_agg(bureau, bureau_bal)
print('prev_agg shape:', prev_agg.shape)
print('bueau_agg shape:', bureau_agg.shape)
print('apps_all before merge shape:', apps_all.shape)
apps_all = apps_all.merge(prev_agg, on='SK_ID_CURR', how='left')
apps_all = apps_all.merge(bureau_agg, on='SK_ID_CURR', how='left')
print('apps_all after merge with prev_agg, bureau_agg shape:', apps_all.shape)
return apps_all
apps, prev, bureau, bureau_bal = get_dataset()
# application, previous, bureau, bureau_bal 관련 데이터셋 가공 및 취합.
apps_all = get_apps_all_with_prev_bureau_agg(apps, prev, bureau, bureau_bal)
# Category 컬럼을 모두 Label 인코딩 수행.
apps_all = get_apps_all_encoded(apps_all)
# 학습과 테스트 데이터로 분리.
apps_all_train, apps_all_test = get_apps_all_train_test(apps_all)
#학습수행.
clf = train_apps_all(apps_all_train)
<ipython-input-2-eb2bde2565da>:4: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
To accept the future behavior, pass 'sort=False'.
To retain the current behavior and silence the warning, pass 'sort=True'.
apps = pd.concat([app_train, app_test])
bureau_day_amt_agg shape: (305811, 47)
bureau_active_agg shape: (251815, 47)
bureau_bal_agg shape: (134542, 9)
bureau_agg shape: (305811, 101)
prev_agg shape: (338857, 41)
bueau_agg shape: (305811, 101)
apps_all before merge shape: (356255, 135)
apps_all after merge with prev_agg, bureau_agg shape: (356255, 276)
train shape: (215257, 274) valid shape: (92254, 274)
[LightGBM] [Warning] num_threads is set with nthread=4, will be overridden by n_jobs=-1. Current value: num_threads=-1
Training until validation scores don't improve for 200 rounds
[100] training's auc: 0.789523 training's binary_logloss: 0.240129 valid_1's auc: 0.763475 valid_1's binary_logloss: 0.246581
[200] training's auc: 0.813736 training's binary_logloss: 0.229194 valid_1's auc: 0.774261 valid_1's binary_logloss: 0.241461
[300] training's auc: 0.832406 training's binary_logloss: 0.221605 valid_1's auc: 0.779681 valid_1's binary_logloss: 0.239434
[400] training's auc: 0.848317 training's binary_logloss: 0.215235 valid_1's auc: 0.782548 valid_1's binary_logloss: 0.238382
[500] training's auc: 0.862026 training's binary_logloss: 0.209586 valid_1's auc: 0.784241 valid_1's binary_logloss: 0.237762
[600] training's auc: 0.873683 training's binary_logloss: 0.204622 valid_1's auc: 0.78505 valid_1's binary_logloss: 0.237438
[700] training's auc: 0.884374 training's binary_logloss: 0.199969 valid_1's auc: 0.785275 valid_1's binary_logloss: 0.237331
[800] training's auc: 0.893925 training's binary_logloss: 0.195589 valid_1's auc: 0.785348 valid_1's binary_logloss: 0.237297
[900] training's auc: 0.902655 training's binary_logloss: 0.191434 valid_1's auc: 0.785455 valid_1's binary_logloss: 0.237258
[1000] training's auc: 0.910528 training's binary_logloss: 0.187463 valid_1's auc: 0.78579 valid_1's binary_logloss: 0.237153
[1100] training's auc: 0.91796 training's binary_logloss: 0.183568 valid_1's auc: 0.785789 valid_1's binary_logloss: 0.237141
[1200] training's auc: 0.925033 training's binary_logloss: 0.179834 valid_1's auc: 0.785853 valid_1's binary_logloss: 0.237124
[1300] training's auc: 0.931293 training's binary_logloss: 0.17629 valid_1's auc: 0.785849 valid_1's binary_logloss: 0.237143
Early stopping, best iteration is:
[1180] training's auc: 0.923646 training's binary_logloss: 0.180584 valid_1's auc: 0.785882 valid_1's binary_logloss: 0.237111
preds = clf.predict_proba(apps_all_test.drop(['SK_ID_CURR'], axis=1))[:, 1 ]
apps_all_test['TARGET'] = preds
apps_all_test[['SK_ID_CURR', 'TARGET']].to_csv('bureau_baseline_01.csv', index=False)
| 차수 | Private Score | Public Score | 설명 |
|---|---|---|---|
| 1차 | 0.74088 | 0.74448 | application 데이터 세트 기본 preprocessing |
| 2차 | 0.75458 | 0.75882 | application 데이터 세트 Feature Engineering |
| 3차 | 0.76396 | 0.77579 | previous 데이터 세트 Feature Engineering |
| 4차 | 0.76420 | 0.77583 | previous 데이터 세트 Feature Engineering 함수화 |
| 5차 | 0.76710 | 0.77630 | Bayesian Optimization을 이용한 하이퍼 파라미터 튜닝 |
| 6차 | 0.77680 | 0.78641 | Bureau, Bureau_bal 데이터 세트 Feature Engineering |
캐글 Advanced 머신러닝 실전 박치기 / 인프런