transcription
Introduction to Manual Feature Engineering.ipynb
.loc
으로 대체 가능...
avg_repaid = df.ix[df['TARGET'] == 0, var_name].median()
avg_not_repaid = df.ix[df['TARGET'] == 1, var_name].median()
plt.figure(figsize = (12, 6))
sns.kdeplot(df.ix[df['TARGET'] == 0, var_name], label = 'TARGET == 0')
sns.kdeplot(df.ix[df['TARGET'] == 1, var_name], label = 'TARGET == 1')
...
...
avg_repaid = df.loc[df['TARGET'] == 0, var_name].median()
avg_not_repaid = df.loc[df['TARGET'] == 1, var_name].median()
plt.figure(figsize = (12, 6))
sns.kdeplot(df.loc[df['TARGET'] == 0, var_name], label = 'TARGET == 0')
sns.kdeplot(df.loc[df['TARGET'] == 1, var_name], label = 'TARGET == 1')
...
# 추가: 수치형 데이터만 선택
numeric_cols = bureau.drop(columns=['SK_ID_BUREAU']).select_dtypes(include='number').columns
# 그룹화 후 집계
bureau_agg = bureau.drop(columns=['SK_ID_BUREAU']).groupby('SK_ID_CURR', as_index=False)[numeric_cols].agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()
# 결과 확인
bureau_agg.head()
bureau_agg.columns
의 열 개수와 columns
변수의 값 개수가 일치하지 않아 발생.reset_index(drop=True)
# 추가: 수치형 데이터만 선택
numeric_cols = bureau.drop(columns=['SK_ID_BUREAU']).select_dtypes(include='number').columns
# 그룹화 후 집계
bureau_agg = bureau.drop(columns=['SK_ID_BUREAU']).groupby('SK_ID_CURR', as_index=False)[numeric_cols].agg(['count', 'mean', 'max', 'min', 'sum']).reset_index(drop=True)
# 결과 확인
bureau_agg.head()
# 추가: SK_ID_CURR에 대한 불필요 계산 컬럼 삭제
columns_to_remove = [(col, agg) for col, agg in bureau_agg.columns if col == 'SK_ID_CURR' and agg in ['count', 'mean', 'max', 'min', 'sum']]
bureau_agg.drop(columns=columns_to_remove, inplace=True)
def agg_numeric(df, group_var, df_name):
"""Aggregates the numeric values in a dataframe. This can
be used to create features for each instance of the grouping variable.
Parameters
--------
df (dataframe):
the dataframe to calculate the statistics on
group_var (string):
the variable by which to group df
df_name (string):
the variable used to rename the columns
Return
--------
agg (dataframe):
a dataframe with the statistics aggregated for
all numeric columns. Each instance of the grouping variable will have
the statistics (mean, min, max, sum; currently supported) calculated.
The columns are also renamed to keep track of features created.
"""
for col in df:
if col != group_var and 'SK_ID' in col:
df = df.drop(columns = col)
group_ids = df[group_var]
numeric_df = df.select_dtypes('number')
numeric_df[group_var] = group_ids
agg = numeric_df.groupby(group_var).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()
columns = [group_var]
for var in agg.columns.levels[0]:
if var != group_var:
for stat in agg.columns.levels[1][:-1]:
columns.append('%s_%s_%s' % (df_name, var, stat))
agg.columns = columns
return agg