😢 스터디노트 (Machine Learning 10)

zoe·2023년 5월 24일

CREDIT CARD FRAUD DETECTION

신용카드 부정 사용자 검출
https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud

데이터 개요
신용카드 사기 검출 분류 실습용 데이터
데이터에 class라는 이름의 컬럼이 사기 유무를 의미
class 컬럼의 불균형이 극심해서 전체 데이터의 약 0.172%가 1(사기 Fraud)을 가짐

데이터 특성
금융 데이터이고 기업의 기밀 보호를 위해 대다수 특성의 이름은 삭제되어 있음.

Amount : 거래금액

Class : Fraud 여유 (1이면 Fraud)

# 데이터 읽기

import pandas as pd

data_path = './creditcard.csv'

raw_data = pd.read_csv(data_path)
raw_data.head()

# 특성
# 데이터의 특성은 이름이 감춰져 있다.

raw_data.columns

# 데이터 라벨의 불균형이 심하다

raw_data['Class'].value_counts()

# fraud rate : 0.17%

fraud_rate = round(raw_data['Class'].value_counts()[1]/len(raw_data) * 100, 2)
print('Frauds', fraud_rate, '% of the dataset')

# 그래프로 표현되기 힘들다

import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x='Class', data=raw_data)
plt.title('Class Distributions \n (0 : No Fraud || 1 : Fraud)', fontsize = 14)
plt.show()

# 일단, X, y로 데이터 선정

X = raw_data.iloc[:, 1:-1]
y = raw_data.iloc[:, -1]

X.shape, y.shape

# 데이터 나누기
# stratify : 분포 비율을 맞춰 나눌 수 있음 
# https://wikidocs.net/43332
# https://yeko90.tistory.com/entry/what-is-stratify-in-traintestsplit

from sklearn.model_selection import train_test_split

X_tarin, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=13, 
                                                    stratify=y)

# 나눈 데이터의 불균형 정도 확인

import numpy as np

np.unique(y_train, return_counts=True)

tmp = np.unique(y_train, return_counts=True)[1]
print(tmp[1] / len(y_train) * 100, '%')

print(np.unique(y_test, return_counts=True)[1][1] / len(y_test) *100 , '%')

1st Trial

# 분류기의 성능을 return하는 함수 작성

from sklearn.metrics import (accuracy_score, precision_score, 
                             recall_score, f1_score, roc_auc_score)

def get_clf_eval(y_test, pred) : 
    acc = accuracy_score(y_test, pred)
    pre = precision_score(y_test, pred)
    re = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    auc = roc_auc_score(y_test, pred)
    
    return acc, pre, re, f1, auc

# 성능 출력 함수 작성

from sklearn.metrics import confusion_matrix

def print_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    acc, pre, re, f1, auc = get_clf_eval(y_test, pred)
    
    print('=> confusion matrix')
    print(confusion)
    print('====================')
    
    print('Accuracy : {0:.4f}, Precision : {1:.4f}' .format(acc, pre))
    print('Recall: {0:.4f}, F1 : {1:.4f}, AUC : {2:.4f}'.format(re, f1, auc))

# Logistic Regression
# recall이 60%가 안된다

from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(random_state=13, solver='liblinear')
lr_clf.fit(X_tarin, y_train)
lr_pred = lr_clf.predict(X_test)

print_clf_eval(y_test, lr_pred)

# Decision Tree
# recall이 71%

from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier(random_state=13, max_depth=4)
dt_clf.fit(X_tarin, y_train)
dt_pred = dt_clf.predict(X_test)

print_clf_eval(y_test, dt_pred)

# Random Forest
# recall이 74%

from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=13, n_jobs=-1, n_estimators=100)
rf_clf.fit(X_tarin, y_train)
rf_pred = rf_clf.predict(X_test)

print_clf_eval(y_test, rf_pred)

# LightGBM
# recall이 77%

from lightgbm import LGBMClassifier

lgbm_clf = LGBMClassifier(n_estimators=1000, num_leaves=64, n_jobs=-1,
                          boost_from_average = False)

lgbm_clf.fit(X_tarin, y_train)
lgbm_pred = lgbm_clf.predict(X_test)

print_clf_eval(y_test, lgbm_pred)

여기서 Recall과 Precision의 의미는
- 은행 입장에서는 Recall이 좋을 것이다.
- 사용자 입장에서는 Precision이 좋을 수 있다

한걸음 전진

# 모델과 데이터를 주면 성능을 출력하는 함수 생성

def get_result(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    
    return get_clf_eval(y_test, pred)

# 다수의 모델의 성능을 정리해서 DataFrame으로 반환하는 함수 작성

def get_result_pd(models, model_names, X_train, y_train, X_test, y_test):
    col_names = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
    tmp = []
    
    for model in models:
        tmp.append(get_result(model, X_train, y_train, X_test, y_test))
    
    return pd.DataFrame(tmp, columns=col_names, index=model_names)

# 4개의 분류 모델을 한 번에 표로 정리
# 앙상블 계열의 성능이 우수하다(randomforest, lightGBM)

import time

models = [lr_clf, dt_clf, rf_clf, lgbm_clf]
model_names = ['LinearReg', 'DecisionTree', 'RandomForest', 'LightGBM']

start_time = time.time()
results = get_result_pd(models, model_names, X_train, y_train, X_test, y_test)

print('Fit time : ', time.time() - start_time)
results

2nd Trial

# raw_data의 Amount 컬럼 확인
# Amount는 신용카드 사용 금액
# 컬럼 분포가 특정 대역에 아주 많다

import seaborn as sns

plt.figure(figsize=(10, 5))
sns.distplot(raw_data['Amount'], color='r')
plt.show()

# Amount 컬럼에 StandardScaler 적용

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
amount_n = scaler.fit_transform(raw_data['Amount'].values.reshape(-1, 1))

raw_data_copy = raw_data.iloc[:, 1: -2]
raw_data_copy['Amount_Scaled'] = amount_n
raw_data_copy.head()

raw_data['Amount'].values.reshape(-1, 1)

# 데이터 나누기

X_train, X_test, y_train, y_test = train_test_split(raw_data_copy, y, test_size=0.3, 
                                                    random_state=13, stratify=y)

# 모델에 다시 평가

models = [lr_clf, dt_clf, rf_clf, lgbm_clf]
model_names = ['LinearReg', 'DecisionTree', 'RandomForest', 'LightGBM']

start_time = time.time()
results = get_result_pd(models, model_names, X_train, y_train, X_test, y_test)

print('Fit time : ', time.time() - start_time)
results

# 모델별 ROC 커브

from sklearn.metrics import roc_curve

def draw_roc_curve(models, model_names, X_test, y_test):
    plt.figure(figsize=(10, 10))
    
    for model in range(len(models)):
        pred = models[model].predict_proba(X_test)[:, 1]
        fpr, tpr, thresholds = roc_curve(y_test, pred)
        plt.plot(fpr, tpr, label = model_names[model])
        
    plt.plot([0, 1], [0, 1], 'k--', label = 'random quess')
    plt.title('ROC')
    plt.legend()
    plt.grid()
    plt.show()

draw_roc_curve(models, model_names, X_test, y_test)

# 다른 시도 log scale

amount_log = np.log1p(raw_data['Amount'])

raw_data_copy['Amount_Scaled'] = amount_log
raw_data_copy.head()

# 분포가 변화함

plt.figure(figsize=(10, 5))
sns.histplot(raw_data_copy['Amount_Scaled'],kde=True, color='r')

plt.show()

# 다시 성능 확인
# 미세한 변화가 보이지만 확실한 변화는 관찰되지 않는다.

X_train, X_test, y_train, y_test = train_test_split(raw_data_copy, y, test_size=0.3, 
                                                    random_state=13, stratify=y)
start_time = time.time()
results = get_result_pd(models, model_names, X_train, y_train, X_test, y_test)

print('Fit time : ', time.time() - start_time)
results

# ROC 커브 결과

draw_roc_curve(models, model_names, X_test, y_test)

3rd Trial

# 특이 데이터

import seaborn as sns

plt.figure(figsize=(10, 7))
sns.boxenplot(data=raw_data[['V13', 'V14', 'V15']])

# Outlier를 정리하기 위해 Outlier의 인덱스를 파악하는 코드

def get_outlier(df=None, column=None, weight = 1.5):
    fraud = df[df['Class']==1][column] # 비정상 데이터에 대해서만 outlier를 확인하겠다
    quantile_25 = np.percentile(fraud.values, 25)
    quantile_75 = np.percentile(fraud.values, 75)
    
    iqr = quantile_75 - quantile_25
    iqr_weight = iqr * weight
    lowest_val = quantile_25 - iqr_weight
    highest_val = quantile_75 + iqr_weight
    
    outlier_index = fraud[(fraud < lowest_val) | (fraud > highest_val)].index
    
    return outlier_index

# Outlier 찾기

get_outlier(df=raw_data, column='V14', weight=1.5)

# Outlier 제거

raw_data_copy.shape

outlier_index = get_outlier(df=raw_data, column='V14', weight=1.5)
raw_data_copy.drop(outlier_index, axis=0, inplace=True)
raw_data_copy.shape

# Outlier를 제거하고 데이터 나누기

X = raw_data_copy

raw_data.drop(outlier_index, axis=0, inplace=True) # outlier를 제거한 y 데이터를 만들기 위해
y = raw_data.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=13, stratify=y)

# 다시 성능 확인
# recall이 80%까지 더 성능이 나아짐

models = [lr_clf, dt_clf, rf_clf, lgbm_clf]
model_names = ['LinearReg', 'DecisionTree', 'RandomForest', 'LightGBM']

start_time = time.time()
results = get_result_pd(models, model_names, X_train, y_train, X_test, y_test)

print('Fit time : ', time.time() - start_time)
results

# ROC 커브

draw_roc_curve(models, model_names, X_test, y_test)

4th Trial - Oversampling

데이터의 불균형이 심할 때 불균형한 두 클래스의 분포를 강제로 맞춰보는 작업
언더샘플링 : 많은 수의 데이터를 적은 수의 데이터로 강제로 조정
오버샘플링
- 원본 데이터의 피처 값들을 아주 약간 변경하여 증식
- 대표적으로는 SMOTE(Synthetic Minority Over-sampling Technique) 방법이 있다
- 적은 데이터 세트에 있는 개별 데이터를 k-최근접이웃 방법으로 찾아서 데이터의 분포 사이에 새로운 데이터를 만드는 방식
- imbalanced-learn이라는 python pkg가 있다

※ 다른 사람이 작성한 것 참고 : https://velog.io/@jaylnne/ML-%EC%8B%A0%EC%9A%A9%EC%B9%B4%EB%93%9C-%EC%82%AC%EA%B8%B0-%ED%83%90%EC%A7%80-%EB%AA%A8%EB%8D%B8-%EB%A7%8C%EB%93%A4%EC%96%B4%EB%B3%B4%EA%B8%B0

# imbalanced-learn 설치

#!pip install imbalanced-learn

# SMOTE 적용

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=13)

X_train_over, y_train_over = SMOTE(random_state=13).fit_resample(X_train, y_train)
# fit_sample이 fit_resample로 바뀜
# train 데이터로 증강해야 한다, train데이터에 변경하는 설정은 test에 하면 안된다 (왜곡될 수 있음) ★★★ 
# (scaler는 test데이터에 적용 가능)

# 데이터 증강 효과

X_train.shape, y_train.shape

X_train_over.shape, y_train_over.shape

# 결과

print(np.unique(y_train, return_counts=True))
print(np.unique(y_train_over, return_counts=True))

# 다시 성능 확인
# recall은 확실히 좋아진다 (LinearReg, DecisionTree의 precision은 매우 낮아짐)

# 에러 : Found input variables with inconsistent numbers of samples
# 해결방법 : https://lovelydiary.tistory.com/425

models = [lr_clf, dt_clf, rf_clf, lgbm_clf]
model_names = ['LinearReg', 'DecisionTree', 'RandomForest', 'LightGBM']

start_time = time.time()
results = get_result_pd(models, model_names, X_train_over, y_train_over, X_test, y_test)

print('Fit time : ', time.time() - start_time)
results

# ROC 커브

draw_roc_curve(models, model_names, X_test, y_test)

어렵...오버샘플링할 때 자꾸 X_train_over, y_train_over의 row 수가 안맞다는 듯이 에러가 나왔는데 다시 전체적으로 돌리다보니 되었다...?

💻 출처 : 제로베이스 데이터 취업 스쿨

zoe

#데이터분석 #퍼포먼스마케팅 #데이터 #디지털마케팅

이전 포스트

😢 스터디노트 (Machine Learning 9)

다음 포스트

😢 스터디노트 (Machine Learning 10)

CREDIT CARD FRAUD DETECTION

1st Trial

한걸음 전진

2nd Trial

3rd Trial

4th Trial - Oversampling

😢 스터디노트 (Machine Learning 9)

스터디노트 (Machine Learning 11)

0개의 댓글