๋ฐ์ดํฐ ๊ฐ์
์ ์ฉ์นด๋ ์ฌ๊ธฐ ๊ฒ์ถ ๋ถ๋ฅ ์ค์ต์ฉ ๋ฐ์ดํฐ
๋ฐ์ดํฐ์ class๋ผ๋ ์ด๋ฆ์ ์ปฌ๋ผ์ด ์ฌ๊ธฐ ์ ๋ฌด๋ฅผ ์๋ฏธ
class ์ปฌ๋ผ์ ๋ถ๊ท ํ์ด ๊ทน์ฌํด์ ์ ์ฒด ๋ฐ์ดํฐ์ ์ฝ 0.172%๊ฐ 1(์ฌ๊ธฐ Fraud)์ ๊ฐ์ง
๋ฐ์ดํฐ ํน์ฑ
๊ธ์ต ๋ฐ์ดํฐ์ด๊ณ ๊ธฐ์ ์ ๊ธฐ๋ฐ ๋ณดํธ๋ฅผ ์ํด ๋๋ค์ ํน์ฑ์ ์ด๋ฆ์ ์ญ์ ๋์ด ์์.
- Amount : ๊ฑฐ๋๊ธ์ก
- Class : Fraud ์ฌ์ (1์ด๋ฉด Fraud)
# ๋ฐ์ดํฐ ์ฝ๊ธฐ
import pandas as pd
data_path = './creditcard.csv'
raw_data = pd.read_csv(data_path)
raw_data.head()
# ํน์ฑ
# ๋ฐ์ดํฐ์ ํน์ฑ์ ์ด๋ฆ์ด ๊ฐ์ถฐ์ ธ ์๋ค.
raw_data.columns
# ๋ฐ์ดํฐ ๋ผ๋ฒจ์ ๋ถ๊ท ํ์ด ์ฌํ๋ค
raw_data['Class'].value_counts()
# fraud rate : 0.17%
fraud_rate = round(raw_data['Class'].value_counts()[1]/len(raw_data) * 100, 2)
print('Frauds', fraud_rate, '% of the dataset')
# ๊ทธ๋ํ๋ก ํํ๋๊ธฐ ํ๋ค๋ค
import seaborn as sns
import matplotlib.pyplot as plt
sns.countplot(x='Class', data=raw_data)
plt.title('Class Distributions \n (0 : No Fraud || 1 : Fraud)', fontsize = 14)
plt.show()
# ์ผ๋จ, X, y๋ก ๋ฐ์ดํฐ ์ ์
X = raw_data.iloc[:, 1:-1]
y = raw_data.iloc[:, -1]
X.shape, y.shape
# ๋ฐ์ดํฐ ๋๋๊ธฐ
# stratify : ๋ถํฌ ๋น์จ์ ๋ง์ถฐ ๋๋ ์ ์์
# https://wikidocs.net/43332
# https://yeko90.tistory.com/entry/what-is-stratify-in-traintestsplit
from sklearn.model_selection import train_test_split
X_tarin, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.3,
random_state=13,
stratify=y)
# ๋๋ ๋ฐ์ดํฐ์ ๋ถ๊ท ํ ์ ๋ ํ์ธ
import numpy as np
np.unique(y_train, return_counts=True)
tmp = np.unique(y_train, return_counts=True)[1]
print(tmp[1] / len(y_train) * 100, '%')
print(np.unique(y_test, return_counts=True)[1][1] / len(y_test) *100 , '%')
# ๋ถ๋ฅ๊ธฐ์ ์ฑ๋ฅ์ returnํ๋ ํจ์ ์์ฑ
from sklearn.metrics import (accuracy_score, precision_score,
recall_score, f1_score, roc_auc_score)
def get_clf_eval(y_test, pred) :
acc = accuracy_score(y_test, pred)
pre = precision_score(y_test, pred)
re = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)
auc = roc_auc_score(y_test, pred)
return acc, pre, re, f1, auc
# ์ฑ๋ฅ ์ถ๋ ฅ ํจ์ ์์ฑ
from sklearn.metrics import confusion_matrix
def print_clf_eval(y_test, pred):
confusion = confusion_matrix(y_test, pred)
acc, pre, re, f1, auc = get_clf_eval(y_test, pred)
print('=> confusion matrix')
print(confusion)
print('====================')
print('Accuracy : {0:.4f}, Precision : {1:.4f}' .format(acc, pre))
print('Recall: {0:.4f}, F1 : {1:.4f}, AUC : {2:.4f}'.format(re, f1, auc))
# Logistic Regression
# recall์ด 60%๊ฐ ์๋๋ค
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression(random_state=13, solver='liblinear')
lr_clf.fit(X_tarin, y_train)
lr_pred = lr_clf.predict(X_test)
print_clf_eval(y_test, lr_pred)
# Decision Tree
# recall์ด 71%
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(random_state=13, max_depth=4)
dt_clf.fit(X_tarin, y_train)
dt_pred = dt_clf.predict(X_test)
print_clf_eval(y_test, dt_pred)
# Random Forest
# recall์ด 74%
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=13, n_jobs=-1, n_estimators=100)
rf_clf.fit(X_tarin, y_train)
rf_pred = rf_clf.predict(X_test)
print_clf_eval(y_test, rf_pred)
# LightGBM
# recall์ด 77%
from lightgbm import LGBMClassifier
lgbm_clf = LGBMClassifier(n_estimators=1000, num_leaves=64, n_jobs=-1,
boost_from_average = False)
lgbm_clf.fit(X_tarin, y_train)
lgbm_pred = lgbm_clf.predict(X_test)
print_clf_eval(y_test, lgbm_pred)
# ๋ชจ๋ธ๊ณผ ๋ฐ์ดํฐ๋ฅผ ์ฃผ๋ฉด ์ฑ๋ฅ์ ์ถ๋ ฅํ๋ ํจ์ ์์ฑ
def get_result(model, X_train, y_train, X_test, y_test):
model.fit(X_train, y_train)
pred = model.predict(X_test)
return get_clf_eval(y_test, pred)
# ๋ค์์ ๋ชจ๋ธ์ ์ฑ๋ฅ์ ์ ๋ฆฌํด์ DataFrame์ผ๋ก ๋ฐํํ๋ ํจ์ ์์ฑ
def get_result_pd(models, model_names, X_train, y_train, X_test, y_test):
col_names = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
tmp = []
for model in models:
tmp.append(get_result(model, X_train, y_train, X_test, y_test))
return pd.DataFrame(tmp, columns=col_names, index=model_names)
# 4๊ฐ์ ๋ถ๋ฅ ๋ชจ๋ธ์ ํ ๋ฒ์ ํ๋ก ์ ๋ฆฌ
# ์์๋ธ ๊ณ์ด์ ์ฑ๋ฅ์ด ์ฐ์ํ๋ค(randomforest, lightGBM)
import time
models = [lr_clf, dt_clf, rf_clf, lgbm_clf]
model_names = ['LinearReg', 'DecisionTree', 'RandomForest', 'LightGBM']
start_time = time.time()
results = get_result_pd(models, model_names, X_train, y_train, X_test, y_test)
print('Fit time : ', time.time() - start_time)
results
# raw_data์ Amount ์ปฌ๋ผ ํ์ธ
# Amount๋ ์ ์ฉ์นด๋ ์ฌ์ฉ ๊ธ์ก
# ์ปฌ๋ผ ๋ถํฌ๊ฐ ํน์ ๋์ญ์ ์์ฃผ ๋ง๋ค
import seaborn as sns
plt.figure(figsize=(10, 5))
sns.distplot(raw_data['Amount'], color='r')
plt.show()
# Amount ์ปฌ๋ผ์ StandardScaler ์ ์ฉ
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
amount_n = scaler.fit_transform(raw_data['Amount'].values.reshape(-1, 1))
raw_data_copy = raw_data.iloc[:, 1: -2]
raw_data_copy['Amount_Scaled'] = amount_n
raw_data_copy.head()
raw_data['Amount'].values.reshape(-1, 1)
# ๋ฐ์ดํฐ ๋๋๊ธฐ
X_train, X_test, y_train, y_test = train_test_split(raw_data_copy, y, test_size=0.3,
random_state=13, stratify=y)
# ๋ชจ๋ธ์ ๋ค์ ํ๊ฐ
models = [lr_clf, dt_clf, rf_clf, lgbm_clf]
model_names = ['LinearReg', 'DecisionTree', 'RandomForest', 'LightGBM']
start_time = time.time()
results = get_result_pd(models, model_names, X_train, y_train, X_test, y_test)
print('Fit time : ', time.time() - start_time)
results
# ๋ชจ๋ธ๋ณ ROC ์ปค๋ธ
from sklearn.metrics import roc_curve
def draw_roc_curve(models, model_names, X_test, y_test):
plt.figure(figsize=(10, 10))
for model in range(len(models)):
pred = models[model].predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, pred)
plt.plot(fpr, tpr, label = model_names[model])
plt.plot([0, 1], [0, 1], 'k--', label = 'random quess')
plt.title('ROC')
plt.legend()
plt.grid()
plt.show()
draw_roc_curve(models, model_names, X_test, y_test)
# ๋ค๋ฅธ ์๋ log scale
amount_log = np.log1p(raw_data['Amount'])
raw_data_copy['Amount_Scaled'] = amount_log
raw_data_copy.head()
# ๋ถํฌ๊ฐ ๋ณํํจ
plt.figure(figsize=(10, 5))
sns.histplot(raw_data_copy['Amount_Scaled'],kde=True, color='r')
plt.show()
# ๋ค์ ์ฑ๋ฅ ํ์ธ
# ๋ฏธ์ธํ ๋ณํ๊ฐ ๋ณด์ด์ง๋ง ํ์คํ ๋ณํ๋ ๊ด์ฐฐ๋์ง ์๋๋ค.
X_train, X_test, y_train, y_test = train_test_split(raw_data_copy, y, test_size=0.3,
random_state=13, stratify=y)
start_time = time.time()
results = get_result_pd(models, model_names, X_train, y_train, X_test, y_test)
print('Fit time : ', time.time() - start_time)
results
# ROC ์ปค๋ธ ๊ฒฐ๊ณผ
draw_roc_curve(models, model_names, X_test, y_test)
# ํน์ด ๋ฐ์ดํฐ
import seaborn as sns
plt.figure(figsize=(10, 7))
sns.boxenplot(data=raw_data[['V13', 'V14', 'V15']])
# Outlier๋ฅผ ์ ๋ฆฌํ๊ธฐ ์ํด Outlier์ ์ธ๋ฑ์ค๋ฅผ ํ์
ํ๋ ์ฝ๋
def get_outlier(df=None, column=None, weight = 1.5):
fraud = df[df['Class']==1][column] # ๋น์ ์ ๋ฐ์ดํฐ์ ๋ํด์๋ง outlier๋ฅผ ํ์ธํ๊ฒ ๋ค
quantile_25 = np.percentile(fraud.values, 25)
quantile_75 = np.percentile(fraud.values, 75)
iqr = quantile_75 - quantile_25
iqr_weight = iqr * weight
lowest_val = quantile_25 - iqr_weight
highest_val = quantile_75 + iqr_weight
outlier_index = fraud[(fraud < lowest_val) | (fraud > highest_val)].index
return outlier_index
# Outlier ์ฐพ๊ธฐ
get_outlier(df=raw_data, column='V14', weight=1.5)
# Outlier ์ ๊ฑฐ
raw_data_copy.shape
outlier_index = get_outlier(df=raw_data, column='V14', weight=1.5)
raw_data_copy.drop(outlier_index, axis=0, inplace=True)
raw_data_copy.shape
# Outlier๋ฅผ ์ ๊ฑฐํ๊ณ ๋ฐ์ดํฐ ๋๋๊ธฐ
X = raw_data_copy
raw_data.drop(outlier_index, axis=0, inplace=True) # outlier๋ฅผ ์ ๊ฑฐํ y ๋ฐ์ดํฐ๋ฅผ ๋ง๋ค๊ธฐ ์ํด
y = raw_data.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
random_state=13, stratify=y)
# ๋ค์ ์ฑ๋ฅ ํ์ธ
# recall์ด 80%๊น์ง ๋ ์ฑ๋ฅ์ด ๋์์ง
models = [lr_clf, dt_clf, rf_clf, lgbm_clf]
model_names = ['LinearReg', 'DecisionTree', 'RandomForest', 'LightGBM']
start_time = time.time()
results = get_result_pd(models, model_names, X_train, y_train, X_test, y_test)
print('Fit time : ', time.time() - start_time)
results
# ROC ์ปค๋ธ
draw_roc_curve(models, model_names, X_test, y_test)
โป ๋ค๋ฅธ ์ฌ๋์ด ์์ฑํ ๊ฒ ์ฐธ๊ณ : https://velog.io/@jaylnne/ML-%EC%8B%A0%EC%9A%A9%EC%B9%B4%EB%93%9C-%EC%82%AC%EA%B8%B0-%ED%83%90%EC%A7%80-%EB%AA%A8%EB%8D%B8-%EB%A7%8C%EB%93%A4%EC%96%B4%EB%B3%B4%EA%B8%B0
# imbalanced-learn ์ค์น
#!pip install imbalanced-learn
# SMOTE ์ ์ฉ
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=13)
X_train_over, y_train_over = SMOTE(random_state=13).fit_resample(X_train, y_train)
# fit_sample์ด fit_resample๋ก ๋ฐ๋
# train ๋ฐ์ดํฐ๋ก ์ฆ๊ฐํด์ผ ํ๋ค, train๋ฐ์ดํฐ์ ๋ณ๊ฒฝํ๋ ์ค์ ์ test์ ํ๋ฉด ์๋๋ค (์๊ณก๋ ์ ์์) โ
โ
โ
# (scaler๋ test๋ฐ์ดํฐ์ ์ ์ฉ ๊ฐ๋ฅ)
# ๋ฐ์ดํฐ ์ฆ๊ฐ ํจ๊ณผ
X_train.shape, y_train.shape
X_train_over.shape, y_train_over.shape
# ๊ฒฐ๊ณผ
print(np.unique(y_train, return_counts=True))
print(np.unique(y_train_over, return_counts=True))
# ๋ค์ ์ฑ๋ฅ ํ์ธ
# recall์ ํ์คํ ์ข์์ง๋ค (LinearReg, DecisionTree์ precision์ ๋งค์ฐ ๋ฎ์์ง)
# ์๋ฌ : Found input variables with inconsistent numbers of samples
# ํด๊ฒฐ๋ฐฉ๋ฒ : https://lovelydiary.tistory.com/425
models = [lr_clf, dt_clf, rf_clf, lgbm_clf]
model_names = ['LinearReg', 'DecisionTree', 'RandomForest', 'LightGBM']
start_time = time.time()
results = get_result_pd(models, model_names, X_train_over, y_train_over, X_test, y_test)
print('Fit time : ', time.time() - start_time)
results
# ROC ์ปค๋ธ
draw_roc_curve(models, model_names, X_test, y_test)
์ด๋ ต...์ค๋ฒ์ํ๋งํ ๋ ์๊พธ X_train_over, y_train_over์ row ์๊ฐ ์๋ง๋ค๋ ๋ฏ์ด ์๋ฌ๊ฐ ๋์๋๋ฐ ๋ค์ ์ ์ฒด์ ์ผ๋ก ๋๋ฆฌ๋ค๋ณด๋ ๋์๋ค...?
๐ป ์ถ์ฒ : ์ ๋ก๋ฒ ์ด์ค ๋ฐ์ดํฐ ์ทจ์ ์ค์ฟจ