해당 내용은 Node 10의 프로젝트(메인 퀘스트) 내용을 담고 있음을 알립니다.
🔗 github.com/hayannn/AIFFEL_MAIN_QUEST/MainProject08
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
# 데이터 경로
DATA_PATH = '/aiffel/aiffel/fnguide/data/'
# 데이터 불러오기
modify_data = pd.read_csv(os.path.join(DATA_PATH, 'sub_upbit_eth_min_tick.csv'), index_col=0, parse_dates=True)
# 불러온 데이터 시각화하기
modify_data.loc['2017-11-01':'2017-12-31','close'].plot()
momentum_signal
: 현재 가격 및 특정 영업일 이전 가격 차이로 라벨링modify_data
와 momentum_signal
비교# window 지정
window = 10
# momentum_signal 만들기
momentum_signal = np.sign(np.sign(modify_data['close'] - modify_data['close'].shift(window)) + 1)
# s_momentum_signal 만들기
s_momentum_signal = pd.Series(momentum_signal, index=modify_data.index)
modify_data
에서 close(종가) 가져오기s_momentum_signal
종가와 얼마나 차이가 있는지 확인하기momentum_signal
값이 1이면 red, 아니면 blue# 기존 데이터 만들기
sub_data = modify_data.loc['2017-11-21', 'close']
# 수식 적용된 데이터 만들기
c_sig = s_momentum_signal.loc['2017-11-21']
# 두 데이터의 비교를 위한 색상 바꾸기
c_sig['color'] = np.where(c_sig == 1, 'red', 'blue')
# 시각화하기
plt.figure(figsize=(10, 5))
plt.scatter(sub_data.index, sub_data.values, c=c_sig['color'])
plt.show()
# momentum_signal
momentum_signal = np.sign(np.sign(modify_data['close'] - modify_data['close'].rolling(window).mean()) + 1)
# s_momentum_signal
s_momentum_signal = pd.Series(momentum_signal, index=modify_data.index)
# 기존 데이터 만들기
sub_data = modify_data.loc['2017-11-21', 'close']
# 수식 적용된 데이터 만들기
c_sig = s_momentum_signal.loc['2017-11-21']
# 두 데이터의 비교를 위한 색상 바꾸기
c_sig['color'] = np.where(c_sig == 1, 'red', 'blue')
# 시각화하기
plt.figure(figsize=(10, 5))
plt.scatter(sub_data.index, sub_data.values, c=c_sig['color'])
plt.show()
국지적 최소, 최댓값을 계속 갱신하는 방식으로 최소-최대 구간을 이어 라벨링
순차적이기 때문에 데이터 증가 ➡️ 선형적 연산 시간 증가(단점)
Wait 계수가 너무 작을 경우, 변동성이 커지니 잘 선택해야 함
슈도 코드
𝐼𝑛𝑖𝑡𝑖𝑎𝑙𝑖𝑧𝑒 𝑣𝑎𝑟𝑖𝑎𝑏𝑙𝑒𝑠𝑓𝑜𝑟 𝑖 𝑖𝑛 𝑑𝑎𝑡𝑎𝑠𝑒𝑡:
𝑖𝑓 𝑐𝑢𝑟𝑟𝑒𝑛𝑡𝑝𝑟𝑖𝑐𝑒<𝑝𝑟𝑒𝑣𝑖𝑜𝑢𝑠𝑚𝑖𝑛𝑝𝑟𝑖𝑐𝑒
𝑚𝑖𝑛𝑝𝑟𝑖𝑐𝑒←𝑐𝑢𝑟𝑟𝑒𝑛𝑡𝑝𝑟𝑖𝑐𝑒
𝑝𝑎𝑠𝑠𝑖𝑛𝑔 𝑡ℎ𝑟𝑜𝑢𝑔ℎ 𝑡ℎ𝑒 𝐹𝑎𝑙𝑙𝑖𝑛𝑔 𝑇𝑟𝑒𝑛𝑑
𝑎𝑐𝑐𝑢𝑚𝑢𝑙𝑎𝑡𝑒𝑠𝑚𝑖𝑛𝑝𝑟𝑖𝑐𝑒
𝑖𝑓 𝑐𝑢𝑟𝑟𝑒𝑛𝑡𝑝𝑟𝑖𝑐𝑒>𝑝𝑟𝑒𝑣𝑖𝑜𝑢𝑠𝑚𝑎𝑥𝑝𝑟𝑖𝑐𝑒
𝑚𝑎𝑥𝑝𝑟𝑖𝑐𝑒←𝑐𝑢𝑟𝑟𝑒𝑛𝑡𝑝𝑟𝑖𝑐𝑒
𝑝𝑎𝑠𝑠𝑖𝑛𝑔 𝑡ℎ𝑟𝑜𝑢𝑔ℎ 𝑡ℎ𝑒 𝑅𝑖𝑠𝑖𝑛𝑔 𝑇𝑟𝑒𝑛𝑑
𝑎𝑐𝑐𝑢𝑚𝑢𝑙𝑎𝑡𝑒𝑠𝑚𝑎𝑥𝑝𝑟𝑖𝑐𝑒
𝑖𝑓 𝑒𝑛𝑑𝑠 𝑡ℎ𝑒 𝐹𝑎𝑙𝑙𝑖𝑛𝑔 𝑇𝑟𝑒𝑛𝑑
𝑚𝑎𝑥𝑝𝑟𝑖𝑐𝑒←𝑚𝑖𝑛𝑝𝑟𝑖𝑐𝑒
𝑖𝑓 𝑒𝑛𝑑𝑠 𝑡ℎ𝑒 𝑅𝑖𝑠𝑖𝑛𝑔 𝑇𝑟𝑒𝑛𝑑
𝑚𝑖𝑛𝑝𝑟𝑖𝑐𝑒←𝑚𝑎𝑥𝑝𝑟𝑖𝑐𝑒
[1] 초기 최대값을 종가 첫 번째 값으로 설정
[2] 기존 최댓값보다 값이 크다면 -> max_value 갱신
[3] 최댓값을 리스트에 추가
[4] 최댓값 갱신 플래그
[5] 최솟값 미갱신 시, 최댓값 갱신 플래그 해제
[6] 최솟값 갱신
[7] 최솟값 및 최댓값 인덱스 반환
def get_local_min_max(close, wait=3):
min_value = close.iloc[0]
max_value = close.iloc[0]
n_cnt_min, n_cnt_max = 0, 0
mins, maxes = [], []
min_idxes, max_idxes = [], []
b_min_update, b_max_update = False, False
for idx, val in zip(close.index[1:], close.values[1:]):
if val < min_value:
min_value = val
mins.append(min_value)
min_idxes.append(idx)
n_cnt_min = 0
b_min_update = True
if val > max_value:
max_value = val
maxes.append(max_value)
max_idxes.append(idx)
n_cnt_max = 0
b_max_update = True
if not b_max_update:
b_min_update = False
n_cnt_min += 1
if n_cnt_min >= wait:
max_value = min_value
n_cnt_min = 0
if not b_min_update:
b_max_update = False
n_cnt_max += 1
if n_cnt_max >= wait:
min_value = max_value
n_cnt_max = 0
return pd.DataFrame.from_dict({'min_time': min_idxes, 'local_min': mins}), pd.DataFrame.from_dict({'max_time': max_idxes, 'local_max': maxes})
mins, maxes = get_local_min_max(sub_data, wait=3)
print(mins)
print('--'*20)
print(maxes)
fig, ax = plt.subplots(1, 1, figsize=(10, 5))
ax.plot(sub_data, 'c')
ax.scatter(mins.min_time, mins.local_min, c='blue')
ax.scatter(maxes.max_time, maxes.local_max, c='red')
ax.set_ylim([sub_data.min() * 0.99, sub_data.max() * 1.01])
Machine Learning for Asset Managers에 소개된 라벨링 방식
(시계열 데이터) = 주식(코인) 가격
maxTvalue
부호를 통해 라벨링Tvalue?
선형 추세의 t-value 값 계산
def t_val_lin_r(close):
import statsmodels.api as sml
# t-value from a linear trend
x = np.ones((close.shape[0], 2))
x[:, 1] = np.arange(close.shape[0])
ols = sml.OLS(close, x).fit()
return ols.tvalues[1]
look_forward_window
: 현재 시점에서 특정 미래 시점까지의 관찰할 윈도우 크기min_sample_length
: 샘플 데이터 최소 길이step
: 슬라이딩 윈도우 이동 시의 간격t1_array
: 특정 기준 시점 결과값 저장t_values_array
: 각 윈도우의 t-value 저장look_forward_window = 60
min_sample_length = 5
step = 1
t1_array = []
t_values_array = []
현재 시점(ind)
부터 미래 look_forward_window
까지의 샘플 데이터 추출molecule = modify_data['2017-11-01':'2017-11-30'].index
label = pd.DataFrame(index=molecule, columns=['t1', 't_val', 'bin'])
tmp_out = []
for ind in tqdm(molecule):
subset = modify_data.loc[ind:, 'close'].iloc[:look_forward_window] # 전방 탐색을 위한 샘플 추출
if look_forward_window > subset.shape[0]:
continue
tmp_subset = pd.Series(index=subset.index[min_sample_length-1:subset.shape[0]-1])
tval = []
# 회귀분석을 통해 t 통계량값을 이용하여 추세 추정
for forward_window in np.arange(min_sample_length, subset.shape[0]):
df = subset.iloc[:forward_window]
tval.append(t_val_lin_r(df.values)) # t-value 사용
tmp_subset.loc[tmp_subset.index] = np.array(tval)
idx_max = tmp_subset.replace([-np.inf, np.inf, np.nan], 0).abs().idxmax()
tmp_t_val = tmp_subset[idx_max]
tmp_out.append([tmp_subset.index[-1], tmp_t_val, np.sign(tmp_t_val)])
label.loc[molecule] = np.array(tmp_out) # prevent leakage
label['t1'] = pd.to_datetime(label['t1'])
label['bin'] = pd.to_numeric(label['bin'], downcast='signed')
sub_data = modify_data.loc['2017-11-21', 'close']
c_sig = label['bin'].loc['2017-11-21']
c_sig['color'] = np.where(c_sig == 1, 'red', 'blue')
fig, ax = plt.subplots(1, 1, figsize=(10, 5))
ax.scatter(sub_data.index, sub_data.values, c=c_sig['color'])
!pip install ta==0.9.0
!pip install shap
import datetime
import sys
import os
import re
import io
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ta
import sys
sys.path.append('/aiffel/aiffel/fnguide/data/')
from libs.feature_importance import importance as imp
from sklearn.feature_selection import SequentialFeatureSelector, RFECV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
DATA_PATH = '/aiffel/aiffel/fnguide/data/'
anno_file_name = os.path.join(DATA_PATH, 'sub_upbit_eth_min_tick_label.pkl')
target_file_name = os.path.join(DATA_PATH, 'sub_upbit_eth_min_tick.csv')
df_modify_data = pd.read_csv(target_file_name, index_col=0, parse_dates=True)
df_label_data = pd.read_pickle(anno_file_name)
df_sub_modify_data = df_modify_data.loc[df_label_data.index]
df_sub_modify_data = df_sub_modify_data.iloc[:1000]
mt = 1
fillna = False
df_ = df_sub_modify_data.copy()
open, high, low, close, volume = 'open', 'high', 'low', 'close', 'volume'
cols = [open, high, low, close, volume]
## Volume Index
# Chaikin Money Flow
df_["volume_cmf"] = ta.volume.ChaikinMoneyFlowIndicator(
high=df_[high], low=df_[low], close=df_[close], volume=df_[volume], window=20*mt, fillna=fillna
).chaikin_money_flow()
# Force Index
df_["volume_fi"] = ta.volume.ForceIndexIndicator(
close=df_[close], volume=df_[volume], window=15*mt, fillna=fillna
).force_index()
# Money Flow Indicator
df_["volume_mfi"] = ta.volume.MFIIndicator(
high=df_[high],
low=df_[low],
close=df_[close],
volume=df_[volume],
window=15*mt,
fillna=fillna,
).money_flow_index()
# Ease of Movement
df_["volume_sma_em"] = ta.volume.EaseOfMovementIndicator(
high=df_[high], low=df_[low], volume=df_[volume], window=15*mt, fillna=fillna
).sma_ease_of_movement()
# Volume Price Trend
df_["volume_vpt"] = ta.volume.VolumePriceTrendIndicator(
close=df_[close], volume=df_[volume], fillna=fillna
).volume_price_trend()
## volatility index
# Average True Range
df_["volatility_atr"] = ta.volatility.AverageTrueRange(
close=df_[close], high=df_[high], low=df_[low], window=10*mt, fillna=fillna
).average_true_range()
# Ulcer Index
df_["volatility_ui"] = ta.volatility.UlcerIndex(
close=df_[close], window=15*mt, fillna=fillna
).ulcer_index()
## trend index
# MACD
df_["trend_macd_diff"] = ta.trend.MACD(
close=df_[close], window_slow=25*mt, window_fast=10*mt, window_sign=9, fillna=fillna
).macd_diff()
# Average Directional Movement Index (ADX)
df_["trend_adx"] = ta.trend.ADXIndicator(
high=df_[high], low=df_[low], close=df_[close], window=15*mt, fillna=fillna
).adx()
# TRIX Indicator
df_["trend_trix"] = ta.trend.TRIXIndicator(
close=df_[close], window=15*mt, fillna=fillna
).trix()
# Mass Index
df_["trend_mass_index"] = ta.trend.MassIndex(
high=df_[high], low=df_[low], window_fast=10*mt, window_slow=25*mt, fillna=fillna
).mass_index()
# DPO Indicator
df_["trend_dpo"] = ta.trend.DPOIndicator(
close=df_[close], window=20*mt, fillna=fillna
).dpo()
# Aroon Indicator
df_["trend_aroon_ind"] = ta.trend.AroonIndicator(close=df_[close], window=20, fillna=fillna).aroon_indicator()
## momentum index
# Relative Strength Index (RSI)
df_["momentum_rsi"] = ta.momentum.RSIIndicator(close=df_[close], window=15*mt, fillna=fillna).rsi()
# Williams R Indicator
df_["momentum_wr"] = ta.momentum.WilliamsRIndicator(
high=df_[high], low=df_[low], close=df_[close], lbp=15*mt, fillna=fillna
).williams_r()
df_tmp_data = df_.join(df_label_data).dropna()
X = df_tmp_data.iloc[:, 5:-1]
y = df_tmp_data.iloc[:, -1]
sc = StandardScaler()
X_sc = sc.fit_transform(X)
X_sc = pd.DataFrame(X_sc, index=X.index, columns=X.columns)
rfc = RandomForestClassifier(class_weight='balanced')
rfc.fit(X_sc, y)
feat_imp = imp.mean_decrease_impurity(rfc, X.columns)
feat_imp
svc_rbf = SVC(kernel='rbf', probability=True)
cv = KFold(n_splits=5)
feat_imp_mda = imp.mean_decrease_accuracy(svc_rbf, X_sc, y, cv_gen=cv)
plot_feature_importance
함수def plot_feature_importance(importance_df, save_fig=False, output_path=None):
# Plot mean imp bars with std
plt.figure(figsize=(10, importance_df.shape[0] / 5))
importance_df.sort_values('mean', ascending=True, inplace=True)
importance_df['mean'].plot(kind='barh', color='b', alpha=0.25, xerr=importance_df['std'], error_kw={'ecolor': 'r'})
if save_fig:
plt.savefig(output_path)
else:
plt.show()
plot_feature_importance(feat_imp)
plot_feature_importance(feat_imp_mda)
svc_rbf = SVC(kernel='linear', probability=True)
rfe_cv = RFECV(svc_rbf, cv=cv)
rfe_fitted = rfe_cv.fit(X_sc, y)
rfe_df = pd.DataFrame([rfe_fitted.support_, rfe_fitted.ranking_], columns=X_sc.columns).T.rename(columns={0:"Optimal_Features", 1:"Ranking"})
rfe_df
Optimal_Features
가 True인 행만 필터링rfe_df[rfe_df["Optimal_Features"]==True]
n = 2
sfs_forward = SequentialFeatureSelector(svc_rbf, n_features_to_select=n, direction='forward')
sfs_fitted = sfs_forward.fit(X_sc, y)
sfs_rank = sfs_fitted.get_support()
sfs_df = pd.DataFrame(sfs_rank, index=X_sc.columns, columns={"Optimal_Features"})
sfs_df[sfs_df["Optimal_Features"]==True].index
import shap
explainer = shap.TreeExplainer(rfc)
shap_value = explainer.shap_values(X_sc)
shap.summary_plot(shap_value, X_sc)
output_file_name = os.path.join(DATA_PATH, 'sub_upbit_eth_min_feature_labels.pkl')
df_tmp_data.to_pickle(output_file_name)
import datetime
import sys
import os
import re
import io
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ta
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score, roc_curve
sys.path.append('/aiffel/aiffel/fnguide/data/')
from libs.mlutil.pkfold import PKFold
DATA_PATH = '/aiffel/aiffel/fnguide/data/'
data_file_name = os.path.join(DATA_PATH, 'sub_upbit_eth_min_feature_labels.pkl')
# t-value
# 추가
df_data = pd.read_pickle(data_file_name)
df_data['t_value'] = df_data['t_value'].apply(lambda x: x if x == 1 else 0)
df_data['t_value'].value_counts()
# 데이터셋 비율
train_ratio, test_ratio = 0.7, 0.2
n_train = int(np.round(len(df_data) * train_ratio))
n_test = int(np.round(len(df_data) * test_ratio))
X, y = df_data.iloc[:, 5:-1], df_data.iloc[:, -1]
# 스케일링
sc = StandardScaler()
X_sc = sc.fit_transform(X)
# 데이터셋 분리
train_x, test_x, train_y, test_y = X_sc[:n_train, :], X_sc[-n_test:, :], y.iloc[:n_train], y.iloc[-n_test:]
train_x = pd.DataFrame(train_x, index=train_y.index, columns=X.columns)
train_y = pd.Series(train_y, index=train_y.index)
test_x = pd.DataFrame(test_x, index=test_y.index, columns=X.columns)
test_y = pd.Series(test_y, index=test_y.index)
# # 기존
# train_x = train_x[:1000]
# train_y = train_y[:1000]
# 변형 (2000개로 설정)
train_x = train_x[:2000]
train_y = train_y[:2000]
n_cv = 5
t1 = pd.Series(train_y.index.values, index=train_y.index)
# purged K-Fold
cv = PKFold(n_cv, t1, 0)
하이퍼파라미터 튜닝 진행
- 예시 코드에서 수치 임의 추가 진행
- 참고) LMS 내 코드 원본
# GridsearchCV에서 사용할 파라미터 설정합니다. 파라미터값을 바꿔보세요 bc_params = {'n_estimators': [5, 10, 20], 'max_features': [0.5, 0.7], 'base_estimator__max_depth': [3,5,10,20], 'base_estimator__max_features': [None, 'auto'], 'base_estimator__min_samples_leaf': [3, 5, 10], 'bootstrap_features': [False, True] }
파라미터 값 추가
bc_params = {'n_estimators': [5, 10, 20],
'max_features': [0.3, 0.5, 0.7, 0.9],
'base_estimator__max_depth': [3, 5, 10, 20],
'base_estimator__max_features': [None, 'auto'],
'base_estimator__min_samples_leaf': [3, 5, 10],
'bootstrap_features': [False, True]
}
rfc = RandomForestClassifier(class_weight='balanced')
bag_rfc = BaggingClassifier(rfc)
gs_rfc = GridSearchCV(bag_rfc, bc_params, cv=cv, n_jobs=-1, verbose=1)
gs_rfc.fit(train_x, train_y)
# best estimator
gs_rfc_best = gs_rfc.best_estimator_
gs_rfc_best.fit(train_x, train_y)
참고) 기존 예시와 다른 부분
- n_cv = 4 -> 5
- 데이터 샘플 1000 -> 2000
- 하이퍼 파라미터 값 추가
pred_y = gs_rfc_best.predict(test_x)
prob_y = gs_rfc_best.predict_proba(test_x)
confusion = confusion_matrix(test_y, pred_y)
accuracy = accuracy_score(test_y, pred_y)
precision = precision_score(test_y, pred_y)
recall = recall_score(test_y, pred_y)
print('================= confusion matrix ====================')
print(confusion)
print('=======================================================')
print(f'정확도:{accuracy}, 정밀도:{precision}, 재현율:{recall}')
fpr, tpr, thresholds = roc_curve(test_y, pred_y)
auc = roc_auc_score(test_y, pred_y)
plt.plot(fpr, tpr, linewidth=2)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('fpr')
plt.ylabel('tpr')
print(f'auc:{auc}')
!pip install optuna
import optuna
def objective(trial):
# 하이퍼파라미터 설정
n_estimators = trial.suggest_int('n_estimators', 5, 100)
max_features = trial.suggest_float('max_features', 0.3, 0.9)
max_depth = trial.suggest_int('base_estimator__max_depth', 3, 20)
min_samples_leaf = trial.suggest_int('base_estimator__min_samples_leaf', 3, 20)
bootstrap_features = trial.suggest_categorical('bootstrap_features', [False, True])
rfc = RandomForestClassifier(
class_weight='balanced',
max_depth=max_depth,
min_samples_leaf=min_samples_leaf,
max_features=None
)
bag_rfc = BaggingClassifier(
base_estimator=rfc,
n_estimators=n_estimators,
max_features=max_features,
bootstrap_features=bootstrap_features
)
# cross validation
scores = []
for train_idx, val_idx in cv.split(train_x):
X_train, X_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
bag_rfc.fit(X_train, y_train)
preds = bag_rfc.predict(X_val)
scores.append(accuracy_score(y_val, preds))
return np.mean(scores)
# Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)
# 최적 하이퍼파라미터 뽑기
best_params = study.best_params
print("Best Parameters:", best_params)
Best Parameters: {
'n_estimators': 62,
'max_features': 0.8285727979016057,
'base_estimator__max_depth': 8,
'base_estimator__min_samples_leaf': 3,
'bootstrap_features': False
}
rfc = RandomForestClassifier(
class_weight='balanced',
max_depth=best_params['base_estimator__max_depth'],
min_samples_leaf=best_params['base_estimator__min_samples_leaf'],
max_features=None
)
bag_rfc = BaggingClassifier(
base_estimator=rfc,
n_estimators=best_params['n_estimators'],
max_features=best_params['max_features'],
bootstrap_features=best_params['bootstrap_features']
)
bag_rfc.fit(train_x, train_y)
pred_y = bag_rfc.predict(test_x)
prob_y = bag_rfc.predict_proba(test_x)
confusion = confusion_matrix(test_y, pred_y)
accuracy = accuracy_score(test_y, pred_y)
precision = precision_score(test_y, pred_y)
recall = recall_score(test_y, pred_y)
print('================= Confusion Matrix ====================')
print(confusion)
print('=======================================================')
print(f'정확도: {accuracy}, 정밀도: {precision}, 재현율: {recall}')
fpr, tpr, thresholds = roc_curve(test_y, prob_y[:, 1])
auc = roc_auc_score(test_y, prob_y[:, 1])
plt.plot(fpr, tpr, linewidth=2)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title(f'AUC: {auc}')
plt.show()
아래 2가지가 동일한 상태에서 튜닝 라이브러리를 변경하여 수행
이전
이후
정확도를 기준으로 했을 경우 약 0.64에서 0.67로 증가했음을 볼 수 있음
그러나, 재현율 수치가 크게 감소한 것은 데이터 불균형 등의 이유를 찾아보아야 하는 문제는 여전히 존재함.
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
# Logistic Regression 모델 훈련
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(train_x, train_y)
# 예측 및 확률
pred_y = model.predict(test_x)
prob_y = model.predict_proba(test_x)
# 지표 계산
confusion = confusion_matrix(test_y, pred_y)
accuracy = accuracy_score(test_y, pred_y)
precision = precision_score(test_y, pred_y)
recall = recall_score(test_y, pred_y)
# 결과 출력
print('================= confusion matrix ====================')
print(confusion)
print('=======================================================')
print(f'정확도: {accuracy}, 정밀도: {precision}, 재현율: {recall}')
# ROC Curve 및 AUC
fpr, tpr, thresholds = roc_curve(test_y, prob_y[:, 1]) # 클래스 1에 대한 확률
auc = roc_auc_score(test_y, prob_y[:, 1])
# ROC Curve 그리기
plt.plot(fpr, tpr, linewidth=2, label=f'AUC = {auc}')
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.legend()
plt.show()
print(f'AUC: {auc}')
import lightgbm as lgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
# LightGBM 모델 훈련
model = lgb.LGBMClassifier(n_estimators=100, random_state=42)
model.fit(train_x, train_y)
# 예측 및 확률
pred_y = model.predict(test_x)
prob_y = model.predict_proba(test_x)
# 지표 계산
confusion = confusion_matrix(test_y, pred_y)
accuracy = accuracy_score(test_y, pred_y)
precision = precision_score(test_y, pred_y)
recall = recall_score(test_y, pred_y)
# 결과 출력
print('================= confusion matrix ====================')
print(confusion)
print('=======================================================')
print(f'정확도: {accuracy}, 정밀도: {precision}, 재현율: {recall}')
# ROC Curve 및 AUC
fpr, tpr, thresholds = roc_curve(test_y, prob_y[:, 1]) # 클래스 1에 대한 확률
auc = roc_auc_score(test_y, prob_y[:, 1])
# ROC Curve 그리기
plt.plot(fpr, tpr, linewidth=2, label=f'AUC = {auc}')
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.legend()
plt.show()
print(f'AUC: {auc}')
!pip install catboost
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
# CatBoost 모델 훈련
model = CatBoostClassifier(iterations=100, random_state=42, verbose=0)
model.fit(train_x, train_y)
# 예측 및 확률
pred_y = model.predict(test_x)
prob_y = model.predict_proba(test_x)
# 지표 계산
confusion = confusion_matrix(test_y, pred_y)
accuracy = accuracy_score(test_y, pred_y)
precision = precision_score(test_y, pred_y)
recall = recall_score(test_y, pred_y)
# 결과 출력
print('================= confusion matrix ====================')
print(confusion)
print('=======================================================')
print(f'정확도: {accuracy}, 정밀도: {precision}, 재현율: {recall}')
# ROC Curve 및 AUC
fpr, tpr, thresholds = roc_curve(test_y, prob_y[:, 1]) # 클래스 1에 대한 확률
auc = roc_auc_score(test_y, prob_y[:, 1])
# ROC Curve 그리기
plt.plot(fpr, tpr, linewidth=2, label=f'AUC = {auc}')
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.legend()
plt.show()
print(f'AUC: {auc}')
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
# Naive Bayes 모델 훈련
model = GaussianNB()
model.fit(train_x, train_y)
# 예측 및 확률
pred_y = model.predict(test_x)
prob_y = model.predict_proba(test_x)
# 지표 계산
confusion = confusion_matrix(test_y, pred_y)
accuracy = accuracy_score(test_y, pred_y)
precision = precision_score(test_y, pred_y)
recall = recall_score(test_y, pred_y)
# 결과 출력
print('================= confusion matrix ====================')
print(confusion)
print('=======================================================')
print(f'정확도: {accuracy}, 정밀도: {precision}, 재현율: {recall}')
# ROC Curve 및 AUC
fpr, tpr, thresholds = roc_curve(test_y, prob_y[:, 1]) # 클래스 1에 대한 확률
auc = roc_auc_score(test_y, prob_y[:, 1])
# ROC Curve 그리기
plt.plot(fpr, tpr, linewidth=2, label=f'AUC = {auc}')
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.legend()
plt.show()
print(f'AUC: {auc}')
하이퍼파라미터 튜닝(Optuna)까지 했던 수치
Logistic Regression
LightGBM
CatBoost
Naive Bayes
예상한만큼, Logistic Regression의 정확도가 가장 높게 측정됨, 재현율 역시 기존 튜닝 수치에 비하면 개선됨
Naive Bayes 역시 나쁘지 않은 정확도를 보이면서도 재현율이 잘 보존된 것을 알 수 있었음
gs_rfc_best = gs_rfc.best_estimator_
부분에서 마지막 _을 넣지 않고 적합을 돌려서 1시간을 기다려 Auttribute Error를 보는 실수를 했음.