๋ฐ์ดํฐ ์์ง/๊ฐ๊ณต/๋ณํ โ ๋ชจ๋ธ ํ์ต/์์ธก โ ๋ชจ๋ธํ๊ฐ ๋ฐ๋ณต
๋ค์ํ ๋ชจ๋ธ, ๋ค์ํ ํ๋ผ๋ฏธํฐ๋ฅผ ๋๊ณ , ์๋์ ์ผ๋ก ๋น๊ตํ๋ค.
- ํ๊ท๋ชจ๋ธ๋ค์ ์ค์ ๊ฐ๊ณผ ์๋ฌ์น๋ฅผ ๊ฐ์ง๊ณ ๊ณ์ฐ
- ๋ถ๋ฅ ๋ชจ๋ธ์ ํ๊ฐ ํญ๋ชฉ์ด ์กฐ๊ธ ๋ง๋ค (์ ํ๋, ์ค์ฐจํ๋ ฌ, ์ ๋ฐ๋, ์ฌํ์จ, F1 score, ROC AUC ๋ฑ๋ฑ...)
- ์ด์ง ๋ถ๋ฅ ๋ชจ๋ธ์ ํ๊ฐ
- TP True Positive : ์ค์ Positive๋ฅผ Positive๋ผ๊ณ ๋ง์ถ ๊ฒฝ์ฐ
- FN False Negative : ์ค์ Positive๋ฅผ Negative๋ผ๊ณ ํ๋ฆฌ๊ฒ ์์ธกํ ๊ฒฝ์ฐ
- TN True Negative : ์ค์ Negative๋ฅผ Negative๋ผ๊ณ ๋ง์ถ ๊ฒฝ์ฐ
- FP False Positive : ์ค์ Negative๋ฅผ Positive๋ผ๊ณ ํ๋ฆฌ๊ฒ ์์ธกํ ๊ฒฝ์ฐ
Accuracy : ์ ์ฒด ๋ฐ์ดํฐ ์ค์ ๋ง๊ฒ ์์ธกํ ๊ฒ์ ๋น์จ
Precision : ์ฐธ(์์ฑ)์ด๋ผ๊ณ ์์ธกํ ๊ฒ ์ค์์ ์ค์ ์ฐธ(์์ฑ)์ ๋น์จ (precision = TP / (TP + FP)) (ex) ์คํธ๋ฉ์ผ์ด๋ผ๊ณ ์์ธกํ๋๋ฐ ์คํธ์ด ์๋ ๊ฒฝ์ฐ ๊ณค๋ํ๋ค)
โ RECALL(TPR TRUE POSITIVE RATIO) : ์ฐธ์ธ ๋ฐ์ดํฐ๋ค ์ค์์ ์ฐธ์ด๋ผ๊ณ ์์ธกํ ๊ฒ (recall = TP / (TP + FN)) โ
FALL_OUT(FPR FALSE POSITIVE RATIO) : ์ค์ ์์ฑ์ด ์๋๋ฐ, ์์ฑ์ด๋ผ๊ณ ์๋ชป ์์ธกํ ๊ฒฝ์ฐ
- F1 - Score : Recall๊ณผ Precision์ ๊ฒฐํฉํ ์งํ. Recall๊ณผ Precision์ด ์ด๋ ํ์ชฝ์ผ๋ก ์น์ฐ์น์ง ์๊ณ ๋ ๋ค ๋์ ๊ฐ์ ๊ฐ์ง ์๋ก ๋์ ๊ฐ์ ๊ฐ์ง
ROC ๊ณก์ :
โป FPR : FALL_OUT(FPR FALSE POSITIVE RATIO)
โป TPR : RECALL(TPR TRUE POSITIVE RATIO)
AUC : ROC ๊ณก์ ์ ์๋์ ๋ฉด์ . ์ผ๋ฐ์ ์ผ๋ก 1์ ๊ฐ๊น์ธ์๋ก ์ข์ ์์น. ๊ธฐ์ธ๊ธฐ๊ฐ 1์ธ ์ง์ ์๋์ ๋ฉด์ ์ด 0.5 โ AUC๋ 0.5๋ณด๋ค ์ปค์ผ ํจ
# ๋ค์ ์์ธ ๋ง ๋ถ๋ฅํ๋ ๋ฐ์ดํฐ๋ก\
# ๋ฐ์ดํฐ ์ฝ๊ธฐ
import pandas as pd
red_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv'
white_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv'
red_wine = pd.read_csv(red_url, sep=';')
white_wine = pd.read_csv(white_url, sep=';')
red_wine['color'] = 1
white_wine['color'] = 0
wine = pd.concat([red_wine, white_wine])
wine['taste'] = [1 if grade > 5 else 0 for grade in wine['quality']]
X = wine.drop(['taste', 'quality'], axis=1)
Y = wine['taste']
# ๊ฐ๋จํ ๊ฒฐ์ ๋๋ฌด ์ ์ฉํ๊ธฐ
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=13)
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, Y_train)
y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)
print('Train Acc : ', accuracy_score(Y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(Y_test, y_pred_test))
# ๊ฐ ์์น ๊ตฌํด๋ณด๊ธฐ
from sklearn.metrics import (accuracy_score, precision_score,
recall_score, f1_score, roc_auc_score, roc_curve)
print('Accuracy : ', accuracy_score(Y_test, y_pred_test))
print('Recall : ', recall_score(Y_test, y_pred_test))
print('Precision : ', precision_score(Y_test, y_pred_test))
print('AUC Score : ', roc_auc_score(Y_test, y_pred_test))
print('F1 Score : ', f1_score(Y_test, y_pred_test))
wine_tree.predict_proba(X_test) # 0์ผ ํ๋ฅ , 1์ผ ํ๋ฅ
roc_curve(Y_test, pred_proba)
# ROC ์ปค๋ธ ๊ทธ๋ฆฌ๊ธฐ
import matplotlib.pyplot as plt
%matplotlib inline
pred_proba = wine_tree.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(Y_test, pred_proba)
fpr
tpr
thresholds
plt.figure(figsize=(10, 8))
plt.plot([0, 1], [0, 1], 'r', ls = 'dashed') # 0, 0 ~ 1, 1 ์ฌ์ด์ ๊ทธ๋ํ๊ฐ ๊ทธ๋ ค์ง, ๋ณด์กฐ์ ์ญํ
plt.plot(fpr, tpr)
plt.grid()
plt.show()
# ๋คํญํจ์
import numpy as np
import matplotlib.pyplot as plt
x = np.linspace(-3, 2, 100)
y = 3*x**2 + 2
x
y
plt.figure(figsize=(12, 8))
plt.plot(x, y)
plt.xlabel('$x$')
plt.ylabel('$y$')
plt.show()
# style
import matplotlib as mpl
mpl.style.use('seaborn-whitegrid')
plt.figure(figsize=(12, 8))
plt.plot(x, y)
plt.xlabel('$x$', fontsize = 25)
plt.ylabel('$y$', fontsize = 25)
plt.show()
# ๋คํญํจ์์ x ์ถ ๋ฐฉํฅ ์ด๋
x = np.linspace(-5, 5, 100)
y1 = 3*x**2 + 2
y2 = 3*(x+1)**2 + 2
# plot
plt.figure(figsize=(12, 8))
plt.plot(x, y1, lw = 2, ls = 'dashed', label = '$y=3x^2 + 2$')
plt.plot(x, y2, label = '$y=3(x+1)^2 + 2$')
plt.legend(fontsize = 15)
plt.xlabel('$x$', fontsize = 25)
plt.ylabel('$y$', fontsize = 25)
plt.show()
# ์ง์ํจ์
# ์ง์ํจ์๋ฅผ ํ์
ํ๊ธฐ ์ํด ๋ค์ํ ๊ฒฝ์ฐ
x = np.linspace(-2, 2, 100)
a11, a12, a13 = 2, 3, 4
y11, y12, y13 = a11**x, a12**x, a13**x
a21, a22, a23 = 1/2, 1/3, 1/4
y21, y22, y23 = a21**x, a22**x, a23**x
# ๊ทธ๋ํ
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
ax[0].plot(x, y11, color = 'k', label = r'$2^x$')
ax[0].plot(x, y12, '--', color = 'k', label = r'$3^x$')
ax[0].plot(x, y13, ':', color = 'k', label = r'$4^x$')
ax[0].legend(fontsize = 20)
ax[1].plot(x, y21, color = 'k', label = r'$(1/2)^x$')
ax[1].plot(x, y22, '--', color = 'k', label = r'$(1/3)^x$')
ax[1].plot(x, y23, ':', color = 'k', label = r'$(1/4)^x$')
ax[1].legend(fontsize = 20)
x = np.linspace(0, 10)
plt.figure(figsize = (6, 6))
plt.plot(x, x**2, '--', color = 'k', label = r'$x^2$')
plt.plot(x, 2**x, color = 'k', label = r'$2^x$')
plt.legend(loc = 'center left', fontsize = 25)
plt.xlabel('$x$', fontsize = 25)
plt.ylabel('$y$', fontsize = 25)
plt.show()
x = np.array([10, 100, 1000, 10000, 100000])
(1 + 1/x)**x
# ๋ก๊ทธํจ์๋ฅผ ๊ทธ๋ฆฌ๊ธฐ ์ํ ๋ฐ์ดํฐ
def log(x, base):
return np.log(x)/np.log(base)
x1 = np.linspace(0.0001, 5, 1000)
x2 = np.linspace(0.01, 5, 100)
y11, y12 = log(x1, 10), log(x2, np.e)
y21, y22 = log(x1, 1/10), log(x2, 1/np.e)
# ๊ทธ๋ฆฌ๊ธฐ ์ํ ์ค๋น
fig, ax = plt.subplots(1, 2, figsize = (12, 6))
ax[0].plot(x1, y11, label = r'$log_{10} x$', color = 'k')
ax[0].plot(x2, y12, '--', label = r'$log_{e} x$', color = 'k')
ax[0].set_xlabel('$x$', fontsize = 25)
ax[0].set_ylabel('$y$', fontsize = 25)
ax[0].legend(fontsize = 20, loc = 'lower right')
ax[1].plot(x1, y21, label = r'$log_{10} x$', color = 'k')
ax[1].plot(x2, y22, '--', label = r'$log_{1/e} x$', color = 'k')
ax[1].set_xlabel('$x$', fontsize = 25)
ax[1].set_ylabel('$y$', fontsize = 25)
ax[1].legend(fontsize = 20, loc = 'upper right')
plt.show()
# ์๊ทธ๋ชจ์ด๋
z = np.linspace(-10, 10, 100)
sigma = 1/(1+np.exp(-z))
plt.figure(figsize=(12, 8))
plt.plot(z, sigma)
plt.xlabel('$z$', fontsize = 25)
plt.ylabel('$\sigma(z)$', fontsize = 25)
plt.show()
๋ฒกํฐ์ ํํ
๋จ์ผ ๋ณ์ ์ค์นผ๋ผ ํํ
๋ค์ค ๋ณ์ ์ค์นผ๋ผ ํจ์
๋ค๋ณ์ ๋ฒกํฐ ํจ์
๋ค๋ฉด์ ๋ฒกํฐํจ์ ์์ )
# ๋ค๋ณ์ ๋ฒกํฐํจ์ ์์
u = np.linspace(0, 1, 30)
v = np.linspace(0, 1, 30)
U, V = np.meshgrid(u, v)
X = U
Y = V
U, V = np.meshgrid(u, v)
U, V
Z = (1 + U**2) + (V/(1+V**2))
z
t = np.linspace(0, 4, 3)
p = np.linspace(0, 4, 3)
T, P = np.meshgrid(t, p)
T, P
# ๊ทธ๋ฆฌ๊ธฐ
fig = plt.figure(figsize=(7, 7))
ax = plt.axes(projection = '3d')
ax.xaxis.set_tick_params(labelsize = 15)
ax.yaxis.set_tick_params(labelsize = 15)
ax.zaxis.set_tick_params(labelsize = 15)
ax.set_xlabel(r'$x$', fontsize = 20)
ax.set_ylabel(r'$y$', fontsize = 20)
ax.set_xlabel(r'$z$', fontsize = 20)
ax.scatter3D(U, V, Z, marker = '.', color = 'gray')
plt.show()
# ํจ์ ํฉ์ฑ ์์
x = np.linspace(-4, 4, 100)
y = x**3 - 15*x + 30
z = np.log(y)
# ํจ์ ์๊น์
fig, ax = plt.subplots(1, 2, figsize = (12, 6))
ax[0].plot(x, y, label=r'$x^3 - 15x + 30$', color = 'k')
ax[0].legend(fontsize = 18)
ax[1].plot(y, z, label=r'$\log(y)$', color = 'k')
ax[1].legend(fontsize = 18)
plt.show()
# ํฉ์ฑํ ๊ฒ ์๊น์
fig, ax = plt.subplots(1, 2, figsize = (12, 6))
ax[0].plot(x, z,'--', label=r'$\log(f(x))$', color = 'k')
ax[0].legend(fontsize = 18)
ax[1].plot(x, y, label=r'$x^3 - 15x + 30$', color = 'k')
ax[1].legend(fontsize = 18)
ax_tmp = ax[1].twinx() # x์ถ์ ํ๋ ๋ ๋ง๋ค์ด๋ผ
ax_tmp.plot(x, z, '--', label = r'$\log(f(x))$', color = 'k') # ๋ง๋ x์ถ์ ๋ง์ถฐ ๊ทธ๋ํ๋ฅผ ๊ทธ๋ ค๋ผ
plt.show()
# ๊ฐ๋จํ ๋ฐ์ดํฐ
sample = [1, 7, 9, 16, 36, 39, 45, 45, 46, 48, 51, 100, 101]
tmp_y = [1]*len(sample)
tmp_y
# plot
plt.figure(figsize=(12, 4))
plt.scatter(sample, tmp_y)
plt.grid()
plt.show()
# ์งํ๋ฅผ ์ฐพ๋ ๋ฒ
np.median(sample)
np.percentile(sample, 25)
np.percentile(sample, 75) - np.percentile(sample, 25)
iqr = np.percentile(sample, 75) - np.percentile(sample, 25)
iqr * 1.5
# ๊ทธ๋ฆฌ๊ธฐ
q1 = np.percentile(sample, 25)
q2 = np.median(sample)
q3 = np.percentile(sample, 75)
iqr = q3 - q1
upper_fence = q3 + iqr*1.5
lower_fence = q1 - iqr-1.5
plt.figure(figsize=(12, 4))
plt.scatter(sample, tmp_y)
plt.axvline(x=q1, color='black')
plt.axvline(x=q2, color='red')
plt.axvline(x=q3, color='black')
plt.axvline(x=upper_fence, color='black', ls = 'dashed')
plt.axvline(x=lower_fence, color='black', ls = 'dashed')
plt.grid()
plt.show()
# framework ์ด์ฉ
import seaborn as sns
plt.figure(figsize=(12, 4))
sns.boxenplot(sample)
plt.grid()
plt.show()
์ด๋ ต..
๐ป ์ถ์ฒ : ์ ๋ก๋ฒ ์ด์ค ๋ฐ์ดํฐ ์ทจ์ ์ค์ฟจ