# logistic function
import numpy as np
z = np.arange(-10, 10, 0.01)
g = 1 / (1+np.exp(-z))
z, g
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(z, g)
plt.figure(figsize=(12, 8))
ax = plt.gca()
# gca() : ์ค์ ๊ฐ ๋ณ๊ฒฝ ๊ฐ๋ฅ ํจ์
ax.plot(z, g)
# spines() : ์ถ ์ง์
ax.spines['left'].set_position('zero')
ax.spines['right'].set_color('none')
ax.spines['bottom'].set_position('center')
ax.spines['top'].set_color('none')
plt.show()
# Logistic Reg. Cost Function์ ๊ทธ๋ํ
h = np.arange(0.01, 1, 0.01)
C0 = -np.log(1-h)
C1 = -np.log(h)
plt.figure(figsize=(12, 8))
plt.plot(h, C0, label = 'y=0')
plt.plot(h, C1, label = 'y=1')
plt.legend()
plt.show()
# ๋ฐ์ดํฐ ๋ฐ๊ธฐ
import pandas as pd
wine_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/wine.csv'
wine = pd.read_csv(wine_url, index_col=0)
wine.head()
# ๋ง ๋ฑ๊ธ ๋ง๋ค๊ธฐ
wine['taste'] = [1 if grade > 5 else 0 for grade in wine['quality']]
X = wine.drop(['taste', 'quality'], axis=1)
y = wine['taste']
# ๋ฐ์ดํฐ ๋ถ๋ฆฌ
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
# ๊ฐ๋จ ๋ก์ง์คํฑ ํ๊ท ํ
์คํธ
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lr = LogisticRegression(solver = 'liblinear', random_state=13)
#solver = 'liblinear' : ์ต์ ํ ์๊ณ ๋ฆฌ์ฆ์ liblinear๋ก ์ค์
lr.fit(X_train, y_train)
y_pred_tr = lr.predict(X_train)
y_pred_test = lr.predict(X_test)
print('Train Acc : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(y_test, y_pred_test))
# ์ค์ผ์ผ๋ฌ๊น์ง ์ ์ฉํด์ ํ์ดํ๋ผ์ธ ๊ตฌ์ถ
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
estimators = [('scaler', StandardScaler()), # standardscaler() : ๊ธฐ์กด ๋ณ์์ ๋ฒ์๋ฅผ ์ ๊ท ๋ถํฌ๋ก ๋ณํํ๋ ๊ฒ.
('clf', LogisticRegression(solver='liblinear', random_state=13))] # ๋ถ๋ฅ๊ธฐ
estimators
pipe = Pipeline(estimators)
pipe
#fit
pipe.fit(X_train, y_train)
# ์์นํจ๊ณผ๊ฐ ์๊ธด ํ๋ค
y_pred_tr = pipe.predict(X_train)
y_pred_test = pipe.predict(X_test)
print('Train Acc : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(y_test, y_pred_test))
# Decision Tree์์ ๋น๊ต
from sklearn.tree import DecisionTreeClassifier
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, y_train)
models = {'logistic regression' : pipe, 'decision tree' : wine_tree}
models
# AUC ๊ทธ๋ํ๋ฅผ ์ด์ฉํ ๋ชจ๋ธ๊ฐ ๋น๊ต
from sklearn.metrics import roc_curve
plt.figure(figsize=(10, 8))
plt.plot([0, 1], [0, 1], label = 'random_guess')
for model_name, model in models.items():
pred = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, pred)
plt.plot(fpr, tpr, label = model_name)
plt.grid()
plt.legend()
plt.show()
# ๋ฐ์ดํฐ ์ฝ๊ธฐ
import pandas as pd
PIMA_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/diabetes.csv'
PIMA = pd.read_csv(PIMA_url)
PIMA.head()
# ๋ฐ์ดํฐ ํ์ธ
PIMA.info()
# float์ผ๋ก ๋ฐ์ดํฐ ๋ณํ
PIMA = PIMA.astype('float')
PIMA.info()
# ์๊ด๊ด๊ณ ํ์ธ
# Outcome๊ณผ ๋ค๋ฅธ ํน์ฑ๊ณผ์ ๊ด๊ณ
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(12, 10))
sns.heatmap(PIMA.corr(), cmap='YlGnBu')
plt.show()
# 0์ธ ๋ฐ์ดํฐ ์กด์ฌ โ
# 0์ด๋ผ๋ ์ซ์๊ฐ ํ์์ ์๋ ๊ฒ์ ๋ฌธ์ ๋ก ๋ณด์
(PIMA==0).astype(int).sum()
# ์ํ์ ์ง์๊ณผ PIMA ์ธ๋์ธ์ ๋ํ ์ ๋ณด๊ฐ ์์ผ๋ฏ๋ก ์ผ๋จ ํ๊ท ๊ฐ์ผ๋ก ๋์ฒด
zero_features = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI']
PIMA[zero_features] = PIMA[zero_features].replace(0, PIMA[zero_features].mean())
(PIMA==0).astype(int).sum()
# ๋ฐ์ดํฐ ๋๋๊ธฐ
from sklearn.model_selection import train_test_split
X = PIMA.drop(['Outcome'], axis=1)
y = PIMA['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13,
stratify=y) # stratify ๊ฐ์ ์ค์ ํ๋ฉด ๋น์จ์ ๋ง์ถฐ ๋๋ ์ง
# Pipeline ๋ง๋ค๊ธฐ
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
estimators = [('scaler', StandardScaler()),
('clf', LogisticRegression(solver='liblinear', random_state=13))]
estimators
pipe_lr = Pipeline(estimators)
pipe_lr
pipe_lr.fit(X_train, y_train)
pred = pipe_lr.predict(X_test)
pred
# ์์น ํ์ธ
# ์๋์ ์๋ฏธ๋ฅผ ๊ฐ์ง ์ ์์ด์ ์ด ์์น ์์ฒด๋ฅผ ํ๊ฐํ ์๋ ์๋ค.
from sklearn.metrics import (accuracy_score, recall_score, precision_score,
roc_auc_score, f1_score)
print('Accuracy : ', accuracy_score(y_test, pred))
print('Recall : ', recall_score(y_test, pred))
print('Precision : ', precision_score(y_test, pred))
print('AUC score : ', roc_auc_score(y_test, pred))
print('f1 score : ', f1_score(y_test, pred))
# ๋ค๋ณ์ ๋ฐฉ์ ์์ ๊ฐ ๊ณ์ ๊ฐ ํ์ธ
coeff = list(pipe_lr['clf'].coef_[0])
# coef :ํน์ฑ์ ๋ํ ๊ณ์๋ฅผ ํฌํจํ ๋ฐฐ์ด
labels = list(X_train.columns)
coeff
# ์ค์ feature ๊ทธ๋ํ
# ํฌ๋๋น, BMI ๋ฑ์ ๋น๋จ์ ์ํฅ์ ๋ฏธ์น๋ ์ ๋๊ฐ ๋๋ค.
# ํ์์ ์์ธก์ ๋ถ์ ์ ์ํฅ์ ์ค๋ค.
# ์ฐ๋ น์ด BMI๋ณด๋ค ์ธจ๋ ฅ ๋ณ์์ ๋ ๊ด๋ จ๋์ด ์์์ง๋ง, ๋ชจ๋ธ์ BMI์ Glucose์ ๋ ์์กดํจ
features = pd.DataFrame({'Features':labels, 'importance':coeff})
features.sort_values(by=['importance'], ascending=True, inplace=True)
features['positive'] = features['importance'] > 0
features.set_index('Features', inplace=True)
features['importance'].plot(kind='barh', figsize=(11, 6)
, color = features['positive'].map({True:'blue', False:'red'}))
plt.xlabel('Importance')
plt.show()
features
# ๋ฐ์ดํฐ ๋ฐ๊ธฐ
import pandas as pd
wine_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/wine.csv'
wine = pd.read_csv(wine_url, index_col=0)
wine['taste'] = [1 if grade > 5 else 0 for grade in wine['quality']]
X = wine.drop(['taste', 'quality'], axis=1)
y = wine['taste']
# ๋ฐ์ดํฐ ๋ถ๋ฆฌ
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
# ๊ฐ๋จํ ๋ก์ง์คํฑ ํ๊ท ์ ์ฉ
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lr = LogisticRegression(solver='liblinear', random_state=13)
lr.fit(X_train, y_train)
y_pred_tr = lr.predict(X_train)
y_pred_test = lr.predict(X_test)
print('Train Acc : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(y_test, y_pred_test))
# classification_report
# macro avg : ํด๋์ค๋ณ ํ๊ท
# weighted avg : ํด๋์ค๋ณ ๋ถํฌ๋ฅผ ๋ฐ์ํ(support๊ฐ ๋ฐ์๋) ํ๊ท
from sklearn.metrics import classification_report
print(classification_report(y_test, lr.predict(X_test)))
# confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, lr.predict(X_test))
# precision_recall curve
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
%matplotlib inline
plt.figure(figsize=(10, 8))
pred = lr.predict_proba(X_test)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_test, pred)
plt.plot(thresholds, precisions[:len(thresholds)], label = 'precision')
plt.plot(thresholds, recalls[:len(thresholds)], label = 'recall')
plt.grid()
plt.legend()
plt.show()
# threshlod = 0.5 : ๋ถ๋ฅ๊ธฐ์ค
pred_proba = lr.predict_proba(X_test)
pred_proba[:3] # [0์ผ ํ๋ฅ , 1์ผ ํ๋ฅ ]
# ๊ฐ๋จํ ํ์ธํด๋ณด๊ธฐ
import numpy as np
np.concatenate([pred_proba, y_pred_test.reshape(-1, 1)], axis=1)
# reshape(-1, 1): ํฌ๊ธฐ๋ ์ ์งํ๋, ๋ง์ง๋ง์๋ 1๋ก ๋ง๋ค์ด๋ฌ๋ผ
pred_proba
y_pred_test
# threshold ๋ฐ๊ฟ๋ณด๊ธฐ - Binarizer
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=0.6).fit(pred_proba)
pred_bin = binarizer.transform(pred_proba)[:, 1]
pred_bin
print(classification_report(y_test, lr.predict(X_test)))
# ๋ค์ classification report
print(classification_report(y_test, pred_bin))
# ๋ค์ confusion matrix
confusion_matrix(y_test, pred_bin)
๐ป ์ถ์ฒ : ์ ๋ก๋ฒ ์ด์ค ๋ฐ์ดํฐ ์ทจ์ ์ค์ฟจ