๐Ÿ˜ข ์Šคํ„ฐ๋””๋…ธํŠธ (Machine Learning 7)

zoeยท2023๋…„ 5์›” 22์ผ
0

Logistic Regression

  • ๋ถ„๋ฅ˜ ๋ฌธ์ œ๋Š” 0 ๋˜๋Š” 1๋กœ ์˜ˆ์ธกํ•ด์•ผ ํ•˜๋‚˜ Linear Regression์„ ๊ทธ๋Œ€๋กœ ์ ์šฉํ•˜๋ฉด ์˜ˆ์ธก๊ฐ’ hฮธ(x)๋Š” 0๋ณด๋‹ค ์ž‘๊ฑฐ๋‚˜ 1๋ณด๋‹ค ํฐ ๊ฐ’์„ ๊ฐ€์งˆ ์ˆ˜ ์žˆ์Œ
  • hฮธ(x)๊ฐ€ ํ•ญ์ƒ 0์—์„œ 1 ์‚ฌ์ด์˜ ๊ฐ’์„ ๊ฐ–๋„๋ก Hypothesis ํ•จ์ˆ˜๋ฅผ ์ˆ˜์ •

# logistic function

import numpy as np

z = np.arange(-10, 10, 0.01)
g = 1 / (1+np.exp(-z))
z, g
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(z, g)
plt.figure(figsize=(12, 8))
ax = plt.gca()
# gca() : ์„ค์ •๊ฐ’ ๋ณ€๊ฒฝ ๊ฐ€๋Šฅ ํ•จ์ˆ˜

ax.plot(z, g)
# spines() : ์ถ• ์ง€์ •
ax.spines['left'].set_position('zero') 
ax.spines['right'].set_color('none')
ax.spines['bottom'].set_position('center')
ax.spines['top'].set_color('none')

plt.show()


  • Decision Boundary


  • Logistic Regression์—์„œ Cost Function ์žฌ์ •์˜

  • Learning ์•Œ๊ณ ๋ฆฌ์ฆ˜์€ ๋™์ผ

# Logistic Reg. Cost Function์˜ ๊ทธ๋ž˜ํ”„

h = np.arange(0.01, 1, 0.01)

C0 = -np.log(1-h)
C1 = -np.log(h)

plt.figure(figsize=(12, 8))
plt.plot(h, C0, label = 'y=0')
plt.plot(h, C1, label = 'y=1')
plt.legend()

plt.show()




์‹ค์Šต - ์™€์ธ๋ฐ์ดํ„ฐ

  • ์™€์ธ ๋“ฑ๊ธ‰
# ๋ฐ์ดํ„ฐ ๋ฐ›๊ธฐ

import pandas as pd

wine_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/wine.csv'

wine = pd.read_csv(wine_url, index_col=0)
wine.head()
# ๋ง› ๋“ฑ๊ธ‰ ๋งŒ๋“ค๊ธฐ

wine['taste'] = [1 if grade > 5 else 0 for grade in wine['quality']]

X = wine.drop(['taste', 'quality'], axis=1)
y = wine['taste']
# ๋ฐ์ดํ„ฐ ๋ถ„๋ฆฌ

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
# ๊ฐ„๋‹จ ๋กœ์ง€์Šคํ‹ฑ ํšŒ๊ท€ ํ…Œ์ŠคํŠธ

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression(solver = 'liblinear', random_state=13) 
#solver = 'liblinear' : ์ตœ์ ํ™” ์•Œ๊ณ ๋ฆฌ์ฆ˜์„ liblinear๋กœ ์„ค์ •
lr.fit(X_train, y_train)
y_pred_tr = lr.predict(X_train)
y_pred_test = lr.predict(X_test)

print('Train Acc : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(y_test, y_pred_test))
# ์Šค์ผ€์ผ๋Ÿฌ๊นŒ์ง€ ์ ์šฉํ•ด์„œ ํŒŒ์ดํ”„๋ผ์ธ ๊ตฌ์ถ•

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

estimators = [('scaler', StandardScaler()), # standardscaler() : ๊ธฐ์กด ๋ณ€์ˆ˜์˜ ๋ฒ”์œ„๋ฅผ ์ •๊ทœ ๋ถ„ํฌ๋กœ ๋ณ€ํ™˜ํ•˜๋Š” ๊ฒƒ.
              ('clf', LogisticRegression(solver='liblinear', random_state=13))] # ๋ถ„๋ฅ˜๊ธฐ
estimators
pipe = Pipeline(estimators)
pipe
#fit

pipe.fit(X_train, y_train)
# ์ƒ์Šนํšจ๊ณผ๊ฐ€ ์žˆ๊ธด ํ•˜๋‹ค

y_pred_tr = pipe.predict(X_train)
y_pred_test = pipe.predict(X_test)

print('Train Acc : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(y_test, y_pred_test))
# Decision Tree์™€์˜ ๋น„๊ต

from sklearn.tree import DecisionTreeClassifier

wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, y_train)

models = {'logistic regression' : pipe, 'decision tree' : wine_tree}

models
# AUC ๊ทธ๋ž˜ํ”„๋ฅผ ์ด์šฉํ•œ ๋ชจ๋ธ๊ฐ„ ๋น„๊ต

from sklearn.metrics import roc_curve

plt.figure(figsize=(10, 8))
plt.plot([0, 1], [0, 1], label = 'random_guess')

for model_name, model in models.items():
    pred = model.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, pred)
    plt.plot(fpr, tpr, label = model_name)
    
plt.grid()
plt.legend()
plt.show()




์‹ค์Šต - PIMA ์ธ๋””์–ธ ๋‹น๋‡จ๋ณ‘ ์˜ˆ์ธก

  • ์›๋ž˜ ๋ฐ์ดํ„ฐ๋Š” kaggle

# ๋ฐ์ดํ„ฐ ์ฝ๊ธฐ

import pandas as pd

PIMA_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/diabetes.csv'

PIMA = pd.read_csv(PIMA_url)
PIMA.head()
# ๋ฐ์ดํ„ฐ ํ™•์ธ

PIMA.info()
# float์œผ๋กœ ๋ฐ์ดํ„ฐ ๋ณ€ํ™˜

PIMA = PIMA.astype('float')
PIMA.info()
# ์ƒ๊ด€๊ด€๊ณ„ ํ™•์ธ
# Outcome๊ณผ ๋‹ค๋ฅธ ํŠน์„ฑ๊ณผ์˜ ๊ด€๊ณ„

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

plt.figure(figsize=(12, 10))
sns.heatmap(PIMA.corr(), cmap='YlGnBu')
plt.show()
# 0์ธ ๋ฐ์ดํ„ฐ ์กด์žฌ โ˜…
# 0์ด๋ผ๋Š” ์ˆซ์ž๊ฐ€ ํ˜ˆ์••์— ์žˆ๋Š” ๊ฒƒ์€ ๋ฌธ์ œ๋กœ ๋ณด์ž„

(PIMA==0).astype(int).sum()
# ์˜ํ•™์  ์ง€์‹๊ณผ PIMA ์ธ๋””์–ธ์— ๋Œ€ํ•œ ์ •๋ณด๊ฐ€ ์—†์œผ๋ฏ€๋กœ ์ผ๋‹จ ํ‰๊ท ๊ฐ’์œผ๋กœ ๋Œ€์ฒด

zero_features = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI']
PIMA[zero_features] = PIMA[zero_features].replace(0, PIMA[zero_features].mean())
(PIMA==0).astype(int).sum()
# ๋ฐ์ดํ„ฐ ๋‚˜๋ˆ„๊ธฐ

from sklearn.model_selection import train_test_split

X = PIMA.drop(['Outcome'], axis=1)
y = PIMA['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13,
                                                    stratify=y) # stratify ๊ฐ’์„ ์„ค์ •ํ•˜๋ฉด ๋น„์œจ์— ๋งž์ถฐ ๋‚˜๋ˆ ์ง
# Pipeline ๋งŒ๋“ค๊ธฐ

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

estimators = [('scaler', StandardScaler()), 
              ('clf', LogisticRegression(solver='liblinear', random_state=13))]

estimators
pipe_lr = Pipeline(estimators)
pipe_lr
pipe_lr.fit(X_train, y_train)
pred = pipe_lr.predict(X_test)
pred
# ์ˆ˜์น˜ ํ™•์ธ
# ์ƒ๋Œ€์  ์˜๋ฏธ๋ฅผ ๊ฐ€์งˆ ์ˆ˜ ์—†์–ด์„œ ์ด ์ˆ˜์น˜ ์ž์ฒด๋ฅผ ํ‰๊ฐ€ํ•  ์ˆ˜๋Š” ์—†๋‹ค.

from sklearn.metrics import (accuracy_score, recall_score, precision_score,
                             roc_auc_score, f1_score)

print('Accuracy : ', accuracy_score(y_test, pred))
print('Recall : ', recall_score(y_test, pred))
print('Precision : ', precision_score(y_test, pred))
print('AUC score : ', roc_auc_score(y_test, pred))
print('f1 score : ', f1_score(y_test, pred))
# ๋‹ค๋ณ€์ˆ˜ ๋ฐฉ์ •์‹์˜ ๊ฐ ๊ณ„์ˆ˜ ๊ฐ’ ํ™•์ธ

coeff = list(pipe_lr['clf'].coef_[0])
# coef  :ํŠน์„ฑ์— ๋Œ€ํ•œ ๊ณ„์ˆ˜๋ฅผ ํฌํ•จํ•œ ๋ฐฐ์—ด
labels = list(X_train.columns)
coeff
# ์ค‘์š” feature ๊ทธ๋ž˜ํ”„
# ํฌ๋„๋‹น, BMI ๋“ฑ์€ ๋‹น๋‡จ์— ์˜ํ–ฅ์„ ๋ฏธ์น˜๋Š” ์ •๋„๊ฐ€ ๋†’๋‹ค.
# ํ˜ˆ์••์€ ์˜ˆ์ธก์— ๋ถ€์ •์  ์˜ํ–ฅ์„ ์ค€๋‹ค.
# ์—ฐ๋ น์ด BMI๋ณด๋‹ค ์ธจ๋ ฅ ๋ณ€์ˆ˜์™€ ๋” ๊ด€๋ จ๋˜์–ด ์žˆ์—ˆ์ง€๋งŒ, ๋ชจ๋ธ์€ BMI์™€ Glucose์— ๋” ์˜์กดํ•จ


features = pd.DataFrame({'Features':labels, 'importance':coeff})
features.sort_values(by=['importance'], ascending=True, inplace=True)
features['positive'] = features['importance'] > 0
features.set_index('Features', inplace=True)
features['importance'].plot(kind='barh', figsize=(11, 6)
                            , color = features['positive'].map({True:'blue', False:'red'}))

plt.xlabel('Importance')
plt.show()
features




Precision and Recall - ์ •๋ฐ€๋„์™€ ์žฌํ˜„์œจ์˜ ํŠธ๋ ˆ์ด๋“œ์˜คํ”„

# ๋ฐ์ดํ„ฐ ๋ฐ›๊ธฐ

import pandas as pd

wine_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/wine.csv'

wine = pd.read_csv(wine_url, index_col=0)
wine['taste'] = [1 if grade > 5 else 0 for grade in wine['quality']]

X = wine.drop(['taste', 'quality'], axis=1)
y = wine['taste']
# ๋ฐ์ดํ„ฐ ๋ถ„๋ฆฌ

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
# ๊ฐ„๋‹จํ•œ ๋กœ์ง€์Šคํ‹ฑ ํšŒ๊ท€ ์ ์šฉ

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression(solver='liblinear', random_state=13)
lr.fit(X_train, y_train)

y_pred_tr = lr.predict(X_train)
y_pred_test = lr.predict(X_test)

print('Train Acc : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(y_test, y_pred_test))

  • classification_report
  • macro avg : ํด๋ž˜์Šค๋ณ„ ํ‰๊ท 
  • weighted avg : ํด๋ž˜์Šค๋ณ„ ๋ถ„ํฌ๋ฅผ ๋ฐ˜์˜ํ•œ(support๊ฐ€ ๋ฐ˜์˜๋œ) ํ‰๊ท 
# classification_report
# macro avg  : ํด๋ž˜์Šค๋ณ„ ํ‰๊ท 
# weighted avg : ํด๋ž˜์Šค๋ณ„ ๋ถ„ํฌ๋ฅผ ๋ฐ˜์˜ํ•œ(support๊ฐ€ ๋ฐ˜์˜๋œ) ํ‰๊ท 

from sklearn.metrics import classification_report

print(classification_report(y_test, lr.predict(X_test)))
  • confusion matrix
# confusion matrix

from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, lr.predict(X_test))

  • precision_recall curve
# precision_recall curve

import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve

%matplotlib inline

plt.figure(figsize=(10, 8))
pred = lr.predict_proba(X_test)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_test, pred)
plt.plot(thresholds, precisions[:len(thresholds)], label = 'precision')
plt.plot(thresholds, recalls[:len(thresholds)], label = 'recall')
plt.grid()
plt.legend()
plt.show()

# threshlod = 0.5  : ๋ถ„๋ฅ˜๊ธฐ์ค€

pred_proba = lr.predict_proba(X_test)
pred_proba[:3] # [0์ผ ํ™•๋ฅ , 1์ผ ํ™•๋ฅ ]
# ๊ฐ„๋‹จํžˆ ํ™•์ธํ•ด๋ณด๊ธฐ

import numpy as np

np.concatenate([pred_proba, y_pred_test.reshape(-1, 1)], axis=1)
# reshape(-1, 1): ํฌ๊ธฐ๋Š” ์œ ์ง€ํ•˜๋˜, ๋งˆ์ง€๋ง‰์—๋Š” 1๋กœ ๋งŒ๋“ค์–ด๋‹ฌ๋ผ
pred_proba
y_pred_test
# threshold ๋ฐ”๊ฟ”๋ณด๊ธฐ - Binarizer

from sklearn.preprocessing import Binarizer

binarizer = Binarizer(threshold=0.6).fit(pred_proba)
pred_bin = binarizer.transform(pred_proba)[:, 1]
pred_bin
print(classification_report(y_test, lr.predict(X_test)))
# ๋‹ค์‹œ classification report

print(classification_report(y_test, pred_bin))
# ๋‹ค์‹œ confusion matrix

confusion_matrix(y_test, pred_bin)

๐Ÿ’ป ์ถœ์ฒ˜ : ์ œ๋กœ๋ฒ ์ด์Šค ๋ฐ์ดํ„ฐ ์ทจ์—… ์Šค์ฟจ

profile
#๋ฐ์ดํ„ฐ๋ถ„์„ #ํผํฌ๋จผ์Šค๋งˆ์ผ€ํŒ… #๋ฐ์ดํ„ฐ #๋””์ง€ํ„ธ๋งˆ์ผ€ํŒ…

0๊ฐœ์˜ ๋Œ“๊ธ€