๐Ÿ˜ข ์Šคํ„ฐ๋””๋…ธํŠธ (Machine Learning 4)

zoeยท2023๋…„ 5์›” 17์ผ
0

Decision Tree๋ฅผ ์ด์šฉํ•œ ์™€์ธ ๋ฐ์ดํ„ฐ ๋ถ„์„ - Wine

# ๋ฐ์ดํ„ฐ ์ฝ๊ธฐ

import pandas as pd

red_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv'
white_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv'

red_wine = pd.read_csv(red_url, sep=';')
white_wine = pd.read_csv(white_url, sep=';')
# ๋‘ ๋ฐ์ดํ„ฐ์˜ ๊ตฌ์กฐ๋Š” ๋™์ผํ•˜๋‹ค

red_wine.head()
white_wine.head()
# ์ปฌ๋Ÿผ ์ข…๋ฅ˜

white_wine.columns
# ๋‘ ๋ฐ์ดํ„ฐ ํ•ฉ์น˜๊ธฐ

red_wine['color'] = 1
white_wine['color'] = 0

wine = pd.concat([red_wine, white_wine])
wine.info()
# quality ์ปฌ๋Ÿผ์€ 3๋ถ€ํ„ฐ 9๋“ฑ๊ธ‰๊นŒ์ง€ ์กด์žฌ

wine['quality'].unique()
# histogram

import plotly.express as px

fig = px.histogram(wine, x='quality')
fig.show()
# ๋ ˆ๋“œ/ํ™”์ดํŠธ ์™€์ธ๋ณ„๋กœ ๋“ฑ๊ธ‰ Histogram

fig = px.histogram(wine, x='quality', color = 'color')
fig.show()




Decision Tree๋ฅผ ์ด์šฉํ•œ ์™€์ธ ๋ฐ์ดํ„ฐ ๋ถ„์„ - ๋ ˆ๋“œ์™€์ธ ํ™”์ดํŠธ ์™€์ธ ๋ถ„๋ฅ˜๊ธฐ

# ๋ผ๋ฒจ ๋ถ„๋ฆฌ

X = wine.drop(['color'], axis=1)
Y = wine['color']
# ๋ฐ์ดํ„ฐ๋ฅผ ํ›ˆ๋ จ์šฉ๊ณผ ํ…Œ์ŠคํŠธ์šฉ์œผ๋กœ ๋‚˜๋ˆ„๊ธฐ

from sklearn.model_selection import train_test_split
import numpy as np

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=13)

np.unique(Y_train, return_counts=True)
# ํ›ˆ๋ จ์šฉ๊ณผ ํ…Œ์ŠคํŠธ์šฉ์ด ๋ ˆ๋“œ/ํ™”์ดํŠธ ์™€์ธ์— ๋”ฐ๋ผ ์–ด๋А์ •๋„ ๊ตฌ๋ถ„๋˜์—ˆ์„๊นŒ

import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Histogram(x=X_train['quality'], name='Train'))
fig.add_trace(go.Histogram(x=X_test['quality'], name='Test'))

fig.update_layout(barmode = 'overlay')
fig.update_traces(opacity = 0.75)
fig.show()
# ๊ฒฐ์ •๋‚˜๋ฌด

from sklearn.tree import DecisionTreeClassifier

wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, Y_train)
from sklearn.metrics import accuracy_score

y_pred_tr  = wine_tree.predict(X_train) # ์ˆ˜์น˜ ๋ณด๊ธฐ ์œ„ํ•ด
y_pred_test = wine_tree.predict(X_test)

print('Train Acc : ', accuracy_score(Y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(Y_test, y_pred_test))




Decision Tree๋ฅผ ์ด์šฉํ•œ ์™€์ธ ๋ฐ์ดํ„ฐ ๋ถ„์„ - ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ

# ์™€์ธ ๋ฐ์ดํ„ฐ์˜ ๋ช‡ ๊ฐœ ํ•ญ๋ชฉ์˜ Boxplot
# ์ปฌ๋Ÿผ๋“ค์˜ ์ตœ๋Œ€/์ตœ์†Œ ๋ฒ”์œ„๊ฐ€ ๊ฐ๊ฐ ๋‹ค๋ฅด๊ณ , ํ‰๊ท ๊ณผ ๋ถ„์‚ฐ์ด ๊ฐ๊ฐ ๋‹ค๋ฅด๋‹ค.
# ํŠน์„ฑ(feature)์˜ ํŽธํ–ฅ ๋ฌธ์ œ๋Š” ์ตœ์ ์˜ ๋ชจ๋ธ์„ ์ฐพ๋Š”๋ฐ ๋ฐฉํ•ด๊ฐ€ ๋  ์ˆ˜๋„ ์žˆ๋‹ค.

fig = go.Figure()
fig.add_trace(go.Box(y=X['fixed acidity'], name = 'fixed acidity'))
fig.add_trace(go.Box(y=X['chlorides'], name='chlorides'))
fig.add_trace(go.Box(y=X['quality'], name='quality'))

fig.show()
# ์ด๋Ÿด ๋•Œ ์“ฐ๋Š” ๊ฒƒ์ด MinMaxScaler์™€ StandardScaler์ด๋‹ค
# ๊ฒฐ์ •๋‚˜๋ฌด์—์„œ๋Š” ์ด๋Ÿฐ ์ „์ฒ˜๋ฆฌ๋Š” ์˜๋ฏธ๋ฅผ ๊ฐ€์ง€์ง€ ์•Š๋Š”๋‹ค.
# ์ฃผ๋กœ Cost Function์„ ์ตœ์ ํ™”ํ•  ๋•Œ ์œ ํšจํ•  ๋•Œ๊ฐ€ ์žˆ๋‹ค.
# MinMaxScaler์™€ StandardScaler ์ค‘ ์–ด๋–ค ๊ฒƒ์ด ์ข‹์„์ง€๋Š” ํ•ด๋ด์•ผ ์•ˆ๋‹ค.

from sklearn.preprocessing import MinMaxScaler, StandardScaler

MMS = MinMaxScaler()
SS = StandardScaler()

SS.fit(X)
MMS.fit(X)

X_ss = SS.transform(X)
X_mms = MMS.transform(X)

X_ss_pd = pd.DataFrame(X_ss, columns=X.columns)
X_mms_pd = pd.DataFrame(X_mms, columns=X.columns)
# MinMaxScaler : ์ตœ๋Œ€ ์ตœ์†Œ๊ฐ’์„ 1๊ณผ 0์œผ๋กœ ๊ฐ•์ œ๋กœ ๋งž์ถ”๋Š” ๊ฒƒ

fig = go.Figure()
fig.add_trace(go.Box(y=X_mms_pd['fixed acidity'], name='fixed acidity'))
fig.add_trace(go.Box(y=X_mms_pd['chlorides'], name='chlorides'))
fig.add_trace(go.Box(y=X_mms_pd['quality'], name = 'quality'))

fig.show()
# StandardScaler : ํ‰๊ท ์„ 0์œผ๋กœ ํ‘œ์ค€ํŽธ์ฐจ๋ฅผ 1๋กœ ๋งž์ถ”๋Š” ๊ฒƒ


def px_box(target_df):
    fig = go.Figure()
    fig.add_trace(go.Box(y=target_df['fixed acidity'], name = 'fixed acidity'))
    fig.add_trace(go.Box(y=target_df['chlorides'], name='chlorides'))
    fig.add_trace(go.Box(y=target_df['quality'], name = 'quality'))
    
    fig.show()

px_box(X_ss_pd)
# MinMaxScaler๋ฅผ ์ ์šฉํ•ด์„œ ๋‹ค์‹œ ํ•™์Šต
# ๋‹ค์‹œ ์ด์•ผ๊ธฐํ•˜์ง€๋งŒ ๊ฒฐ์ •๋‚˜๋ฌด์—์„œ๋Š” ์ด๋Ÿฐ ์ „์ฒ˜๋ฆฌ๋Š” ๊ฑฐ์˜ ํšจ๊ณผ๊ฐ€ ์—†๋‹ค.

X_train, X_test, Y_train, Y_test = train_test_split(X_mms_pd, Y, test_size=0.2, random_state=13)

wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, Y_train)

y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)

print('Train Acc : ', accuracy_score(Y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(Y_test, y_pred_test) )
# StandardScaler๋ฅผ ์ ์šฉ

X_train, X_test, Y_train, Y_test = train_test_split(X_ss_pd, Y, test_size=0.2, random_state=13)

wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, Y_train)

y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)

print('Train Acc : ', accuracy_score(Y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(Y_test, y_pred_test) )
# ๊ฒฐ์ •๋‚˜๋ฌด๋Š” ํ™”์ดํŠธ์™€์ธ๊ณผ ๋ ˆ๋“œ์™€์ธ์„ ์–ด๋–ป๊ฒŒ ๊ตฌ๋ถ„ํ• ๊นŒ?
#total sulfur dioxide๊ฐ€ ์ค‘์š”ํ•œ ์—ญํ• ์„ ํ•˜๋Š” ๊ฒƒ๊ฐ™๋‹ค.
# ๋ ˆ๋“œ์™€์ธ๊ณผ ํ™”์ดํŠธ์™€์ธ์„ ๊ตฌ๋ถ„ํ•˜๋Š” ์ค‘์š” ํŠน์„ฑ
# MaxDepth๋ฅผ ๋†’์ด๋ฉด ์ € ์ˆ˜์น˜์—๋„ ๋ณ€ํ™”๊ฐ€ ์˜จ๋‹ค.

dict(zip(X_train.columns, wine_tree.feature_importances_))




Decision Tree๋ฅผ ์ด์šฉํ•œ ์™€์ธ ๋ฐ์ดํ„ฐ ๋ถ„์„ - ๋ง›์˜ ์ด์ง„ ๋ถ„๋ฅ˜

# quality ์ปฌ๋Ÿผ์„ ์ด์ง„ํ™”

wine['taste'] = [1 if grade > 5 else 0 for grade in wine['quality']] # โ˜…
wine.info()
# ๋ ˆ๋“œ/ํ™”์ดํŠธ ์™€์ธ ๋ถ„๋ฅ˜์™€ ๋™์ผ ๊ณผ์ •์„ ๊ฑฐ์น˜์ž

X = wine.drop(['taste'], axis=1)
Y = wine['taste']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=13)

wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, Y_train)
# 100ํ”„๋กœ ๊ฐ€๋Šฅํ•œ๊ฐ€? โ†’ ์˜์‹ฌํ•ด์•ผ ํ•œ๋‹ค
y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)

print('Train Acc : ', accuracy_score(Y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(Y_test, y_pred_test))
# ์™œ ์ด๋Ÿฐ ์ผ์ด ์ƒ๊ฒผ๋Š”์ง€ ํ™•์ธํ•ด๋ณด์ž
# quality ์ปฌ๋Ÿผ์œผ๋กœ taste ์ปฌ๋Ÿผ์„ ๋งŒ๋“ค์—ˆ์œผ๋‹ˆ quality ์ปฌ๋Ÿผ์€ ์ œ๊ฑฐ ํ–ˆ์–ด์•ผ ํ–ˆ๋‹ค

import matplotlib.pyplot as plt
import sklearn.tree as tree

plt.figure(figsize=(12, 8))
tree.plot_tree(wine_tree, feature_names=X.columns)
# ๋‹ค์‹œ ์ง„ํ–‰

X = wine.drop(['taste', 'quality'], axis=1)
Y = wine['taste']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=13)

wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, Y_train)
y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)

print('Train Acc : ', accuracy_score(Y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(Y_test, y_pred_test))
# ์–ด๋–ค ์™€์ธ์„ โ€œ๋ง›์žˆ๋‹คโ€๊ณ  ํ•  ์ˆ˜ ์žˆ๋‚˜?

import matplotlib.pyplot as plt
import sklearn.tree as tree

plt.figure(figsize=(12, 8))
tree.plot_tree(wine_tree, feature_names=X.columns)




Pipeline

import pandas as pd

red_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv'
white_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv'

red_wine = pd.read_csv(red_url, sep=';')
white_wine = pd.read_csv(white_url, sep=';')

red_wine['color'] = 1
white_wine['color'] = 0

wine = pd.concat([red_wine, white_wine])
X = wine.drop(['color'], axis=1)
Y = wine['color']
  • ๋ ˆ๋“œ/ํ™”์ดํŠธ ์™€์ธ ๋ถ„๋ฅ˜๊ธฐ์˜ ๋™์ž‘ Process
    [scaler] StandardScaler() โ†’ test_train_split() โ†’ [clf] DecisionTreeClassifier()
  • ์—ฌ๊ธฐ์„œ test_train_split์€ Pipeline ๋‚ด๋ถ€๊ฐ€ ์•„๋‹ˆ๋‹ค




ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ํŠœ๋‹ - ๊ต์ฐจ๊ฒ€์ฆ

  • ๊ต์ฐจ๊ฒ€์ฆ
    - ๋‚˜์—๊ฒŒ ์ฃผ์–ด์ง„ ใ…ฃ๋ฐ์ดํ„ฐ์— ์ ์šฉํ•œ ๋ชจ๋ธ์˜ ์„ฑ๋Šฅ์„ ์ •ํ™•ํžˆ ํ‘œํ˜„ํ•˜๊ธฐ ์œ„ํ•ด์„œ๋„ ์œ ์šฉํ•˜๋‹ค.

  • ๊ณผ์ ํ•ฉ : ๋ชจ๋ธ์ด ํ•™์Šต ๋ฐ์ดํ„ฐ์—๋งŒ ๊ณผ๋„ํ•˜๊ฒŒ ์ตœ์ ํ™”๋œ ํ˜„์ƒ. ๊ทธ๋กœ ์ธํ•ด ์ผ๋ฐ˜ํ™”๋œ ๋ฐ์ดํ„ฐ์—์„œ๋Š” ์˜ˆ์ธก ์„ฑ๋Šฅ์ด ๊ณผํ•˜๊ฒŒ ๋–จ์–ด์ง€๋Š” ํ˜„์ƒ

  • holdout

  • k(์ˆซ์ž)-fold cross validation

  • stratified k-fold cross validation

  • ๊ฒ€์ฆ validation์ด ๋๋‚œ ํ›„ test์šฉ ๋ฐ์ดํ„ฐ๋กœ ์ตœ์ข… ํ‰๊ฐ€

# holdout
# k(์ˆซ์ž)-fold cross validation
# stratified k-fold cross validation
# ๊ฒ€์ฆ validation์ด ๋๋‚œ ํ›„ test์šฉ ๋ฐ์ดํ„ฐ๋กœ ์ตœ์ข… ํ‰๊ฐ€
# ๊ต์ฐจ๊ฒ€์ฆ ๊ตฌํ˜„ํ•˜๊ธฐ
# simple example

import numpy as np
from sklearn.model_selection import KFold

X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
Y = np.array([1, 2, 3, 4])
kf = KFold(n_splits=2)

print(kf.get_n_splits(X))
print(kf)
for train_idx, test_idx in kf.split(X):
    print('--- idx')
    print(train_idx, test_idx),
    print('--- train data')
    print(X[train_idx])
    print('--- val data')
    print(X[test_idx])
# ๋‹ค์‹œ ์™€์ธ ๋ง› ๋ถ„๋ฅ˜ํ•˜๋˜ ๋ฐ์ดํ„ฐ๋กœ\
# ๋ฐ์ดํ„ฐ ์ฝ๊ธฐ

import pandas as pd

red_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv'
white_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv'

red_wine = pd.read_csv(red_url, sep=';')
white_wine = pd.read_csv(white_url, sep=';')

red_wine['color'] = 1
white_wine['color'] = 0

wine = pd.concat([red_wine, white_wine])
# ์™€์ธ ๋ง› ๋ถ„๋ฅ˜๊ธฐ๋ฅผ ์œ„ํ•œ ๋ฐ์ดํ„ฐ ์ •๋ฆฌ

wine['taste'] = [1 if grade > 5 else 0 for grade in wine['quality']]

X = wine.drop(['taste', 'quality'], axis=1)
Y = wine['taste']
# ์ง€๋‚œ๋ฒˆ ์˜์‚ฌ ๊ฒฐ์ • ๋‚˜๋ฌด ๋ชจ๋ธ๋กœ๋Š”?
# ์—ฌ๊ธฐ์„œ ์ž ๊น, ๊ทธ๋Ÿฌ๋‹ˆ๊นŒ ๋ˆ„๊ฐ€, โ€œ๋ฐ์ดํ„ฐ๋ฅผ ์ €๋ ‡๊ฒŒ ๋ถ„๋ฆฌํ•˜๋Š” ๊ฒƒ์ด ์ตœ์„ ์ธ๊ฑด๊ฐ€?โ€
# โ€œ์ € acc๋ฅผ ์–ด๋–ป๊ฒŒ ์‹ ๋ขฐํ•  ์ˆ˜ ์žˆ๋Š”๊ฐ€?โ€ ๋ผ๊ณ  ๋ฌป๋Š”๋‹ค๋ฉด

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=13)

wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, Y_train)

y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)

print('Train Acc : ', accuracy_score(Y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(Y_test, y_pred_test))
# KFold

from sklearn.model_selection import KFold

KFold = KFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)
# KFold๋Š” index๋ฅผ ๋ฐ˜ํ™˜ํ•œ๋‹ค

for train_idx, test_idx in KFold.split(X):
    print(len(train_idx), len(test_idx))
# ๊ฐ๊ฐ์˜ fold์— ๋Œ€ํ•œ ํ•™์Šต ํ›„ acc
# ๋ชจ๋ธ์ด ํ•˜๋‚˜์˜ accuracy๊ฐ€ ์•„๋‹ ์ˆ˜ ์žˆ๋‹ค.

cv_accuracy = []

for train_idx, test_idx in KFold.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    Y_train, Y_test = Y.iloc[train_idx], Y.iloc[test_idx]
    
    wine_tree_cv.fit(X_train, Y_train)
    pred = wine_tree_cv.predict(X_test)
    cv_accuracy.append(accuracy_score(Y_test, pred))

cv_accuracy
# ๊ฐ acc์˜ ๋ถ„์‚ฐ์ด ํฌ์ง€ ์•Š๋‹ค๋ฉด ํ‰๊ท ์„ ๋Œ€ํ‘œ ๊ฐ’์œผ๋กœ ํ•œ๋‹ค

np.mean(cv_accuracy)
# StratifiedKFold
# https://continuous-development.tistory.com/166

from sklearn.model_selection import StratifiedKFold

skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)

cv_accuracy = []

for train_idx, test_idx in skfold.split(X, Y):
    X_train = X.iloc[train_idx]
    X_test = X.iloc[test_idx]
    Y_train = Y.iloc[train_idx]
    Y_test = Y.iloc[test_idx]
    
    wine_tree_cv.fit(X_train, Y_train)
    pred = wine_tree_cv.predict(X_test)
    cv_accuracy.append(accuracy_score(Y_test, pred))

cv_accuracy
# acc์˜ ํ‰๊ท ์ด ๋” ๋‚˜์˜๋‹ค

np.mean(cv_accuracy)
# cross validation์„ ๋ณด๋‹ค ๊ฐ„ํŽธํžˆ ํ•˜๋Š” ๋ฐฉ๋ฒ•

from sklearn.model_selection import cross_val_score

skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)

cross_val_score(wine_tree_cv, X, Y, scoring=None, cv=skfold)
# depth๊ฐ€ ๋†’๋‹ค๊ณ  ๋ฌด์กฐ๊ฑด acc๊ฐ€ ์ข‹์•„์ง€๋Š” ๊ฒƒ๋„ ์•„๋‹ˆ๋‹ค

wine_tree_cv = DecisionTreeClassifier(max_depth=5, random_state=13)

cross_val_score(wine_tree_cv, X, Y, scoring=None, cv=skfold)
def skfold_dt(depth):
    from sklearn.model_selection import cross_val_score

    skfold = StratifiedKFold(n_splits=5)
    wine_tree_cv = DecisionTreeClassifier(max_depth=depth, random_state=13)

    print(cross_val_score(wine_tree_cv, X, Y, scoring=None, cv=skfold))
skfold_dt(3)
# train score์™€ ํ•จ๊ป˜ ๋ณด๊ณ  ์‹ถ์„ ๊ฒฝ์šฐ
# ํ˜„์žฌ ์šฐ๋ฆฌ๋Š” ๊ณผ์ ํ•ฉ ํ˜„์ƒ๋„ ํ•จ๊ป˜ ๋ชฉ๊ฒฉํ•˜๊ณ  ์žˆ๋‹ค
from sklearn.model_selection import cross_validate
cross_validate(wine_tree_cv, X, Y, scoring=None, cv=skfold, return_train_score=True)




ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ํŠœ๋‹

  • ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ํŠœ๋‹ : ๋ชจ๋ธ์˜ ์„ฑ๋Šฅ์„ ํ™•๋ณดํ•˜๊ธฐ ์œ„ํ•ด ์กฐ์ ˆํ•˜๋Š” ์„ค์ • ๊ฐ’
  • ํŠœ๋‹ ๋Œ€์ƒ : ๊ฒฐ์ •๋‚˜๋ฌด์—์„œ ํŠœ๋‹ํ•ด ๋ณผ๋งŒํ•œ ๊ฒƒ์€ max_depth์ด๋‹ค.
import pandas as pd

red_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv'
white_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv'

red_wine = pd.read_csv(red_url, sep=';')
white_wine = pd.read_csv(white_url, sep=';')

red_wine['color'] = 1
white_wine['color'] = 0

wine = pd.concat([red_wine, white_wine])
wine['taste'] = [1 if grade > 5 else 0 for grade in wine['quality']]

X = wine.drop(['taste', 'quality'], axis=1)
Y = wine['taste']
# GridSearchCV
# cv๋Š” cross validation

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

params = {'max_depth' : [2, 4, 7, 10]}
gridsearch = GridSearchCV(estimator=wine_tree, param_grid=params, cv = 5)
gridsearch.fit(X, Y)
# GridSearchCV์˜ ๊ฒฐ๊ณผ

import pprint

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(gridsearch.cv_results_)
# ์ตœ์ ์˜ ์„ฑ๋Šฅ์„ ๊ฐ€์ง„ ๋ชจ๋ธ

gridsearch.best_estimator_
gridsearch.best_score_
gridsearch.best_params_
# ๋งŒ์•ฝ pipeline์„ ์ ์šฉํ•œ ๋ชจ๋ธ์— GridSearch๋ฅผ ์ ์šฉํ•˜๊ณ  ์‹ถ๋‹ค๋ฉด

from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

estimators = [('scaler', StandardScaler()), ('clf', DecisionTreeClassifier(random_state=13))]

pipe = Pipeline(estimators)
param_grid = [{'clf__max_depth': [2, 4, 7, 10]}]

GridSearch = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=5)
GridSearch.fit(X, Y)
# best ๋ชจ๋ธ
GridSearch.best_estimator_
# best_score_

GridSearch.best_score_
GridSearch.cv_results_


ํ‘œ๋กœ ์„ฑ๋Šฅ ๊ฒฐ๊ณผ๋ฅผ ์ •๋ฆฌ

# ์žก๊ธฐ์ˆ  ํ•˜๋‚˜ - ํ‘œ๋กœ ์„ฑ๋Šฅ ๊ฒฐ๊ณผ๋ฅผ ์ •๋ฆฌ
# accuracy์˜ ํ‰๊ท ๊ณผ ํ‘œ์ค€ํŽธ์ฐจ๋ฅผ ํ™•์ธ

import pandas as pd

score_df = pd.DataFrame(GridSearch.cv_results_)
score_df[['params', 'rank_test_score', 'mean_test_score', 'std_test_score']]

๐Ÿ’ป ์ถœ์ฒ˜ : ์ œ๋กœ๋ฒ ์ด์Šค ๋ฐ์ดํ„ฐ ์ทจ์—… ์Šค์ฟจ

profile
#๋ฐ์ดํ„ฐ๋ถ„์„ #ํผํฌ๋จผ์Šค๋งˆ์ผ€ํŒ… #๋ฐ์ดํ„ฐ #๋””์ง€ํ„ธ๋งˆ์ผ€ํŒ…

0๊ฐœ์˜ ๋Œ“๊ธ€