# ๋ฐ์ดํฐ ์ฝ๊ธฐ
import pandas as pd
red_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv'
white_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv'
red_wine = pd.read_csv(red_url, sep=';')
white_wine = pd.read_csv(white_url, sep=';')
# ๋ ๋ฐ์ดํฐ์ ๊ตฌ์กฐ๋ ๋์ผํ๋ค
red_wine.head()
white_wine.head()
# ์ปฌ๋ผ ์ข
๋ฅ
white_wine.columns
# ๋ ๋ฐ์ดํฐ ํฉ์น๊ธฐ
red_wine['color'] = 1
white_wine['color'] = 0
wine = pd.concat([red_wine, white_wine])
wine.info()
# quality ์ปฌ๋ผ์ 3๋ถํฐ 9๋ฑ๊ธ๊น์ง ์กด์ฌ
wine['quality'].unique()
# histogram
import plotly.express as px
fig = px.histogram(wine, x='quality')
fig.show()
# ๋ ๋/ํ์ดํธ ์์ธ๋ณ๋ก ๋ฑ๊ธ Histogram
fig = px.histogram(wine, x='quality', color = 'color')
fig.show()
# ๋ผ๋ฒจ ๋ถ๋ฆฌ
X = wine.drop(['color'], axis=1)
Y = wine['color']
# ๋ฐ์ดํฐ๋ฅผ ํ๋ จ์ฉ๊ณผ ํ
์คํธ์ฉ์ผ๋ก ๋๋๊ธฐ
from sklearn.model_selection import train_test_split
import numpy as np
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=13)
np.unique(Y_train, return_counts=True)
# ํ๋ จ์ฉ๊ณผ ํ
์คํธ์ฉ์ด ๋ ๋/ํ์ดํธ ์์ธ์ ๋ฐ๋ผ ์ด๋์ ๋ ๊ตฌ๋ถ๋์์๊น
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Histogram(x=X_train['quality'], name='Train'))
fig.add_trace(go.Histogram(x=X_test['quality'], name='Test'))
fig.update_layout(barmode = 'overlay')
fig.update_traces(opacity = 0.75)
fig.show()
# ๊ฒฐ์ ๋๋ฌด
from sklearn.tree import DecisionTreeClassifier
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, Y_train)
from sklearn.metrics import accuracy_score
y_pred_tr = wine_tree.predict(X_train) # ์์น ๋ณด๊ธฐ ์ํด
y_pred_test = wine_tree.predict(X_test)
print('Train Acc : ', accuracy_score(Y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(Y_test, y_pred_test))
# ์์ธ ๋ฐ์ดํฐ์ ๋ช ๊ฐ ํญ๋ชฉ์ Boxplot
# ์ปฌ๋ผ๋ค์ ์ต๋/์ต์ ๋ฒ์๊ฐ ๊ฐ๊ฐ ๋ค๋ฅด๊ณ , ํ๊ท ๊ณผ ๋ถ์ฐ์ด ๊ฐ๊ฐ ๋ค๋ฅด๋ค.
# ํน์ฑ(feature)์ ํธํฅ ๋ฌธ์ ๋ ์ต์ ์ ๋ชจ๋ธ์ ์ฐพ๋๋ฐ ๋ฐฉํด๊ฐ ๋ ์๋ ์๋ค.
fig = go.Figure()
fig.add_trace(go.Box(y=X['fixed acidity'], name = 'fixed acidity'))
fig.add_trace(go.Box(y=X['chlorides'], name='chlorides'))
fig.add_trace(go.Box(y=X['quality'], name='quality'))
fig.show()
# ์ด๋ด ๋ ์ฐ๋ ๊ฒ์ด MinMaxScaler์ StandardScaler์ด๋ค
# ๊ฒฐ์ ๋๋ฌด์์๋ ์ด๋ฐ ์ ์ฒ๋ฆฌ๋ ์๋ฏธ๋ฅผ ๊ฐ์ง์ง ์๋๋ค.
# ์ฃผ๋ก Cost Function์ ์ต์ ํํ ๋ ์ ํจํ ๋๊ฐ ์๋ค.
# MinMaxScaler์ StandardScaler ์ค ์ด๋ค ๊ฒ์ด ์ข์์ง๋ ํด๋ด์ผ ์๋ค.
from sklearn.preprocessing import MinMaxScaler, StandardScaler
MMS = MinMaxScaler()
SS = StandardScaler()
SS.fit(X)
MMS.fit(X)
X_ss = SS.transform(X)
X_mms = MMS.transform(X)
X_ss_pd = pd.DataFrame(X_ss, columns=X.columns)
X_mms_pd = pd.DataFrame(X_mms, columns=X.columns)
# MinMaxScaler : ์ต๋ ์ต์๊ฐ์ 1๊ณผ 0์ผ๋ก ๊ฐ์ ๋ก ๋ง์ถ๋ ๊ฒ
fig = go.Figure()
fig.add_trace(go.Box(y=X_mms_pd['fixed acidity'], name='fixed acidity'))
fig.add_trace(go.Box(y=X_mms_pd['chlorides'], name='chlorides'))
fig.add_trace(go.Box(y=X_mms_pd['quality'], name = 'quality'))
fig.show()
# StandardScaler : ํ๊ท ์ 0์ผ๋ก ํ์คํธ์ฐจ๋ฅผ 1๋ก ๋ง์ถ๋ ๊ฒ
def px_box(target_df):
fig = go.Figure()
fig.add_trace(go.Box(y=target_df['fixed acidity'], name = 'fixed acidity'))
fig.add_trace(go.Box(y=target_df['chlorides'], name='chlorides'))
fig.add_trace(go.Box(y=target_df['quality'], name = 'quality'))
fig.show()
px_box(X_ss_pd)
# MinMaxScaler๋ฅผ ์ ์ฉํด์ ๋ค์ ํ์ต
# ๋ค์ ์ด์ผ๊ธฐํ์ง๋ง ๊ฒฐ์ ๋๋ฌด์์๋ ์ด๋ฐ ์ ์ฒ๋ฆฌ๋ ๊ฑฐ์ ํจ๊ณผ๊ฐ ์๋ค.
X_train, X_test, Y_train, Y_test = train_test_split(X_mms_pd, Y, test_size=0.2, random_state=13)
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, Y_train)
y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)
print('Train Acc : ', accuracy_score(Y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(Y_test, y_pred_test) )
# StandardScaler๋ฅผ ์ ์ฉ
X_train, X_test, Y_train, Y_test = train_test_split(X_ss_pd, Y, test_size=0.2, random_state=13)
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, Y_train)
y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)
print('Train Acc : ', accuracy_score(Y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(Y_test, y_pred_test) )
# ๊ฒฐ์ ๋๋ฌด๋ ํ์ดํธ์์ธ๊ณผ ๋ ๋์์ธ์ ์ด๋ป๊ฒ ๊ตฌ๋ถํ ๊น?
#total sulfur dioxide๊ฐ ์ค์ํ ์ญํ ์ ํ๋ ๊ฒ๊ฐ๋ค.
# ๋ ๋์์ธ๊ณผ ํ์ดํธ์์ธ์ ๊ตฌ๋ถํ๋ ์ค์ ํน์ฑ
# MaxDepth๋ฅผ ๋์ด๋ฉด ์ ์์น์๋ ๋ณํ๊ฐ ์จ๋ค.
dict(zip(X_train.columns, wine_tree.feature_importances_))
# quality ์ปฌ๋ผ์ ์ด์งํ
wine['taste'] = [1 if grade > 5 else 0 for grade in wine['quality']] # โ
wine.info()
# ๋ ๋/ํ์ดํธ ์์ธ ๋ถ๋ฅ์ ๋์ผ ๊ณผ์ ์ ๊ฑฐ์น์
X = wine.drop(['taste'], axis=1)
Y = wine['taste']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=13)
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, Y_train)
# 100ํ๋ก ๊ฐ๋ฅํ๊ฐ? โ ์์ฌํด์ผ ํ๋ค
y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)
print('Train Acc : ', accuracy_score(Y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(Y_test, y_pred_test))
# ์ ์ด๋ฐ ์ผ์ด ์๊ฒผ๋์ง ํ์ธํด๋ณด์
# quality ์ปฌ๋ผ์ผ๋ก taste ์ปฌ๋ผ์ ๋ง๋ค์์ผ๋ quality ์ปฌ๋ผ์ ์ ๊ฑฐ ํ์ด์ผ ํ๋ค
import matplotlib.pyplot as plt
import sklearn.tree as tree
plt.figure(figsize=(12, 8))
tree.plot_tree(wine_tree, feature_names=X.columns)
# ๋ค์ ์งํ
X = wine.drop(['taste', 'quality'], axis=1)
Y = wine['taste']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=13)
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, Y_train)
y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)
print('Train Acc : ', accuracy_score(Y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(Y_test, y_pred_test))
# ์ด๋ค ์์ธ์ โ๋ง์๋คโ๊ณ ํ ์ ์๋?
import matplotlib.pyplot as plt
import sklearn.tree as tree
plt.figure(figsize=(12, 8))
tree.plot_tree(wine_tree, feature_names=X.columns)
import pandas as pd
red_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv'
white_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv'
red_wine = pd.read_csv(red_url, sep=';')
white_wine = pd.read_csv(white_url, sep=';')
red_wine['color'] = 1
white_wine['color'] = 0
wine = pd.concat([red_wine, white_wine])
X = wine.drop(['color'], axis=1)
Y = wine['color']
๊ต์ฐจ๊ฒ์ฆ
- ๋์๊ฒ ์ฃผ์ด์ง ใ
ฃ๋ฐ์ดํฐ์ ์ ์ฉํ ๋ชจ๋ธ์ ์ฑ๋ฅ์ ์ ํํ ํํํ๊ธฐ ์ํด์๋ ์ ์ฉํ๋ค.
๊ณผ์ ํฉ : ๋ชจ๋ธ์ด ํ์ต ๋ฐ์ดํฐ์๋ง ๊ณผ๋ํ๊ฒ ์ต์ ํ๋ ํ์. ๊ทธ๋ก ์ธํด ์ผ๋ฐํ๋ ๋ฐ์ดํฐ์์๋ ์์ธก ์ฑ๋ฅ์ด ๊ณผํ๊ฒ ๋จ์ด์ง๋ ํ์
holdout
k(์ซ์)-fold cross validation
stratified k-fold cross validation
๊ฒ์ฆ validation์ด ๋๋ ํ test์ฉ ๋ฐ์ดํฐ๋ก ์ต์ข ํ๊ฐ
# holdout
# k(์ซ์)-fold cross validation
# stratified k-fold cross validation
# ๊ฒ์ฆ validation์ด ๋๋ ํ test์ฉ ๋ฐ์ดํฐ๋ก ์ต์ข
ํ๊ฐ
# ๊ต์ฐจ๊ฒ์ฆ ๊ตฌํํ๊ธฐ
# simple example
import numpy as np
from sklearn.model_selection import KFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
Y = np.array([1, 2, 3, 4])
kf = KFold(n_splits=2)
print(kf.get_n_splits(X))
print(kf)
for train_idx, test_idx in kf.split(X):
print('--- idx')
print(train_idx, test_idx),
print('--- train data')
print(X[train_idx])
print('--- val data')
print(X[test_idx])
# ๋ค์ ์์ธ ๋ง ๋ถ๋ฅํ๋ ๋ฐ์ดํฐ๋ก\
# ๋ฐ์ดํฐ ์ฝ๊ธฐ
import pandas as pd
red_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv'
white_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv'
red_wine = pd.read_csv(red_url, sep=';')
white_wine = pd.read_csv(white_url, sep=';')
red_wine['color'] = 1
white_wine['color'] = 0
wine = pd.concat([red_wine, white_wine])
# ์์ธ ๋ง ๋ถ๋ฅ๊ธฐ๋ฅผ ์ํ ๋ฐ์ดํฐ ์ ๋ฆฌ
wine['taste'] = [1 if grade > 5 else 0 for grade in wine['quality']]
X = wine.drop(['taste', 'quality'], axis=1)
Y = wine['taste']
# ์ง๋๋ฒ ์์ฌ ๊ฒฐ์ ๋๋ฌด ๋ชจ๋ธ๋ก๋?
# ์ฌ๊ธฐ์ ์ ๊น, ๊ทธ๋ฌ๋๊น ๋๊ฐ, โ๋ฐ์ดํฐ๋ฅผ ์ ๋ ๊ฒ ๋ถ๋ฆฌํ๋ ๊ฒ์ด ์ต์ ์ธ๊ฑด๊ฐ?โ
# โ์ acc๋ฅผ ์ด๋ป๊ฒ ์ ๋ขฐํ ์ ์๋๊ฐ?โ ๋ผ๊ณ ๋ฌป๋๋ค๋ฉด
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=13)
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, Y_train)
y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)
print('Train Acc : ', accuracy_score(Y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(Y_test, y_pred_test))
# KFold
from sklearn.model_selection import KFold
KFold = KFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)
# KFold๋ index๋ฅผ ๋ฐํํ๋ค
for train_idx, test_idx in KFold.split(X):
print(len(train_idx), len(test_idx))
# ๊ฐ๊ฐ์ fold์ ๋ํ ํ์ต ํ acc
# ๋ชจ๋ธ์ด ํ๋์ accuracy๊ฐ ์๋ ์ ์๋ค.
cv_accuracy = []
for train_idx, test_idx in KFold.split(X):
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
Y_train, Y_test = Y.iloc[train_idx], Y.iloc[test_idx]
wine_tree_cv.fit(X_train, Y_train)
pred = wine_tree_cv.predict(X_test)
cv_accuracy.append(accuracy_score(Y_test, pred))
cv_accuracy
# ๊ฐ acc์ ๋ถ์ฐ์ด ํฌ์ง ์๋ค๋ฉด ํ๊ท ์ ๋ํ ๊ฐ์ผ๋ก ํ๋ค
np.mean(cv_accuracy)
# StratifiedKFold
# https://continuous-development.tistory.com/166
from sklearn.model_selection import StratifiedKFold
skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)
cv_accuracy = []
for train_idx, test_idx in skfold.split(X, Y):
X_train = X.iloc[train_idx]
X_test = X.iloc[test_idx]
Y_train = Y.iloc[train_idx]
Y_test = Y.iloc[test_idx]
wine_tree_cv.fit(X_train, Y_train)
pred = wine_tree_cv.predict(X_test)
cv_accuracy.append(accuracy_score(Y_test, pred))
cv_accuracy
# acc์ ํ๊ท ์ด ๋ ๋์๋ค
np.mean(cv_accuracy)
# cross validation์ ๋ณด๋ค ๊ฐํธํ ํ๋ ๋ฐฉ๋ฒ
from sklearn.model_selection import cross_val_score
skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)
cross_val_score(wine_tree_cv, X, Y, scoring=None, cv=skfold)
# depth๊ฐ ๋๋ค๊ณ ๋ฌด์กฐ๊ฑด acc๊ฐ ์ข์์ง๋ ๊ฒ๋ ์๋๋ค
wine_tree_cv = DecisionTreeClassifier(max_depth=5, random_state=13)
cross_val_score(wine_tree_cv, X, Y, scoring=None, cv=skfold)
def skfold_dt(depth):
from sklearn.model_selection import cross_val_score
skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=depth, random_state=13)
print(cross_val_score(wine_tree_cv, X, Y, scoring=None, cv=skfold))
skfold_dt(3)
# train score์ ํจ๊ป ๋ณด๊ณ ์ถ์ ๊ฒฝ์ฐ
# ํ์ฌ ์ฐ๋ฆฌ๋ ๊ณผ์ ํฉ ํ์๋ ํจ๊ป ๋ชฉ๊ฒฉํ๊ณ ์๋ค
from sklearn.model_selection import cross_validate
cross_validate(wine_tree_cv, X, Y, scoring=None, cv=skfold, return_train_score=True)
import pandas as pd
red_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv'
white_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv'
red_wine = pd.read_csv(red_url, sep=';')
white_wine = pd.read_csv(white_url, sep=';')
red_wine['color'] = 1
white_wine['color'] = 0
wine = pd.concat([red_wine, white_wine])
wine['taste'] = [1 if grade > 5 else 0 for grade in wine['quality']]
X = wine.drop(['taste', 'quality'], axis=1)
Y = wine['taste']
# GridSearchCV
# cv๋ cross validation
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
params = {'max_depth' : [2, 4, 7, 10]}
gridsearch = GridSearchCV(estimator=wine_tree, param_grid=params, cv = 5)
gridsearch.fit(X, Y)
# GridSearchCV์ ๊ฒฐ๊ณผ
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(gridsearch.cv_results_)
# ์ต์ ์ ์ฑ๋ฅ์ ๊ฐ์ง ๋ชจ๋ธ
gridsearch.best_estimator_
gridsearch.best_score_
gridsearch.best_params_
# ๋ง์ฝ pipeline์ ์ ์ฉํ ๋ชจ๋ธ์ GridSearch๋ฅผ ์ ์ฉํ๊ณ ์ถ๋ค๋ฉด
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
estimators = [('scaler', StandardScaler()), ('clf', DecisionTreeClassifier(random_state=13))]
pipe = Pipeline(estimators)
param_grid = [{'clf__max_depth': [2, 4, 7, 10]}]
GridSearch = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=5)
GridSearch.fit(X, Y)
# best ๋ชจ๋ธ
GridSearch.best_estimator_
# best_score_
GridSearch.best_score_
GridSearch.cv_results_
# ์ก๊ธฐ์ ํ๋ - ํ๋ก ์ฑ๋ฅ ๊ฒฐ๊ณผ๋ฅผ ์ ๋ฆฌ
# accuracy์ ํ๊ท ๊ณผ ํ์คํธ์ฐจ๋ฅผ ํ์ธ
import pandas as pd
score_df = pd.DataFrame(GridSearch.cv_results_)
score_df[['params', 'rank_test_score', 'mean_test_score', 'std_test_score']]
๐ป ์ถ์ฒ : ์ ๋ก๋ฒ ์ด์ค ๋ฐ์ดํฐ ์ทจ์ ์ค์ฟจ