import pandas as pd
df = pd.DataFrame({
'A' : ['a', 'b', 'c', 'a', 'b'],
'B' : [1, 2, 3, 1, 0]
})
df
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df['A'])
le.classes_
-> array(['a', 'b', 'c'], dtype=object)
df['le_A'] = le.transform(df['A'])
df
-> 'a'-0, 'b'-1, 'c'-2로 변환
le.transform(['a', 'b'])
-> array([0, 1])
le.fit_transform(df['A'])
-> array([0, 1, 2, 0, 1])
le.inverse_transform([1, 2, 2, 2])
-> array(['b', 'c', 'c', 'c'], dtype=object)
df = pd.DataFrame({
'A' : [10, 20, -10, 0, 25],
'B' : [1, 2, 3, 1, 0]
})
df
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
mms.fit(df)
mms.data_max_, mms.data_min_, mms.data_range_
-> (array([25., 3.]), array([-10., 0.]), array([35., 3.]))
df_mms = mms.transform(df)
df_mms
mms.inverse_transform(df_mms)
mms.fit_transform(df)
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(df)
ss.mean_, ss.scale_
-> (array([9. , 1.4]), array([12.80624847, 1.0198039 ]))
df_ss = ss.transform(df)
df_ss
ss.fit_transform(df)
ss.inverse_transform(df_ss)
df = pd.DataFrame({
'A' : [-0.1, 0., 0.1, 0.2, 0.3, 0.4, 1.0, 1.1, 5]
})
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
mm = MinMaxScaler()
ss = StandardScaler()
rs = RobustScaler()
df_scaler = df.copy()
df_scaler['MinMax'] = mm.fit_transform(df)
df_scaler['Standard'] = ss.fit_transform(df)
df_scaler['Robust'] = rs.fit_transform(df)
df_scaler
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style='whitegrid')
plt.figure(figsize=(16,6))
sns.boxplot(data=df_scaler, orient='h');
import pandas as pd
red_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial' + \
'/master/dataset/winequality-red.csv'
white_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial' + \
'/master/dataset/winequality-white.csv'
red_wine = pd.read_csv(red_url, sep=';')
white_wine = pd.read_csv(white_url, sep=';')
white_wine.columns
-> Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
'pH', 'sulphates', 'alcohol', 'quality'],
dtype='object')
red_wine['color'] = 1.
white_wine['color'] = 0.
wine = pd.concat([red_wine, white_wine])
wine.info()
wine['quality'].unique()
-> array([5, 6, 7, 4, 8, 3, 9], dtype=int64)
import plotly.express as px
fig = px.histogram(wine, x='quality')
fig.show()
fig = px.histogram(wine, x='quality', color='color')
fig.show()
레드와인, 화이트와인 분류기
라벨 분리하기
X = wine.drop(['color'], axis=1)
y = wine['color']
from sklearn.model_selection import train_test_split
import numpy as np
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
np.unique(y_train, return_counts=True)
-> (array([0., 1.]), array([3913, 1284], dtype=int64))
-> 0은 3913개, 1은 1284개 있음을 확인
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Histogram(x=X_train['quality'], name='Train'))
fig.add_trace(go.Histogram(x=X_test['quality'], name='Test'))
fig.update_layout(barmode='overlay') # 두 그래프를 겹치게 함
fig.update_traces(opacity=0.75) # 투명도
fig.show()
from sklearn.tree import DecisionTreeClassifier
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, y_train)
from sklearn.metrics import accuracy_score
y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)
print('Train Acc : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(y_test, y_pred_test))
fig = go.Figure()
fig.add_trace(go.Box(y=X['fixed acidity'], name='fixed acidity'))
fig.add_trace(go.Box(y=X['chlorides'], name='chlorides'))
fig.add_trace(go.Box(y=X['quality'], name='quality'))
fig.show()
-> 컬럼들의 최대/최소 범위가 각각 다르고, 평균과 분산이 각각 다름
-> 특성(feature)의 편향 문제는 최적의 모델을 찾는데 방해가 될 수 있음
-> 이럴 때 쓰는 것이 MinMaxScaler와 StandardScaler
from sklearn.preprocessing import MinMaxScaler, StandardScaler
MMS = MinMaxScaler()
SS = StandardScaler()
SS.fit(X)
MMS.fit(X)
X_ss = SS.transform(X)
X_mms = MMS.transform(X)
X_ss_pd = pd.DataFrame(X_ss, columns=X.columns)
X_mms_pd = pd.DataFrame(X_mms, columns=X.columns)
-> 결정나무에서 이런 전처리는 무의미
-> 주로 Cost Function을 최적화할 때 유효함
-> MinMaxScaler와 StandardScaler 중 어느 것이 좋을지는 해봐야 암
fig = go.Figure()
fig.add_trace(go.Box(y=X_mms_pd['fixed acidity'], name='fixed acidity'))
fig.add_trace(go.Box(y=X_mms_pd['chlorides'], name='chlorides'))
fig.add_trace(go.Box(y=X_mms_pd['quality'], name='quality'))
fig.show()
fig = go.Figure()
fig.add_trace(go.Box(y=X_ss_pd['fixed acidity'], name='fixed acidity'))
fig.add_trace(go.Box(y=X_ss_pd['chlorides'], name='chlorides'))
fig.add_trace(go.Box(y=X_ss_pd['quality'], name='quality'))
fig.show()
X_train, X_test, y_train, y_test = train_test_split(X_mms_pd, y, test_size=0.2, random_state=13)
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, y_train)
y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)
print('Train Acc : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(y_test, y_pred_test))
X_train, X_test, y_train, y_test = train_test_split(X_ss_pd, y, test_size=0.2, random_state=13)
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, y_train)
y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)
print('Train Acc : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(y_test, y_pred_test))
dict(zip(X_train.columns, wine_tree.feature_importances_))
{'fixed acidity': 0.0,
'volatile acidity': 0.0,
'citric acid': 0.0,
'residual sugar': 0.0,
'chlorides': 0.24230360549660776,
'free sulfur dioxide': 0.0,
'total sulfur dioxide': 0.7576963945033922,
'density': 0.0,
'pH': 0.0,
'sulphates': 0.0,
'alcohol': 0.0,
'quality': 0.0}
wine['taste'] = [1. if grade>5 else 0. for grade in wine['quality']]
wine.info()
X = wine.drop(['taste'], axis=1)
y = wine['taste']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, y_train)
y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)
print('Train Acc : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(y_test, y_pred_test))
-> 결과 :
Train Acc : 1.0
Test Acc : 1.0
import matplotlib.pyplot as plt
import sklearn.tree as tree
plt.figure(figsize=(12,6))
tree.plot_tree(wine_tree, feature_names=X.columns)
-> quality 컬럼으로 taste 컬럼을 만들었으니 quality 컬럼으로 100프로 예측이 가능했던 것
-> quality 컬럼은 제거해야 함
X = wine.drop(['taste', 'quality'], axis=1)
y = wine['taste']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, y_train)
y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)
print('Train Acc : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(y_test, y_pred_test))
plt.figure(figsize=(12,8))
tree.plot_tree(wine_tree, feature_names=X.columns, rounded=True, filled=True)
plt.show()
-> '맛있는' 와인을 구분하는 기준을 알 수 있음
-> 'alcohol', 'free sulfur dioxide'가 높으면 맛있는 와인
코드의 실행 순서에 혼돈이 있을 수 있기 때문에 사용하는 기능 -> Pipeline
레드/화이트 와인 분류기의 동작 Process
-> StandardScaler() -> test_train_split() -> DecisionTreeClassifier()
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
estimators = [('scaler', StandardScaler()),
('clf', DecisionTreeClassifier())]
pipe = Pipeline(estimators)
pipe.steps
-> [('scaler', StandardScaler()), ('clf', DecisionTreeClassifier())]
pipe.set_params(clf__max_depth=2)
pipe.set_params(clf__random_state=13)
-> 스탭이름'clf' + 언더바 두 개 '__' + 속성 이름
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13, stratify=y)
pipe.fit(X_train, y_train)
from sklearn.metrics import accuracy_score
y_pred_tr = pipe.predict(X_train)
y_pred_test = pipe.predict(X_test)
print('Train Acc : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(y_test, y_pred_test))
교차검증
: 과적합은 모델이 학습 데이터에만 과도하게 최적화된 현상.
그로 인해 일반화된 데이터에서는 예측 성능이 과하게 떨어지는 현상.
따라서 교차검증은 주어진 데이터에 적용한 모델의 성능을 정확히 표현하기 위해서 유용함.
train 데이터와 test 데이터의 accuracy를 비교하는 것 -> holdout
train 데이터를 몇 개로 나누어 각각 validation(검증)을 하고 평균을 구한 후, test 데이터로 최종평가함 -> kfold(교차검증)
import numpy as np
from sklearn.model_selection import KFold
X = np.array([[1,2], [3,4], [1,2], [3,4]])
y = np.array([1,2,3,4])
kf = KFold(n_splits=2) # 몇 등분으로 나눌지 정함
print(kf.get_n_splits(X)) # 몇 등분으로 나누는지 알려줌
print(kf)
for train_idx, test_idx in kf.split(X):
print('--- idx')
print(train_idx, test_idx)
print('--- train data')
print(X[train_idx])
print('--- val data')
print(X[test_idx])
2
KFold(n_splits=2, random_state=None, shuffle=False)
--- idx
[2 3] [0 1]
--- train data
[[1 2]
[3 4]]
--- val data
[[1 2]
[3 4]]
--- idx
[0 1] [2 3]
--- train data
[[1 2]
[3 4]]
--- val data
[[1 2]
[3 4]]
import pandas as pd
red_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial' + \
'/master/dataset/winequality-red.csv'
white_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial' + \
'/master/dataset/winequality-white.csv'
red_wine = pd.read_csv(red_url, sep=';')
white_wine = pd.read_csv(white_url, sep=';')
red_wine['color'] = 1.
white_wine['color'] = 0.
wine = pd.concat([red_wine, white_wine])
wine['taste'] = [1. if grade>5 else 0. for grade in wine['quality']]
X = wine.drop(['taste', 'quality'], axis=1)
y = wine['taste']
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, y_train)
y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)
print('Train Acc : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(y_test, y_pred_test))
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)
for train_idx, test_idx in kfold.split(X):
print(len(train_idx), len(test_idx))
cv_accuracy = []
for train_idx, test_idx in kfold.split(X):
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
wine_tree_cv.fit(X_train, y_train)
pred = wine_tree_cv.predict(X_test)
cv_accuracy.append(accuracy_score(y_test, pred))
cv_accuracy
[0.6007692307692307,
0.6884615384615385,
0.7090069284064665,
0.7628945342571208,
0.7867590454195535]
np.mean(cv_accuracy)
-> 0.709578255462782
from sklearn.model_selection import StratifiedKFold
skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)
cv_accuracy = []
for train_idx, test_idx in skfold.split(X, y): # Stratified는 y도 같이 넣어줘야함
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
wine_tree_cv.fit(X_train, y_train)
pred = wine_tree_cv.predict(X_test)
cv_accuracy.append(accuracy_score(y_test, pred))
cv_accuracy
[0.5523076923076923,
0.6884615384615385,
0.7143956889915319,
0.7321016166281755,
0.7567359507313318]
np.mean(cv_accuracy)
-> 0.6888004974240539
-> acc의 평균이 더 나쁨!! (stratified가 무조건 좋은 것은 아님)
from sklearn.model_selection import cross_val_score
skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)
cross_val_score(wine_tree_cv, X, y, scoring=None, cv=skfold)
-> max_depth가 높다고 무조건 acc가 좋아지는 것은 아님!!
def skfold_dt(depth):
from sklearn.model_selection import cross_validate
skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=depth, random_state=13)
print(cross_val_score(wine_tree_cv, X, y, scoring=None, cv=skfold))
from sklearn.model_selection import cross_validate
cross_validate(wine_tree_cv, X, y, scoring=None, cv=skfold, return_train_score=True)
{'fit_time': array([0.01027703, 0.01000023, 0.00897837, 0.0099802 , 0.01002574]),
'score_time': array([0.0030005 , 0.00200129, 0.00200009, 0.00100374, 0.0010016 ]),
'test_score': array([0.50076923, 0.62615385, 0.69745958, 0.7582756 , 0.74903772]),
'train_score': array([0.78795459, 0.78045026, 0.77568295, 0.76356291, 0.76279338])}
-> 과적합 현상이 나타남(train데이터에 비해 test데이터의 acc가 낮은 경우)
하이퍼파라미터 튜닝 : 모델의 성능을 확보하기 위해 조절하는 설정 값
튜닝 대상 : 결정나무에서 아직 우리가 튜닝해볼 만한 것은 max_depth임. 간단하게 반복문으로 max_depth를 바꿔가며 테스트해볼 수 있을 것. but 간편하고 유용한 방법이 있음
GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
params = {'max_depth' : [2, 4, 7, 10]} # 파라미터
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
gridsearch = GridSearchCV(estimator=wine_tree, param_grid=params, cv=5)
gridsearch.fit(X, y)
-> CV는 cross validation(교차검증)
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(gridsearch.cv_results_)
{ 'mean_fit_time': array([0.00559764, 0.00799551, 0.01352868, 0.01753268]),
'mean_score_time': array([0.00120192, 0.00100427, 0.00099716, 0.00079865]),
'mean_test_score': array([0.6888005 , 0.66356523, 0.65340854, 0.64401587]),
'param_max_depth': masked_array(data=[2, 4, 7, 10],
mask=[False, False, False, False],
fill_value='?',
dtype=object),
'params': [ {'max_depth': 2}, # rank 1등
{'max_depth': 4}, # rank 2등
{'max_depth': 7}, # rank 3등
{'max_depth': 10}], # rank 4등
'rank_test_score': array([1, 2, 3, 4]),
'split0_test_score': array([0.55230769, 0.51230769, 0.50846154, 0.51615385]),
'split1_test_score': array([0.68846154, 0.63153846, 0.60307692, 0.60076923]),
'split2_test_score': array([0.71439569, 0.72363356, 0.68360277, 0.66743649]),
'split3_test_score': array([0.73210162, 0.73210162, 0.73672055, 0.71054657]),
'split4_test_score': array([0.75673595, 0.7182448 , 0.73518091, 0.72517321]),
'std_fit_time': array([0.00120125, 0.00063204, 0.00072068, 0.00057751]),
'std_score_time': array([3.99685845e-04, 2.39085219e-06, 8.46516908e-06, 3.99455922e-04]),
'std_test_score': array([0.07179934, 0.08390453, 0.08727223, 0.07717557])}
gridsearch.best_estimator_
-> DecisionTreeClassifier(max_depth=2, random_state=13)
gridsearch.best_score_
-> 0.6888004974240539
gridsearch.best_params_
-> {'max_depth': 2}
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
estimators = [('scaler', StandardScaler()),
('clf', DecisionTreeClassifier(random_state=13))]
pipe = Pipeline(estimators)
param_grid = [{'clf__max_depth' : [2, 4, 7, 10]}]
GridSearch = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=5)
GridSearch.fit(X, y)
GridSearch.best_estimator_
-> DecisionTreeClassifier(max_depth=2, random_state=13)
GridSearch.cv_results_
{'mean_fit_time': array([0.00711594, 0.01000628, 0.01406202, 0.01889477]),
'std_fit_time': array([0.00067744, 0.00089818, 0.00067699, 0.00080829]),
'mean_score_time': array([0.00170584, 0.00159731, 0.00120444, 0.00160127]),
'std_score_time': array([0.00040065, 0.0004857 , 0.00039914, 0.00049447]),
'param_clf__max_depth': masked_array(data=[2, 4, 7, 10],
mask=[False, False, False, False],
fill_value='?',
dtype=object),
'params': [{'clf__max_depth': 2},
{'clf__max_depth': 4},
{'clf__max_depth': 7},
{'clf__max_depth': 10}],
'split0_test_score': array([0.55230769, 0.51230769, 0.50846154, 0.51615385]),
'split1_test_score': array([0.68846154, 0.63153846, 0.60461538, 0.60230769]),
'split2_test_score': array([0.71439569, 0.72363356, 0.68206313, 0.66589684]),
'split3_test_score': array([0.73210162, 0.73210162, 0.73672055, 0.71054657]),
'split4_test_score': array([0.75673595, 0.7182448 , 0.73518091, 0.72517321]),
'mean_test_score': array([0.6888005 , 0.66356523, 0.6534083 , 0.64401563]),
'std_test_score': array([0.07179934, 0.08390453, 0.08699322, 0.0769154 ]),
'rank_test_score': array([1, 2, 3, 4])}
import pandas as pd
score_df = pd.DataFrame(GridSearch.cv_results_)
score_df[['params', 'rank_test_score', 'mean_test_score', 'std_test_score']]
<제로베이스 데이터 취업 스쿨>