def exec_fold(clf, fold=5):
kfold = KFold(n_splits=fold)
scores = []
for iter_count, (train_index, test_index) in enumerate(kfold.split(X_titanic_df)):
X_train, X_test = X_titanic_df.values[train_index], X_titanic_df.values[test_index]
y_train, y_test = y_titanic_df.values[train_index], y_titanic_df.values[test_index]
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
scores.append(accuracy)
print("교차 검증 {0} 정확도: {1:.4f}".format(iter_count, accuracy))
mean_score = np.mean(scores)
print('평균 정확도: {0:.4f}'.format(mean_score))
만약 데이터가 골고루 섞여 있지 않고 데이터가 불균형하게 있다면 정확도는 적합한 모델 평가지표가 되지 못함.
예시)
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
class MyFakeClassifier(BaseEstimator):
def fit(self, X, y):
pass
def predict(self, X):
return np.zeros((len(X), 1), dtype=bool)
digits = load_digits()
y = (digits.target == 7).astype(int)
X_train, X_test, y_train, y_test = train_test_split(digits.data, y, random_state=11)
print('레이블 테스트 세트 크기 : ', y_test.shape)
print('테스트 세트 레이블 0과 1의 분포도')
print(pd.Series(y_test).value_counts())
fakeClf = MyFakeClassifier()
fakeClf.fit(X_train, y_train)
fakePred = fakeClf.predict(X_test)
print('모든 예측을 0으로 하여도 정확도는: {:.3f}'.format(accuracy_score(y_test, fakePred)))
무조건 0으로 예측해도 정확도가 90%
print(confusion_matrix(y_test, fakePred))
[[405 0]
[ 45 0]]
[[TN FP]
[FN TP]]