빅데이터 분석 절차
1. 기획 (위험성 분석)
2. 데이터 수집
3. 데이터 전처리
4. 모델 선택
5. 평가 및 적용
위험성 분석
모델의 정확도
데이터 확인
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = sns.load_dataset('titanic') # seaborn에 유명한 데이터셋 몇 개가 이미 올라와 있다
df
df.shape
(891, 15)
df.pclass.value_counts()
3 491
1 216
2 184
Name: pclass, dtype: int64
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 survived 891 non-null int64
1 pclass 891 non-null int64
2 sex 891 non-null object
3 age 714 non-null float64
4 sibsp 891 non-null int64
5 parch 891 non-null int64
6 fare 891 non-null float64
7 embarked 889 non-null object
8 class 891 non-null category
9 who 891 non-null object
10 adult_male 891 non-null bool
11 deck 203 non-null category
12 embark_town 889 non-null object
13 alive 891 non-null object
14 alone 891 non-null bool
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB
for i in df.columns:
print('feature: ',i)
print(df[i].value_counts())
plt.figure(figsize=(20,10))
df[i].value_counts().plot(kind='bar')
plt.show()
df.age.plot(kind='hist', bins=40)
df.describe() # include='all' 옵션을 주면 숫자형이 아닌 컬럼도 나온다.
결측치 처리
print('피쳐별 결측치: \n',df.isna().sum())
print('전체 결측치 수: ', df.isna().sum().sum())
피쳐별 결측치:
survived 0
pclass 0
sex 0
age 177
sibsp 0
parch 0
fare 0
embarked 2
class 0
who 0
adult_male 0
deck 688
embark_town 2
alive 0
alone 0
dtype: int64
전체 결측치 수: 869
df.embark_town.value_counts().plot(kind='bar')
df.drop('embark_town', axis=1, inplace=True)
df.embarked.fillna('S', inplace=True) # S가 대부분이기 때문에 2개의 결측치를 S로 채움
grouped = df.groupby(['sex','pclass'])
df_sp_grouped = grouped.age.median().unstack()
df_sp_grouped
m1_md = df_sp_grouped.iloc[0, 0]
f1_md = df_sp_grouped.iloc[0, 1]
m2_md = df_sp_grouped.iloc[0, 2]
f2_md = df_sp_grouped.iloc[1, 0]
m3_md = df_sp_grouped.iloc[1, 1]
f3_md = df_sp_grouped.iloc[1, 2]
# pandas는 isna 조건을 내부로 넣는 것을 권장
df.loc[(df.sex=='male') & (df.pclass==1) & (df.age.isna()), 'age'] = m1_md
df.loc[(df.sex=='female') & (df.pclass==1) & (df.age.isna()), 'age'] = f1_md
df.loc[(df.sex=='male') & (df.pclass==2) & (df.age.isna()), 'age'] = m2_md
df.loc[(df.sex=='female') & (df.pclass==2) & (df.age.isna()), 'age'] = f2_md
df.loc[(df.sex=='male') & (df.pclass==3) & (df.age.isna()), 'age'] = m3_md
df.loc[(df.sex=='female') & (df.pclass==3) & (df.age.isna()), 'age'] = f3_md
# deck은 77%정도가 결측치라서 삭제로 결정
df.drop(columns='deck', inplace=True)
df.isna().sum().sum()
0
결측치 제거 추가 연습
df1 = sns.load_dataset('titanic')
# 1. embark_town 제거
df1.drop(columns='embark_town', inplace=True)
# 2. age열은 평균으로 결측치 업데이트
df1.age.fillna(df.age.mean(), inplace=True)
# 3. embarked, deck은 'N'으로 업데이트
df1.embarked.fillna('N', inplace=True)
df1.deck = df1.deck.astype('object') # deck의 type이 category 라서 새로운 category인 N으로 업데이트할 수 없어서 type을 object로 변형한다
df1.deck.fillna('N', inplace=True)
데이터 뜯어보기
df.corr() # 상관계수, 숫자형 데이터만 나옴
sns.heatmap(df.corr(), cmap='coolwarm', annot=True, cbar=True)
# annot=True 옵션은 칸 안에 숫자를 표시한다
# cmap='coolwarm' 옵션을 주면 음수는 파란색, 양수는 빨간색 계열로 나온다
가정
# pclass와 생존여부를 따로 보자
pd.crosstab(df.pclass, df.survived, margins=True) # margins=True 옵션을 주면 각 행, 열 별 합이 표시된다
pd.crosstab(df.pclass, df.survived).plot(kind='bar')
# 성별과 생존 여부
pd.crosstab(df.sex, df.survived, margins=True)
pd.crosstab(df.sex, df.survived).plot(kind='bar')
# pclass, 성별과 생존 여부
pd.crosstab([df.pclass, df.sex], df.survived, margins=True)
sns.violinplot(x='pclass', y='age', hue='survived', data=df, inner='quartile', split=True) # boxplot의 변형
# inner='quartile' 옵션은 분위수별로 선을 표시해준다
인코딩
Label Encoding
from sklearn.preprocessing import LabelEncoder
# df.dtypes[(df.dtypes=='object') | (df.dtypes=='bool') | (df.dtypes=='category')]
column_list = ['sex', 'embarked','class','who','adult_male','alive','alone']
for i in column_list:
encoder = LabelEncoder()
encoder.fit(df[i])
df[i] = encoder.transform(df[i])
column_list2 = ['sex','embarked','class','who','adult_male','deck','alive','alone']
for i in column_list2:
encoder = LabelEncoder()
encoder.fit(df1[i])
df1[i] = encoder.transform(df1[i])
데이터 전처리의 3단계
def standard_deviation(x):
return (x - x.mean()) / x.std()
def normalization(x):
return (x - x.min()) / (x.max() - x.min())
alive와 class 삭제
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', cbar=False)
df.drop(['alive','class'], axis=1, inplace=True)
df1.drop(['alive','class'], axis=1, inplace=True)
데이터 분리
X = df.drop('survived',axis=1)
y = df.survived
X1 = df1.drop('survived', axis=1)
y1 = df1.survived
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
학습
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
dt_clf = DecisionTreeClassifier()
rf_clf = RandomForestClassifier(n_jobs=2) # n_jobs는 CPU의 코어를 몇개 사용할지 결정, -1은 전체 사용
lr_clf = LogisticRegression(n_jobs=2)
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
print('DecisionTree accuracy score: %.2f' %accuracy_score(y_test, dt_pred))
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
print('RandomForest accuracy score: %.2f' %accuracy_score(y_test, rf_pred))
lr_clf.fit(X_train, y_train)
lr_pred = lr_clf.predict(X_test)
print('LogisticRegression accuracy score: %.2f' %accuracy_score(y_test, lr_pred))
DecisionTree accuracy score: 0.77
RandomForest accuracy score: 0.82
LogisticRegression accuracy score: 0.80
K-Fold
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5)
model_list = [dt_clf, rf_clf, lr_clf]
model_name_list = ['DecisionTree Classifier','RandomForest Classifier','LogisticRegression Classifier']
for j, val in enumerate(model_name_list):
scores = []
for i, (train, test) in enumerate(kfold.split(X)):
X_train = X.values[train]
X_test = X.values[test]
y_train = y.values[train]
y_test = y.values[test]
model_list[j].fit(X_train, y_train)
pred = model_list[j].predict(X_test)
print('%s accuracy score: %.2f' %(model_name_list[j], accuracy_score(y_test, pred)))
scores.append(accuracy_score(y_test, pred))
print('kfold %s 평균 정확도: %.2f' %(model_name_list[j], np.mean(scores)))
DecisionTree Classifier accuracy score: 0.78
DecisionTree Classifier accuracy score: 0.79
DecisionTree Classifier accuracy score: 0.83
DecisionTree Classifier accuracy score: 0.78
DecisionTree Classifier accuracy score: 0.74
kfold DecisionTree Classifier 평균 정확도: 0.78
RandomForest Classifier accuracy score: 0.78
RandomForest Classifier accuracy score: 0.82
RandomForest Classifier accuracy score: 0.85
RandomForest Classifier accuracy score: 0.78
RandomForest Classifier accuracy score: 0.84
kfold RandomForest Classifier 평균 정확도: 0.81
LogisticRegression Classifier accuracy score: 0.82
LogisticRegression Classifier accuracy score: 0.81
LogisticRegression Classifier accuracy score: 0.79
LogisticRegression Classifier accuracy score: 0.78
LogisticRegression Classifier accuracy score: 0.88
kfold LogisticRegression Classifier 평균 정확도: 0.81
Stratified K-Fold
from sklearn.model_selection import StratifiedKFold
skfold = StratifiedKFold()
model_list = [dt_clf, rf_clf, lr_clf]
model_name_list = ['DecisionTree Classifier','RandomForest Classifier','LogisticRegression Classifier']
for j, val in enumerate(model_name_list):
scores = []
for i, (train, test) in enumerate(skfold.split(X, y)):
X_train = X.values[train]
X_test = X.values[test]
y_train = y.values[train]
y_test = y.values[test]
model_list[j].fit(X_train, y_train)
pred = model_list[j].predict(X_test)
print('%s accuracy score: %.2f' %(model_name_list[j], accuracy_score(y_test, pred)))
scores.append(accuracy_score(y_test, pred))
print('Stratifiedkfold %s 평균 정확도: %.2f' %(model_name_list[j], np.mean(scores)))
DecisionTree Classifier accuracy score: 0.79
DecisionTree Classifier accuracy score: 0.79
DecisionTree Classifier accuracy score: 0.83
DecisionTree Classifier accuracy score: 0.78
DecisionTree Classifier accuracy score: 0.78
Stratifiedkfold DecisionTree Classifier 평균 정확도: 0.79
RandomForest Classifier accuracy score: 0.79
RandomForest Classifier accuracy score: 0.80
RandomForest Classifier accuracy score: 0.85
RandomForest Classifier accuracy score: 0.78
RandomForest Classifier accuracy score: 0.83
Stratifiedkfold RandomForest Classifier 평균 정확도: 0.81
LogisticRegression Classifier accuracy score: 0.82
LogisticRegression Classifier accuracy score: 0.81
LogisticRegression Classifier accuracy score: 0.80
LogisticRegression Classifier accuracy score: 0.80
LogisticRegression Classifier accuracy score: 0.86
Stratifiedkfold LogisticRegression Classifier 평균 정확도: 0.82
K-Fold와 Stratified K-Fold
cross_val_score
from sklearn.model_selection import cross_val_score
cvs = cross_val_score(dt_clf, X, y, cv=5)
np.mean(cvs)
0.7867553825874082
cvs1 = cross_val_score(dt_clf, X1, y1, cv=5)
np.mean(cvs1)
0.7912748728893353
for model in model_list:
cvs = cross_val_score(model, X, y, cv=5)
cvs1 = cross_val_score(model, X1, y1, cv=5)
print('cvs:',np.mean(cvs))
print('cvs1:', np.mean(cvs1))
cvs: 0.7901387232439896
cvs1: 0.7957629778419434
cvs: 0.8069801016885318
cvs1: 0.8092210156299039
cvs: 0.8181783943255289
cvs1: 0.817048521750047
첨언
하이퍼 파라미터 튜닝
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
X_train_1, X_val, y_train_1, y_val = train_test_split(X_train, y_train, test_size=0.2)
from sklearn.model_selection import GridSearchCV
param_grid = {'max_depth':[2, 4, 5],
'min_samples_split':[2, 4, 6],
'min_samples_leaf':[1, 3, 5]}
dt_clf = DecisionTreeClassifier()
grid_dtclf = GridSearchCV(dt_clf, param_grid=param_grid, scoring='accuracy', n_jobs=-1, cv=5)
import time
start_time = time.time()
grid_dtclf.fit(X_train, y_train)
print('걸린 시간: ', time.time() - start_time)
걸린 시간: 2.61286997795105
grid_dtclf.best_params_
{'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 6}
grid_dtclf.best_score_
0.8302018633540372
best_dtclf = grid_dtclf.best_estimator_
best_pred = best_dtclf.predict(X_test)
lr_clf.get_params()
{'C': 1.0,
'class_weight': None,
'dual': False,
'fit_intercept': True,
'intercept_scaling': 1,
'l1_ratio': None,
'max_iter': 100,
'multi_class': 'auto',
'n_jobs': 2,
'penalty': 'l2',
'random_state': None,
'solver': 'lbfgs',
'tol': 0.0001,
'verbose': 0,
'warm_start': False}
회귀 성능 평가 지표
분류 성능 평가 지표
Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, best_pred)
array([[49, 9],
[ 5, 27]], dtype=int64)
정밀도(Precision) & 재현율(Recall)
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
def get_clf_eval(y_test, pred):
confusion = pd.DataFrame(confusion_matrix(y_test, pred),
columns=['예측 0','예측 1'],
index=['실제 0', '실제 1'])
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
print('오차 행렬')
print(confusion)
print('-'*30)
print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}'.format(accuracy, precision, recall))
get_clf_eval(y_test, best_pred)
오차 행렬
예측 0 예측 1
실제 0 49 9
실제 1 5 27
------------------------------
정확도: 0.8444, 정밀도: 0.7500, 재현율: 0.8438
lr_clf.fit(X_train, y_train)
pred_proba = lr_clf.predict_proba(X_test)
pred = lr_clf.predict(X_test)
print('pred_praba() 결과 Shape: {0}'.format(pred_proba.shape))
print('pred_proba array에서 앞 3개만 샘플로 추출\n', pred_proba[:3])
pred_praba() 결과 Shape: (90, 2)
pred_proba array에서 앞 3개만 샘플로 추출
[[0.89749612 0.10250388]
[0.11478398 0.88521602]
[0.77018145 0.22981855]]
pred_proba_result = np.concatenate([pred_proba, pred.reshape(-1,1)], axis=1)
print('두 개의 class 중에서 더 큰 확률을 클래스 값으로 예측\n', pred_proba_result[:3])
두 개의 class 중에서 더 큰 확률을 클래스 값으로 예측
[[0.89749612 0.10250388 0. ]
[0.11478398 0.88521602 1. ]
[0.77018145 0.22981855 0. ]]
from sklearn.preprocessing import Binarizer
for i in range(1, 10):
binarizer = Binarizer(threshold=i*0.1)
lr_pred_new = binarizer.fit_transform(pred_proba)[:,1]
print('threshold: {}'.format(i*0.1))
get_clf_eval(y_test, lr_pred_new)
threshold: 0.1
오차 행렬
예측 0 예측 1
실제 0 17 41
실제 1 1 31
------------------------------
정확도: 0.5333, 정밀도: 0.4306, 재현율: 0.9688
threshold: 0.2
오차 행렬
예측 0 예측 1
실제 0 36 22
실제 1 3 29
------------------------------
정확도: 0.7222, 정밀도: 0.5686, 재현율: 0.9062
...
threshold: 0.8
오차 행렬
예측 0 예측 1
실제 0 55 3
실제 1 16 16
------------------------------
정확도: 0.7889, 정밀도: 0.8421, 재현율: 0.5000
threshold: 0.9
오차 행렬
예측 0 예측 1
실제 0 57 1
실제 1 20 12
------------------------------
정확도: 0.7667, 정밀도: 0.9231, 재현율: 0.3750
F1 스코어
from sklearn.metrics import f1_score
def clf_eval(y_test, pred):
confusion = pd.DataFrame(confusion_matrix(y_test, pred),
columns=['예측 0','예측 1'],
index=['실제 0', '실제 1'])
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
f_score = f1_score(y_test, pred)
print('오차 행렬')
print(confusion)
print('-'*30)
print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1스코어: {3:.4f}'.format(accuracy, precision, recall, f_score))
clf_eval(y_test, best_pred)
오차 행렬
예측 0 예측 1
실제 0 49 9
실제 1 5 27
------------------------------
정확도: 0.8444, 정밀도: 0.7500, 재현율: 0.8438, F1스코어: 0.7941
a_score, r_score, p_score, f1score = [], [], [], []
for i in range(101):
binarizer = Binarizer(threshold=i*0.01)
lr_pred_new = binarizer.fit_transform(pred_proba)[:,1]
a_score.append(accuracy_score(y_test, lr_pred_new))
r_score.append(recall_score(y_test, lr_pred_new))
p_score.append(precision_score(y_test, lr_pred_new))
f1score.append(f1_score(y_test, lr_pred_new))
np.argmax(np.array(f1score))
30
f1score[30] # f1_score가 최대인 threshold 값
0.7733333333333334
ROC AUC
from sklearn.metrics import roc_auc_score
def clf_eval(y_test, pred):
confusion = pd.DataFrame(confusion_matrix(y_test, pred),
columns=['예측 0','예측 1'],
index=['실제 0', '실제 1'])
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
f_score = f1_score(y_test, pred)
auc = roc_auc_score(y_test, pred)
print('오차 행렬')
print(confusion)
print('-'*30)
print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1스코어: {3:.4f}, AUC스코어: {4:.4f}'.format(accuracy, precision, recall, f_score, auc))
clf_eval(y_test, best_pred)
오차 행렬
예측 0 예측 1
실제 0 49 9
실제 1 5 27
------------------------------
정확도: 0.8444, 정밀도: 0.7500, 재현율: 0.8438, F1스코어: 0.7941, AUC스코어: 0.8443
a_score, r_score, p_score, f1score, auc = [], [], [], [], []
for i in range(101):
binarizer = Binarizer(threshold=i*0.01)
lr_pred_new = binarizer.fit_transform(pred_proba)[:,1]
a_score.append(accuracy_score(y_test, lr_pred_new))
r_score.append(recall_score(y_test, lr_pred_new))
p_score.append(precision_score(y_test, lr_pred_new))
f1score.append(f1_score(y_test, lr_pred_new))
auc.append(roc_auc_score(y_test, lr_pred_new))
plt.figure(figsize=(20,10))
plt.plot(a_score, label='a_score')
plt.plot(r_score, label='a_score')
plt.plot(p_score, label='a_score')
plt.plot(f1score, label='a_score')
plt.plot(auc, label='a_score')
plt.legend()
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df_white = pd.read_csv('./data/winequality-white (1).csv', sep=';')
df_red = pd.read_csv('./data/winequality-red (1).csv', sep=';')
df_red['redwhite'] = 0
df_white['redwhite'] = 1
df = pd.concat([df_red, df_white]) # 둘을 이어주지만 인덱스는 자신의 것을 그대로 사용하여 중복된 인덱스가 있을 것이다
df.reset_index(inplace=True) # 인덱스를 재조정한다
df.drop('index', axis=1, inplace=True)
df.isna().sum()
fixed acidity 0
volatile acidity 0
citric acid 0
residual sugar 0
chlorides 0
free sulfur dioxide 0
total sulfur dioxide 0
density 0
pH 0
sulphates 0
alcohol 0
quality 0
redwhite 0
dtype: int64
for i in df.columns:
print('feature: ',i)
if len(df.iloc[:,0].unique()) < 10:
print(df[i].value_counts())
plt.figure(figsize=(10,5))
df[i].values_counts().plot(kind='bar')
else:
print(df[i].describe())
plt.figure(figsize=(10,5))
df[i].plot(kind='hist',bins=100)
plt.show()
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(), cmap='coolwarm', cbar=False, annot=True)
# 다중 분류 문제 -> 이진 분류 문제
df.quality.unique()
array([5, 6, 7, 4, 8, 3, 9], dtype=int64)
df['quality_new'] = 0
df.loc[df.quality > 6, 'quality_new'] = 1
df.quality_new.value_counts()
0 5220
1 1277
Name: quality_new, dtype: int64
df.drop('quality',axis=1, inplace=True)
for i in ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
'pH', 'sulphates', 'alcohol']:
sns.violinplot(x='redwhite',y=i,hue='quality_new', data=df, inner='quartile',split=True)
plt.show()
X = df.drop('quality_new', axis=1)
y = df.quality_new
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
dt_clf = DecisionTreeClassifier()
rf_clf = RandomForestClassifier(n_jobs=2)
lr_clf = LogisticRegression(n_jobs=2)
rf_params = {'max_depth' : range(1, 11),
'min_samples_leaf' : range(1, 11),
'min_samples_split' : range(1, 11)}
dt_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)
lr_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
rf_pred = rf_clf.predict(X_test)
lr_pred = lr_clf.predict(X_test)
def clf_eval(y_test, pred):
confusion = pd.DataFrame(confusion_matrix(y_test, pred),
columns=['예측 0','예측 1'],
index=['실제 0', '실제 1'])
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
f_score = f1_score(y_test, pred)
auc = roc_auc_score(y_test, pred)
print('오차 행렬')
print(confusion)
print('-'*30)
print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1스코어: {3:.4f}, AUC스코어: {4:.4f}'.format(accuracy, precision, recall, f_score, auc))
clf_eval(y_test, dt_pred)
오차 행렬
예측 0 예측 1
실제 0 475 50
실제 1 51 74
------------------------------
정확도: 0.8446, 정밀도: 0.5968, 재현율: 0.5920, F1스코어: 0.5944, AUC스코어: 0.7484
clf_eval(y_test, rf_pred)
오차 행렬
예측 0 예측 1
실제 0 508 17
실제 1 50 75
------------------------------
정확도: 0.8969, 정밀도: 0.8152, 재현율: 0.6000, F1스코어: 0.6912, AUC스코어: 0.7838
clf_eval(y_test, lr_pred)
오차 행렬
예측 0 예측 1
실제 0 511 14
실제 1 97 28
------------------------------
정확도: 0.8292, 정밀도: 0.6667, 재현율: 0.2240, F1스코어: 0.3353, AUC스코어: 0.5987
dt_pred_prob = dt_clf.predict_proba(X_test)
rf_pred_prob = rf_clf.predict_proba(X_test)
lr_pred_prob = lr_clf.predict_proba(X_test)
a_score, r_score, p_score, f1score, auc = [], [], [], [], []
for i in range(1, 100):
binarizer = Binarizer(threshold=i*0.01)
dt_pred_new = binarizer.fit_transform(dt_pred_prob)[:,1]
a_score.append(accuracy_score(y_test, dt_pred_new))
r_score.append(recall_score(y_test, dt_pred_new))
p_score.append(precision_score(y_test, dt_pred_new))
f1score.append(f1_score(y_test, dt_pred_new))
auc.append(roc_auc_score(y_test, dt_pred_new))
plt.figure(figsize=(20,10))
plt.plot(a_score, label='a_score')
plt.plot(r_score, label='r_score')
plt.plot(p_score, label='p_score')
plt.plot(f1score, label='f1_score')
plt.plot(auc, label='auc')
plt.legend()
plt.show()
a_score, r_score, p_score, f1score, auc = [], [], [], [], []
for i in range(1, 100):
binarizer = Binarizer(threshold=i*0.01)
rf_pred_new = binarizer.fit_transform(rf_pred_prob)[:,1]
a_score.append(accuracy_score(y_test, rf_pred_new))
r_score.append(recall_score(y_test, rf_pred_new))
p_score.append(precision_score(y_test, rf_pred_new))
f1score.append(f1_score(y_test, rf_pred_new))
auc.append(roc_auc_score(y_test, rf_pred_new))
plt.figure(figsize=(20,10))
plt.plot(a_score, label='a_score')
plt.plot(r_score, label='r_score')
plt.plot(p_score, label='p_score')
plt.plot(f1score, label='f1_score')
plt.plot(auc, label='auc')
plt.legend()
plt.show()
a_score, r_score, p_score, f1score, auc = [], [], [], [], []
for i in range(1, 100):
binarizer = Binarizer(threshold=i*0.01)
lr_pred_new = binarizer.fit_transform(lr_pred_prob)[:,1]
a_score.append(accuracy_score(y_test, lr_pred_new))
r_score.append(recall_score(y_test, lr_pred_new))
p_score.append(precision_score(y_test, lr_pred_new))
f1score.append(f1_score(y_test, lr_pred_new))
auc.append(roc_auc_score(y_test, lr_pred_new))
plt.figure(figsize=(20,10))
plt.plot(a_score, label='a_score')
plt.plot(r_score, label='r_score')
plt.plot(p_score, label='p_score')
plt.plot(f1score, label='f1_score')
plt.plot(auc, label='auc')
plt.legend()
plt.show()
구성
결정 트리는 규칙을 정할 때 균일도를 고려한다.
Gini Index
Entrophy Index
문제점
제약
제약 파라미터
장점
단점
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
# DecisionTree Classifier 생성
dt_clf = DecisionTreeClassifier(random_state=156)
# 붓꽃 데이터를 로딩하고, 학습과 테스트 데이터 셋으로 분리
iris_data = load_iris()
X_train , X_test , y_train , y_test = train_test_split(iris_data.data, iris_data.target,
test_size=0.2, random_state=11)
# DecisionTreeClassifer 학습.
dt_clf.fit(X_train , y_train)
from sklearn.tree import export_graphviz
# export_graphviz()의 호출 결과로 out_file로 지정된 tree.dot 파일을 생성함.
export_graphviz(dt_clf, out_file="tree.dot", class_names=iris_data.target_names,
feature_names = iris_data.feature_names, impurity=True, filled=True)
import graphviz
# 위에서 생성된 tree.dot 파일을 Graphviz 읽어서 Jupyter Notebook상에서 시각화
with open("tree.dot") as f:
dot_graph = f.read()
graphviz.Source(dot_graph)
import seaborn as sns
import numpy as np
%matplotlib inline
# feature importance 추출
print("Feature importanes:\n{0}".format(np.round(dt_clf.feature_importances_, 3)))
# feature별 importance 매핑
# dictionary data 생성 {feature name: feature importance}
iris_fi = {}
for name, value in zip(iris_data.feature_names, dt_clf.feature_importances_):
print('{0} : {1:.3f}'.format(name, value))
iris_fi[name] = value
# items 데이터 생성 후 value 자리의 데이터로 내림차순 정렬
# key, value를 따로 저장
iris_fi_sorted = sorted(iris_fi.items(), key=lambda x: x[1], reverse=True)
iris_fi_sorted_feature = [feature for feature, importance in iris_fi_sorted]
iris_fi_sorted_importance = [importance for feature, importance in iris_fi_sorted]
# feature importance를 column 별로 시각화하기
# 가로막대 그래프는 반드시 내림차순으로 정렬할 것
sns.barplot(x=iris_fi_sorted_importance, y=iris_fi_sorted_feature)
과적합이 발생하면 feature importance는 신뢰도를 잃는다.
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
%matplotlib inline
plt.title("3 Class values with 2 Features Sample data creation")
# 2차원 시각화를 위해서 피처는 2개, 클래스는 3가지 유형의 분류 샘플 데이터 생성.
X_features, y_labels = make_classification(n_features=2, n_redundant=0, n_informative=2, n_classes=3, n_clusters_per_class=1, random_state=0)
# 그래프 형태로 2개의 피처로 2차원 좌표 시각화, 각 클래스 값은 다른 색깔로 표시됨.
plt.scatter(X_features[:, 0], X_features[:, 1], c=y_labels, s=25, edgecolor='k')
import numpy as np
# Classifier의 Decision Boundary를 시각화 하는 함수
def visualize_boundary(model, X, y):
fig,ax = plt.subplots()
# 학습 데이타 scatter plot으로 나타내기
ax.scatter(X[:, 0], X[:, 1], c=y, s=25, cmap='rainbow', edgecolor='k',
clim=(y.min(), y.max()), zorder=3)
ax.axis('tight')
ax.axis('off')
xlim_start , xlim_end = ax.get_xlim()
ylim_start , ylim_end = ax.get_ylim()
# 호출 파라미터로 들어온 training 데이타로 model 학습 .
model.fit(X, y)
# meshgrid 형태인 모든 좌표값으로 예측 수행.
xx, yy = np.meshgrid(np.linspace(xlim_start,xlim_end, num=200),np.linspace(ylim_start,ylim_end, num=200))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
# contourf() 를 이용하여 class boundary 를 visualization 수행.
n_classes = len(np.unique(y))
contours = ax.contourf(xx, yy, Z, alpha=0.3,
levels=np.arange(n_classes + 1) - 0.5,
cmap='rainbow', clim=(y.min(), y.max()), zorder=1)
from sklearn.tree import DecisionTreeClassifier
# 특정한 트리 생성 제약 없는 결정 트리의 Decision Boundary 시각화.
dt_clf = DecisionTreeClassifier().fit(X_features, y_labels)
visualize_boundary(dt_clf, X_features, y_labels)
앙상블 모델
# 데이터 로드 및 패키지 임포트
# import Library
import pandas as pd
import seaborn as sns
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')
df = sns.load_dataset('titanic')
df.drop(['embark_town', 'alive', 'class'], axis=1, inplace=True)
df.age.fillna(df.age.mean(), inplace=True)
df.embarked.fillna("N", inplace=True)
df.deck = df.deck.astype("object")
df.deck.fillna("N", inplace=True)
en_list = df.dtypes[(df.dtypes=='object') | (df.dtypes=='bool') | (df.dtypes=='category')].index
for i in en_list:
encoder = LabelEncoder()
encoder.fit(df[i])
df[i] = encoder.transform(df[i])
# 입력데이터 X와 출력데이터 y를 분리
X = df.drop('survived', axis=1)
y = df.survived
# 개별 모델은 로지스틱 회귀와 KNN 임.
lr_clf = LogisticRegression()
knn_clf = KNeighborsClassifier(n_neighbors=8)
dt_clf = DecisionTreeClassifier()
rf_clf = RandomForestClassifier()
# 개별 모델을 소프트 보팅 기반의 앙상블 모델로 구현한 분류기
vo_clf = VotingClassifier(estimators=[('LR', lr_clf),('KNN', knn_clf),
('DT', dt_clf)] , voting='soft')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2 , random_state= 156)
# VotingClassifier 학습/예측/평가.
vo_clf.fit(X_train , y_train)
pred = vo_clf.predict(X_test)
print('Voting 분류기 정확도: {0:.4f}'.format(accuracy_score(y_test , pred)))
# 개별 모델의 학습/예측/평가.
classifiers = [lr_clf, knn_clf, dt_clf, rf_clf]
for classifier in classifiers:
classifier.fit(X_train , y_train)
pred = classifier.predict(X_test)
class_name= classifier.__class__.__name__
print('{0} 정확도: {1:.4f}'.format(class_name, accuracy_score(y_test , pred)))
Voting 분류기 정확도: 0.8436
LogisticRegression 정확도: 0.8212
KNeighborsClassifier 정확도: 0.7039
DecisionTreeClassifier 정확도: 0.8436
RandomForestClassifier 정확도: 0.8324
단순/가중 평균
Bagging
Random Forest
Hard Voting과 Soft Voting
트리 기반 앙상블 알고리즘의 단점
Boosting
Boosting이 Bagging에 밀렸던 이유
Gradient Boosting
XGBoost (eXtreme Gradient Boost)
언더 샘플링
Stacking
잘 읽었습니다