์์๋ธ ๊ธฐ๋ฒ : Voting, Bagging, Boosting, ์คํ๊น ๋ฑ์ผ๋ก ๋๋๋ค. ๋ณดํ ๊ณผ ๋ฐฐ๊น ์ ์ฌ๋ฌ ๊ฐ์ ๋ถ๋ฅ๊ธฐ๊ฐ ํฌํ๋ฅผ ํตํด ์ต์ข ์์ธก ๊ฒฐ๊ณผ๋ฅผ ๊ฒฐ์ ํ๋ ๋ฐฉ์. ๋ณดํ ๊ณผ ๋ฐฐ๊น ์ ์ฐจ์ด์ ์ ๋ณดํ ์ ๊ฐ๊ฐ ๋ค๋ฅธ ๋ถ๋ฅ๊ธฐ, ๋ฐฐ๊น ์ ๊ฐ์ ๋ถ๋ฅ๊ธฐ๋ฅผ ์ฌ์ฉ. ๋ํ์ ์ธ ๋ฐฐ๊น ๋ฐฉ์์ด ๋๋ค ํฌ๋ ์คํธ์ด๋ค
Boosting : ์ฌ๋ฌ ๊ฐ์ (์ฝํ)๋ถ๋ฅ๊ธฐ๊ฐ ์์ฐจ์ ์ผ๋ก ํ์ต์ ํ๋ฉด์ ์์์ ํ์ตํ ๋ถ๋ฅ๊ธฐ๊ฐ ์์ธก์ด ํ๋ฆฐ ๋ฐ์ดํฐ์ ๋ํด ๋ค์ ๋ถ๋ฅ๊ธฐ๊ฐ ๊ฐ์ค์น๋ฅผ ์ธ๊ฐํด์ ํ์ต์ ์ด์ด ์งํํ๋ ๋ฐฉ์. ์์ธก ์ฑ๋ฅ์ด ๋ฐ์ด๋์ ์์๋ธ ํ์ต์ ์ฃผ๋ํ๊ณ ์๋ค โ ๊ทธ๋๋์ธํธ๋ถ์คํธ, XGBoost, LightGBM ๋ฑ
๋ฐฐ๊น ๊ณผ ๋ถ์คํ ์ ์ฐจ์ด
Adaboost - STEP2) Step1์์ ํ๋ฆฐ +์ ๊ฐ์ค์น๋ฅผ ์ธ๊ฐํ๊ณ ๋ค์ ๊ฒฝ๊ณ๋ฅผ ๊ฒฐ์
Adaboost - STEP3) ๋ค์ ๋์น -์ ๊ฐ์ค์น๋ฅผ ์ธ๊ฐํ๊ณ ๋ค์ ๊ฒฝ๊ณ๋ฅผ ๊ฒฐ์
Adaboost - STEP4) ์์ ๊ฒฐ์ ํ ๊ฒฝ๊ณ๋ค์ ํฉ์นจ
# wine ๋ฐ์ดํฐ
# ๋ฐ์ดํฐ ์ฝ๊ธฐ, ์ปฌ๋ผ ์์ฑ
import pandas as pd
wine_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/wine.csv'
wine = pd.read_csv(wine_url, index_col=0)
wine.head()
wine['taste'] = [1 if grade > 5 else 0 for grade in wine['quality']]
X = wine.drop(['taste', 'quality'], axis=1)
y = wine['taste']
# ์ง์ StandardScaler ์ ์ฉ
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_sc = sc.fit_transform(X)
X_sc
# Scaler ์ ์ฉ ํ์ ๋ฐ์ดํฐ ๋๋๊ธฐ
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_sc, y, test_size=0.2, random_state=13)
# ๋ชจ๋ ์ปฌ๋ผ์ ํ์คํ ๊ทธ๋จ ์กฐ์ฌ
# ์ ๋ถํฌ๋์ด ์๋ ์ปฌ๋ผ์ด ์ข์ ๋๊ฐ ๋ง๋ค
import matplotlib.pyplot as plt
%matplotlib inline
wine.hist(bins=10, figsize=(15, 10))
plt.show()
# quality๋ณ ๋ค๋ฅธ ํน์ฑ์ด ์ด๋ค์ง ํ์ธ
column_names = ['fixed acidity', 'volatile acidity', 'citric acid',
'citric acid', 'residual sugar','chlorides', 'free sulfur dioxide',
'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
df_pivot_table = wine.pivot_table(column_names, ['quality'], aggfunc='median')
print(df_pivot_table)
# quality์ ๋ํ ๋๋จธ์ง ํน์ง๋ค์ ์๊ด๊ด๊ณ
# ์๊ด๊ณ์๋ ์ ๋๊ฐ์ผ๋ก ๋ด์ผ ํจ
corr_matrix = wine.corr()
print(corr_matrix['quality'].sort_values(ascending=False))
# taste ์ปฌ๋ผ์ ๋ถํฌ
import seaborn as sns
sns.countplot(x= wine['taste'], data=wine)
plt.show()
# ๋ค์ํ ๋ชจ๋ธ์ ํ๋ฒ์ ํ
์คํธ โ
โ
โ
from sklearn.ensemble import (AdaBoostClassifier, GradientBoostingClassifier,
RandomForestClassifier)
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
models = []
models.append(('RandomForestClassifier', RandomForestClassifier())) # ํํํ์ผ๋ก ๋ฃ๊ธฐ
models.append(('DecisionTreeClassifier', DecisionTreeClassifier()))
models.append(('AdaBoostClassifier', AdaBoostClassifier()))
models.append(('GradientBoostingClassifier', GradientBoostingClassifier()))
models.append(('LogisticRegression', LogisticRegression()))
models
# ๊ฒฐ๊ณผ ์ ์ฅ ์์
โ
โ
โ
from sklearn.model_selection import KFold, cross_val_score
results = []
names = []
for name, model in models:
kfold = KFold(n_splits=5, random_state=13, shuffle = True) # shuffle = True : ๋๋๊ธฐ ์ ์ ๋ฐ์ดํฐ ์๊ธฐ(?)
cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
results.append(cv_results)
names.append(name)
print(name, cv_results.mean(), cv_results.std())
results
# cross-validation ๊ฒฐ๊ณผ๋ฅผ ์ผ๋ชฉ์์ฐํ๊ฒ ํ์ธํ๊ธฐ
# ์ง๊ธ์ randomForest๊ฐ ์ ๋ฆฌํด๋ณด์ธ๋ค
fig = plt.figure(figsize=(14, 8))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
# ํ
์คํธ ๋ฐ์ดํฐ์ ๋ํ ํ๊ฐ ๊ฒฐ๊ณผ
from sklearn.metrics import accuracy_score
for name, model in models:
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(name, accuracy_score(y_test, pred))
# iris ๋ฐ์ดํฐ
from sklearn.datasets import load_iris
iris = load_iris()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,
test_size=0.2, random_state=13,
stratify=iris.target)
# kNN ํ์ต
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
# accuracy
from sklearn.metrics import accuracy_score
pred = knn.predict(X_test)
print(accuracy_score(y_test, pred))
# ๊ฐ๋จํ ์ฑ๊ณผ
# confusion_matrix : https://wikidocs.net/194464
# classification_report : https://wikidocs.net/193994
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
# HAR ๋ฐ์ดํฐ ์ฝ๊ธฐ
import pandas as pd
url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/features.txt'
feature_name_df = pd.read_csv(url, sep='\s+', header=None,
names=['column_index', 'column_name'])
# sep='\s+' : ๊ธธ์ด๊ฐ ์ ํด์ง์ง ์์ ๊ณต๋ฐฑ์ด ๊ตฌ๋ถ์์ธ ๊ฒฝ์ฐ์๋ \s+ ์ ๊ท์(regular expression) ๋ฌธ์์ด์ ์ฌ์ฉ
# ์ฐธ๊ณ : https://datascienceschool.net/01%20python/04.02%20%EB%8D%B0%EC%9D%B4%ED%84%B0%20%EC%9E%85%EC%B6%9C%EB%A0%A5.html
# names= : column์ด๋ฆ ์ค์
feature_name = feature_name_df.iloc[:, 1].values.tolist()
X_train_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/train/X_train.txt'
X_test_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/test/X_test.txt'
X_train = pd.read_csv(X_test_url, sep='\s+', header=None)
X_test = pd.read_csv(X_test_url, sep='\s+', header=None)
X_train.columns = feature_name
X_test.columns = feature_name
y_train_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/train/y_train.txt'
y_test_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/test/y_test.txt'
y_train = pd.read_csv(y_test_url, sep='\s+', header=None, names = ['action'])
y_test = pd.read_csv(y_test_url, sep='\s+', header=None, names=['action'])
# ํ์ ๋ชจ๋ import
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import time
import warnings
warnings.filterwarnings('ignore')
# acc๊ฐ 1 %, ๊ณ์ฐ์๊ฐ 388์ด
# ์ผ๋ฐ์ ์ผ๋ก GBM์ด ์ฑ๋ฅ ์์ฒด๋ ๋๋ค ํฌ๋ ์คํธ๋ณด๋ค ์ข๋ค๊ณ ์๋ ค์ ธ ์๋ค
# sckit-learn์ GBM์ ์๋๊ฐ ์์ฃผ ๋๋ฆฐ ๊ฒ์ผ๋ก ์๋ ค์ ธ ์๋ค.
start_time = time.time()
gb_clf = GradientBoostingClassifier(random_state=13)
gb_clf.fit(X_train, y_train)
gb_pred = gb_clf.predict(X_test)
print('ACC : ', accuracy_score(y_test, gb_pred))
print('Fit item : ', time.time() - start_time)
# GridSearch
# ์๊ฐ์ด ์ค๋ ๊ฑธ๋ฆผ!! โ
โ
โ
โ
โ
from sklearn.model_selection import GridSearchCV
params = {
'n_estimators' : [100, 500],
'learning_rate' : [0.05, 0.1]
}
start_time = time.time()
grid = GridSearchCV(gb_clf, param_grid=params, cv = 2, verbose=1, n_jobs=-1)
# cv : ๊ต์ฐจ๊ฒ์ฆ์ ์ํ fold ํ์
# verbose : ๋์๊ฐ ํ์, ์ํ ๊ฒฐ๊ณผ ๋ฉ์์ง๋ฅผ ์ถ๋ ฅ, verbose=0(default)๋ฉด ๋ฉ์์ง ์ถ๋ ฅ ์ํจ, verbose=1์ด๋ฉด ๊ฐ๋จํ ๋ฉ์์ง ์ถ๋ ฅ, verbose=2์ด๋ฉด ํ์ดํผ ํ๋ผ๋ฏธํฐ๋ณ ๋ฉ์์ง ์ถ๋ ฅ
# ์ถ์ฒ : https://www.inflearn.com/questions/62112/gridsearchcv%EC%97%90%EC%84%9C-verbose
grid.fit(X_train, y_train)
print('Fit time : ', time.time() - start_time)
# best ํ๋ผ๋ฏธํฐ
grid.best_score_
grid.best_params_
# test ๋ฐ์ดํฐ์์์ ์ฑ๋ฅ
accuracy_score(y_test, grid.best_estimator_.predict(X_test))
XGBoost : ํธ๋ฆฌ ๊ธฐ๋ฐ์ ์์๋ธ ํ์ต์์ ๊ฐ์ฅ ๊ฐ๊ด๋ฐ๋ ์๊ณ ๋ฆฌ์ฆ ์ค ํ๋. GBM ๊ธฐ๋ฐ์ ์๊ณ ๋ฆฌ์ฆ์ธ๋ฐ, GBM์ ๋๋ฆฐ ์๋๋ฅผ ๋ค์ํ ๊ท์ ๋ฅผ ํตํด ํด๊ฒฐ. ํนํ ๋ณ๋ ฌํ์ต์ด ๊ฐ๋ฅํ๋๋ก ์ค๊ณ๋จ. XGBoost๋ ๋ฐ๋ณต ์ํ ์๋ง๋ค ๋ด๋ถ์ ์ผ๋ก ํ์ต๋ฐ์ดํฐ์ ๊ฒ์ฆ ๋ฐ์ดํฐ๋ฅผ ๊ต์ฐจ๊ฒ์ฆ์ ์ํ. ๊ต์ฐจ๊ฒ์ฆ์ ํตํด ์ต์ ํ๋๋ฉด ๋ฐ๋ณต์ ์ค๋จํ๋ ์กฐ๊ธฐ ์ค๋จ ๊ธฐ๋ฅ์ ๊ฐ์ง๊ณ ์์
์ฃผ์ ํ๋ผ๋ฏธํฐ
- nthread : CPU์ ์คํ ์ค๋ ๋ ๊ฐ์๋ฅผ ์กฐ์ . ๋ํดํธ๋ CPU์ ์ ์ฒด ์ค๋ ๋๋ฅผ ์ฌ์ฉํ๋ ๊ฒ
- eta : GBM ํ์ต๋ฅ
- num_bosst_rounds : n_estimators์ ๊ฐ์ ํ๋ผ๋ฏธํฐ
- max_depth
# install
# pip install xgboost
# ์๋ฌ๊ฐ ๋ ๊ฒฝ์ฐ, conda install py-xgboost
# xgboost๋ ์ค์นํด์ผ ํจ
!pip install xgboost
# ์ฑ๋ฅ ํ์ธ
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
# ์ค์ ํ ์ด์ : https://stackoverflow.com/questions/71996617/invalid-classes-inferred-from-unique-values-of-y-expected-0-1-2-3-4-5-got
start_time = time.time()
xgb = XGBClassifier(n_estimators = 400, learning_rate = 0.1, max_depth = 3)
xgb.fit(X_train.values, y_train)
print('Fit time : ', time.time()- start_time)
accuracy_score(y_test, xgb.predict(X_test.values))
# ์กฐ๊ธฐ ์ข
๋ฃ ์กฐ๊ฑด๊ณผ ๊ฒ์ฆ ๋ฐ์ดํฐ ์ง์ ๊ฐ๋ฅ
from xgboost import XGBClassifier
evals = [(X_test.values, y_test)]
start_time = time.time()
xgb = XGBClassifier(n_estimators = 400, learning_rate = 0.1, max_depth = 3)
xgb.fit(X_train.values, y_train, early_stopping_rounds=10, eval_set=evals)
print('Fit time : ', time.time() - start_time)
accuracy_score(y_test, xgb.predict(X_test.values))
# install for mac User
# brew install lightgbm
# pip install light gbm
!pip install lightgbm
import numpy as np
from sklearn.preprocessing import LabelEncoder
# ๋ผ๋ฒจ ์ธ์ฝ๋ ์์ฑ
encoder = LabelEncoder()
# X_train๋ฐ์ดํฐ๋ฅผ ์ด์ฉ ํผํ
ํ๊ณ ๋ผ๋ฒจ์ซ์๋ก ๋ณํํ๋ค
encoder.fit(X_train)
X_train_encoded = encoder.transform(X_train)
# X_test๋ฐ์ดํฐ์๋ง ์กด์ฌํ๋ ์๋ก ์ถํํ ๋ฐ์ดํฐ๋ฅผ ์ ๊ท ํด๋์ค๋ก ์ถ๊ฐํ๋ค (์ค์!!!)
for label in np.unique(X_test):
if label not in encoder.classes_: # unseen label ๋ฐ์ดํฐ์ธ ๊ฒฝ์ฐ( )
encoder.classes_ = np.append(encoder.classes_, label) # ๋ฏธ์ฒ๋ฆฌ ์ ValueError๋ฐ์
X_test_encoded = encoder.transform(X_test)
from lightgbm import LGBMClassifier
start_time = time.time()
lgbm = LGBMClassifier(n_estimators=400)
lgbm.fit(X_train.values, y_train, early_stopping_rounds=100, eval_set=evals)
print('Fit time : ', time.time() - start_time)

์ด๋ ต..์คํ์ด ์๋จ;; ์ถํ ๋ค์ ํ์ธ ํ์ ใ ใ
๐ป ์ถ์ฒ : ์ ๋ก๋ฒ ์ด์ค ๋ฐ์ดํฐ ์ทจ์ ์ค์ฟจ