작업 2유형 : ensemble 학습

SOOYEON·2022년 5월 14일
0

빅데이터분석기사

목록 보기
5/36

Voting

강한 학습기: hard learner

# hard voting
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressoin
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

rnf_model = RandomForestClassifier(random_state = 42)
logit_model = LogisticRegressoin(random_state = 42)
svm_model = SVC(random_state = 42)

voting_hard = VotingClassifier(
    estimators=[('lr', logit_model), 
    		    ('rf', rnf_model), 
                ('svc', svm_model)],
    			voting='hard')
voting_hard.fit(X_scaled_train, y_train)

# accuracy_score
from sklearn.metrics import accuracy_score

for clf in (logit_model, rnf_model, svm_model, voting_hard):
    clf.fit(X_scaled_train, y_train)
    y_pred = clf.predict(X_scaled_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))
    

# confusion
voting_pred_train=voting_hard.predict(X_scaled_train)
voting_confusion_train=confusion_matrix(y_train, voting_pred_train)
print("투표분류기 분류기 훈련데이터 오차행렬:\n", voting_confusion_train)

voting_pred_test=voting_hard.predict(X_scaled_test)
voting_confusion_test=confusion_matrix(y_test, voting_pred_test)
print("투표분류기 분류기 테스트 데이터 오차행렬:\n", voting_confusion_test)

from sklearn.metrics import accuracy_score
print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

약한 학습기: soft learner

SVC(probability=True)

# soft voting
logit_model = LogisticRegression(random_state=42)
rnf_model = RandomForestClassifier(random_state=42)
svm_model = SVC(probability=True, random_state=42)

voting_soft = VotingClassifier(
    estimators=[('lr', logit_model), 
    			('rf', rnf_model), 
                ('svc', svm_model)], 
                voting='soft')
voting_soft.fit(X_scaled_train, y_train)

# accuracy_score
from sklearn.metrics import accuracy_score
for clf in (logit_model, rnf_model, svm_model, voting_soft):
    clf.fit(X_scaled_train, y_train)
    y_pred = clf.predict(X_scaled_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

# confusion (분류)
voting_pred_train=voting_soft.predict(X_scaled_train)
voting_confusion_train=confusion_matrix(y_train, voting_pred_train)
print("투표분류기 분류기 훈련데이터 오차행렬:\n", voting_confusion_train)

voting_pred_test=voting_soft.predict(X_scaled_test)
voting_confusion_test=confusion_matrix(y_test, voting_pred_test)
print("투표분류기 분류기 훈련데이터 오차행렬:\n", voting_confusion_test)

# RMSE (회귀)
# RMSE (Root Mean Squared Error)
import numpy as np
from sklearn.metrics import mean_squared_error 
MSE_train = mean_squared_error(y_train, pred_train)
MSE_test = mean_squared_error(y_test, pred_test)
print("훈련   데이터 RMSE:", np.sqrt(MSE_train))
print("테스트 데이터 RMSE:", np.sqrt(MSE_test))

Bagging

# 기본 모델 적용 (분류)
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier(base_estimator=SVC(), 
				n_estimators=10, random_state=0)
model.fit(X_scaled_train, y_train)
pred_train=model.predict(X_scaled_train)
model.score(X_scaled_train, y_train)

# 분류 : confusion_matrix, classification_report
cfreport_train=classification_report(y_train, pred_train)

# 기본 모델 적용 (회귀)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor
model = BaggingRegressor(base_estimator=KNeighborsRegressor(),
				n_estimators=10, random_state=0)
model.fit(X_scaled_train, y_train)
pred_train=model.predict(X_scaled_train)
model.score(X_scaled_train, y_train)

# 회귀 : RMSE
pred_test=model.predict(X_scaled_test)
model.score(X_scaled_test, y_test)
MSE_train = mean_squared_error(y_train, pred_train)
MSE_test = mean_squared_error(y_test, pred_test)

Boosting

AdaBoosting


# 기본 모델 적용 (분류)
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier(n_estimators= 100, random_state = 0)


# 기본 모델 적용 (회귀)
from sklearn.ensemble import AdaBoostRegressor
model = AdaBoostRegressor(random_state=0, n_estimators=100)

Gradient Boosting

# 기본 모델 적용 (분류)
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(
				n_estimators=100, 
				learning_rate=1.0, 
                max_depth=1, 
                random_state=0)

# 기본 모델 적용 (회귀)
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor(random_state=0)

Stacking

스태킹 : 데이터셋이 아닌 여러 학습기로부터 예측값을 학습하여 일반화된 최종 모델을 구성하는 방법

  • 스태킹 모델을 구성하는 개별 알고리즘 최적화 및 모델 순서는 스태킹 앙상블에서 중요한 역할을 미친다.

분류

  • StackingClassifier
  • 분류기 (estimators) : 랜덤 포레스트, SVC
  • 최종 분류기 (final estimator) : 로지스틱 회귀
# 모델 적용 (분류)
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

estimators = [('rf' , RandomForestClassifier(n_estimators = 10, random_state = 42)), ('svc', SVC(random_state = 42))]

model = StackingClassifier(estimators = estimators, final_estimator = LogisticRegression())
model.fit(X_scaled_train, y_train)
pred_train(X_scaled_train)
model.score(X_scaled_train, y_train)						

회귀

  • StackingRegressor
  • 분류기 (estimators) : 선형 회귀, K 최근접 이웃
  • 최종 분류기 (final estimator) : 랜덤 포레스트
# 모델 적용 (회귀)
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor

estimators = [('lr', LinearRegression()), 
              ('knn', KNeighborsRegressor())]
model = StackingRegressor(estimators = estimators, 
                          final_estimator = RandomForestRegressor(n_estimators = 10, random_state = 42))
model.fit(X_scaled_train, y_train)

pred_train = model.predict(X_scaled_train)
model.score(X_scaled_train, y_train)

0개의 댓글