[제로베이스 데이터 취업 스쿨]
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, stratify=labels, random_state=13)
# 각 클래스(setosa, versicolor, verginica) 별로 동일 비율: stratify
np.unique(y_test, return_counts=True)
iris_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
# random_state: 학습할 때 마다 일관성
iris_tree.fit(X_train, y_train)
y_pred_tr = iris_tree.predict(iris.data[:, 2:])
accuracy_score(iris.target, y_pred_tr)
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
le.fit_transform(df['A'])
le.inverse_transform([1,2,2,2])
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
mms = MinMaxScaler()
df_mms = mms.fit_transform(df)
from sklearn.pipeline import Pipeline
estimators = [('scaler', StandardScaler()), ('clf', DecisionTreeClassifier())]
pipe = Pipeline(estimators)
pipe.set_params(clf__max_depth=2)
pipe.set_params(clf__random_state=13)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13, stratify=y)
pipe.fit(X_train, y_train)
y_pred_tr = pipe.predict(X_train)
y_pred_test = pipe.predict(X_test)
print('Train acc : ', accuracy_score(y_train, y_pred_tr))
print('Test acc :', accuracy_score(y_test, y_pred_test))
KFold, StratifiedKFold는 인덱스를 반환한다.
StratifiedKFold: 분포가 다를 때 사용
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)
cv_accuracy = []
for train_idx, test_idx in skfold.split(X, y):
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
wine_tree_cv.fit(X_train, y_train)
pred = wine_tree_cv.predict(X_test)
cv_accuracy.append(accuracy_score(y_test, pred))
np.mean(cv_accuracy)
# 간편한 방법
cross_val_score(wine_tree_cv, X, y, scoring=None, cv=skfold)
cross_validate(wine_tree_cv, X, y, scoring=None, cv=skfold, return_train_score=True)
from sklearn.model_selection import GridSearchCV
params = {'max_depth' : [2,4,7,10]}
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
gridsearch = GridSearchCV(estimator=wine_tree, param_grid=params, cv=5)
gridsearch.fit(X, y)
gridsearch.best_estimator_
gridsearch.best_score_
gridsearch.best_params_
estimators=[('scaler', StandardScaler()), ('clf', DecisionTreeClassifier(random_state=13))]
pipe=Pipeline(estimators)
param_grid=[{'clf__max_depth':[2,4,7,10]}]
GridSearch=GridSearchCV(estimator=pipe, param_grid=param_grid, cv=5)
GridSearch.fit(X, y)
score_df=pd.DataFrame(GridSearch.cv_results_)
score_df[['params', 'rank_test_score', 'mean_test_score', 'std_test_score']]
회귀모델: 실제 값과의 에러치를 가지고 계산
분류모델: 정확도(Accuacy), 오차행렬(Confusion Matrix), 정밀도(Precision), 재현율(Recall), F1 score, ROC AUC
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# 통계적 회귀
X = np.c_[X, [1]*len(X)] # 상수항(열) 추가
lm = sm.OLS(y, X).fit()
lm.summary()
# 모델 평가(RMS)
reg = LinearRegression()
reg.fit(X_train, y_train)
pred_tr = reg.predict(X_train)
pred_test = reg.predict(X_test)
rmse_tr = np.sqrt(mean_squared_error(y_train, pred_tr))
rmse_test = np.sqrt(mean_squared_error(y_test, pred_test))
print('RMSE of Train Data :', rmse_tr)
print('RMSE of Test Data :', rmse_test)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.preprocessing import Binarizer
lr = LogisticRegression(solver='liblinear', random_state=13)
# 최적화 알고리즘(solver): 보통 데이터 수가 크지 않으면 liblinear로 설정함
lr.fit(X_train, y_train)
y_pred_tr = lr.predict(X_train)
y_pred_test = lr.predict(X_test)
print('Train Acc : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(y_test, y_pred_test))
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
params = {
'max_depth':[6,8,10],
'n_estimators':[50, 100, 200], # decision tree 몇 그루 쓸건지
'min_samples_leaf':[8,12], # 가장 끝에 있는 데이터의 개수
'min_samples_split':[8,12]
}
rf_clf = RandomForestClassifier(random_state=13, n_jobs=-1)
grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train, y_train)
grid_cv.best_score_
rf_clf_best = grid_cv.best_estimator_
rf_clf_best.fit(X_train, y_train)
pred1 = rf_clf_best.predict(X_test)
accuracy_score(y_test, pred1)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train) #절차상으로만 fit 과정이 있음
pred = knn.predict(X_test)
print(accuracy_score(y_test, pred))
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
from sklearn.decomposition import PCA
def get_pca_data(ss_data, n_components=2):
pca = PCA(n_components=n_components)
pca.fit(ss_data)
return pca.transform(ss_data), pca
def print_variance_ratio(pca):
print('variance_ratio: ', pca.explained_variance_ratio_)
print('sum of variance_ratio: ', np.sum(pca.explained_variance_ratio_))
# pca.explained_variance_: 설명력