예시) iris 3종으로 나누기(setosa, versicolor, virginica)
petal length > 2.5 기준: setosa vs versicolor, virginica
versicolor, virginica은 어떤 기준으로?
from sklearn.tree import DecisionTreeClassifier
iris_tree = DecisionTreeClassifier()
iris_tree.fit(iris.data[:, 2:], iris.target)
from sklearn.metrics import accuracy_score
y_pred_tr = iris_tree.predict(iris.data[:, 2:])
accuracy_score(iris.target, y_pred_tr)
from mlxtend.plotting import plot_decision_regions
plt.figure(figsize=(14,8))
plot_decision_regions(X=iris.data[:,2:], y=iris.target, clf=iris_tree, legend=2)
plt.show()
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
iris = load_iris()
feature = iris.data[:, 2:]
labels = iris.target
X_train, X_test, y_train, y_test = train_test_split(feature, labels, test_size=0.2, stratify=labels, random_state=13)
from sklearn.tree import DecisionTreeClassifier
iris_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
iris_tree.fit(X_train, y_train)
from sklearn.tree import plot_tree
plt.figure(figsize=(12,8))
plot_tree(iris_tree)
from sklearn.metrics import accuracy_score
y_pred_tr = iris_tree.predict(X_train)
accuracy_score(y_train, y_pred_tr)
from mlxtend.plotting import plot_decision_regions
plt.figure(figsize=(14,8))
plot_decision_regions(X=X_train, y=y_train, clf=iris_tree, legend=2)
plt.show()
y_pred_tr = iris_tree.predict(X_test)
accuracy_score(y_test, y_pred_tr)
# 0.9666666666666667
scatter_highlight_kwargs = {'s':150, 'label':'Test data', 'alpha':0.9}
scatter_kwargs = {'s':120, 'edgecolor': None, 'alpha':0.7}
plt.figure(figsize=(12,8))
plot_decision_regions(X=feature, y=labels, X_highlight=X_test, clf=iris_tree, legend=2,
scatter_highlight_kwargs=scatter_highlight_kwargs,
scatter_kwargs=scatter_kwargs,
contour_kwargs={'alpha':0.2})
features = iris.data
label = iris.target
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, stratify=labels, random_state=13)
iris_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
iris_tree.fit(X_train, y_train)
plt.figure(figsize=(12,8))
plot_tree(iris_tree)
test_data = [[4.3, 2., 1.2, 1.0]]
# 범주 값
iris.target_names[iris_tree.predict(test_data)]
# 클래스별 확률
iris_tree.predict_proba(test_data)
list1 = [1, 2, 3]
list2 = ['a', 'b', 'c']
pairs = [pair for pair in zip(list1, list2)]
# [(1, 'a'), (2, 'b'), (3, 'c')]
dict(pairs)
# {1: 'a', 2: 'b', 3: 'c'}
dict(zip(list1, list2))
# {1: 'a', 2: 'b', 3: 'c'}
a, b = (*pairs)
a = (1, 2, 3)
b = ('a', 'b', 'c')
Reference
1) 제로베이스 데이터스쿨 강의자료
2) https://aws.amazon.com/ko/what-is/overfitting/