# ๊ฐ๋จํ ๋ฐ์ดํฐ ์์ฑ
import numpy as np
import seaborn as sns
sns.set_style('whitegrid')
rng = np.random.RandomState(13) # numpy.random.RandomState๋ class๋ช
X = np.dot(rng.rand(2, 2), rng.randn(2, 200)).T
X.shape
rng.rand(2, 2) # rand : 0 ~ 1์ฌ์ด์ ๋์
rng.randn(2, 200) # randn : ํ๊ท ์ 0, ํ์คํธ์ฐจ๋ 1์ธ ํ์ค์ ๊ท๋ถํฌ๋ฅผ ๋ฐ๋ฅด๋ ์ซ์๋ฅผ ๋์ด
import matplotlib.pyplot as plt
plt.scatter(X[:,0], X[:, 1])
plt.axis('equal')
# fit
from sklearn.decomposition import PCA
pca = PCA(n_components = 2, random_state = 13)
# n_component = 2 : 2๊ฐ์ ์ฃผ์ฑ๋ถ์ผ๋ก ํํํด๋ผ
pca.fit(X)
# ๋ฒกํฐ์ ๋ถ์ฐ๊ฐ
pca.components_ # 2๊ฐ์ ํ -> 2๊ฐ์ ๋ฒกํฐ
pca.explained_variance_ # ๋ฒกํฐ์ ์ค๋ช
๋ ฅ
# ์ฃผ์ฑ๋ถ ๋ฒกํฐ ๊ทธ๋ฆด ์ค๋น
def draw_vector(v0, v1, ax=None):
ax = ax or plt.gca()
# ax=None๊ฐ(ํน์ ๊ฐ์ด ์์ ๊ฒฝ์ฐ), plt.gca()๋ฅผ ์ ์ฉ ๊ทธ๋ ์ง ์์ผ๋ฉด ax๊ฐ ์ ์ฉ
arrowprops = dict(arrowstyle='->', # ํ์ดํ ์คํ์ผ
linewidth=2, color = 'black', shrinkA=0, shrinkB=0)
ax.annotate('', v1, v0, arrowprops=arrowprops)
# ๊ทธ๋ฆฌ๊ธฐ
plt.scatter(X[:, 0], X[:, 1], alpha=0.4) # alpha : ํฌ๋ช
๋
for length, vector in zip(pca.explained_variance_, pca.components_):
v = vector *3 * np.sqrt(length) # 3 : ์์์ ๊ฐ, ์ ๋นํ ํฌ๊ธฐ๋ก ๋ณํํ๋ ค๊ณ
draw_vector(pca.mean_, pca.mean_ + v)
plt.axis('equal')
plt.show()
pca.mean_ # ๋ฐ์ดํฐ์ ์ค์ฌ, ๊ฐ์ฅ ํฐ ์ํฅ๋ ฅ์ ๋ผ์น๋ ๋ฒกํฐ(?)
# n_components = 1๋ก ์ค์
pca = PCA(n_components=1, random_state=13)
pca.fit(X)
X_pca = pca.transform(X)
X_pca
print(pca.components_)
print(pca.explained_variance_)
pca.mean_
pca.explained_variance_ratio_ # ์ ์ฒด ๋ฐ์ดํฐ์ 93%์ ๋ ๋ฐ์ํ ์ ์๋ค
# linear regression๊ณผ ๊ฐ์ ๊ฒฐ๊ณผ์ผ์ง๋
X_new = pca.inverse_transform(X_pca) # ์๋์ ๋ฐ์ดํฐ๋ก ๋ณํ, 2์ฐจ์์ ํํ๋ก...
plt.scatter(X[:, 0], X[:, 1], alpha=0.3)
plt.scatter(X_new[:,0], X_new[:, 1], alpha=0.9)
plt.axis('equal')
plt.show()
import pandas as pd
from sklearn.datasets import load_iris
iris = load_iris()
iris_pd = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_pd['species'] = iris.target
iris_pd.head()
# ํน์ฑ 4๊ฐ๋ฅผ ํ ๋ฒ์ ํ์ธํ๊ธฐ ์ด๋ ต๋ค
sns.pairplot(iris_pd, hue='species', height=3,
x_vars=['sepal length (cm)', 'sepal width (cm)'],
y_vars=['petal length (cm)', 'petal width (cm)'])
# Scaler ์ ์ฉ
from sklearn.preprocessing import StandardScaler
iris_ss = StandardScaler().fit_transform(iris.data)
iris_ss[:3]
# pca ๊ฒฐ๊ณผ๋ฅผ returnํ๋ ํจ์ ์์ฑ
from sklearn.decomposition import PCA
def get_pca_data(ss_data, n_components = 2): # n_components = 2 : 4๊ฐ์ feature๋ฅผ ๋ค ๋ณด๊ณ 2๊ฐ๋ก ๋ฒกํฐ๋ฅผ ์ถ์ถํด๋ผ(?..)
pca = PCA(n_components=n_components)
pca.fit(ss_data)
return pca.transform(ss_data), pca
iris_pca, pca = get_pca_data(iris_ss, 2)
iris_pca.shape
pca.mean_
pca.components_
# 4๊ฐ์ ์์๋ฅผ ๊ฐ์ ๋์ค์ง๋ง ์ค์ ๋ก๋ 2๊ฐ์ ๋ฒกํฐ๊ฐ ๋์จ๋ค
# ์ด๋ฆ์ ์ ์ ์์ง๋ง 4๊ฐ์ ํฌ๊ธฐ์ pca 2๊ฐ์ ๋ฒกํฐ๋ก ์ค์ด๋ ๋ค
# pca ๊ฒฐ๊ณผ๋ฅผ pandas๋ก ์ ๋ฆฌ
def get_pd_from_pca(pca_data, cols=['pca_component_1', 'pca_component_2']):
return pd.DataFrame(pca_data, columns=cols)
# 4๊ฐ์ ํน์ฑ์ ๋ ๊ฐ์ ํน์ฑ์ผ๋ก ์ ๋ฆฌ
iris_pd_pca = get_pd_from_pca(iris_pca)
iris_pd_pca['species'] = iris.target
iris_pd_pca.head()
# ๋ ๊ฐ์ ํน์ฑ ๊ทธ๋ฆฌ๊ธฐ
sns.pairplot(iris_pd_pca, hue='species', height=5,
x_vars=['pca_component_1'], y_vars=['pca_component_2'])
# 4๊ฐ์ ํน์ฑ์ ๋ชจ๋ ์ฌ์ฉํด์ randomforest์ ์ ์ฉ
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
def rf_scores(X ,y, cv=5):
rf = RandomForestClassifier(random_state=13, n_estimators=100)
# n_estimators=100 : decision tree์ ๊ฐ์
scores_rf = cross_val_score(rf, X, y, scoring = 'accuracy', cv = cv)
print('Score : ', np.mean(scores_rf))
rf_scores(iris_ss, iris.target)
# ๋ ๊ฐ์ ํน์ฑ๋ง ์ ์ฉํ์ ๋
pca_X = iris_pd_pca[['pca_component_1', 'pca_component_2']]
rf_scores(pca_X, iris.target)
wine_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/wine.csv'
wine = pd.read_csv(wine_url, sep=',', index_col=0)
wine.head()
# ์์ธ ์์ ๋ถ๋ฅ(red/white)
wine_y = wine['color']
wine_X = wine.drop(['color'], axis=1)
wine_X.head()
# StandardScaler ์ ์ฉ
wine_ss = StandardScaler().fit_transform(wine_X)
wine_ss[:3]
def print_variance_ratio(pca):
print('variance_ratio : ' , pca.explained_variance_ratio_)
print('sum of variance_ratio : ' , np.sum(pca.explained_variance_ratio_))
# ๋ ๊ฐ์ ์ฃผ์ฑ๋ถ์ผ๋ก ์ค์ด๋ ๊ฒ์ ๋ฐ์ดํฐ์ 50%๊ฐ ์๋๋ค
pca_wine, pca = get_pca_data(wine_ss, n_components=2)
print_variance_ratio(pca)
# ๊ทธ๋ํ
pca_columns = ['pca_component_1','pca_component_2']
pca_wine_pd = pd.DataFrame(pca_wine, columns=pca_columns)
pca_wine_pd['color'] = wine_y.values
pca_wine_pd
sns.pairplot(pca_wine_pd, hue='color', height=5,
x_vars=['pca_component_1'], y_vars=['pca_component_2'])
# Random Forest์ ์ ์ฉํ์ ๋ ์๋ฐ์ดํฐ์ ํฐ ์ฐจ์ด๊ฐ ์๋ค.
rf_scores(wine_ss, wine_y)
pca_X = pca_wine_pd[['pca_component_1', 'pca_component_2']]
rf_scores(pca_X, wine_y)
# ์ฃผ์ฑ๋ถ 3๊ฐ๋ก ํํํ๋๋ 98% ์ด์ ํํ ๊ฐ๋ฅํ๋ค
pca_wine, pca = get_pca_data(wine_ss, n_components=3)
print_variance_ratio(pca)
cols = ['pca_1', 'pca_2', 'pca_3']
pca_wine_pd = get_pd_from_pca(pca_wine, cols=cols)
pca_X = pca_wine_pd[cols]
pca_X
rf_scores(pca_X, wine_y)
# ์ฃผ์ฑ๋ถ 3๊ฐ๋ก ํํํ ๊ฒ ์ ๋ฆฌ
pca_wine_plot = pca_X
pca_wine_plot['color'] = wine_y.values
pca_wine_plot.head()
# 3D๋ก ๊ทธ๋ฆฌ๊ธฐ
from mpl_toolkits.mplot3d import Axes3D
markers = ['^', 'o']
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
for i, marker in enumerate(markers):
x_axis_data = pca_wine_plot[pca_wine_plot['color']==i]['pca_1']
y_axis_data = pca_wine_plot[pca_wine_plot['color']==i]['pca_2']
z_axis_data = pca_wine_plot[pca_wine_plot['color']==i]['pca_3']
ax.scatter(x_axis_data, y_axis_data, z_axis_data, s = 20, alpha = 0.5, marker = marker)
ax.view_init(30, 80)
plt.show()
# plotly
import plotly.express as px
fig = px.scatter_3d(pca_wine_plot, x = 'pca_1', y='pca_2', z='pca_3',
color = 'color', symbol='color', opacity = 0.4)
fig.update_layout(margin = dict(l=0, r=0, b= 0, t= 0))
fig.show()
# ๋ฐ์ดํฐ ์ฝ๊ธฐ
from sklearn.datasets import fetch_olivetti_faces
faces_all = fetch_olivetti_faces()
print(faces_all.DESCR)
# ํน์ ์ํ๋ง ์ ํ
K = 20
faces = faces_all.images[faces_all.target == K]
faces
import matplotlib.pyplot as plt
N = 2
M = 5
fig = plt.figure(figsize=(10, 5))
plt.subplots_adjust(top=1, bottom=0, hspace=0, wspace=0.05)
for n in range(N*M):
ax = fig.add_subplot(N, M, n+1)
ax.imshow(faces[n], cmap=plt.cm.bone)
ax.grid(False)
ax.xaxis.set_ticks([]) #์ขํ๋ฅผ ์์ ๋ฒ๋ ค๋ผ(?..)
ax.yaxis.set_ticks([])
plt.suptitle('Olivetti')
plt.tight_layout()
plt.show()
# ๋ ๊ฐ์ ์ฑ๋ถ์ผ๋ก ๋ถ์
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X = faces_all.data[faces_all.target == K]
W = pca.fit_transform(X) # 10์ฅ์ ์ฌ์ง์ ํํํ๋ ๋ฒกํฐ
W
X.shape
import numpy as np
np.sqrt(4096) # 64*64ํฝ์
์ ๊ทธ๋ฆผ์ด 10๊ฐ๊ฐ ์๋ค
X_inv = pca.inverse_transform(W)
X_inv
# ๋ถ์๋ ๊ฒฐ๊ณผ ํ์ธ
N = 2
M = 5
fig = plt.figure(figsize=(10, 5))
plt.subplots_adjust(top=1, bottom=0, hspace=0, wspace=0.05)
for n in range(N*M):
ax = fig.add_subplot(N, M, n+1)
ax.imshow(X_inv[n].reshape(64, 64), cmap=plt.cm.bone) # reshape : ๋ฐฐ์ด์ ์ฌ๊ตฌ์กฐํ ํน์ ๋ณ๊ฒฝ
ax.grid(False)
ax.xaxis.set_ticks([])
ax.yaxis.set_ticks([])
plt.suptitle('PCA result')
plt.tight_layout()
plt.show()
# ์์ ๊ณผ ๋ ๊ฐ์ eigen face
# 10์ฅ์ ์ฌ์ง์ ์ด ์ธ์ฅ์ผ๋ก ๋ชจ๋ ํํ ๊ฐ๋ฅ
face_mean = pca.mean_.reshape(64, 64)
face_p1 = pca.components_[0].reshape(64, 64)
face_p2 = pca.components_[1].reshape(64, 64)
plt.figure(figsize=(12,7))
plt.subplot(131)
plt.imshow(face_mean, cmap=plt.cm.bone)
plt.grid(False) ; plt.xticks([]); plt.yticks([]); plt.title('mean')
plt.subplot(132)
plt.imshow(face_p1, cmap=plt.cm.bone)
plt.grid(False) ; plt.xticks([]); plt.yticks([]); plt.title('face_p1')
plt.subplot(133)
plt.imshow(face_p2, cmap=plt.cm.bone)
plt.grid(False) ; plt.xticks([]); plt.yticks([]); plt.title('face_p2')
plt.show()
# ๊ฐ์ค์น ์ ์
import numpy as np
N = 2
M = 5
w = np.linspace(-5, 10, N*M)
w
# ์ฒซ๋ฒ์งธ ์ฑ๋ถ์ ๋ณํ(mean๊ณผ face_p1์ ๊ฒฐ๊ณผ ์ด์ฉ)
fig = plt.figure(figsize=(10, 5))
plt.subplots_adjust(top=1, bottom=0, hspace=0, wspace=0.05)
for n in range(N*M):
ax = fig.add_subplot(N, M, n+1)
ax.imshow(face_mean + w[n] * face_p1, cmap=plt.cm.bone) # ์์ ์์ ๊ฐ์ค์น๊ฐ face_p1๊ณผ ๊ณฑํด์ง
plt.grid(False)
plt.xticks([])
plt.yticks([])
plt.title('Weight : ' + str(round(w[n])))
plt.tight_layout()
plt.show()
# ๋ ๋ฒ์งธ ์ฑ๋ถ์ ๋ณํ(mean๊ณผ face_p2์ ๊ฒฐ๊ณผ ์ด์ฉ)
fig = plt.figure(figsize=(10, 5))
plt.subplots_adjust(top=1, bottom=0, hspace=0, wspace=0.05)
for n in range(N*M):
ax = fig.add_subplot(N, M, n+1)
ax.imshow(face_mean + w[n] * face_p2, cmap=plt.cm.bone) # ์์ ์์ ๊ฐ์ค์น๊ฐ face_p1๊ณผ ๊ณฑํด์ง
plt.grid(False)
plt.xticks([])
plt.yticks([])
plt.title('Weight : ' + str(round(w[n])))
plt.tight_layout()
plt.show()
# ๋ ๊ฐ์ ์ฑ๋ถ ๋ค ํํ
nx, ny = (5, 5)
x = np.linspace(-5, 8, nx)
y = np.linspace(-5, 8, ny)
w1, w2 = np.meshgrid(x, y)
w1, w2
w1.shape
w1 = w1.reshape(-1, ) # reshape(-1, ) : ์์ ์ ๊ฒฝ์ฐ์ง ๋ง๊ณ ๋ค์ ๊ฒ์ ๋ฐ๊พธ์ด๋ผ
w2 = w2.reshape(-1, )
w1, w2
# ๋ค์ ํฉ์ฑ
fig = plt.figure(figsize=(12, 10))
plt.subplots_adjust(top=1, bottom=0, hspace=0, wspace=0.05)
N = 5
M = 5
for n in range(N*M):
ax = fig.add_subplot(N, M, n+1)
ax.imshow(face_mean + w1[n] * face_p1 + w2[n] * face_p2, cmap=plt.cm.bone) # ์์ ์์ ๊ฐ์ค์น๊ฐ face_p1๊ณผ ๊ณฑํด์ง
plt.grid(False)
plt.xticks([])
plt.yticks([])
plt.title('Weight : ' + str(round(w1[n],1)) + ', ' + str(round(w2[n],1)))
plt.tight_layout()
plt.show()
# HAR ๋ฐ์ดํฐ ์ฝ๊ธฐ
import pandas as pd
url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/features.txt'
feature_name_df = pd.read_csv(url, sep='\s+', header=None, names=['column_index', 'comumn_name'])
feature_name = feature_name_df.iloc[:, 1].values.tolist()
X_train_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/' +\
'master/dataset/HAR_dataset/train/X_train.txt'
X_test_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/' +\
'master/dataset/HAR_dataset/test/X_test.txt'
X_train = pd.read_csv(X_train_url, sep='\s+', header=None)
X_test = pd.read_csv(X_test_url, sep='\s+', header=None)
X_train.columns = feature_name
X_test.columns = feature_name
y_train_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/' +\
'master/dataset/HAR_dataset/train/y_train.txt'
y_test_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/' +\
'master/dataset/HAR_dataset/test/y_test.txt'
y_train = pd.read_csv(y_train_url, sep='\s+', header=None)
y_test = pd.read_csv(y_test_url, sep='\s+', header=None)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
# ์ฌ์ฌ์ฉ์ ์ํด ํจ์ ๋ง๋ค๊ธฐ
from sklearn.decomposition import PCA
def get_pca_data(ss_data, n_components=2):
pca = PCA(n_components=n_components)
pca.fit(ss_data)
return pca.transform(ss_data), pca
# PCA fit
HAR_pca, pca = get_pca_data(X_train, n_components=2)
HAR_pca.shape
pca.mean_.shape, pca.components_.shape
cols = ['pca_'+str(n) for n in range(pca.components_.shape[0])]
cols
# PCA ๊ฒฐ๊ณผ๋ฅผ ์ ์ฅํ๋ ํจ์
def get_pd_from_pca(pca_data, col_num):
cols = ['pca_' + str(n) for n in range(col_num)]
return pd.DataFrame(pca_data, columns=cols)
# components 2๊ฐ
HAR_pca, pca = get_pca_data(X_train, n_components=2)
HAR_pd_pca = get_pd_from_pca(HAR_pca, pca.components_.shape[0])
HAR_pd_pca['action'] = y_train
HAR_pd_pca.head()
import seaborn as sns
sns.pairplot(HAR_pd_pca, hue='action', height=5, x_vars=['pca_0'], y_vars=['pca_1'])
# ์ ์ฒด 500๊ฐ๊ฐ ๋๋ ํน์ฑ์ ๋จ ๋ ๊ฐ๋ก ์ค์ด๋ฉด ์ด์ ๋์ด๋ค
import numpy as np
def print_variance_ratio(pca):
print('variance_ratio : ', pca.explained_variance_ratio_)
print('sum of variance_ratio : ', np.sum(pca.explained_variance_ratio_))
print_variance_ratio(pca)
# ์ปดํฌ๋ํธ 3๊ฐ ์งํ
HAR_pca, pca = get_pca_data(X_train, n_components=3)
HAR_pd_pca = get_pd_from_pca(HAR_pca, pca.components_.shape[0])
HAR_pd_pca['action'] = y_train
print_variance_ratio(pca)
# ์ปดํฌ๋ํธ 10๊ฐ
HAR_pca, pca = get_pca_data(X_train, n_components=10)
HAR_pd_pca = get_pd_from_pca(HAR_pca, pca.components_.shape[0])
HAR_pd_pca['action'] = y_train
print_variance_ratio(pca)
%%time
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
params = {
'max_depth' : [6, 8, 10],
'n_estimators' : [50, 100, 200],
'min_samples_leaf' : [8, 12],
'min_samples_split' : [8, 12]
}
rf_clf = RandomForestClassifier(random_state=13, n_jobs=-1)
grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(HAR_pca, y_train.values.reshape(-1,))
# ์ฑ๋ฅ์ ์กฐ๊ธ ๋์๋ค
cv_results_df = pd.DataFrame(grid_cv.cv_results_)
cv_results_df.columns
target_col = ['rank_test_score', 'mean_test_score', 'param_n_estimators', 'param_max_depth']
cv_results_df[target_col].sort_values('rank_test_score').head()
# best ํ๋ผ๋ฏธํฐ
grid_cv.best_params_
grid_cv.best_score_
# ํ
์คํธ ๋ฐ์ดํฐ์ ์ ์ฉ
from sklearn.metrics import accuracy_score
rf_clf_best = grid_cv.best_estimator_
rf_clf_best
rf_clf_best.fit(HAR_pca, y_train.values.reshape(-1,))
pred1 = rf_clf_best.predict(pca.transform(X_test))
# (pca.transform(X_test)) : ๋ณ๋๋ก fit์ ํ๋ฉด ์คํ๋ ค ๊ผฌ์,
# pca.transform์ผ๋ก ์ก์๋ ํ๋ผ๋ฏธํฐ๋ฅผ ๊ฐ์ง๊ณ X_test๋ฅผ transformํด์ผ ํ๋ค โ
โ
โ
accuracy_score(y_test, pred1)
# xgboost ์๋ ๊ฒฝ์ฐ ๋ฒ์ ๋ฎ์ถฐ์ ์ฌ ๋ค์ด ์งํ
# https://stackoverflow.com/questions/71996617/invalid-classes-inferred-from-unique-values-of-y-expected-0-1-2-3-4-5-got
#!pip uninstall xgboost
#!pip install xgboost==1.5.0
# xgboost
import time
from xgboost import XGBClassifier
evals = [(pca.transform(X_test), y_test)]
start_time = time.time()
xgb = XGBClassifier(n_estimators = 400, learning_rate=0.1, max_depth = 3)
xgb.fit(HAR_pca, y_train.values.reshape(-1,),
early_stopping_rounds=10, eval_set=evals)
print('Fit time : ', time.time() - start_time)
# accuracy
accuracy_score(y_test, xgb.predict(pca.transform(X_test)))
NMIST : 28*28 ํฝ์ ์ 0 ~ 9 ์ฌ์ด์ ์ซ์ ์ด๋ฏธ์ง์ ๋ ์ด๋ธ๋ก ๊ตฌ์ฑ๋ ์ . 60000๊ฐ์ ํ๋ จ์ฉ ์ ๊ณผ 10000๊ฐ์ ์คํ์ฉ ์ ์ผ๋ก ๊ตฌ์ฑ
# ๋ฐ์ดํฐ ์ฝ๊ธฐ
import pandas as pd
df_train = pd.read_csv('./MNIST/mnist_train.csv')
df_test = pd.read_csv('./MNIST/mnist_test.csv')
df_train.shape, df_test.shape
# train ๋ฐ์ดํฐ
df_train.head()
# test ๋ฐ์ดํฐ
df_test
# ๋ฐ์ดํฐ ์ ๋ฆฌ
import numpy as np
X_train = np.array(df_train.iloc[:, 1:])
y_train = np.array(df_train['label'])
X_test = np.array(df_test.iloc[:, 1:])
y_test = np.array(df_test['label'])
X_train.shape, y_train.shape, X_test.shape, y_test.shape
# ๋ฐ์ดํฐ ํ์ธ
import random
samples = random.choices(population=range(0, 60000), k=16)
samples
# randomํ๊ฒ 16๊ฐ๋ง ํ์ธ
import matplotlib.pyplot as plt
plt.figure(figsize=(14, 12))
for idx, n in enumerate(samples):
plt.subplot(4, 4, idx+1)
plt.imshow(X_train[n].reshape(28, 28), cmap='Greys', interpolation='nearest')
plt.title(y_train[n])
plt.show()
# fit
from sklearn.neighbors import KNeighborsClassifier
import time
start_time = time.time()
clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(X_train, y_train)
print('Fit time : ', time.time() - start_time)
# test ๋ฐ์ดํฐ predict
# kNN์ ๊ฑฐ๋ฆฌ๋ฅผ ๋ชจ๋ ๊ตฌํด์ผ ํ๋ค..์๊ฐ์ด ์ค๋ ๊ฑธ๋ฆผ
# pca๋ก ์ฐจ์์ ์ค์ฌ๋ณด์
from sklearn.metrics import accuracy_score
start_time = time.time()
pred = clf.predict(X_test)
print('Fit time : ', time.time() - start_time)
print(accuracy_score(y_test, pred))
# pca๋ก ์ฐจ์์ ์ค์ฌ๋ณด์
from sklearn.pipeline import Pipeline
# Pipeline : ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ์์ ํ์ต๊น์ง์ ๊ณผ์ ์ ํ๋๋ก ์ฐ๊ฒฐ
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, StratifiedKFold
# GridSearchCV : ์์ธก ๋ชจํ์ ํ๋ผ๋ฏธํฐ ์ต์ ๊ฐ์ ๊ต์ฐจ๊ฒ์ฆ์ผ๋ก ํ์ธ, ํ์ดํผ ํ๋ผ๋ฏธํฐ ํ๋ ์ํ
# ์์
๋ฑ๋ก
pipe = Pipeline([
('pca', PCA()), ('clf', KNeighborsClassifier())])
parameters = {
'pca__n_components':[2, 5, 10], 'clf__n_neighbors':[5, 10, 15]}
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=13)
grid = GridSearchCV(pipe, parameters, cv=kf, n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)
# best score
print('Best score : %0.3f' % grid.best_score_)
print('Best parameters set : ')
best_parameters = grid.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print('\t%s : %r' %(param_name, best_parameters[param_name]))
# ์ฝ 93%์ acc๊ฐ ํ๋ณด๋๋ค
accuracy_score(y_test, grid.best_estimator_.predict(X_test))
# ๊ฒฐ๊ณผ ํ์ธ
# ex) 9 : precision 0.90 -> 9๋ผ๊ณ ํ ๊ฒ์ค์ 9์ธ ํ๋ฅ 0.90
def results(y_pred, y_test):
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))
results(grid.predict(X_train), y_train)
# ์ซ์๋ฅผ ๋ค์ ํ์ธํ๊ณ ์ถ๋ค๋ฉด
n = 1000
plt.imshow(X_test[n].reshape(28, 28), cmap='Greys', interpolation='nearest')
plt.show()
print('Answer is : ', grid.best_estimator_.predict(X_test[n].reshape(1,784)))
print('Real Label is : ', y_test[n])
# ํ๋ฆฐ ๋ฐ์ดํฐ ํ์ธ
preds = grid.best_estimator_.predict(X_test)
preds
y_test
# ํ๋ฆฐ ๋ฐ์ดํฐ ์ถ๋ฆฌ๊ธฐ
wrong_results = X_test[y_test != pred]
wrong_results
wrong_results.shape[0]
samples = random.choices(population=range(0, wrong_results.shape[0]), k=16)
plt.figure(figsize=(14, 12))
for idx, n in enumerate(samples):
plt.subplot(4, 4, idx+1)
plt.imshow(wrong_results[n].reshape(28, 28), cmap='Greys', interpolation='nearest')
# interpolation='nearest' : ๋ณด๊ฐ๋ฒ, ํฝ์
๋ค์ ์ถ ์์น ๊ฐ๊ฒฉ์ ๋ณด์ ํ์ฌ ์ด๋ฏธ์ง๊ฐ ์์ฐ์ค๋ฌ์ด ๋ชจ์์ผ๋ก ๋ณด์ผ ์ ์๊ฒ ํ๋ ๋ฐฉ๋ฒ. 'nearest'๋ ๊ฐ์ฅ ๊ณ ํด์๋์ธ ๋ณด๊ฐ๋ฒ
# ์ถ์ฒ : https://bentist.tistory.com/23
plt.title(grid.best_estimator_.predict(wrong_results[n].reshape(1, 784))[0])
plt.show()
# ํ์ดํ๋ ๋ฐ์ดํฐ
import pandas as pd
titanic_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/titanic.xls'
titanic = pd.read_excel(titanic_url)
titanic.head()
# ์ด๋ฆ ๋ถ๋ฆฌํด์ title ๋ง๋ค๊ธฐ(์ฌํ์ ์ ๋ถ์ด ์ด๋ฆ์ ๋ฐ์๋์ด ์์ด์..)
import re
title = []
for idx, dataset in titanic.iterrows():
title.append(re.search('\,\s\w+(\s\w+)?\.', dataset['name']).group()[2:-1])
titanic['title'] = title
titanic.head()
# ๊ท์กฑ๊ณผ ํ๋ฏผ ๋ฑ๊ธ ๊ตฌ๋ณ
titanic['title'] = titanic['title'].replace('Mlle', 'Miss')
titanic['title'] = titanic['title'].replace('Ms', 'Miss')
titanic['title'] = titanic['title'].replace('Mme', 'Mrs')
Rare_f = ['Dona', 'Dr', 'Lady', 'the Countess']
Rare_m = ['Capt', 'Col', 'Don', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Master']
for each in Rare_f:
titanic['title'] = titanic['title'].replace(each, 'Rare_f')
for each in Rare_m:
titanic['title'] = titanic['title'].replace(each, 'Rare_m')
titanic['title'].unique()
# gender ์ปฌ๋ผ ์์ฑ
from sklearn.preprocessing import LabelEncoder
le_sex = LabelEncoder()
le_sex.fit(titanic['sex'])
titanic['gender'] = le_sex.transform(titanic['sex'])
# grade ์ปฌ๋ผ ์์ฑ(์ซ์๋ก ๋ฐ๊ฟ์ฃผ์ด์ผ ํจ)
from sklearn.preprocessing import LabelEncoder
le_grade = LabelEncoder()
le_grade.fit(titanic['title'])
titanic['grade'] = le_grade.transform(titanic['title'])
le_grade.classes_
# ๋ฐ์ดํฐ ์ ๋ฆฌ
titanic.head()
# null์ด ์๋ ๋ฐ์ดํฐ๋ง
titanic = titanic[titanic['age'].notnull()]
titanic = titanic[titanic['fare'].notnull()]
titanic.info()
# ๋ฐ์ดํฐ ๋๋๊ธฐ
from sklearn.model_selection import train_test_split
X = titanic[['pclass', 'age', 'sibsp', 'parch', 'fare', 'gender', 'grade']].astype('float')
y = titanic['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
# pca ์ ์ฉ
from sklearn.decomposition import PCA
def get_pca_data(ss_data, n_components=2):
pca = PCA(n_components=n_components)
pca.fit(ss_data)
return pca.transform(ss_data), pca
def get_pd_from_pca(pca_data, col_num):
cols = ['pca_'+str(n) for n in range(col_num)]
return pd.DataFrame(pca_data, columns=cols)
import numpy as np
def print_variance_ratio(pca, only_sum=False):
if only_sum == False:
print('variance_ratio : ', pca.explained_variance_ratio_)
print('sum of variance_ratio : ', np.sum(pca.explained_variance_ratio_))
# ๋ ๊ฐ์ ์ถ์ผ๋ก ๋ณํ
pca_data, pca = get_pca_data(X_train, n_components=2)
print_variance_ratio(pca)
# ๊ทธ๋ํ
import seaborn as sns
pca_columns = ['pca_1', 'pca_2']
pca_pd = pd.DataFrame(pca_data, columns=pca_columns)
pca_pd['survived'] = y_train
sns.pairplot(pca_pd, hue = 'survived', height=5, x_vars=['pca_1'], y_vars=['pca_2'])
# 3๊ฐ๋ก ์งํ
# ์ธ ๊ฐ์ ์ถ์ผ๋ก ๋ณํ
pca_pd = get_pd_from_pca(pca_data, 3)
pca_pd['survived'] = y_train.values
pca_pd.head()
# ๊ทธ๋ํ
# plotly.express
import plotly.express as px
fig = px.scatter_3d(pca_pd,
x='pca_0', y='pca_1', z='pca_2',
color='survived', symbol='survived',
opacity=0.4)
fig.update_layout(margin = dict(l=0, r=0, b=0, t=0))
fig.show()
# pipeline ๊ตฌ์ถ
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
estimators = [('scaler', StandardScaler()),
('pca', PCA(n_components=3)),
('clf', KNeighborsClassifier(n_neighbors=20))]
pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)
print(accuracy_score(y_test, pred))
# ๋ค์ ํ์ธ
dicaprio = np.array([[3, 18, 0, 0, 5, 1, 1]])
print('Dicaprio : ', pipe.predict_proba(dicaprio)[0, 1])
winslet = np.array([[1, 16, 1, 1, 100, 0, 3]])
print('Winslet : ', pipe.predict_proba(winslet)[0, 1])
๋ง์ด ์ด๋ ต..ใ ใ
๐ป ์ถ์ฒ : ์ ๋ก๋ฒ ์ด์ค ๋ฐ์ดํฐ ์ทจ์ ์ค์ฟจ