๐Ÿ˜ข ์Šคํ„ฐ๋””๋…ธํŠธ (Machine Learning 13)

zoeยท2023๋…„ 5์›” 26์ผ
0

Principal Component Analysis

  • PCA : ๋ฐ์ดํ„ฐ ์ง‘ํ•ฉ ๋‚ด์— ์กด์žฌํ•˜๋Š” ๊ฐ ๋ฐ์ดํ„ฐ์˜ ์ฐจ์ด๋ฅผ ๊ฐ€์žฅ ์ž˜ ๋‚˜ํƒ€๋‚ด์ฃผ๋Š” ์š”์†Œ๋ฅผ ์ฐพ์•„๋‚ด๋Š” ๋ฐฉ๋ฒ•. ํ†ต๊ณ„ ๋ฐ์ดํ„ฐ ๋ถ„์„(์ฃผ์„ฑ๋ถ„ ์ฐพ๊ธฐ), ๋ฐ์ดํ„ฐ ์••์ถ•(์ฐจ์› ๊ฐ์†Œ), ๋…ธ์ด์ฆˆ ์ œ๊ฑฐ ๋“ฑ ๋‹ค์–‘ํ•œ ๋ถ„์•ผ์—์„œ ์‚ฌ์šฉ
    - ์ฐจ์›์ถ•์†Œ์™€ ๋ณ€์ˆ˜์ถ”์ถœ ๊ธฐ๋ฒ•์œผ๋กœ ๋„๋ฆฌ ์“ฐ์ด๊ณ  ์žˆ๋Š” ์ฃผ์„ฑ๋ถ„ ๋ถ„์„
    - PCA๋Š”๋ฐ์ดํ„ฐ์˜ ๋ถ„์‚ฐ์„ ์ตœ๋Œ€ํ•œ ๋ณด์กดํ•˜๋ฉด์„œ ์„œ๋กœ ์ง๊ตํ•˜๋Š” ์ƒˆ ๊ธฐ์ €(์ถ•)๋ฅผ ์ฐพ์•„, ๊ณ ์ฐจ์› ๊ณต๊ฐ„์˜ ํ‘œ๋ณธ๋“ค์„ ์„ ํ˜• ์—ฐ๊ด€์„ฑ์ด ์—†๋Š” ์ €์ฐจ์› ๊ณต๊ฐ„์œผ๋กœ ๋ณ€ํ™˜ํ•˜๋Š” ๊ธฐ๋ฒ•
    - ๋ณ€์ˆ˜์ถ”์ถœ์€ ๊ธฐ์กด ๋ณ€์ˆ˜๋ฅผ ์กฐํ•ฉํ•ด ์ƒˆ๋กœ์šด ๋ณ€์ˆ˜๋ฅผ ๋งŒ๋“œ๋Š” ๊ธฐ๋ฒ•(๋ณ€์ˆ˜ ์„ ํƒ๊ณผ ๊ตฌ๋ถ„ํ•  ๊ฒƒ)
# ๊ฐ„๋‹จํ•œ ๋ฐ์ดํ„ฐ ์ƒ์„ฑ
import numpy as np
import seaborn as sns

sns.set_style('whitegrid')

rng = np.random.RandomState(13) # numpy.random.RandomState๋Š” class๋ช…
X = np.dot(rng.rand(2, 2), rng.randn(2, 200)).T
X.shape
rng.rand(2, 2)  # rand : 0 ~ 1์‚ฌ์ด์˜ ๋‚œ์ˆ˜
rng.randn(2, 200) # randn : ํ‰๊ท ์€ 0, ํ‘œ์ค€ํŽธ์ฐจ๋Š” 1์ธ ํ‘œ์ค€์ •๊ทœ๋ถ„ํฌ๋ฅผ ๋”ฐ๋ฅด๋Š” ์ˆซ์ž๋ฅผ ๋‚˜์—ด
import matplotlib.pyplot as plt

plt.scatter(X[:,0], X[:, 1])
plt.axis('equal')
# fit

from sklearn.decomposition import PCA

pca = PCA(n_components = 2, random_state = 13)
# n_component = 2 : 2๊ฐœ์˜ ์ฃผ์„ฑ๋ถ„์œผ๋กœ ํ‘œํ˜„ํ•ด๋ผ
pca.fit(X)
# ๋ฒกํ„ฐ์™€ ๋ถ„์‚ฐ๊ฐ’

pca.components_ # 2๊ฐœ์˜ ํ–‰ -> 2๊ฐœ์˜ ๋ฒกํ„ฐ
pca.explained_variance_ # ๋ฒกํ„ฐ์˜ ์„ค๋ช…๋ ฅ
# ์ฃผ์„ฑ๋ถ„ ๋ฒกํ„ฐ ๊ทธ๋ฆด ์ค€๋น„

def draw_vector(v0, v1, ax=None):
    ax = ax or plt.gca() 
    # ax=None๊ฐ’(ํŠน์ • ๊ฐ’์ด ์—†์„ ๊ฒฝ์šฐ), plt.gca()๋ฅผ ์ ์šฉ ๊ทธ๋ ‡์ง€ ์•Š์œผ๋ฉด ax๊ฐ’ ์ ์šฉ
    arrowprops = dict(arrowstyle='->', # ํ™”์‚ดํ‘œ ์Šคํƒ€์ผ
                      linewidth=2, color = 'black', shrinkA=0, shrinkB=0) 
    
    ax.annotate('', v1, v0, arrowprops=arrowprops)
# ๊ทธ๋ฆฌ๊ธฐ

plt.scatter(X[:, 0], X[:, 1], alpha=0.4) # alpha : ํˆฌ๋ช…๋„
for length, vector in zip(pca.explained_variance_, pca.components_):
    v = vector  *3 * np.sqrt(length) # 3 : ์ž„์˜์˜ ๊ฐ’, ์ ๋‹นํ•œ ํฌ๊ธฐ๋กœ ๋ณ€ํ™˜ํ•˜๋ ค๊ณ 
    draw_vector(pca.mean_, pca.mean_ + v)
plt.axis('equal')
plt.show()
pca.mean_ # ๋ฐ์ดํ„ฐ์˜ ์ค‘์‹ฌ, ๊ฐ€์žฅ ํฐ ์˜ํ–ฅ๋ ฅ์„ ๋ผ์น˜๋Š” ๋ฒกํ„ฐ(?)
  • ๋ฐ์ดํ„ฐ์˜ ์ฃผ์„ฑ๋ถ„์„ ์ฐพ์€ ๋‹ค์Œ ์ฃผ์ถ•์„ ๋ณ€๊ฒฝํ•˜๋Š” ๊ฒƒ๋„ ๊ฐ€๋Šฅํ•˜๋‹ค
# n_components = 1๋กœ ์„ค์ •

pca = PCA(n_components=1, random_state=13)
pca.fit(X)
X_pca = pca.transform(X)
X_pca
print(pca.components_)
print(pca.explained_variance_)
pca.mean_
pca.explained_variance_ratio_ # ์ „์ฒด ๋ฐ์ดํ„ฐ์˜ 93%์ •๋„ ๋ฐ˜์˜ํ•  ์ˆ˜ ์žˆ๋‹ค
# linear regression๊ณผ ๊ฐ™์€ ๊ฒฐ๊ณผ์ผ์ง€๋„

X_new = pca.inverse_transform(X_pca) # ์›๋ž˜์˜ ๋ฐ์ดํ„ฐ๋กœ ๋ณ€ํ™˜, 2์ฐจ์›์˜ ํ˜•ํƒœ๋กœ...
plt.scatter(X[:, 0], X[:, 1], alpha=0.3)
plt.scatter(X_new[:,0], X_new[:, 1], alpha=0.9)
plt.axis('equal')
plt.show()


Principal Component Analysis - iris

import pandas as pd
from sklearn.datasets import load_iris

iris = load_iris()

iris_pd = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_pd['species'] = iris.target
iris_pd.head()
# ํŠน์„ฑ 4๊ฐœ๋ฅผ ํ•œ ๋ฒˆ์— ํ™•์ธํ•˜๊ธฐ ์–ด๋ ต๋‹ค

sns.pairplot(iris_pd, hue='species', height=3,
             x_vars=['sepal length (cm)', 'sepal width (cm)'],
             y_vars=['petal length (cm)', 'petal width (cm)'])
# Scaler ์ ์šฉ

from sklearn.preprocessing import StandardScaler

iris_ss = StandardScaler().fit_transform(iris.data)
iris_ss[:3]
# pca ๊ฒฐ๊ณผ๋ฅผ returnํ•˜๋Š” ํ•จ์ˆ˜ ์ƒ์„ฑ

from sklearn.decomposition import PCA

def get_pca_data(ss_data, n_components = 2): # n_components = 2 : 4๊ฐœ์˜ feature๋ฅผ ๋‹ค ๋ณด๊ณ  2๊ฐœ๋กœ ๋ฒกํ„ฐ๋ฅผ ์ถ”์ถœํ•ด๋ผ(?..)
    pca = PCA(n_components=n_components)
    pca.fit(ss_data)
    
    return pca.transform(ss_data), pca
iris_pca, pca = get_pca_data(iris_ss, 2)
iris_pca.shape
pca.mean_
pca.components_ 
# 4๊ฐœ์˜ ์š”์†Œ๋ฅผ ๊ฐ’์€ ๋‚˜์˜ค์ง€๋งŒ ์‹ค์ œ๋กœ๋Š” 2๊ฐœ์˜ ๋ฒกํ„ฐ๊ฐ€ ๋‚˜์˜จ๋‹ค
# ์ด๋ฆ„์€ ์•Œ ์ˆ˜ ์—†์ง€๋งŒ 4๊ฐœ์˜ ํฌ๊ธฐ์— pca 2๊ฐœ์˜ ๋ฒกํ„ฐ๋กœ ์ค„์–ด๋“ ๋‹ค
# pca ๊ฒฐ๊ณผ๋ฅผ pandas๋กœ ์ •๋ฆฌ

def get_pd_from_pca(pca_data, cols=['pca_component_1', 'pca_component_2']):
    return pd.DataFrame(pca_data, columns=cols)
# 4๊ฐœ์˜ ํŠน์„ฑ์„ ๋‘ ๊ฐœ์˜ ํŠน์„ฑ์œผ๋กœ ์ •๋ฆฌ

iris_pd_pca = get_pd_from_pca(iris_pca)
iris_pd_pca['species'] = iris.target
iris_pd_pca.head()
# ๋‘ ๊ฐœ์˜ ํŠน์„ฑ ๊ทธ๋ฆฌ๊ธฐ

sns.pairplot(iris_pd_pca, hue='species', height=5, 
             x_vars=['pca_component_1'], y_vars=['pca_component_2'])
# 4๊ฐœ์˜ ํŠน์„ฑ์„ ๋ชจ๋‘ ์‚ฌ์šฉํ•ด์„œ randomforest์— ์ ์šฉ

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def rf_scores(X ,y, cv=5):
    rf = RandomForestClassifier(random_state=13, n_estimators=100) 
    # n_estimators=100 : decision tree์˜ ๊ฐœ์ˆ˜
    scores_rf = cross_val_score(rf, X, y, scoring = 'accuracy', cv = cv)
    
    print('Score : ', np.mean(scores_rf))

rf_scores(iris_ss, iris.target)
# ๋‘ ๊ฐœ์˜ ํŠน์„ฑ๋งŒ ์ ์šฉํ–ˆ์„ ๋•Œ

pca_X = iris_pd_pca[['pca_component_1', 'pca_component_2']]

rf_scores(pca_X, iris.target)


Principal Component Analysis - wine

wine_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/wine.csv'

wine = pd.read_csv(wine_url, sep=',', index_col=0)
wine.head()
# ์™€์ธ ์ƒ‰์ƒ ๋ถ„๋ฅ˜(red/white)

wine_y = wine['color']
wine_X = wine.drop(['color'], axis=1)
wine_X.head()
# StandardScaler ์ ์šฉ

wine_ss = StandardScaler().fit_transform(wine_X)
wine_ss[:3]
def print_variance_ratio(pca):
    print('variance_ratio : ' , pca.explained_variance_ratio_)
    print('sum of variance_ratio : ' , np.sum(pca.explained_variance_ratio_))
# ๋‘ ๊ฐœ์˜ ์ฃผ์„ฑ๋ถ„์œผ๋กœ ์ค„์ด๋Š” ๊ฒƒ์€ ๋ฐ์ดํ„ฐ์˜ 50%๊ฐ€ ์•ˆ๋œ๋‹ค

pca_wine, pca = get_pca_data(wine_ss, n_components=2)
print_variance_ratio(pca)
# ๊ทธ๋ž˜ํ”„

pca_columns = ['pca_component_1','pca_component_2']
pca_wine_pd = pd.DataFrame(pca_wine, columns=pca_columns)
pca_wine_pd['color'] = wine_y.values


pca_wine_pd
sns.pairplot(pca_wine_pd, hue='color', height=5, 
             x_vars=['pca_component_1'], y_vars=['pca_component_2'])
# Random Forest์— ์ ์šฉํ–ˆ์„ ๋•Œ ์›๋ฐ์ดํ„ฐ์™€ ํฐ ์ฐจ์ด๊ฐ€ ์—†๋‹ค.

rf_scores(wine_ss, wine_y)
pca_X = pca_wine_pd[['pca_component_1', 'pca_component_2']]
rf_scores(pca_X, wine_y)
# ์ฃผ์„ฑ๋ถ„ 3๊ฐœ๋กœ ํ‘œํ˜„ํ–ˆ๋”๋‹ˆ 98% ์ด์ƒ ํ‘œํ˜„ ๊ฐ€๋Šฅํ•˜๋‹ค

pca_wine, pca = get_pca_data(wine_ss, n_components=3)
print_variance_ratio(pca)
cols = ['pca_1', 'pca_2', 'pca_3']
pca_wine_pd = get_pd_from_pca(pca_wine, cols=cols)

pca_X = pca_wine_pd[cols]
pca_X
rf_scores(pca_X, wine_y)
# ์ฃผ์„ฑ๋ถ„ 3๊ฐœ๋กœ ํ‘œํ˜„ํ•œ ๊ฒƒ ์ •๋ฆฌ

pca_wine_plot = pca_X
pca_wine_plot['color'] = wine_y.values

pca_wine_plot.head()
# 3D๋กœ ๊ทธ๋ฆฌ๊ธฐ

from mpl_toolkits.mplot3d import Axes3D

markers = ['^', 'o']

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

for i, marker in enumerate(markers):
    x_axis_data = pca_wine_plot[pca_wine_plot['color']==i]['pca_1']
    y_axis_data = pca_wine_plot[pca_wine_plot['color']==i]['pca_2']
    z_axis_data = pca_wine_plot[pca_wine_plot['color']==i]['pca_3']
    
    ax.scatter(x_axis_data, y_axis_data, z_axis_data, s = 20, alpha = 0.5, marker = marker)

ax.view_init(30, 80)
plt.show()
    
    
# plotly

import plotly.express as px

fig = px.scatter_3d(pca_wine_plot, x = 'pca_1', y='pca_2', z='pca_3',
                    color = 'color', symbol='color', opacity = 0.4)
fig.update_layout(margin = dict(l=0, r=0, b= 0, t= 0))
fig.show()


Principal Component Analysis - PCA eigenface

  • Olivetti ๋ฐ์ดํ„ฐ : ์–ผ๊ตด ์ธ์‹์šฉ์œผ๋กœ ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ์ง€๋งŒ, ํŠน์ • ์ธ๋ฌผ์˜ ๋ฐ์ดํ„ฐ(10์žฅ)๋งŒ ์ด์šฉํ•ด์„œ PCA ์‹ค์Šต์šฉ์œผ๋กœ ์‚ฌ์šฉ
# ๋ฐ์ดํ„ฐ ์ฝ๊ธฐ

from sklearn.datasets import fetch_olivetti_faces

faces_all = fetch_olivetti_faces()
print(faces_all.DESCR)
# ํŠน์ • ์ƒ˜ํ”Œ๋งŒ ์„ ํƒ

K = 20
faces = faces_all.images[faces_all.target == K]
faces
import matplotlib.pyplot as plt

N = 2
M = 5

fig = plt.figure(figsize=(10, 5))
plt.subplots_adjust(top=1, bottom=0, hspace=0, wspace=0.05)
for n in range(N*M):
    ax = fig.add_subplot(N, M, n+1)
    ax.imshow(faces[n], cmap=plt.cm.bone)
    ax.grid(False)
    ax.xaxis.set_ticks([]) #์ขŒํ‘œ๋ฅผ ์—†์• ๋ฒ„๋ ค๋ผ(?..)
    ax.yaxis.set_ticks([])
    
plt.suptitle('Olivetti')
plt.tight_layout()
plt.show()
    
# ๋‘ ๊ฐœ์˜ ์„ฑ๋ถ„์œผ๋กœ ๋ถ„์„

from sklearn.decomposition import PCA

pca = PCA(n_components=2)

X = faces_all.data[faces_all.target == K]
W = pca.fit_transform(X) # 10์žฅ์˜ ์‚ฌ์ง„์„ ํ‘œํ˜„ํ•˜๋Š” ๋ฒกํ„ฐ
W
X.shape
import numpy as np

np.sqrt(4096) # 64*64ํ”ฝ์…€์˜ ๊ทธ๋ฆผ์ด 10๊ฐœ๊ฐ€ ์žˆ๋‹ค
X_inv = pca.inverse_transform(W)
X_inv
# ๋ถ„์„๋œ ๊ฒฐ๊ณผ ํ™•์ธ

N = 2
M = 5

fig = plt.figure(figsize=(10, 5))
plt.subplots_adjust(top=1, bottom=0, hspace=0, wspace=0.05)
for n in range(N*M):
    ax = fig.add_subplot(N, M, n+1)
    ax.imshow(X_inv[n].reshape(64, 64), cmap=plt.cm.bone) # reshape : ๋ฐฐ์—ด์˜ ์žฌ๊ตฌ์กฐํ™” ํ˜น์€ ๋ณ€๊ฒฝ
    ax.grid(False)
    ax.xaxis.set_ticks([])
    ax.yaxis.set_ticks([])
    
plt.suptitle('PCA result')
plt.tight_layout()
plt.show()
# ์›์ ๊ณผ ๋‘ ๊ฐœ์˜ eigen face
# 10์žฅ์˜ ์‚ฌ์ง„์„ ์ด ์„ธ์žฅ์œผ๋กœ ๋ชจ๋‘ ํ‘œํ˜„ ๊ฐ€๋Šฅ

face_mean = pca.mean_.reshape(64, 64)
face_p1 = pca.components_[0].reshape(64, 64)
face_p2 = pca.components_[1].reshape(64, 64)

plt.figure(figsize=(12,7))
plt.subplot(131)
plt.imshow(face_mean, cmap=plt.cm.bone)
plt.grid(False) ; plt.xticks([]); plt.yticks([]); plt.title('mean')
plt.subplot(132)
plt.imshow(face_p1, cmap=plt.cm.bone)
plt.grid(False) ; plt.xticks([]); plt.yticks([]); plt.title('face_p1')
plt.subplot(133)
plt.imshow(face_p2, cmap=plt.cm.bone)
plt.grid(False) ; plt.xticks([]); plt.yticks([]); plt.title('face_p2')

plt.show()
# ๊ฐ€์ค‘์น˜ ์„ ์ •

import numpy as np

N = 2
M = 5
w = np.linspace(-5, 10, N*M)
w
# ์ฒซ๋ฒˆ์งธ ์„ฑ๋ถ„์˜ ๋ณ€ํ™”(mean๊ณผ face_p1์˜ ๊ฒฐ๊ณผ ์ด์šฉ)

fig = plt.figure(figsize=(10, 5))
plt.subplots_adjust(top=1, bottom=0, hspace=0, wspace=0.05)

for n in range(N*M):
    ax = fig.add_subplot(N, M, n+1)
    ax.imshow(face_mean + w[n] * face_p1, cmap=plt.cm.bone) # ์›์ ์—์„œ ๊ฐ€์ค‘์น˜๊ฐ€ face_p1๊ณผ ๊ณฑํ•ด์ง
    plt.grid(False)
    plt.xticks([])
    plt.yticks([])
    plt.title('Weight : ' + str(round(w[n])))

plt.tight_layout()
plt.show()
# ๋‘ ๋ฒˆ์งธ ์„ฑ๋ถ„์˜ ๋ณ€ํ™”(mean๊ณผ face_p2์˜ ๊ฒฐ๊ณผ ์ด์šฉ)

fig = plt.figure(figsize=(10, 5))
plt.subplots_adjust(top=1, bottom=0, hspace=0, wspace=0.05)

for n in range(N*M):
    ax = fig.add_subplot(N, M, n+1)
    ax.imshow(face_mean + w[n] * face_p2, cmap=plt.cm.bone) # ์›์ ์—์„œ ๊ฐ€์ค‘์น˜๊ฐ€ face_p1๊ณผ ๊ณฑํ•ด์ง
    plt.grid(False)
    plt.xticks([])
    plt.yticks([])
    plt.title('Weight : ' + str(round(w[n])))

plt.tight_layout()
plt.show()
# ๋‘ ๊ฐœ์˜ ์„ฑ๋ถ„ ๋‹ค ํ‘œํ˜„

nx, ny = (5, 5)
x = np.linspace(-5, 8, nx)
y = np.linspace(-5, 8, ny)
w1, w2 = np.meshgrid(x, y)
w1, w2
w1.shape
w1 = w1.reshape(-1, ) # reshape(-1, ) : ์•ž์€ ์‹ ๊ฒฝ์“ฐ์ง€ ๋ง๊ณ  ๋’ค์— ๊ฒƒ์„ ๋ฐ”๊พธ์–ด๋ผ
w2 = w2.reshape(-1, )
w1, w2
# ๋‹ค์‹œ ํ•ฉ์„ฑ
fig = plt.figure(figsize=(12, 10))
plt.subplots_adjust(top=1, bottom=0, hspace=0, wspace=0.05)

N = 5
M = 5

for n in range(N*M):
    ax = fig.add_subplot(N, M, n+1)
    ax.imshow(face_mean + w1[n] * face_p1 + w2[n] * face_p2, cmap=plt.cm.bone) # ์›์ ์—์„œ ๊ฐ€์ค‘์น˜๊ฐ€ face_p1๊ณผ ๊ณฑํ•ด์ง
    plt.grid(False)
    plt.xticks([])
    plt.yticks([])
    plt.title('Weight : ' + str(round(w1[n],1)) + ', ' + str(round(w2[n],1)))

plt.tight_layout()
plt.show()


Principal Component Analysis - HAR data

# HAR ๋ฐ์ดํ„ฐ ์ฝ๊ธฐ

import pandas as pd

url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/features.txt'

feature_name_df = pd.read_csv(url, sep='\s+', header=None, names=['column_index', 'comumn_name'])
feature_name = feature_name_df.iloc[:, 1].values.tolist()
X_train_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/' +\
                'master/dataset/HAR_dataset/train/X_train.txt'
X_test_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/' +\
                'master/dataset/HAR_dataset/test/X_test.txt'

X_train = pd.read_csv(X_train_url, sep='\s+', header=None)
X_test = pd.read_csv(X_test_url, sep='\s+', header=None)

X_train.columns = feature_name
X_test.columns = feature_name
y_train_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/' +\
                'master/dataset/HAR_dataset/train/y_train.txt'
y_test_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/' +\
                'master/dataset/HAR_dataset/test/y_test.txt'

y_train = pd.read_csv(y_train_url, sep='\s+', header=None)
y_test = pd.read_csv(y_test_url, sep='\s+', header=None)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
# ์žฌ์‚ฌ์šฉ์„ ์œ„ํ•ด ํ•จ์ˆ˜ ๋งŒ๋“ค๊ธฐ

from sklearn.decomposition import PCA

def get_pca_data(ss_data, n_components=2):
    pca = PCA(n_components=n_components)
    pca.fit(ss_data)
    
    return pca.transform(ss_data), pca
# PCA fit

HAR_pca, pca = get_pca_data(X_train, n_components=2)
HAR_pca.shape
pca.mean_.shape, pca.components_.shape
cols = ['pca_'+str(n) for n in range(pca.components_.shape[0])]
cols
# PCA ๊ฒฐ๊ณผ๋ฅผ ์ €์žฅํ•˜๋Š” ํ•จ์ˆ˜

def get_pd_from_pca(pca_data, col_num):
    cols = ['pca_' + str(n) for n in range(col_num)]
    return pd.DataFrame(pca_data, columns=cols)
# components 2๊ฐœ

HAR_pca, pca = get_pca_data(X_train, n_components=2)
HAR_pd_pca = get_pd_from_pca(HAR_pca, pca.components_.shape[0])
HAR_pd_pca['action'] = y_train
HAR_pd_pca.head()
import seaborn as  sns

sns.pairplot(HAR_pd_pca, hue='action', height=5, x_vars=['pca_0'], y_vars=['pca_1'])
# ์ „์ฒด 500๊ฐœ๊ฐ€ ๋„˜๋Š” ํŠน์„ฑ์„ ๋‹จ ๋‘ ๊ฐœ๋กœ ์ค„์ด๋ฉด ์ด์ •๋„์ด๋‹ค

import numpy as np

def print_variance_ratio(pca):
    print('variance_ratio : ', pca.explained_variance_ratio_)
    print('sum of variance_ratio : ', np.sum(pca.explained_variance_ratio_))

print_variance_ratio(pca)
# ์ปดํฌ๋„ŒํŠธ 3๊ฐœ ์ง„ํ–‰

HAR_pca, pca = get_pca_data(X_train, n_components=3)
HAR_pd_pca = get_pd_from_pca(HAR_pca, pca.components_.shape[0])
HAR_pd_pca['action'] = y_train

print_variance_ratio(pca)
# ์ปดํฌ๋„ŒํŠธ 10๊ฐœ

HAR_pca, pca = get_pca_data(X_train, n_components=10)
HAR_pd_pca = get_pd_from_pca(HAR_pca, pca.components_.shape[0])
HAR_pd_pca['action'] = y_train

print_variance_ratio(pca)
%%time

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

params = {
    'max_depth' : [6, 8, 10],
    'n_estimators' : [50, 100, 200],
    'min_samples_leaf' : [8, 12],
    'min_samples_split' : [8, 12]
}

rf_clf = RandomForestClassifier(random_state=13, n_jobs=-1)
grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(HAR_pca, y_train.values.reshape(-1,))
# ์„ฑ๋Šฅ์€ ์กฐ๊ธˆ ๋‚˜์˜๋‹ค

cv_results_df = pd.DataFrame(grid_cv.cv_results_)
cv_results_df.columns
target_col = ['rank_test_score', 'mean_test_score', 'param_n_estimators', 'param_max_depth']
cv_results_df[target_col].sort_values('rank_test_score').head()
# best ํŒŒ๋ผ๋ฏธํ„ฐ

grid_cv.best_params_
grid_cv.best_score_
# ํ…Œ์ŠคํŠธ ๋ฐ์ดํ„ฐ์— ์ ์šฉ

from sklearn.metrics import accuracy_score

rf_clf_best = grid_cv.best_estimator_
rf_clf_best
rf_clf_best.fit(HAR_pca, y_train.values.reshape(-1,))

pred1 = rf_clf_best.predict(pca.transform(X_test))
# (pca.transform(X_test)) : ๋ณ„๋„๋กœ fit์„ ํ•˜๋ฉด ์˜คํžˆ๋ ค ๊ผฌ์ž„, 
# pca.transform์œผ๋กœ ์žก์•„๋‘” ํŒŒ๋ผ๋ฏธํ„ฐ๋ฅผ ๊ฐ€์ง€๊ณ  X_test๋ฅผ transformํ•ด์•ผ ํ•œ๋‹ค โ˜…โ˜…โ˜…

accuracy_score(y_test, pred1)
  • xgboost ์•ˆ๋  ๊ฒฝ์šฐ ๋ฒ„์ „ ๋‚ฎ์ถฐ์„œ ์žฌ ๋‹ค์šด ์ง„ํ–‰
# xgboost ์•ˆ๋  ๊ฒฝ์šฐ ๋ฒ„์ „ ๋‚ฎ์ถฐ์„œ ์žฌ ๋‹ค์šด ์ง„ํ–‰
# https://stackoverflow.com/questions/71996617/invalid-classes-inferred-from-unique-values-of-y-expected-0-1-2-3-4-5-got

#!pip uninstall xgboost 
#!pip install xgboost==1.5.0
# xgboost

import time
from xgboost import XGBClassifier

evals = [(pca.transform(X_test), y_test)]

start_time = time.time()
xgb = XGBClassifier(n_estimators = 400, learning_rate=0.1, max_depth = 3)
xgb.fit(HAR_pca, y_train.values.reshape(-1,), 
        early_stopping_rounds=10, eval_set=evals)
print('Fit time : ', time.time() - start_time)
# accuracy

accuracy_score(y_test, xgb.predict(pca.transform(X_test)))




Principal Component Analysis - PCA and kNN

  • NMIST : 28*28 ํ”ฝ์…€์˜ 0 ~ 9 ์‚ฌ์ด์˜ ์ˆซ์ž ์ด๋ฏธ์ง€์™€ ๋ ˆ์ด๋ธ”๋กœ ๊ตฌ์„ฑ๋œ ์…‹. 60000๊ฐœ์˜ ํ›ˆ๋ จ์šฉ ์…‹๊ณผ 10000๊ฐœ์˜ ์‹คํ—˜์šฉ ์…‹์œผ๋กœ ๊ตฌ์„ฑ

  • https://www.kaggle.com/datasets/oddrationale/mnist-in-csv

# ๋ฐ์ดํ„ฐ ์ฝ๊ธฐ

import pandas as pd

df_train = pd.read_csv('./MNIST/mnist_train.csv')
df_test = pd.read_csv('./MNIST/mnist_test.csv')

df_train.shape, df_test.shape
# train ๋ฐ์ดํ„ฐ

df_train.head()
# test ๋ฐ์ดํ„ฐ

df_test
# ๋ฐ์ดํ„ฐ ์ •๋ฆฌ

import numpy as np

X_train = np.array(df_train.iloc[:, 1:])
y_train = np.array(df_train['label'])

X_test = np.array(df_test.iloc[:, 1:])
y_test = np.array(df_test['label'])

X_train.shape, y_train.shape, X_test.shape, y_test.shape
# ๋ฐ์ดํ„ฐ ํ™•์ธ

import random

samples = random.choices(population=range(0, 60000), k=16)
samples
# randomํ•˜๊ฒŒ 16๊ฐœ๋งŒ ํ™•์ธ

import matplotlib.pyplot as plt

plt.figure(figsize=(14, 12))

for idx, n in enumerate(samples):
    plt.subplot(4, 4, idx+1)
    plt.imshow(X_train[n].reshape(28, 28), cmap='Greys', interpolation='nearest')
    plt.title(y_train[n])

plt.show()
# fit

from sklearn.neighbors import KNeighborsClassifier
import time

start_time = time.time()
clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(X_train, y_train)
print('Fit time : ', time.time() - start_time)
# test ๋ฐ์ดํ„ฐ predict
# kNN์€ ๊ฑฐ๋ฆฌ๋ฅผ ๋ชจ๋‘ ๊ตฌํ•ด์•ผ ํ•œ๋‹ค..์‹œ๊ฐ„์ด ์˜ค๋ž˜ ๊ฑธ๋ฆผ
# pca๋กœ ์ฐจ์›์„ ์ค„์—ฌ๋ณด์ž

from sklearn.metrics import accuracy_score

start_time = time.time()
pred = clf.predict(X_test)
print('Fit time : ', time.time() - start_time)
print(accuracy_score(y_test, pred))
# pca๋กœ ์ฐจ์›์„ ์ค„์—ฌ๋ณด์ž

from sklearn.pipeline import Pipeline
# Pipeline : ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ์—์„œ ํ•™์Šต๊นŒ์ง€์˜ ๊ณผ์ •์„ ํ•˜๋‚˜๋กœ ์—ฐ๊ฒฐ
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, StratifiedKFold
# GridSearchCV : ์˜ˆ์ธก ๋ชจํ˜•์˜ ํŒŒ๋ผ๋ฏธํ„ฐ ์ตœ์ ๊ฐ’์„ ๊ต์ฐจ๊ฒ€์ฆ์œผ๋กœ ํ™•์ธ, ํ•˜์ดํผ ํŒŒ๋ผ๋ฏธํ„ฐ ํŠœ๋‹ ์ˆ˜ํ–‰

# ์ž‘์—… ๋“ฑ๋ก
pipe = Pipeline([
    ('pca', PCA()), ('clf', KNeighborsClassifier())])

parameters = {
    'pca__n_components':[2, 5, 10], 'clf__n_neighbors':[5, 10, 15]}

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=13)
grid = GridSearchCV(pipe, parameters, cv=kf, n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)
# best score

print('Best score : %0.3f' % grid.best_score_)
print('Best parameters set : ')
best_parameters = grid.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print('\t%s : %r' %(param_name, best_parameters[param_name]))
# ์•ฝ 93%์˜ acc๊ฐ€ ํ™•๋ณด๋œ๋‹ค

accuracy_score(y_test, grid.best_estimator_.predict(X_test))
# ๊ฒฐ๊ณผ ํ™•์ธ
# ex) 9 : precision 0.90 -> 9๋ผ๊ณ  ํ•œ ๊ฒƒ์ค‘์— 9์ธ ํ™•๋ฅ  0.90
def results(y_pred, y_test):
    from sklearn.metrics import classification_report, confusion_matrix
    print(classification_report(y_test, y_pred))

results(grid.predict(X_train), y_train)
# ์ˆซ์ž๋ฅผ ๋‹ค์‹œ ํ™•์ธํ•˜๊ณ  ์‹ถ๋‹ค๋ฉด

n = 1000
plt.imshow(X_test[n].reshape(28, 28), cmap='Greys', interpolation='nearest')
plt.show()

print('Answer is : ', grid.best_estimator_.predict(X_test[n].reshape(1,784)))
print('Real Label is : ', y_test[n])
# ํ‹€๋ฆฐ ๋ฐ์ดํ„ฐ ํ™•์ธ

preds = grid.best_estimator_.predict(X_test)
preds
y_test
# ํ‹€๋ฆฐ ๋ฐ์ดํ„ฐ ์ถ”๋ฆฌ๊ธฐ

wrong_results = X_test[y_test != pred]
wrong_results
wrong_results.shape[0]
samples = random.choices(population=range(0, wrong_results.shape[0]), k=16)

plt.figure(figsize=(14, 12))

for idx, n in enumerate(samples):
    plt.subplot(4, 4, idx+1)
    plt.imshow(wrong_results[n].reshape(28, 28), cmap='Greys', interpolation='nearest')
    # interpolation='nearest' : ๋ณด๊ฐ„๋ฒ•, ํ”ฝ์…€๋“ค์˜ ์ถ• ์œ„์น˜ ๊ฐ„๊ฒฉ์„ ๋ณด์ •ํ•˜์—ฌ ์ด๋ฏธ์ง€๊ฐ€ ์ž์—ฐ์Šค๋Ÿฌ์šด ๋ชจ์–‘์œผ๋กœ ๋ณด์ผ ์ˆ˜ ์žˆ๊ฒŒ ํ•˜๋Š” ๋ฐฉ๋ฒ•.  'nearest'๋Š” ๊ฐ€์žฅ ๊ณ ํ•ด์ƒ๋„์ธ ๋ณด๊ฐ„๋ฒ•
    # ์ถœ์ฒ˜ : https://bentist.tistory.com/23
    
    plt.title(grid.best_estimator_.predict(wrong_results[n].reshape(1, 784))[0])
    
plt.show()


titanic data using PCA and kNN

# ํƒ€์ดํƒ€๋‹‰ ๋ฐ์ดํ„ฐ

import pandas as pd

titanic_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/titanic.xls'

titanic = pd.read_excel(titanic_url)
titanic.head()
# ์ด๋ฆ„ ๋ถ„๋ฆฌํ•ด์„œ title ๋งŒ๋“ค๊ธฐ(์‚ฌํšŒ์  ์‹ ๋ถ„์ด ์ด๋ฆ„์— ๋ฐ˜์˜๋˜์–ด ์žˆ์–ด์„œ..)

import re

title = []

for idx, dataset in titanic.iterrows():
    title.append(re.search('\,\s\w+(\s\w+)?\.', dataset['name']).group()[2:-1])

titanic['title'] = title
titanic.head()
# ๊ท€์กฑ๊ณผ ํ‰๋ฏผ ๋“ฑ๊ธ‰ ๊ตฌ๋ณ„

titanic['title'] = titanic['title'].replace('Mlle', 'Miss')
titanic['title'] = titanic['title'].replace('Ms', 'Miss')
titanic['title'] = titanic['title'].replace('Mme', 'Mrs')

Rare_f = ['Dona', 'Dr', 'Lady', 'the Countess']
Rare_m = ['Capt', 'Col', 'Don', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Master']

for each in Rare_f:
    titanic['title'] = titanic['title'].replace(each, 'Rare_f')
    
for each in Rare_m:
    titanic['title'] = titanic['title'].replace(each, 'Rare_m')

titanic['title'].unique()
# gender ์ปฌ๋Ÿผ ์ƒ์„ฑ

from sklearn.preprocessing import LabelEncoder

le_sex = LabelEncoder()
le_sex.fit(titanic['sex'])
titanic['gender'] = le_sex.transform(titanic['sex'])
# grade ์ปฌ๋Ÿผ ์ƒ์„ฑ(์ˆซ์ž๋กœ ๋ฐ”๊ฟ”์ฃผ์–ด์•ผ ํ•จ)

from sklearn.preprocessing import LabelEncoder

le_grade = LabelEncoder()
le_grade.fit(titanic['title'])
titanic['grade'] = le_grade.transform(titanic['title'])

le_grade.classes_
# ๋ฐ์ดํ„ฐ ์ •๋ฆฌ

titanic.head()
# null์ด ์•„๋‹Œ ๋ฐ์ดํ„ฐ๋งŒ

titanic = titanic[titanic['age'].notnull()]
titanic = titanic[titanic['fare'].notnull()]

titanic.info()
# ๋ฐ์ดํ„ฐ ๋‚˜๋ˆ„๊ธฐ

from sklearn.model_selection import train_test_split

X = titanic[['pclass', 'age', 'sibsp', 'parch', 'fare', 'gender', 'grade']].astype('float')
y = titanic['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
# pca ์ ์šฉ

from sklearn.decomposition import PCA

def get_pca_data(ss_data, n_components=2):
    pca = PCA(n_components=n_components)
    pca.fit(ss_data)
    
    return pca.transform(ss_data), pca
def get_pd_from_pca(pca_data, col_num):
    cols = ['pca_'+str(n) for n in range(col_num)]
    return pd.DataFrame(pca_data, columns=cols)
import numpy as np

def print_variance_ratio(pca, only_sum=False):
    if only_sum == False:
        print('variance_ratio : ', pca.explained_variance_ratio_)
    print('sum of variance_ratio : ', np.sum(pca.explained_variance_ratio_))
    
# ๋‘ ๊ฐœ์˜ ์ถ•์œผ๋กœ ๋ณ€ํ™˜

pca_data, pca = get_pca_data(X_train, n_components=2)
print_variance_ratio(pca)
# ๊ทธ๋ž˜ํ”„

import seaborn as sns

pca_columns = ['pca_1', 'pca_2']
pca_pd = pd.DataFrame(pca_data, columns=pca_columns)
pca_pd['survived'] = y_train

sns.pairplot(pca_pd, hue = 'survived', height=5, x_vars=['pca_1'], y_vars=['pca_2'])
# 3๊ฐœ๋กœ ์ง„ํ–‰
# ์„ธ ๊ฐœ์˜ ์ถ•์œผ๋กœ ๋ณ€ํ™˜

pca_pd = get_pd_from_pca(pca_data, 3)

pca_pd['survived'] = y_train.values
pca_pd.head()
# ๊ทธ๋ž˜ํ”„
# plotly.express

import plotly.express as px

fig = px.scatter_3d(pca_pd, 
                    x='pca_0', y='pca_1', z='pca_2',
                    color='survived', symbol='survived',
                    opacity=0.4)
fig.update_layout(margin = dict(l=0, r=0, b=0, t=0))
fig.show()
# pipeline ๊ตฌ์ถ•

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

estimators = [('scaler', StandardScaler()),
              ('pca', PCA(n_components=3)),
              ('clf', KNeighborsClassifier(n_neighbors=20))]

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)

pred = pipe.predict(X_test)
print(accuracy_score(y_test, pred))
# ๋‹ค์‹œ ํ™•์ธ

dicaprio = np.array([[3, 18, 0, 0, 5, 1, 1]])
print('Dicaprio : ', pipe.predict_proba(dicaprio)[0, 1])


winslet = np.array([[1, 16, 1, 1, 100, 0, 3]])
print('Winslet : ', pipe.predict_proba(winslet)[0, 1])

๋งŽ์ด ์–ด๋ ต..ใ… ใ… 

๐Ÿ’ป ์ถœ์ฒ˜ : ์ œ๋กœ๋ฒ ์ด์Šค ๋ฐ์ดํ„ฐ ์ทจ์—… ์Šค์ฟจ

profile
#๋ฐ์ดํ„ฐ๋ถ„์„ #ํผํฌ๋จผ์Šค๋งˆ์ผ€ํŒ… #๋ฐ์ดํ„ฐ #๋””์ง€ํ„ธ๋งˆ์ผ€ํŒ…

0๊ฐœ์˜ ๋Œ“๊ธ€