# iris ๋ฐ์ดํฐ๋ก ์ค์ต
from sklearn.preprocessing import scale
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline
iris = load_iris()
# ํน์ง ์ด๋ฆ - ํญ์ ๋ค์ (cm)๊ฐ ๋ถํธํจ
iris.feature_names
# ๋ท๊ธ์ ์๋ฅด๊ธฐ
cols = [each[:-5] for each in iris.feature_names]
cols
# iris ๋ฐ์ดํฐ ์ ๋ฆฌ
iris_df = pd.DataFrame(data = iris.data, columns=cols)
iris_df.head()
# ํธ์์ ๋ ๊ฐ์ ํน์ฑ๋ง
feature = iris_df[['petal length', 'petal width']]
feature.head()
# ๊ตฐ์งํ
# n_clusters : ๊ตฐ์งํ ํ ๊ฐ์, ์ฆ ๊ตฐ์ง ์ค์ฌ์ ์ ๊ฐ์
# init : ์ด๊ธฐ ๊ตฐ์ง ์ค์ฌ์ ์ ์ขํ๋ฅผ ์ค์ ํ๋ ๋ฐฉ์์ ๊ฒฐ์
# max_iter : ์ต๋ ๋ฐ๋ณต ํ์, ๋ชจ๋ ๋ฐ์ดํฐ์ ์ค์ฌ์ ์ด๋์ด ์์ผ๋ฉด ์ข
๋ฃ
model = KMeans(n_clusters=3)
model.fit(feature)
# ๊ฒฐ๊ณผ ๋ผ๋ฒจ (์ง๋ํ์ต์ ๋ผ๋ฒจ๊ณผ ๋ค๋ฆ, ๊ตฐ์ง ์ค์ฌ์ ๊ตฌ๋ถํ๊ธฐ ์ํ ๊ฒ, ์์๊ฐ ์๋)
model.labels_
# ๊ตฐ์ง ์ค์ฌ๊ฐ
model.cluster_centers_
# ๋ค์ ์ ๋ฆฌ (๊ทธ๋ฆผ์ ๊ทธ๋ฆฌ๊ธฐ ์ํด)
predict = pd.DataFrame(model.predict(feature), columns=['cluster'])
feature = pd.concat([feature, predict], axis=1)
feature.head()
# ๊ฒฐ๊ณผ ํ์ธ
centers = pd.DataFrame(model.cluster_centers_, columns=['petal length', 'petal width'])
center_x = centers['petal length']
center_y = centers['petal width']
plt.figure(figsize=(12, 8))
plt.scatter(feature['petal length'], feature['petal width'], c = feature['cluster'], alpha=0.5)
plt.scatter(center_x, center_y, s=50, marker='D', c='r')
plt.show()
# make_blobs - ๊ตฐ์งํ ์ฐ์ต์ ์ํ ๋ฐ์ดํฐ ์์ฑ๊ธฐ
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=200, n_features=2, centers=3,
cluster_std=0.8, random_state=0)
print(X.shape, y.shape)
unique, counts = np.unique(y, return_counts = True)
print(unique, counts)
# ๋ฐ์ดํฐ ์ ๋ฆฌ
cluster_df = pd.DataFrame(data=X, columns=['ftr1', 'ftr2'])
cluster_df['target'] = y
cluster_df.head()
# ๊ตฐ์งํ
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=200, random_state=13)
cluster_labels = kmeans.fit_predict(X)
cluster_df['kmeans-label'] = cluster_labels
# ๊ฒฐ๊ณผ ๋์ํ
centers = kmeans.cluster_centers_
unique_labels = np.unique(cluster_labels)
markers = ['o', 's', '^', 'P', 'D', 'H', 'x']
for label in unique_labels:
label_cluster = cluster_df[cluster_df['kmeans-label']==label]
center_x_y = centers[label]
plt.scatter(x=label_cluster['ftr1'], y=label_cluster['ftr2'],
edgecolors='k', marker=markers[label])
plt.scatter(x=center_x_y[0], y=center_x_y[1], s=200, color = 'white',
alpha=0.9, edgecolors='k', marker=markers[label])
plt.scatter(x=center_x_y[0], y=center_x_y[1], s=70, color = 'k',
edgecolors='k', marker='$%d$' % label)
plt.show()
# ๊ฒฐ๊ณผ ํ์ธ
print(cluster_df.groupby('target')['kmeans-label'].value_counts())
๊ตฐ์ง ๊ฒฐ๊ณผ์ ํ๊ฐ : ๋ถ๋ฅ๊ธฐ๋ ํ๊ฐ ๊ธฐ์ค์ ๊ฐ์ง๊ณ ์์ง๋ง, ๊ตฐ์ง์ ๊ทธ๋ ์ง ์๋ค. ๊ตฐ์ง ๊ฒฐ๊ณผ๋ฅผ ํ๊ฐํ๊ธฐ ์ํด ์ค๋ฃจ์ฃ ๋ถ์์ ๋ง์ด ํ์ฉํ๋ค
์ค๋ฃจ์ฃ ๋ถ์ : ์ค๋ฃจ์ฃ ๋ถ์์ ๊ฐ ๊ตฐ์ง ๊ฐ์ ๊ฑฐ๋ฆฌ๊ฐ ์ผ๋ง๋ ํจ์จ์ ์ผ๋ก ๋ถ๋ฆฌ๋์ด ์๋์ง ๋ํ๋ธ๋ค. ๋ค๋ฅธ ๊ตฐ์ง๊ณผ๋ ๊ฑฐ๋ฆฌ๊ฐ ๋จ์ด์ ธ ์๊ณ , ๋์ผ ๊ตฐ์ง๊ฐ์ ๋ฐ์ดํฐ๋ ์๋ก ๊ฐ๊น๊ฒ ์ ๋ญ์ณ ์๋์ง ํ์ธํ๋ค. ๊ตฐ์งํ๊ฐ ์ ๋์ด ์์์๋ก ๊ฐ๋ณ ๊ตฐ์ง์ ๋น์ทํ ์ ๋์ ์ฌ์ ๊ณต๊ฐ์ ๊ฐ์ง๊ณ ์๋ค.
์ค๋ฃจ์ฃ ๊ณ์ : ๊ฐ๋ณ ๋ฐ์ดํฐ๊ฐ ๊ฐ์ง๋ ๊ตฐ์งํ์ ์งํ
# ๋ฐ์ดํฐ ์ฝ๊ธฐ
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
import pandas as pd
iris = load_iris()
feature_names = ['sepal_length', 'sepal_width', 'patal_length', 'petal_width']
iris_df = pd.DataFrame(data=iris.data, columns=feature_names)
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, random_state=0).fit(iris_df)
# ๊ตฐ์ง ๊ฒฐ๊ณผ ์ ๋ฆฌ
iris_df['cluster'] = kmeans.labels_
iris_df.head()
# ๊ตฐ์ง ํ๊ฐ๋ฅผ ์ํ ์์
from sklearn.metrics import silhouette_samples, silhouette_score
avg_value = silhouette_score(iris.data, iris_df['cluster'])
score_values = silhouette_samples(iris.data, iris_df['cluster'])
print('avg_value', avg_value)
print('silhouette_samples() return ๊ฐ์ shape', score_values.shape)
# yellowbrick ์ค์น
#!pip install yellowbrick
# ์ค๋ฃจ์ฃ ํ๋์ ๊ฒฐ๊ณผ (์ง์ ํ์ผ๋ก ๊ตฌ๋ถ๋ ๊ฒ์ด ์ ๋ ๊ฒ)
from yellowbrick.cluster import silhouette_visualizer
silhouette_visualizer(kmeans, iris.data, colors='yellowbrick')
# ์ด๋ฏธ์ง ์ฝ๊ธฐ
from matplotlib.image import imread
image = imread('./ladybug.png')
image.shape
plt.imshow(image)
# ์์๋ณ๋ก ํด๋ฌ์คํฐ๋ง
from sklearn.cluster import KMeans
X = image.reshape(-1, 3)
kmeans = KMeans(n_clusters=8, random_state=13).fit(X) # ์์์ 8๊ฐ๋ก ๊ตฌ๋ถ ์๋
segmented_img = kmeans.cluster_centers_[kmeans.labels_]
segmented_img = segmented_img.reshape(image.shape)
# ๊ฒฐ๊ณผ - ์์์ ์ข
๋ฅ๊ฐ ๋จ์ํด์ง
plt.imshow(segmented_img)
# ์ด๋ฒ์๋ ์ฌ๋ฌ ๊ฐ์ ๊ตฐ์ง์ ๋น๊ต
segmented_imgs = []
n_colors = (10, 8, 6, 4, 2)
for n_clusters in n_colors:
kmeans = KMeans(n_clusters=n_clusters, random_state=13).fit(X)
segmented_img = kmeans.cluster_centers_[kmeans.labels_]
segmented_imgs.append(segmented_img.reshape(image.shape))
# ์ด๋ฒ์๋ ์ข ๋ณต์กํ๊ฒ ๊ฒฐ๊ณผ๋ฅผ ์๊ฐํ
plt.figure(figsize=(10, 5))
plt.subplots_adjust(wspace=0.05, hspace=0.1)
plt.subplot(231)
plt.imshow(image)
plt.title('Original image')
plt.axis('off')
for idx, n_clusters in enumerate(n_colors):
plt.subplot(232 + idx)
plt.imshow(segmented_imgs[idx])
plt.title('{} colors' .format(n_clusters))
plt.axis('off')
plt.show()
# MNIST ๋ฐ์ดํฐ
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
X_digits, y_digits = load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X_digits, y_digits, random_state=13)
# ๋ก์ง์คํฑ ํ๊ท
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(multi_class='ovr', solver='lbfgs', max_iter=5000, random_state=13)
log_reg.fit(X_train, y_train)
# ๊ฒฐ๊ณผ
log_reg.score(X_test, y_test)
# pipeline ์ ์ฒ๋ฆฌ ๋๋์ผ๋ก kmeans๋ฅผ ํต๊ณผ
from sklearn.pipeline import Pipeline
pipline = Pipeline([
('kmeans', KMeans(n_clusters=50, random_state=13)),
('log_reg', LogisticRegression(multi_class='ovr', solver='lbfgs',
max_iter=5000, random_state=13))
])
pipline.fit(X_train, y_train)
pipline.score(X_test, y_test)
# Gridsearch
from sklearn.model_selection import GridSearchCV
param_grid = dict(kmeans__n_clusters = range(2, 100))
grid_clf = GridSearchCV(pipline, param_grid, cv=3, verbose=2)
grid_clf.fit(X_train, y_train)
grid_clf.best_params_
grid_clf.score(X_test, y_test)
๐ป ์ถ์ฒ : ์ ๋ก๋ฒ ์ด์ค ๋ฐ์ดํฐ ์ทจ์ ์ค์ฟจ