0. 핵심개념 및 사이킷런 알고리즘 API 링크
sklearn.cluster.KMeans
1. Clustering 이해
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn-deep')
import matplotlib.cm
cmap = matplotlib.cm.get_cmap('plasma')
from sklearn.cluster import KMeans
data = pd.read_csv('Mall_Customers.csv')
X = data.iloc[:, [3,4]]
X.head()
Income Spend
0 15 39
1 15 81
2 16 6
3 16 77
4 17 40
wcss = []
for i in range(1,21):
kmeans = KMeans(n_clusters=i)
kmeans.fit_transform(X)
wcss.append(kmeans.inertia_)
wcss
[269981.28000000014,
181665.82312925166,
106348.37306211119,
73679.78903948837,
44448.45544793369,
37239.83554245604,
30259.657207285458,
25018.576334776328,
21818.11458845217,
19664.68519600554,
17595.28888108518,
16286.850886958873,
14300.044641632878,
13167.778522689903,
12283.892784992784,
10853.593442084231,
10156.765398731703,
9410.284737974447,
8708.406236275801,
8039.671850613157]
plt.figure()
plt.plot(range(1,21), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

k = 5
kmeans = KMeans(n_clusters = k)
y_kmeans = kmeans.fit_predict(X)
y_kmeans
array([1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 3,
1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 0, 4, 3, 4, 0, 4, 0, 4,
3, 4, 0, 4, 0, 4, 0, 4, 0, 4, 3, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4,
0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4,
0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4,
0, 4])
Group_cluster=pd.DataFrame(y_kmeans)
Group_cluster.columns=['Group']
full_data=pd.concat([data, Group_cluster], axis=1)
full_data
ID Gender Age Income Spend Group
0 1 Male 19 15 39 1
1 2 Male 21 15 81 2
2 3 Female 20 16 6 1
3 4 Female 23 16 77 2
4 5 Female 31 17 40 1
... ... ... ... ... ... ...
195 196 Female 35 120 79 4
196 197 Female 45 126 28 0
197 198 Male 32 126 74 4
198 199 Male 32 137 18 0
199 200 Male 30 137 83 4
200 rows × 6 columns
kmeans_pred = KMeans(n_clusters=k, random_state=42).fit(X)
kmeans_pred.cluster_centers_
array([[55.2962963 , 49.51851852],
[88.2 , 17.11428571],
[26.30434783, 20.91304348],
[25.72727273, 79.36363636],
[86.53846154, 82.12820513]])
kmeans_pred.predict([[100, 50], [30, 80]])
array([4, 3])
labels = [('Cluster ' + str(i+1)) for i in range(k)]
labels
['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5']
X=np.array(X)
plt.figure()
for i in range(k):
plt.scatter(X[y_kmeans == i, 0], X[y_kmeans == i, 1], s = 20,
c = cmap(i/k), label = labels[i])

plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1],
s = 100, c = 'black', label = 'Centroids', marker = 'X')
plt.xlabel('Income')
plt.ylabel('Spend')
plt.title('Kmeans cluster plot')
plt.legend()
plt.show()

plt.figure()
for i in range(k):
plt.scatter(X[y_kmeans == i, 0], X[y_kmeans == i, 1], s = 20,
c = cmap(i/k), label = labels[i])
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1],
s = 100, c = 'black', label = 'Centroids', marker = 'X')
plt.xlabel('Income')
plt.ylabel('Spend')
plt.title('Kmeans cluster plot')
plt.legend()
plt.show()

2. k-mean Clustering
from scipy.spatial.distance import cdist, pdist
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
iris = pd.read_csv("iris.csv")
print (iris.head())
sepal_length sepal_width petal_length petal_width class
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
x_iris = iris.drop(['class'],axis=1)
y_iris = iris["class"]
x_iris.head()
sepal_length sepal_width petal_length petal_width
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
y_iris.head()
0 Iris-setosa
1 Iris-setosa
2 Iris-setosa
3 Iris-setosa
4 Iris-setosa
Name: class, dtype: object
x_iris.describe()
sepal_length sepal_width petal_length petal_width
count 150.000000 150.000000 150.000000 150.000000
mean 5.843333 3.054000 3.758667 1.198667
std 0.828066 0.433594 1.764420 0.763161
min 4.300000 2.000000 1.000000 0.100000
25% 5.100000 2.800000 1.600000 0.300000
50% 5.800000 3.000000 4.350000 1.300000
75% 6.400000 3.300000 5.100000 1.800000
max 7.900000 4.400000 6.900000 2.500000
from sklearn.preprocessing import StandardScaler
scale=StandardScaler()
scale.fit(x_iris)
X_scale=scale.transform(x_iris)
pd.DataFrame(X_scale).head()
0 1 2 3
0 -0.900681 1.032057 -1.341272 -1.312977
1 -1.143017 -0.124958 -1.341272 -1.312977
2 -1.385353 0.337848 -1.398138 -1.312977
3 -1.506521 0.106445 -1.284407 -1.312977
4 -1.021849 1.263460 -1.341272 -1.312977
K = range(1,10)
KM = [KMeans(n_clusters=k).fit(X_scale) for k in K]
centroids = [k.cluster_centers_ for k in KM]
D_k = [cdist(x_iris, centrds, 'euclidean') for centrds in centroids]
D_k
array~
cIdx = [np.argmin(D,axis=1) for D in D_k]
dist = [np.min(D,axis=1) for D in D_k]
avgWithinSS = [sum(d)/X_scale.shape[0] for d in dist]
wcss = [sum(d**2) for d in dist]
tss = sum(pdist(X_scale)**2)/X_scale.shape[0]
bss = tss-wcss
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(K, avgWithinSS, 'b*-')
plt.grid(True)
plt.xlabel('Number of clusters')
plt.ylabel('Average within-cluster sum of squares')

fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(K, bss/tss*100, 'b*-')
plt.grid(True)
plt.xlabel('Number of clusters')
plt.ylabel('Percentage of variance explained')

import numpy as np
w,v = np.linalg.eig(np.array([[ 0.91335 ,0.75969 ],[ 0.75969,0.69702]]))
print ("\nEigen Values\n",w)
print ("\nEigen Vectors\n",v)
Eigen Values
[1.57253666 0.03783334]
Eigen Vectors
[[ 0.75530088 -0.6553782 ]
[ 0.6553782 0.75530088]]
k_means_fit = KMeans(n_clusters=4, max_iter=300)
k_means_fit.fit(X_scale)
KMeans(n_clusters=4)
k_means_fit.cluster_centers_
array([[-1.34320731, 0.12656736, -1.31407576, -1.30726051],
[-0.01139555, -0.87288504, 0.37688422, 0.31165355],
[-0.73463631, 1.45201075, -1.29704352, -1.21071997],
[ 1.16743407, 0.15377779, 1.00314548, 1.02963256]])
print ("\nK-Means Clustering - Confusion Matrix\n\n",
pd.crosstab(y_iris,k_means_fit.labels_,rownames = ["Actuall"],colnames = ["Predicted"]) )
K-Means Clustering - Confusion Matrix
Predicted 0 1 2 3
Actuall
Iris-setosa 23 0 27 0
Iris-versicolor 0 39 0 11
Iris-virginica 0 17 0 33
print ("\nSilhouette-score: %0.3f" % silhouette_score(x_iris, k_means_fit.labels_, metric='euclidean'))
Silhouette-score: 0.356
for k in range(2,10):
k_means_fitk = KMeans(n_clusters=k,max_iter=300)
k_means_fitk.fit(x_iris)
print ("For K value",k,",Silhouette-score: %0.3f" % silhouette_score(x_iris, k_means_fitk.labels_, metric='euclidean'))
For K value 2 ,Silhouette-score: 0.681
For K value 3 ,Silhouette-score: 0.553
For K value 4 ,Silhouette-score: 0.498
For K value 5 ,Silhouette-score: 0.489
For K value 6 ,Silhouette-score: 0.367
For K value 7 ,Silhouette-score: 0.358
For K value 8 ,Silhouette-score: 0.348
For K value 9 ,Silhouette-score: 0.345
3. Hierarchical Clustering
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn-deep')
import matplotlib.cm
cmap = matplotlib.cm.get_cmap('plasma')
data = pd.read_csv('Mall_Customers.csv')
X = data.iloc[:, [3,4]].values
import scipy.cluster.hierarchy as sch
plt.figure(1)
z = sch.linkage(X, method = 'ward')
dendrogram = sch.dendrogram(z)
plt.title('Dendrogram')
plt.xlabel('Customers')
plt.ylabel('ward distances')
plt.show()

k = 5
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters = k, affinity = "euclidean", linkage = 'ward')
y_hc = hc.fit_predict(X)
labels = [('Cluster ' + str(i+1)) for i in range(k)]
plt.figure(2)
for i in range(k):
plt.scatter(X[y_hc == i, 0], X[y_hc == i, 1], s = 20,
c = cmap(i/k), label = labels[i])
plt.xlabel('Income')
plt.ylabel('Spending score')
plt.title('HC cluster plot')
plt.legend()
plt.show()
