import sys
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

import tensorflow as tf

from sklearn.datasets.samples_generator import make_blobs
from sklearn.cluster import MiniBatchKMeans, KMeans

Basics of K-means Clustering

The k-means clustering searches for pre-determined number of clusters within an unlabeled multidimensional dataset. It accomplishes this using a simple conception of what the optimal clustering looks like:

  • The "cluster center" is the arithmetic mean of all the points belonging to the cluster.
  • Each point is closer to its own cluster ceenter than to other cluster centers.

First exercise

First, let's generate a two-dimensional dataset containing four distinct blobs. To emphasize that this is an unsupervised algorithm, we will leave the labels out of the visualization.

xs,ys = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)
plt.scatter(xs[:,0], xs[:,1], s=20);

By eye, it is relatively easy to pick out the four clusters. The k-means algorithm does this automatically, and in Scikit-Learn uses the typical estimator API:

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4)
kmeans.fit(xs)
y_cls = kmeans.predict(xs)
plt.scatter(xs[:,0], xs[:,1], c=y_cls, s=20, cmap='viridis')
centers = kmeans.cluster_centers_
plt.scatter(centers[:,0], centers[:,1], c='black', s=100, alpha=0.5);

Finding the optmal k

  • How to find the optimal k?

    • No one-click method to find the optimal k in k-means clustering
    • Elbow method
  • Inertia (distortion)

    • Sum of squared distances of samples to their closest cluster center.
    • Decreases with an increasing number of clusters
    • Becomes zero when the number of clusters equals the numbers of points
  • Elbow method

    • Elbow plot helps indicate number of clusters present in data
    • Only gives an indication of optimal k
    • Does not always pinpoint how many k
    • Other methods : average silhouette, gap statistic

Elbow method

Let's see how the elbow plot looks on a data set with distinct, well-defined clusters.

inertias = []
n_clusters = range(1,10)
for i in n_clusters:
    kmeans = KMeans(i)
    kmeans.fit(xs)
    inertias.append(kmeans.inertia_)
    
elbow_plot = pd.DataFrame({'n_clusters': n_clusters, 'inertias': inertias })
sns.lineplot(x='n_clusters', y='inertias', data=elbow_plot);
plt.xticks(n_clusters);

K-Means Clustering for Images

Fashion MNIST

To be quick, I will load the Fashion MNIST from TensorFlow Keras library, which is often used as the "Hello World" of machine learning programs for computer vision.

fashion_mnist = tf.keras.datasets.fashion_mnist
(train_x, train_y), (test_x, test_y) = fashion_mnist.load_data()
CLASSES = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
NCLASSES = len(CLASSES)
class2label = {i:c for i,c in enumerate(CLASSES)}
print(f"training set: {train_x.shape}")
print(f"test set: {test_x.shape}")
training set: (60000, 28, 28)
test set: (10000, 28, 28)

AS you can see, the dataset contains images of 28x28x1. Let's print some out and see what they look like.

fig,axs = plt.subplots(3,3,figsize=(12,12))
plt.gray()

for i,ax in enumerate(axs.flat):
    ax.imshow(train_x[i])
    ax.axis('off')
    ax.set_title(f'{class2label[train_y[i]]}')

Preprocessing

Images are formated as 2-dimensional numpy arrays. However, the K-means clustering algorithm provided by scikit-learn ingests 1-dimensional arrays; as a result, we will need to reshape each image or precisely wee need to flatten the data.

Clustering algorithms almost always use 1-dimensional data. For example, if you were clustering a set of X, Y coordinates, each point would be passed to the clustering algorithm as a 1-dimensional array with a length of two (example: [2,4] or [-1, 4]). If you were using 3-dimensional data, the array would have a length of 3 (example: [2, 4, 1] or [-1, 4, 5]).

Fashion MNIST contains images that are 28 by 28 pixels; as a result, they will have a length of 784 once we reshape them into a 1-dimensional array.

train_x = train_x.reshape(len(train_x), -1)
print(train_x.shape)
(60000, 784)
train_x = train_x.astype(np.float32) / 255.0

Applying k-means clustering

Since the size of the MNIST dataset is quite large, we will use the mini-batch implementation of k-means clustering (MiniBatchKMeans) provided by scikit-learn. This will dramatically reduce the amount of time it takes to fit the algorithm to the data.

Here, we just choose the n_clusters argument to the n_digits(the size of unique labels, in our case, 10), and set the default parameters in MiniBatchKMeans.

And as you know that, K-means clustering is one of the unsupervised learning. That means it doesn't require any label to train.

kmeans = MiniBatchKMeans(n_clusters=NCLASSES)
kmeans.fit(train_x)
MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
                init_size=None, max_iter=100, max_no_improvement=10,
                n_clusters=10, n_init=3, random_state=None,
                reassignment_ratio=0.01, tol=0.0, verbose=0)
kmeans.labels_
array([7, 4, 6, ..., 8, 6, 5], dtype=int32)

Important: These are not real label of each image. These are just group ids that the clustering models label for each input.

To match group ids with actual labels, we can tackle this with following steps:

  • Combine each images in the same group
  • Check frequency distribution of actual labels (using np.bincount)
  • Find the most frequent label by np.argmax and set the label
kmeans = MiniBatchKMeans(n_clusters=32)
kmeans.fit(train_x)
MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
                init_size=None, max_iter=100, max_no_improvement=10,
                n_clusters=32, n_init=3, random_state=None,
                reassignment_ratio=0.01, tol=0.0, verbose=0)
label2cluster = {}

for i in range(kmeans.n_clusters):
    labels = []
    indeces = np.where(kmeans.labels_ == i)
    labels.append(train_y[indeces])
    #print(len(labels[0]))
def get_label2cluster_mapper(kmeans, ground_trues):

    label2cluster = {}
    for i in range(kmeans.n_clusters):
        labels = []
        indeces = np.where(kmeans.labels_ == i)
        labels.append(ground_trues[indeces])
        #labels = ground_trues[indeces]
        #hist = (np.bincount(np.squeeze(labels)))
        
        if len(labels[0]) == 1:
            hist = np.bincount(labels[0])
        else:
            hist = np.bincount(np.squeeze(labels))

        if np.argmax(hist) in label2cluster:
            label2cluster[np.argmax(hist)].append(i)
        else:
            label2cluster[np.argmax(hist)] = [i]

    return label2cluster
def convert_cluster_to_label(clusters, mapper):
    preds = np.zeros(len(clusters)).astype(np.uint8)
    for i,cl in enumerate(clusters):
        for k,v in mapper.items():
            if cl in v: preds[i] = k
    return preds
label2cluster = get_label2cluster_mapper(kmeans, train_y)
train_cl = kmeans.predict(train_x)
preds = convert_cluster_to_label(train_cl, label2cluster)
print(preds[:20])
print(train_y[:20])
[9 0 0 0 3 2 7 2 5 5 0 9 5 5 7 9 1 0 2 6]
[9 0 0 3 0 2 7 2 5 5 0 9 5 5 7 9 1 0 6 4]
print(f"Accuracy : {sum(preds == train_y)/len(preds) * 100}%")
Accuracy : 67.42%

As a result, some predicted label is mismatched, but in general k-means clustering does an acceptable jobs of predicting labels.

Searching for the k

With the functions defined above, we can now determine the accuracy of our algorithms. Since we are using this clustering algorithm for classification, accuracy is ultimately the most important metric; however, there are other metrics out there that can be applied directly to the clusters themselves, regardless of the associated labels. Two of these metrics that we will use are inertia and homogeneity. (See the detailed description of homogeneity_score)

Furthermore, earlier we made the assumption that K = 10 was the appropriate number of clusters; however, this might not be the case. Let's fit the K-means clustering algorithm with several different values of K, than evaluate the performance using our metrics.

from sklearn.metrics import accuracy_score
from sklearn.metrics import homogeneity_score
from fastprogress.fastprogress import NBMasterBar, NBProgressBar
def calc_metrics(estimator, data, labels):
    inertia = estimator.inertia_
    homogeneity = homogeneity_score(labels, estimator.labels_)
    return inertia, homogeneity
clusters = [10, 16, 32, 64, 128, 256]

mbar = master_bar(clusters)
mbar.write(['clusters', 'inertia', 'homogenity', 'acc'], table=True)

for n_clusters in mbar:
    for j in progress_bar(range(1), parent=mbar):
        estimator = MiniBatchKMeans(n_clusters=n_clusters)
        estimator.fit(train_x)

        inertia,homo = calc_metrics(estimator, train_x, train_y)

        label2cluster = get_label2cluster_mapper(estimator, train_y)
        preds = convert_cluster_to_label(estimator.labels_, label2cluster)
        acc = accuracy_score(train_y, preds)
        mbar.write([f'{n_clusters}', f'{inertia:.2f}', f'{homo:.6f}', f'{acc:.6f}'], table=True)
clusters inertia homogenity acc
10 2025777.12 0.488483 0.553883
16 1745072.75 0.586858 0.647417
32 1562862.75 0.614943 0.660467
64 1326805.25 0.676044 0.712350
128 1390716.25 0.672226 0.710233
256 1089197.25 0.740135 0.771533

As a result, we found out that when the k increases, the accuracy and homogeneity also increases.

Evaluating the test set

test_x = test_x.reshape(len(test_x), -1)
test_x = test_x.astype(np.float32) / 255.0
model = MiniBatchKMeans(n_clusters=36)
model.fit(train_x);
label2cluster = get_label2cluster_mapper(model, train_y)
test_cl = model.predict(test_x)
preds = convert_cluster_to_label(test_cl, label2cluster)
print(preds[:20])
print(test_y[:20])
[5 2 1 1 2 1 0 6 5 7 2 5 7 3 4 1 2 2 8 0]
[9 2 1 1 6 1 4 6 5 7 4 5 7 3 4 1 2 4 8 0]
print(f"Accuracy : {sum(preds == test_y)/len(preds) * 100}%")
Accuracy : 65.48%

Summary

Through this post, I built two K means clustering models for a random generating dataset and Fashion MNIST dataset. The data is first preprocessed with flatten and rescale to 0.0 ~ 1.0, then we use sklearn.cluster.MiniBatchKMeans and sklearn.cluster.KMeans to train and evaluate the model.