In this post, I’d like to discuss about one of basic unsupervised machine learning algorithms, K-means clustering.

Basic idea of kmeans clustering is to find positions which minimize the variance of each cluster. Mathematical description is

here math

Kmeans clustering algorithm steps:

Generate K initial centroids at random positions
Assign each data point with a cluster-label by finding the centroid which is the closest to the data
Calculate new centroids by finding the first moment of each cluster over total number of data within the cluster
Repeat 2 and 3 until centroids converge

Implementation

In order to run this python code, we first need to import python libraries.

1
2
3
4
5
6


from random import random
import numpy as np
import math
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm

Generating Clusters

To generate k number of clusters, I started with k random points within xmin < x < xmax and ymin < y < ymax.
Then gernate total n data points randomly within the radius r centering at each initial point.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18


# cluster generater((-xlen,-ylen) < (x,y) < (xlen,ylen) ). k: the number of clusters, r: radius of the cluster, 
#  n: the number of data points in each cluster

def cluster_gen(k,r,n,xlen,ylen): 
    data = []
    
    #create the center of clusters [x|y] coordinates
    kx = np.random.rand(k,1)*xlen*2 - xlen
    ky = np.random.rand(k,1)*ylen*2 - ylen
    kcent = np.append(kx,ky,1)
    #generate n data points around the center of cluster within radius r
    for cluster in range(k):
        for points in range(n):
            x = kcent[cluster,0] + r*(random())*math.cos(random()*np.pi)
            y = kcent[cluster,1] + r*(random())*math.cos(random()*np.pi)
            data.append([x,y])
            
    return np.array(data)

Test cluster generator

Let’s test the cluster generator. We defind k=3 clusters, n=50 points within r=5 radius, and initial bourdary as -10 < x < 10 and -10 < y < 10.

1
2
3
4


# test cluster generater
data = cluster_gen(k=3,r=5,n=50,xlen=20,ylen=20)
plt.scatter(data[:,0],data[:,1])
a = data[:,0]

Kmenas clustering algorithm

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58


# calculate distance between two points
def dist(a,b):
    d = (a[0]-b[0])**2 + (a[1]-b[1])**2
    d = math.sqrt(d)
    return d

# find xy coordinates of the centroid of a cluster
def cent_mass(data):
    l = len(data)
    x = sum(row[0] for row in data)
    y = sum(row[1] for row in data)
    cent = [x/l,y/l]
    return cent

def k_means(k,data):
    # generate initial center coordinates of k cluster within data range
    xrange = max(data[:,0]) - min(data[:,0])
    yrange = max(data[:,1]) - min(data[:,1])
    kx = (np.random.rand(k,1)*2 - 1)*xrange/2 + (max(data[:,0]) - xrange/2)
    ky = (np.random.rand(k,1)*2 - 1)*yrange/2 + (max(data[:,1]) - yrange/2)
    k_init = np.append(kx,ky,1)

    breaker = True
    
    while breaker:
        # label each data point with k-cluster
        cluster = []
        for i in range(len(data)):
            ksort = []
            cur_p = data[i]
            for j in range(k):
                k_cent = k_init[j]
                ksort.append(dist(cur_p,k_cent))
            ind_min = np.argmin(ksort)
            cluster.append(ind_min)
        cluster = np.array(cluster)
        cluster = cluster.reshape(len(cluster),1)

        # find a new coordinates for each k-means
        df = pd.DataFrame(cluster,columns = ['k'])
        k_new = []
        for i in range(k):
            k_index = df.index[df['k'] == i].tolist()
            data_in_kgroup = data[k_index]
            data_in_kgroup = np.append(data_in_kgroup,[k_init[i]],0)
            new_cm = cent_mass(data_in_kgroup)
            k_new.append(new_cm)
        k_init = np.array(k_init)
        k_new = np.array(k_new)
        k_diff = k_init - k_new
        
        if np.count_nonzero(k_diff == 0) == k_diff.size:
            breaker = False
            
        else:
            breaker = True
        k_init = k_new
    return k_new, cluster

Test Kmeans clustering algorithm

The test result is shown as color coded scatter plot. Each color represents

1
2
3


k_cord, k_data = k_means(k=3, data=data)
plt.scatter(data[:,0],data[:,1],c=k_data)
plt.scatter(k_cord[:,0],k_cord[:,1],c='red')

here images

This is basic steps for kmeans clustering algorithm. However, the result can be affected by initial centroids. Some initial centroids can be converged into the same cluster or can be converged without any cluster-label. In order to resolve the issue we can add two more steps

Additional steps

Repeat 1-4 steps multiple times and store average variance of
Find the centroid which has the minimum value of the total sum of sample variance of each cluster

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33


#add additional steps to kmeans clustering algorithm
# num_inter: the number of iteration
def k_means_adapt(k, data, num_iter):
    kx = np.zeros([k,num_iter])
    ky = np.zeros([k,num_iter])
    kcluster = np.zeros([len(data),num_iter])
    cost = np.zeros([1,num_iter])
    k_coord = np.zeros([k,2])
    for i in range(num_iter):
        cost_temp = np.zeros([k,1])
        k_new, cluster = k_means(k, data)
        kcluster[:,i] = cluster.T
        ky[:,i] = k_new[:,1].T
        kx[:,i] = k_new[:,0].T
        
        df = pd.DataFrame(cluster, columns = ['k'])
        for j in range(k):
            index = df.index[df['k'] == j].tolist()
            kdata = data[index]
            l = len(kdata)
            if l == 0
                l = 10**-8
            dist_cost = np.sum(np.sqrt((kdata[:,0]-kx[j,0])**2+(kdata[:,1] - ky[j,0])**2))/l
            cost_temp[j,0] = dist_cost
        cost[0,i] = np.sum(cost_temp)
   
    index_min = np.argmin(cost)

    k_coord[:,0] = kx[:,index_min].T
    k_coord[:,1] = ky[:,index_min].T
    data_label = kcluster[:,index_min]
    
    return k_coord, data_label