K-means Clustering of Unsupervised Learning in CUDA

Materials available at: https://forejune.co/cuda/

K-means Clustering

  Groups data points into clusters based on their similarity.

  Suppose we have customer data with features such as:

      Age, Income, Shopping frequency

  K-Means Clustering can group these customers into, say, 3 clusters: 

       Cluster 1: Young and budget-conscious

       Cluster 2: Middle-aged professionals

       Cluster 3: High-income frequent shoppers
	
Algorithm:

  1. Randomly pick K central points (centroids or means)

  2. Each data point is assigned to the closest centroid, thus obtain K clusters

  3. Update the centroids by finding the average position of each cluster

  4. Iterate the process for a number of times until the clusters become stable

A CUDA Implementation of K-means

	         A block contains one or more threads,
             and a grid contains one or more blocks.
        
        
                threadIdx.x = 1; threadIdx.y = 2; threadIdx.z = 0;
        
        one thread ~ one point

    Thread index:
	int i = blockIdx.x * blockDim.x + threadIdx.x;
	
 
#ifndef __KMEANSCU_H__
#define __KMEANSCU_H__
#include <vector>
#include <cmath>
#include <cstdlib>
#include <ctime>
#include <limits>
#include <stdio.h>

using namespace std;

struct Point {
    float x, y;
    int cluster;
};

class Centroid {
public:
    float x, y;
    Centroid() {
        x = 0;
        y = 0;
    }
};
void kmeans(vector<Point>& h_points, vector<Centroid>&h_centroids, int iterations);
#endif
 
                                
// kmeans.cu
// https://forejune.co/cuda

#include "kmeanscu.h"

#define THREADS_PER_BLOCK 256

__device__ float distance(const Point& p, const Centroid& c)
{
    return sqrtf((p.x - c.x) * (p.x - c.x) + (p.y - c.y) * (p.y - c.y));
}

__global__ void clusters(Point* points, Centroid* centroids, int npoints, int k)
{
    // one thread works on one point
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= npoints) return;

    float minDist = 99999;
    int bestCluster = 0;

    for (int cl = 0; cl < k; cl++) {
        float dist = distance(points[i], centroids[cl]);
        if (dist < minDist) {
            minDist = dist;
            bestCluster = cl;
        }
    }

    points[i].cluster = bestCluster;
}

void kmeans(vector<Point>& h_points, vector< Centroid>& h_centroids,int iterations){
    int npoints = h_points.size();
    int k = h_centroids.size();

    Point* d_points;
    Centroid* d_centroids;

    cudaMalloc(&d_points, npoints * sizeof(Point));
    cudaMemcpy(d_points, h_points.data(), npoints * sizeof(Point), 
    					cudaMemcpyHostToDevice);
    cudaMalloc(&d_centroids, k * sizeof(Centroid));

    for (int it = 0; it < iterations; it++) {
        cudaMemcpy(d_centroids, h_centroids.data(), k * sizeof(Centroid), 
				      cudaMemcpyHostToDevice);

        int nBlocks = ceil( (float) npoints / THREADS_PER_BLOCK );
        clusters<<< nBlocks, THREADS_PER_BLOCK>>>(d_points, d_centroids,npoints,k);
        cudaDeviceSynchronize();

        cudaMemcpy(h_points.data(), d_points, npoints * sizeof(Point), 
					cudaMemcpyDeviceToHost);

        int counts[k];
        fill(counts, counts+k, 0);      //initialize cluster counts to 0
        Centroid new_centroids[k];      //default constructor set (x, y) to (0, 0)
        for (int j = 0; j < npoints; j++) {
            int cl = h_points[j].cluster;
            new_centroids[cl].x += h_points[j].x;
            new_centroids[cl].y += h_points[j].y;
            ++counts[cl];
        }

        for (int c = 0; c < k; c++) {
            if (counts[c] > 0) {
                h_centroids[c].x = new_centroids[c].x / counts[c];
                h_centroids[c].y = new_centroids[c].y / counts[c];
            }
        }
    }

    cudaFree(d_points);
    cudaFree(d_centroids);
}

Testing K-means routine
 
                                
// testKmeans.cu
// https://forejune.co/cuda

#include "kmeanscu.h"

const int screenWidth = 800;
const int screenHeight = 600;
const int Npoints = 1000;

vector< Point>pointVec;
vector< Centroid>centroids;
int kvalue = 3;

int doKmeans(vector < Point> &points, vector< Centroid> & centroids)
{
    srand(time(0));
    int npoints = Npoints;
    int k = kvalue;
    int iterations = 20;

    pointVec.resize(npoints);
    for(int i = 0; i < npoints; i++) {
        pointVec[i].x = rand() % screenWidth;
        pointVec[i].y = rand() % screenHeight ;
        pointVec[i].cluster = rand() % k;
    }

    centroids.resize( k );
    for (int i = 0; i < k; ++i) {
        centroids[i].x = pointVec[i].x;
        centroids[i].y = pointVec[i].y;
    }

    kmeans(pointVec, centroids, iterations);

    return 0;
}

int main()
{
   doKmeans(pointVec, centroids);

   //print out every twenty points       
   int newRow = 0;
   for(int i = 0; i < Npoints; i+=20 ) {
      printf("(%3.0f, %3.0f): %d\t", pointVec[i].x, pointVec[i].y, pointVec[i].cluster);
      newRow++;
      if (newRow % 4 == 0) printf("\n");
   }

   printf("\nCentroids: \n");
   for(int i = 0; i < centroids.size(); i++)
       printf("%5.2f, %5.2f\n", centroids[i].x, centroids[i].y);


   return 0;
}

Sample Outputs:

	(237,  93): 0	(322,  97): 0	(118, 278): 0	(211, 516): 0
	(165, 166): 0	(440, 566): 1	(  2, 139): 0	(285, 296): 0
	( 44, 383): 0	(336,  79): 0	( 58,  68): 0	(233, 171): 0
	(299, 114): 0	(394, 391): 1	(524, 169): 2	(463, 183): 2
	(741,  25): 2	(335,  88): 0	( 27, 246): 0	(637, 183): 2
	(424, 448): 1	(689, 253): 2	(284, 303): 0	(194, 537): 0
	(769, 199): 2	(772,  25): 2	(190, 171): 0	(605,  33): 2
	(242, 532): 0	(292, 422): 0	(652, 555): 1	(409,  26): 2
	(749, 409): 1	(539, 578): 1	(314, 221): 0	(  1, 388): 0
	(380, 487): 1	(301, 464): 0	( 58, 161): 0	(120, 288): 0
	(525, 111): 2	(291,  82): 0	(634, 580): 1	(446, 363): 1
	(124,  62): 0	(719, 103): 2	(424, 517): 1	(654, 415): 1
	(123, 598): 0	(338, 222): 0
	Centroids:
	180.82, 272.67
	533.72, 462.85
	605.09, 148.98
	

Profiles:

	$nvprof ./testKmeans
	==5151== Profiling application: ./testKmeans
	==5151== Profiling result:
	Time(%)      Time     Calls       Avg       Min       Max  Name
 	42.35%  74.718us        20  3.7350us  3.5830us  5.4080us  clusters(Point*, Centroid*, int, int)
 	40.24%  71.002us        20  3.5500us  3.4560us  4.1920us  [CUDA memcpy DtoH]
 	17.41%  30.715us        21  1.4620us  1.3120us  3.6470us  [CUDA memcpy HtoD]

	==5151== API calls:
	....
	....
	
	==5368== Profiling application: ./testKmeans1
	==5368== Profiling result:
	Time(%)      Time     Calls       Avg       Min       Max  Name
	99.69%  32.198ms        20  1.6099ms  1.6097ms  1.6126ms  clusters(Point*, Centroid*, int, int)
	0.22%   70.878us        20  3.5430us  3.4560us  4.1920us  [CUDA memcpy DtoH]
	0.09%   28.093us        21  1.3370us     672ns  3.1030us  [CUDA memcpy HtoD]
	....
	....
	

Using a Simple Graphical Interface
 
				

// testKmeansg.cu
// https://forejune.co/cuda

#include <GL/glut.h>
#include "kmeanscu.h"

const int screenWidth = 800;
const int screenHeight = 600;
const int Npoints = 1000;

vector< Point>pointVec;
vector< Centroid>centroids;
int kvalue = 3;

int doKmeans(vector < Point> &points, vector< Centroid> & centroids)
{
    srand(time(0));
    int k = kvalue;
    int iterations = 20;

    pointVec.resize(Npoints);
    for(int i = 0; i < Npoints; i++) {
        pointVec[i].x = rand() % screenWidth;
        pointVec[i].y = rand() % screenHeight ;
        pointVec[i].cluster = 0;
    }

    centroids.resize( k );
    for (int i = 0; i < k; ++i) {
        centroids[i].x = pointVec[i].x;
        centroids[i].y = pointVec[i].y;
    }

    kmeans(pointVec, centroids, iterations);

   //print out every twenty points       
   int newRow = 0;
   for(int i = 0; i < Npoints; i+=20 ) {
      printf("(%3.0f, %3.0f): %d\t", pointVec[i].x, pointVec[i].y, pointVec[i].cluster);
      newRow++;
      if (newRow % 4 == 0) printf("\n");
   }

   printf("\nCentroids: \n");
   for(int i = 0; i < centroids.size(); i++)
       printf("%5.2f, %5.2f\n", centroids[i].x, centroids[i].y);


    return 0;
}

void init()
{
  glClearColor(1, 1, 1, 1);     //clear color buffer with white color
  glClear(GL_COLOR_BUFFER_BIT); //clear color buffer
  //define coordinate system
  glMatrixMode(GL_PROJECTION);
  glLoadIdentity();
  gluOrtho2D(0, screenWidth, 0, screenHeight);
}

void display()
{
  glColor3f(0.0, 0.0, 0.0);             //draw with black color
   glLineWidth( 1 );
  glBegin(GL_LINES);
  glEnd();

  //flush out data to color buffer 
  glFlush();            
}

void drawDot(int x, int y)
{
  glBegin(GL_POINTS);
    glVertex2i(x, y);
  glEnd();
  glFlush();
}

void drawDots()
{
  glClear(GL_COLOR_BUFFER_BIT); //clear color buffer
  int n = pointVec.size();
  glPointSize( 5 );
  glColor3f(0, 0, 0);
  glBegin(GL_POINTS);
  for(int i = 0; i < n; i++) 
    glVertex2f(pointVec[i].x, pointVec[i].y);
  glEnd();
  glFlush();
  glutPostRedisplay();
}

void displayClusters( vector< Point> &points, vector< Centroid> & centroids )
{
  glClear(GL_COLOR_BUFFER_BIT); //clear color buffer
  //display data points
  int n = points.size();
  int k = centroids.size();

  glPointSize( 5 );
  for (int i = 0; i < n; i++) {
    int c = points[i].cluster;
    float color[3] = {0};       //display color (R, G, B)
    if ( c < 3 )
      color[c] = 1;    //Red, green or blue
    else if ( c == 4 ){ //fourth color is black 
      color[0] = 1.0;  //yellow 
      color[1] = 1.0;  
    } else if ( c == 5) {
      color[1] = 1.0;  //cyan
      color[2] = 1.0;  
    }
    glColor3fv(color);
    drawDot(points[i].x, points[i].y);
  }
  //display centroids
  glPointSize( 15 );
  glColor3f(1, 0, 1);   //magenta       
  for(int j = 0; j < k; j++)
    drawDot(centroids[j].x, centroids[j].y);
  glColor3f(0, 0, 0);   //restore black color
  glPointSize( 3 );
  glutPostRedisplay();
}

void keyboard(unsigned char key, int x, int y)
{
  switch ( key ) {
    case 27:            //escape
        exit( -1 );
    case 'c':
       // displayClusters();
	displayClusters(pointVec, centroids);
	break;
    case 'd':
	drawDots();
        break;
    case '3':
	kvalue = 3;
	break;
    case '4':
	kvalue = 4;
	break;
    case '5':
	kvalue = 5;
	break;
    case '6':
	kvalue = 6;
	break;
    case 'k':
	doKmeans(pointVec, centroids);
	break;
  }
}

int main(int argc, char *argv[])
{
  glutInit(&argc, argv);        //Initialization
  glutInitDisplayMode(GLUT_SINGLE | GLUT_RGB );
  glutInitWindowSize(screenWidth, screenHeight);
  glutInitWindowPosition(400, 200);
  glutCreateWindow( argv[0] );
  init();
  glutKeyboardFunc( keyboard );
  glutDisplayFunc( display );   //specifies callback function for dispaly
  glutMainLoop();               //go into perpetual loop
  return 0;
}

Sample graphics output: