Materials available at: https://forejune.co/cuda/
K-means Clustering
Groups data points into clusters based on their similarity.
Suppose we have customer data with features such as:
Age, Income, Shopping frequency
K-Means Clustering can group these customers into, say, 3 clusters:
Cluster 1: Young and budget-conscious
Cluster 2: Middle-aged professionals
Cluster 3: High-income frequent shoppers
A CUDA Implementation of K-means
A block contains one or more threads,
and a grid contains one or more blocks.
threadIdx.x = 1; threadIdx.y = 2; threadIdx.z = 0;
#ifndef __KMEANSCU_H__
#define __KMEANSCU_H__
#include <vector>
#include <cmath>
#include <cstdlib>
#include <ctime>
#include <limits>
#include <stdio.h>
using namespace std;
struct Point {
float x, y;
int cluster;
};
class Centroid {
public:
float x, y;
Centroid() {
x = 0;
y = 0;
}
};
void kmeans(vector<Point>& h_points, vector<Centroid>&h_centroids, int iterations);
#endif
|
// kmeans.cu
// https://forejune.co/cuda
#include "kmeanscu.h"
#define THREADS_PER_BLOCK 256
__device__ float distance(const Point& p, const Centroid& c)
{
return sqrtf((p.x - c.x) * (p.x - c.x) + (p.y - c.y) * (p.y - c.y));
}
__global__ void clusters(Point* points, Centroid* centroids, int npoints, int k)
{
// one thread works on one point
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= npoints) return;
float minDist = 99999;
int bestCluster = 0;
for (int cl = 0; cl < k; cl++) {
float dist = distance(points[i], centroids[cl]);
if (dist < minDist) {
minDist = dist;
bestCluster = cl;
}
}
points[i].cluster = bestCluster;
}
void kmeans(vector<Point>& h_points, vector< Centroid>& h_centroids,int iterations){
int npoints = h_points.size();
int k = h_centroids.size();
Point* d_points;
Centroid* d_centroids;
cudaMalloc(&d_points, npoints * sizeof(Point));
cudaMemcpy(d_points, h_points.data(), npoints * sizeof(Point),
cudaMemcpyHostToDevice);
cudaMalloc(&d_centroids, k * sizeof(Centroid));
for (int it = 0; it < iterations; it++) {
cudaMemcpy(d_centroids, h_centroids.data(), k * sizeof(Centroid),
cudaMemcpyHostToDevice);
int nBlocks = ceil( (float) npoints / THREADS_PER_BLOCK );
clusters<<< nBlocks, THREADS_PER_BLOCK>>>(d_points, d_centroids,npoints,k);
cudaDeviceSynchronize();
cudaMemcpy(h_points.data(), d_points, npoints * sizeof(Point),
cudaMemcpyDeviceToHost);
int counts[k];
fill(counts, counts+k, 0); //initialize cluster counts to 0
Centroid new_centroids[k]; //default constructor set (x, y) to (0, 0)
for (int j = 0; j < npoints; j++) {
int cl = h_points[j].cluster;
new_centroids[cl].x += h_points[j].x;
new_centroids[cl].y += h_points[j].y;
++counts[cl];
}
for (int c = 0; c < k; c++) {
if (counts[c] > 0) {
h_centroids[c].x = new_centroids[c].x / counts[c];
h_centroids[c].y = new_centroids[c].y / counts[c];
}
}
}
cudaFree(d_points);
cudaFree(d_centroids);
}
|
Testing K-means routine
// testKmeans.cu
// https://forejune.co/cuda
#include "kmeanscu.h"
const int screenWidth = 800;
const int screenHeight = 600;
const int Npoints = 1000;
vector< Point>pointVec;
vector< Centroid>centroids;
int kvalue = 3;
int doKmeans(vector < Point> &points, vector< Centroid> & centroids)
{
srand(time(0));
int npoints = Npoints;
int k = kvalue;
int iterations = 20;
pointVec.resize(npoints);
for(int i = 0; i < npoints; i++) {
pointVec[i].x = rand() % screenWidth;
pointVec[i].y = rand() % screenHeight ;
pointVec[i].cluster = rand() % k;
}
centroids.resize( k );
for (int i = 0; i < k; ++i) {
centroids[i].x = pointVec[i].x;
centroids[i].y = pointVec[i].y;
}
kmeans(pointVec, centroids, iterations);
return 0;
}
int main()
{
doKmeans(pointVec, centroids);
//print out every twenty points
int newRow = 0;
for(int i = 0; i < Npoints; i+=20 ) {
printf("(%3.0f, %3.0f): %d\t", pointVec[i].x, pointVec[i].y, pointVec[i].cluster);
newRow++;
if (newRow % 4 == 0) printf("\n");
}
printf("\nCentroids: \n");
for(int i = 0; i < centroids.size(); i++)
printf("%5.2f, %5.2f\n", centroids[i].x, centroids[i].y);
return 0;
}
|
Sample Outputs:
(237, 93): 0 (322, 97): 0 (118, 278): 0 (211, 516): 0
(165, 166): 0 (440, 566): 1 ( 2, 139): 0 (285, 296): 0
( 44, 383): 0 (336, 79): 0 ( 58, 68): 0 (233, 171): 0
(299, 114): 0 (394, 391): 1 (524, 169): 2 (463, 183): 2
(741, 25): 2 (335, 88): 0 ( 27, 246): 0 (637, 183): 2
(424, 448): 1 (689, 253): 2 (284, 303): 0 (194, 537): 0
(769, 199): 2 (772, 25): 2 (190, 171): 0 (605, 33): 2
(242, 532): 0 (292, 422): 0 (652, 555): 1 (409, 26): 2
(749, 409): 1 (539, 578): 1 (314, 221): 0 ( 1, 388): 0
(380, 487): 1 (301, 464): 0 ( 58, 161): 0 (120, 288): 0
(525, 111): 2 (291, 82): 0 (634, 580): 1 (446, 363): 1
(124, 62): 0 (719, 103): 2 (424, 517): 1 (654, 415): 1
(123, 598): 0 (338, 222): 0
Centroids:
180.82, 272.67
533.72, 462.85
605.09, 148.98
Profiles:
$nvprof ./testKmeans
==5151== Profiling application: ./testKmeans
==5151== Profiling result:
Time(%) Time Calls Avg Min Max Name
42.35% 74.718us 20 3.7350us 3.5830us 5.4080us clusters(Point*, Centroid*, int, int)
40.24% 71.002us 20 3.5500us 3.4560us 4.1920us [CUDA memcpy DtoH]
17.41% 30.715us 21 1.4620us 1.3120us 3.6470us [CUDA memcpy HtoD]
==5151== API calls:
....
....
Using one thread
==5368== Profiling application: ./testKmeans1
==5368== Profiling result:
Time(%) Time Calls Avg Min Max Name
99.69% 32.198ms 20 1.6099ms 1.6097ms 1.6126ms clusters(Point*, Centroid*, int, int)
0.22% 70.878us 20 3.5430us 3.4560us 4.1920us [CUDA memcpy DtoH]
0.09% 28.093us 21 1.3370us 672ns 3.1030us [CUDA memcpy HtoD]
....
....
Using a Simple Graphical Interface
// testKmeansg.cu
// https://forejune.co/cuda
#include <GL/glut.h>
#include "kmeanscu.h"
const int screenWidth = 800;
const int screenHeight = 600;
const int Npoints = 1000;
vector< Point>pointVec;
vector< Centroid>centroids;
int kvalue = 3;
int doKmeans(vector < Point> &points, vector< Centroid> & centroids)
{
srand(time(0));
int k = kvalue;
int iterations = 20;
pointVec.resize(Npoints);
for(int i = 0; i < Npoints; i++) {
pointVec[i].x = rand() % screenWidth;
pointVec[i].y = rand() % screenHeight ;
pointVec[i].cluster = 0;
}
centroids.resize( k );
for (int i = 0; i < k; ++i) {
centroids[i].x = pointVec[i].x;
centroids[i].y = pointVec[i].y;
}
kmeans(pointVec, centroids, iterations);
//print out every twenty points
int newRow = 0;
for(int i = 0; i < Npoints; i+=20 ) {
printf("(%3.0f, %3.0f): %d\t", pointVec[i].x, pointVec[i].y, pointVec[i].cluster);
newRow++;
if (newRow % 4 == 0) printf("\n");
}
printf("\nCentroids: \n");
for(int i = 0; i < centroids.size(); i++)
printf("%5.2f, %5.2f\n", centroids[i].x, centroids[i].y);
return 0;
}
void init()
{
glClearColor(1, 1, 1, 1); //clear color buffer with white color
glClear(GL_COLOR_BUFFER_BIT); //clear color buffer
//define coordinate system
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
gluOrtho2D(0, screenWidth, 0, screenHeight);
}
void display()
{
glColor3f(0.0, 0.0, 0.0); //draw with black color
glLineWidth( 1 );
glBegin(GL_LINES);
glEnd();
//flush out data to color buffer
glFlush();
}
void drawDot(int x, int y)
{
glBegin(GL_POINTS);
glVertex2i(x, y);
glEnd();
glFlush();
}
void drawDots()
{
glClear(GL_COLOR_BUFFER_BIT); //clear color buffer
int n = pointVec.size();
glPointSize( 5 );
glColor3f(0, 0, 0);
glBegin(GL_POINTS);
for(int i = 0; i < n; i++)
glVertex2f(pointVec[i].x, pointVec[i].y);
glEnd();
glFlush();
glutPostRedisplay();
}
void displayClusters( vector< Point> &points, vector< Centroid> & centroids )
{
glClear(GL_COLOR_BUFFER_BIT); //clear color buffer
//display data points
int n = points.size();
int k = centroids.size();
glPointSize( 5 );
for (int i = 0; i < n; i++) {
int c = points[i].cluster;
float color[3] = {0}; //display color (R, G, B)
if ( c < 3 )
color[c] = 1; //Red, green or blue
else if ( c == 4 ){ //fourth color is black
color[0] = 1.0; //yellow
color[1] = 1.0;
} else if ( c == 5) {
color[1] = 1.0; //cyan
color[2] = 1.0;
}
glColor3fv(color);
drawDot(points[i].x, points[i].y);
}
//display centroids
glPointSize( 15 );
glColor3f(1, 0, 1); //magenta
for(int j = 0; j < k; j++)
drawDot(centroids[j].x, centroids[j].y);
glColor3f(0, 0, 0); //restore black color
glPointSize( 3 );
glutPostRedisplay();
}
void keyboard(unsigned char key, int x, int y)
{
switch ( key ) {
case 27: //escape
exit( -1 );
case 'c':
// displayClusters();
displayClusters(pointVec, centroids);
break;
case 'd':
drawDots();
break;
case '3':
kvalue = 3;
break;
case '4':
kvalue = 4;
break;
case '5':
kvalue = 5;
break;
case '6':
kvalue = 6;
break;
case 'k':
doKmeans(pointVec, centroids);
break;
}
}
int main(int argc, char *argv[])
{
glutInit(&argc, argv); //Initialization
glutInitDisplayMode(GLUT_SINGLE | GLUT_RGB );
glutInitWindowSize(screenWidth, screenHeight);
glutInitWindowPosition(400, 200);
glutCreateWindow( argv[0] );
init();
glutKeyboardFunc( keyboard );
glutDisplayFunc( display ); //specifies callback function for dispaly
glutMainLoop(); //go into perpetual loop
return 0;
}
|
Sample graphics output: