Kmeans Clustering Using Multilayer Perceptron (MLP) in CUDA

Materials available at: https://forejune.co/cuda/

Overview

Review

    K-means Clustering

       see https://forejune.co/cuda/ai/unsupervised/unsupercu.html

    Multi-layer Perceptron (MLP)

       see https://forejune.co/cuda/ai/mlp/mlpcu.html

                                
// mlpcu.h
// https://forejune.co/cuda

#ifndef __MLPCU_H__
#define __MLPCU_H__

using namespace std;

#define n1 3 
#define m1 7
#define K  4
#define numSamples 600
#define eta 0.5

// Simple MLP class
class MLP
{
private:
    double a[m1];  //linear sum of products of inputs and weights
    double h[m1];  //hidden nodes h[j] = g(a[j])
    double y[K];   //predicted output 
    double z[K];   //linear sum of products of weights and hidden nodes
    double *d_inputs;
    double *d_labels;
    double *d_a;
    double *d_h;        //hidden layer
    double *d_y;        //output
    double *d_yd;       //desired output
    double *d_z;
    double *d_w;
    double *d_wo;
public:
    MLP();
    //Sigmoid activation function
    double g(double x);
    //Derivative of sigmoid function
    double gd(double x);
    //Forward propagation
    double* predicted(double x[]);
    void train(const vector< Point> &points, int epochs);
    ~MLP();
};
#endif

                                
// mlp.cu
// https://forejune.co/cuda

#include <iostream>
#include <iomanip>
#include <cmath>
#include <stdio.h>
#include "kmeanscu.h"
#include "mlpcu.h"

using namespace std;

double w[n1][m1];
double wo[m1][K];

//Sigmoid activation function
__device__ double d_g(double x) 
{
      return 1.0 / (1.0 + exp(-x));
}
    
//Derivative of sigmoid function
__device__ double d_gd(double x) 
{
    double s = d_g(x);
     
    return s * (1 - s);
}

//y[] is the output
__global__ void d_forward(int k, double *w, double *wo, double *h, double *a, 
                               double *z,  double *y, double *d_inputs)
{
  __shared__ double x[n1];
  int ii = threadIdx.x;		//indexing input nodes
  int jj, ll;
  if (ii < n1) 	//inputs
    x[ii] = d_inputs[k*n1+ii];
  __syncthreads();

  //hidden layer activation, all hidden layer threads work in parallel
  if ( ii >= n1 && ii < n1+m1) {
    jj = ii - n1;
    a[jj] = 0;
    for (int i = 0; i < n1; i++) 
       a[jj] += w[i*m1+jj] * x[i]; 
    if ( jj > 0 )
       h[jj] = d_g(a[jj]); 	//sigmoid activation function
  }
  __syncthreads();

   //output layer activation, all output layer threads work in parallel
   if (ii >= n1 + m1){
     ll = ii - n1 - m1;
     z[ll] = 0; 
     for(int j = 0; j < m1; j++) 
       z[ll] += wo[j*K+ll] * h[j];  
      y[ll] = d_g( z[ll] );
   }
   __syncthreads();
}
  
//Backward propagation, yd is the desired output
__global__  void d_backward(int k, double *w, double *wo,  double *h, double *a, 
		            double *z, double *y, double *d_inputs, double *d_labels)
{
  __shared__ double delta[m1];
  __shared__ double deltao[K];
  __shared__ double w_share[n1][m1];	//just for easier readability, not necessarily needed
  __shared__ double wo_share[m1][K];
  __shared__ double x[n1];	//inputs
  __shared__ double yd[K];	//desired outputs
  int ii = threadIdx.x;		//indexing input nodes
  int jj = threadIdx.y;		//indexing hidden nodes
  int ll  = threadIdx.z;	//indexing output nodes
  if (jj == 0 && ll == 0 )
    x[ii] = d_inputs[k*n1+ii];
  __syncthreads();
  if ( ii == 0 && jj == 0 )
    yd[ll] = d_labels[k*K+ll];
  __syncthreads();
  if (ll == 0 )
    w_share[ii][jj] = w[ii*m1+jj]; 
  __syncthreads();
  if (ii == 0 )
    wo_share[jj][ll] = wo[jj*K+ll]; 
  __syncthreads();
  //output threads work in parallel 
  if (ii == 0 && jj == 0 ) {
  double e = yd[ll] - y[ll];  //output layer error
  deltao[ll] = e * d_gd(z[ll]);
  }
  __syncthreads();

   //Compute hidden layer error, hidden threads work in parallel
  if (ii == 0 && ll == 0 ) {
   delta[jj] = 0;
   for(int l = 0; l < K; l++)
     delta[jj] += deltao[l] * wo_share[jj][l] * d_gd(a[jj]);
  }
     //delta[jj] += deltao[l] * wo[jj*K+l] * d_gd(a[jj]);
    __syncthreads();
    
    //Update weights (hidden to output), hidden and output threads work in parallel 
  if (ii == 0 ) {
    wo_share[jj][ll] += eta * deltao[ll] * h[jj];
  }
    __syncthreads();
     
    //Update weights (input to hidden), input and hidden threads work in parallel
    if ( ll==0 ) {
      w_share[ii][jj] += eta * delta[jj] * x[ii];
    } 
    syncthreads();
   if ( ll == 0 ) 
    w[ii*m1+jj] =  w_share[ii][jj]; 
    __syncthreads();

   if ( ii == 0 ) 
    wo[jj*K+ll] = wo_share[jj][ll]; 
    __syncthreads();
} 

MLP::MLP()
{
	int wSize = n1 * m1 * sizeof(double);
	int woSize = m1 * K * sizeof(double);
	cudaMalloc((void**)&d_inputs, numSamples * n1 * sizeof(double));
	cudaMalloc((void**)&d_labels, numSamples * K * sizeof(double));
	cudaMalloc((void**)&d_a, m1 * sizeof(double));
	cudaMalloc((void**)&d_h, m1 * sizeof(double));
	cudaMalloc((void**)&d_z, K * sizeof(double));
	cudaMalloc((void**)&d_y, K * sizeof(double));
	cudaMalloc((void**)&d_yd, K * sizeof(double));
	cudaMalloc((void**)&d_w, wSize);
	cudaMalloc((void**)&d_wo, woSize);

         //random weights
        for (int i = 0; i < n1; i++)
          for (int j = 0; j < m1; j++)
            w[i][j] =  rand() % 1000 / 1000.0;

        for (int j = 0; j < m1; j++)
          for (int l = 0; l < K; l++)
            wo[j][l] = rand() % 1000 / 1000.0;

        cudaMemcpy(d_w, w, wSize, cudaMemcpyHostToDevice);
        cudaMemcpy(d_wo, wo, woSize, cudaMemcpyHostToDevice);
	h[0] = 1;	//for bias 
        cudaMemcpy(d_h, h, m1 * sizeof(double), cudaMemcpyHostToDevice);
}

//Sigmoid activation function
double MLP::g(double x) 
{
      return 1.0 / (1.0 + exp(-x));
}

//Derivative of sigmoid function
double MLP::gd(double x) 
{
       double s = g(x);
        
       return s * (1 - s);
}

//Forward propagation
double* MLP::predicted(double x[]) 
{
	//hidden layer activation
	for (int j = 0; j < m1; j++) {
          a[j] = 0;
          for (int i = 0; i < n1; i++) 
            a[j] += w[i][j] * x[i]; 
	   if ( j > 0 )
	     h[j] = g( a[j] );
         }
	 //output layer activation
	 for (int l = 0; l < K; l++){
	   z[l] = 0; 
	   for(int j = 0; j < m1; j++) 
	     z[l] += wo[j][l] * h[j];  
	   y[l] = g( z[l] );
	  }

          return y;  //return output
} 

// Train the perceptron
// eta = learning rate
// epochs = number of times of training 
// numSamples = number of different input sets (x_data)
// yd is target output
void MLP::train(const vector< Point> &points, int epochs)
{
	double inputs[numSamples][n1];
	double labels[numSamples][K];
        dim3 threads(n1, m1, K);   //input, hidden, output layers threads
	
	for (int i = 0; i < numSamples; i++) {
	  inputs[i][0] = 1;		//bias
	  inputs[i][1] = points[i].x;
	  inputs[i][2] = points[i].y;
	}
	
	for (int i = 0; i < numSamples; i++) {
          for (int k = 0; k < K; k++)
            if (points[i].cluster == k)
              labels[i][k] = 1;
            else
              labels[i][k] = 0;
	}
 
	cudaMemcpy(d_inputs, inputs, numSamples*n1*sizeof(double), cudaMemcpyHostToDevice);
	cudaMemcpy(d_labels, labels, numSamples*K*sizeof(double), cudaMemcpyHostToDevice);
	for (int epoch = 0; epoch < epochs; epoch++){
	  for (int i = 0; i < numSamples; i++) {
	    //for simplicity, use more threads than needed
	    d_forward<<<1,n1+m1+K>>>(i, d_w, d_wo, d_h, d_a, d_z, d_y, d_inputs);
  	    cudaDeviceSynchronize();
	    //for simplicity, use more threads than needed
	    d_backward<<<1, threads>>>(i, d_w, d_wo, d_h, d_a, d_z, d_y, d_inputs, d_labels);
  	    cudaDeviceSynchronize();
	  }
	}
	  //save the weights
	  cudaMemcpy(&w[0][0], d_w, sizeof(w), cudaMemcpyDeviceToHost);
	  cudaMemcpy(&wo[0][0], d_wo, sizeof(wo), cudaMemcpyDeviceToHost);
}

MLP::~MLP()
{
	cudaFree(d_inputs);
	cudaFree(d_labels);
	cudaFree(d_a);
	cudaFree(d_h);
	cudaFree(d_z);
	cudaFree(d_y);
}

Testing K-means MLP routines

// testKmlp.cu // https://forejune.co/cuda #include <iostream> #include <iomanip> #include <cmath> #include <stdio.h> #include "kmeanscu.h" #include "mlpcu.h" using namespace std; const int screenWidth = 800; const int screenHeight = 700; void graphics(int argc, char *argv[]); int classifier( double y[]) { int value = 0; double max = y[0]; for (int i = 1; i < K; i++) { if (y[i] > max) { max = y[i]; value = i; } } return value; } bool overlap(const vector<Centroid> & centroids) { int n = centroids.size(); for (int i = 0; i < n; i++) for (int j=i+1; j < n; j++) if ( abs(centroids[i].x - centroids[j].x) < 0.01 && abs(centroids[i].y - centroids[j].y) < 0.01 ) return true; return false; } // cluster the points using MLP void useMlp(MLP &mlp, vector<Point> &points, int delta) { int n = points.size(); double x[3]; x[0] = 1; //bias int j = 0; for (int i = 0; i < n; i+=delta) { x[1] = points[i].x; x[2] = points[i].y; double *output = mlp.predicted(x); cout << fixed << setprecision(2) << " " << x[1] << " " << x[2] << " : " << classifier(output) << "" << " (" << setprecision(1); for(int k = 0; k < K; k++) { cout << output[k]; if (k < K - 1) cout << " "; else cout << ")" << "\t"; } points[i].cluster = (int) classifier(output); if (++j % 2 == 0) cout << endl; } } MLP mlp; vector<Centroid> centroids(K); int main(int argc, char *argv[]) { srand(time(0)); int iterations = 50; // Number of K-means iterations // Generate data vector<Point> points; for (int i = 0; i < numSamples; ++i) { Point p; p.x = (float) (rand() % screenWidth) /screenWidth; p.y = (float) (rand() % screenHeight) / screenHeight; points.push_back(p); } int c[K]; // Randomly initialize centroids from points do { for (int i = 0; i < K; i++) { c[i] = rand() % points.size(); centroids[i].x = points[c[i]].x; centroids[i].y = points[c[i]].y; } } while (overlap(centroids)); kmeans(points, centroids, iterations); int delta = 20; //output results int k = 0; for (int j = 0; j < points.size(); j+=delta ) { cout << fixed << setprecision(2) << "(" << points[j].x << ", " << points[j].y << ") : " << points[j].cluster << "\t"; if ( ++k % 2 == 0 ) cout << endl; } cout << endl; cout << "Training the MLP ..." << endl; mlp.train(points, 10000); // Test the MLP cout << "Testing MLP:" << endl; useMlp(mlp, points, delta); getchar(); graphics(argc, argv); return 0; } // graphics routines #include <GL/glut.h> //vectors to store points vector<Point>pointVec; void init() { glClearColor(1, 1, 1, 1); //clear color buffer with white color glClear(GL_COLOR_BUFFER_BIT); //clear color buffer //define coordinate system glMatrixMode(GL_PROJECTION); glLoadIdentity(); gluOrtho2D(0, screenWidth, 0, screenHeight); glPointSize( 3 ); glColor3f(0.0, 0.0, 0.0); //draw with black color } void display() { //flush out data to color buffer glFlush(); } //draw one point void drawDot(int x, int y) { glBegin(GL_POINTS); glVertex2i(x, y); glEnd(); glFlush(); } //display cluster points with different colors and centroids void displayClusters( vector<Point> &points) { glClear(GL_COLOR_BUFFER_BIT); //clear color buffer //display data points int n = points.size(); int k = centroids.size(); glPointSize( 5 ); for (int i = 0; i < n; i++) { int c = points[i].cluster; float color[3] = {0}; //display color (R, G, B) if ( c < 3 ) color[c] = 1; //Red, green or blue //fourth color is black glColor3fv(color); drawDot(screenWidth*points[i].x, screenHeight*points[i].y); } //display centroids glPointSize( 9 ); glColor3f(1, 0, 1); //magenta for(int j = 0; j < k; j++) drawDot(screenWidth*centroids[j].x, screenHeight*centroids[j].y); glColor3f(0, 0, 0); //restore black color glPointSize( 3 ); glutPostRedisplay(); } void keyboard(unsigned char key, int x, int y) { switch ( key ) { case 27: //escape exit( -1 ); case 'd': displayClusters(pointVec); break; case 'm': useMlp(mlp, pointVec, 1); break; } } void mouse(int button, int state, int mx, int my) { int x = mx, y = screenHeight - my; if ( button == GLUT_LEFT_BUTTON && state == GLUT_DOWN ){ drawDot(x, y); Point pt = {(double) x / screenWidth, (double) y / screenHeight}; pointVec.push_back( pt ); } glFlush(); } void graphics(int argc, char *argv[]) { glutInit(&argc, argv); //Initialization glutInitDisplayMode(GLUT_SINGLE | GLUT_RGB ); glutInitWindowSize(screenWidth, screenHeight); glutInitWindowPosition(59, 50); glutCreateWindow( argv[0] ); init(); // specifies calback functions glutKeyboardFunc( keyboard ); glutMouseFunc( mouse ); glutDisplayFunc( display ); glutMainLoop(); //go into perpetual loop }

Using a Makefile:

CC=nvcc

testKmlp: testKmlp.o mlp.o kmeans.o
	$(CC) -o testKmlp testKmlp.o mlp.o kmeans.o -lGL -lGLU -lglut

mlp.o:
	$(CC) -c mlp.cu

kmeans.o:
	$(CC) -c kmeans.cu

testKmlp.o:
	$(CC) -c testKmlp.cu

clean:
	rm mlp.o testKmlp.o

Sample Outputs:

(0.28, 0.27) :  3	(0.51, 0.80) :  0
(0.50, 0.11) :  1	(0.20, 0.36) :  3
(0.62, 0.67) :  0	(0.63, 0.69) :  0
(0.03, 0.45) :  3	(0.53, 0.01) :  1
(0.42, 0.69) :  2	(0.43, 0.91) :  2
(0.70, 0.18) :  1	(0.54, 0.62) :  0
(0.29, 0.47) :  3	(0.05, 0.80) :  2
(0.11, 0.20) :  3	(0.49, 0.55) :  0
(0.21, 0.89) :  2	(0.82, 0.39) :  1
(0.92, 0.73) :  0	(0.06, 0.55) :  2
(0.61, 0.33) :  1	(0.93, 0.25) :  1
(0.23, 0.88) :  2	(0.82, 0.83) :  0
(0.68, 0.18) :  1	(0.50, 0.04) :  1
(0.46, 0.25) :  3	(0.18, 0.95) :  2
(0.67, 0.34) :  1	(0.01, 0.41) :  3

Training the MLP ...
Testing MLP:
  0.28 0.27 : 3 (0.0 0.0 0.0 1.0)	  0.51 0.80 : 0 (0.9 0.0 0.0 0.0)
  0.50 0.11 : 1 (0.0 0.8 0.0 0.2)	  0.20 0.36 : 3 (0.0 0.0 0.0 1.0)
  0.62 0.67 : 0 (1.0 0.0 0.0 0.0)	  0.63 0.69 : 0 (1.0 0.0 0.0 0.0)
  0.03 0.45 : 3 (0.0 0.0 0.0 1.0)	  0.53 0.01 : 1 (0.0 1.0 0.0 0.0)
  0.42 0.69 : 2 (0.0 0.0 1.0 0.0)	  0.43 0.91 : 2 (0.0 0.0 1.0 0.0)
  0.70 0.18 : 1 (0.0 1.0 0.0 0.0)	  0.54 0.62 : 0 (1.0 0.0 0.0 0.0)
  0.29 0.47 : 3 (0.0 0.0 0.0 1.0)	  0.05 0.80 : 2 (0.0 0.0 1.0 0.0)
  0.11 0.20 : 3 (0.0 0.0 0.0 1.0)	  0.49 0.55 : 0 (0.3 0.0 0.0 0.0)
  0.21 0.89 : 2 (0.0 0.0 1.0 0.0)	  0.82 0.39 : 1 (0.0 1.0 0.0 0.0)
  0.92 0.73 : 0 (1.0 0.0 0.0 0.0)	  0.06 0.55 : 2 (0.0 0.0 1.0 0.0)
  0.61 0.33 : 1 (0.0 1.0 0.0 0.0)	  0.93 0.25 : 1 (0.0 1.0 0.0 0.0)
  0.23 0.88 : 2 (0.0 0.0 1.0 0.0)	  0.82 0.83 : 0 (1.0 0.0 0.0 0.0)
  0.68 0.18 : 1 (0.0 1.0 0.0 0.0)	  0.50 0.04 : 1 (0.0 0.9 0.0 0.0)
  0.46 0.25 : 3 (0.0 0.0 0.0 1.0)	  0.18 0.95 : 2 (0.0 0.0 1.0 0.0)
  0.67 0.34 : 1 (0.0 1.0 0.0 0.0)	  0.01 0.41 : 3 (0.0 0.0 0.0 1.0)