Materials available at: https://forejune.co/cuda/
Overview
Review
K-means Clustering
see https://forejune.co/cuda/ai/unsupervised/unsupercu.html
Multi-layer Perceptron (MLP)
see https://forejune.co/cuda/ai/mlp/mlpcu.html

// mlpcu.h
// https://forejune.co/cuda
#ifndef __MLPCU_H__
#define __MLPCU_H__
using namespace std;
#define n1 3
#define m1 7
#define K 4
#define numSamples 600
#define eta 0.5
// Simple MLP class
class MLP
{
private:
double a[m1]; //linear sum of products of inputs and weights
double h[m1]; //hidden nodes h[j] = g(a[j])
double y[K]; //predicted output
double z[K]; //linear sum of products of weights and hidden nodes
double *d_inputs;
double *d_labels;
double *d_a;
double *d_h; //hidden layer
double *d_y; //output
double *d_yd; //desired output
double *d_z;
double *d_w;
double *d_wo;
public:
MLP();
//Sigmoid activation function
double g(double x);
//Derivative of sigmoid function
double gd(double x);
//Forward propagation
double* predicted(double x[]);
void train(const vector< Point> &points, int epochs);
~MLP();
};
#endif
|
// mlp.cu
// https://forejune.co/cuda
#include <iostream>
#include <iomanip>
#include <cmath>
#include <stdio.h>
#include "kmeanscu.h"
#include "mlpcu.h"
using namespace std;
double w[n1][m1];
double wo[m1][K];
//Sigmoid activation function
__device__ double d_g(double x)
{
return 1.0 / (1.0 + exp(-x));
}
//Derivative of sigmoid function
__device__ double d_gd(double x)
{
double s = d_g(x);
return s * (1 - s);
}
//y[] is the output
__global__ void d_forward(int k, double *w, double *wo, double *h, double *a,
double *z, double *y, double *d_inputs)
{
__shared__ double x[n1];
int ii = threadIdx.x; //indexing input nodes
int jj, ll;
if (ii < n1) //inputs
x[ii] = d_inputs[k*n1+ii];
__syncthreads();
//hidden layer activation, all hidden layer threads work in parallel
if ( ii >= n1 && ii < n1+m1) {
jj = ii - n1;
a[jj] = 0;
for (int i = 0; i < n1; i++)
a[jj] += w[i*m1+jj] * x[i];
if ( jj > 0 )
h[jj] = d_g(a[jj]); //sigmoid activation function
}
__syncthreads();
//output layer activation, all output layer threads work in parallel
if (ii >= n1 + m1){
ll = ii - n1 - m1;
z[ll] = 0;
for(int j = 0; j < m1; j++)
z[ll] += wo[j*K+ll] * h[j];
y[ll] = d_g( z[ll] );
}
__syncthreads();
}
//Backward propagation, yd is the desired output
__global__ void d_backward(int k, double *w, double *wo, double *h, double *a,
double *z, double *y, double *d_inputs, double *d_labels)
{
__shared__ double delta[m1];
__shared__ double deltao[K];
__shared__ double w_share[n1][m1]; //just for easier readability, not necessarily needed
__shared__ double wo_share[m1][K];
__shared__ double x[n1]; //inputs
__shared__ double yd[K]; //desired outputs
int ii = threadIdx.x; //indexing input nodes
int jj = threadIdx.y; //indexing hidden nodes
int ll = threadIdx.z; //indexing output nodes
if (jj == 0 && ll == 0 )
x[ii] = d_inputs[k*n1+ii];
__syncthreads();
if ( ii == 0 && jj == 0 )
yd[ll] = d_labels[k*K+ll];
__syncthreads();
if (ll == 0 )
w_share[ii][jj] = w[ii*m1+jj];
__syncthreads();
if (ii == 0 )
wo_share[jj][ll] = wo[jj*K+ll];
__syncthreads();
//output threads work in parallel
if (ii == 0 && jj == 0 ) {
double e = yd[ll] - y[ll]; //output layer error
deltao[ll] = e * d_gd(z[ll]);
}
__syncthreads();
//Compute hidden layer error, hidden threads work in parallel
if (ii == 0 && ll == 0 ) {
delta[jj] = 0;
for(int l = 0; l < K; l++)
delta[jj] += deltao[l] * wo_share[jj][l] * d_gd(a[jj]);
}
//delta[jj] += deltao[l] * wo[jj*K+l] * d_gd(a[jj]);
__syncthreads();
//Update weights (hidden to output), hidden and output threads work in parallel
if (ii == 0 ) {
wo_share[jj][ll] += eta * deltao[ll] * h[jj];
}
__syncthreads();
//Update weights (input to hidden), input and hidden threads work in parallel
if ( ll==0 ) {
w_share[ii][jj] += eta * delta[jj] * x[ii];
}
syncthreads();
if ( ll == 0 )
w[ii*m1+jj] = w_share[ii][jj];
__syncthreads();
if ( ii == 0 )
wo[jj*K+ll] = wo_share[jj][ll];
__syncthreads();
}
MLP::MLP()
{
int wSize = n1 * m1 * sizeof(double);
int woSize = m1 * K * sizeof(double);
cudaMalloc((void**)&d_inputs, numSamples * n1 * sizeof(double));
cudaMalloc((void**)&d_labels, numSamples * K * sizeof(double));
cudaMalloc((void**)&d_a, m1 * sizeof(double));
cudaMalloc((void**)&d_h, m1 * sizeof(double));
cudaMalloc((void**)&d_z, K * sizeof(double));
cudaMalloc((void**)&d_y, K * sizeof(double));
cudaMalloc((void**)&d_yd, K * sizeof(double));
cudaMalloc((void**)&d_w, wSize);
cudaMalloc((void**)&d_wo, woSize);
//random weights
for (int i = 0; i < n1; i++)
for (int j = 0; j < m1; j++)
w[i][j] = rand() % 1000 / 1000.0;
for (int j = 0; j < m1; j++)
for (int l = 0; l < K; l++)
wo[j][l] = rand() % 1000 / 1000.0;
cudaMemcpy(d_w, w, wSize, cudaMemcpyHostToDevice);
cudaMemcpy(d_wo, wo, woSize, cudaMemcpyHostToDevice);
h[0] = 1; //for bias
cudaMemcpy(d_h, h, m1 * sizeof(double), cudaMemcpyHostToDevice);
}
//Sigmoid activation function
double MLP::g(double x)
{
return 1.0 / (1.0 + exp(-x));
}
//Derivative of sigmoid function
double MLP::gd(double x)
{
double s = g(x);
return s * (1 - s);
}
//Forward propagation
double* MLP::predicted(double x[])
{
//hidden layer activation
for (int j = 0; j < m1; j++) {
a[j] = 0;
for (int i = 0; i < n1; i++)
a[j] += w[i][j] * x[i];
if ( j > 0 )
h[j] = g( a[j] );
}
//output layer activation
for (int l = 0; l < K; l++){
z[l] = 0;
for(int j = 0; j < m1; j++)
z[l] += wo[j][l] * h[j];
y[l] = g( z[l] );
}
return y; //return output
}
// Train the perceptron
// eta = learning rate
// epochs = number of times of training
// numSamples = number of different input sets (x_data)
// yd is target output
void MLP::train(const vector< Point> &points, int epochs)
{
double inputs[numSamples][n1];
double labels[numSamples][K];
dim3 threads(n1, m1, K); //input, hidden, output layers threads
for (int i = 0; i < numSamples; i++) {
inputs[i][0] = 1; //bias
inputs[i][1] = points[i].x;
inputs[i][2] = points[i].y;
}
for (int i = 0; i < numSamples; i++) {
for (int k = 0; k < K; k++)
if (points[i].cluster == k)
labels[i][k] = 1;
else
labels[i][k] = 0;
}
cudaMemcpy(d_inputs, inputs, numSamples*n1*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_labels, labels, numSamples*K*sizeof(double), cudaMemcpyHostToDevice);
for (int epoch = 0; epoch < epochs; epoch++){
for (int i = 0; i < numSamples; i++) {
//for simplicity, use more threads than needed
d_forward<<<1,n1+m1+K>>>(i, d_w, d_wo, d_h, d_a, d_z, d_y, d_inputs);
cudaDeviceSynchronize();
//for simplicity, use more threads than needed
d_backward<<<1, threads>>>(i, d_w, d_wo, d_h, d_a, d_z, d_y, d_inputs, d_labels);
cudaDeviceSynchronize();
}
}
//save the weights
cudaMemcpy(&w[0][0], d_w, sizeof(w), cudaMemcpyDeviceToHost);
cudaMemcpy(&wo[0][0], d_wo, sizeof(wo), cudaMemcpyDeviceToHost);
}
MLP::~MLP()
{
cudaFree(d_inputs);
cudaFree(d_labels);
cudaFree(d_a);
cudaFree(d_h);
cudaFree(d_z);
cudaFree(d_y);
}
|
Testing K-means MLP routines
// testKmlp.cu
// https://forejune.co/cuda
#include <iostream>
#include <iomanip>
#include <cmath>
#include <stdio.h>
#include "kmeanscu.h"
#include "mlpcu.h"
using namespace std;
const int screenWidth = 800;
const int screenHeight = 700;
void graphics(int argc, char *argv[]);
int classifier( double y[])
{
int value = 0;
double max = y[0];
for (int i = 1; i < K; i++) {
if (y[i] > max) {
max = y[i];
value = i;
}
}
return value;
}
bool overlap(const vector<Centroid> & centroids)
{
int n = centroids.size();
for (int i = 0; i < n; i++)
for (int j=i+1; j < n; j++)
if ( abs(centroids[i].x - centroids[j].x) < 0.01 &&
abs(centroids[i].y - centroids[j].y) < 0.01 )
return true;
return false;
}
// cluster the points using MLP
void useMlp(MLP &mlp, vector<Point> &points, int delta)
{
int n = points.size();
double x[3];
x[0] = 1; //bias
int j = 0;
for (int i = 0; i < n; i+=delta) {
x[1] = points[i].x;
x[2] = points[i].y;
double *output = mlp.predicted(x);
cout << fixed << setprecision(2) << " " << x[1] << " " << x[2] << " : " <<
classifier(output) << "" << " (" << setprecision(1);
for(int k = 0; k < K; k++) {
cout << output[k];
if (k < K - 1)
cout << " ";
else
cout << ")" << "\t";
}
points[i].cluster = (int) classifier(output);
if (++j % 2 == 0) cout << endl;
}
}
MLP mlp;
vector<Centroid> centroids(K);
int main(int argc, char *argv[])
{
srand(time(0));
int iterations = 50; // Number of K-means iterations
// Generate data
vector<Point> points;
for (int i = 0; i < numSamples; ++i) {
Point p;
p.x = (float) (rand() % screenWidth) /screenWidth;
p.y = (float) (rand() % screenHeight) / screenHeight;
points.push_back(p);
}
int c[K];
// Randomly initialize centroids from points
do {
for (int i = 0; i < K; i++) {
c[i] = rand() % points.size();
centroids[i].x = points[c[i]].x;
centroids[i].y = points[c[i]].y;
}
} while (overlap(centroids));
kmeans(points, centroids, iterations);
int delta = 20;
//output results
int k = 0;
for (int j = 0; j < points.size(); j+=delta ) {
cout << fixed << setprecision(2) << "(" << points[j].x << ", " << points[j].y
<< ") : " << points[j].cluster << "\t";
if ( ++k % 2 == 0 ) cout << endl;
}
cout << endl;
cout << "Training the MLP ..." << endl;
mlp.train(points, 10000);
// Test the MLP
cout << "Testing MLP:" << endl;
useMlp(mlp, points, delta);
getchar();
graphics(argc, argv);
return 0;
}
// graphics routines
#include <GL/glut.h>
//vectors to store points
vector<Point>pointVec;
void init()
{
glClearColor(1, 1, 1, 1); //clear color buffer with white color
glClear(GL_COLOR_BUFFER_BIT); //clear color buffer
//define coordinate system
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
gluOrtho2D(0, screenWidth, 0, screenHeight);
glPointSize( 3 );
glColor3f(0.0, 0.0, 0.0); //draw with black color
}
void display()
{
//flush out data to color buffer
glFlush();
}
//draw one point
void drawDot(int x, int y)
{
glBegin(GL_POINTS);
glVertex2i(x, y);
glEnd();
glFlush();
}
//display cluster points with different colors and centroids
void displayClusters( vector<Point> &points)
{
glClear(GL_COLOR_BUFFER_BIT); //clear color buffer
//display data points
int n = points.size();
int k = centroids.size();
glPointSize( 5 );
for (int i = 0; i < n; i++) {
int c = points[i].cluster;
float color[3] = {0}; //display color (R, G, B)
if ( c < 3 )
color[c] = 1; //Red, green or blue
//fourth color is black
glColor3fv(color);
drawDot(screenWidth*points[i].x, screenHeight*points[i].y);
}
//display centroids
glPointSize( 9 );
glColor3f(1, 0, 1); //magenta
for(int j = 0; j < k; j++)
drawDot(screenWidth*centroids[j].x, screenHeight*centroids[j].y);
glColor3f(0, 0, 0); //restore black color
glPointSize( 3 );
glutPostRedisplay();
}
void keyboard(unsigned char key, int x, int y)
{
switch ( key ) {
case 27: //escape
exit( -1 );
case 'd':
displayClusters(pointVec);
break;
case 'm':
useMlp(mlp, pointVec, 1);
break;
}
}
void mouse(int button, int state, int mx, int my)
{
int x = mx, y = screenHeight - my;
if ( button == GLUT_LEFT_BUTTON && state == GLUT_DOWN ){
drawDot(x, y);
Point pt = {(double) x / screenWidth, (double) y / screenHeight};
pointVec.push_back( pt );
}
glFlush();
}
void graphics(int argc, char *argv[])
{
glutInit(&argc, argv); //Initialization
glutInitDisplayMode(GLUT_SINGLE | GLUT_RGB );
glutInitWindowSize(screenWidth, screenHeight);
glutInitWindowPosition(59, 50);
glutCreateWindow( argv[0] );
init();
// specifies calback functions
glutKeyboardFunc( keyboard );
glutMouseFunc( mouse );
glutDisplayFunc( display );
glutMainLoop(); //go into perpetual loop
}
|
Using a Makefile:
CC=nvcc
testKmlp: testKmlp.o mlp.o kmeans.o
$(CC) -o testKmlp testKmlp.o mlp.o kmeans.o -lGL -lGLU -lglut
mlp.o:
$(CC) -c mlp.cu
kmeans.o:
$(CC) -c kmeans.cu
testKmlp.o:
$(CC) -c testKmlp.cu
clean:
rm mlp.o testKmlp.o
Sample Outputs:
(0.28, 0.27) : 3 (0.51, 0.80) : 0 (0.50, 0.11) : 1 (0.20, 0.36) : 3 (0.62, 0.67) : 0 (0.63, 0.69) : 0 (0.03, 0.45) : 3 (0.53, 0.01) : 1 (0.42, 0.69) : 2 (0.43, 0.91) : 2 (0.70, 0.18) : 1 (0.54, 0.62) : 0 (0.29, 0.47) : 3 (0.05, 0.80) : 2 (0.11, 0.20) : 3 (0.49, 0.55) : 0 (0.21, 0.89) : 2 (0.82, 0.39) : 1 (0.92, 0.73) : 0 (0.06, 0.55) : 2 (0.61, 0.33) : 1 (0.93, 0.25) : 1 (0.23, 0.88) : 2 (0.82, 0.83) : 0 (0.68, 0.18) : 1 (0.50, 0.04) : 1 (0.46, 0.25) : 3 (0.18, 0.95) : 2 (0.67, 0.34) : 1 (0.01, 0.41) : 3 Training the MLP ... Testing MLP: 0.28 0.27 : 3 (0.0 0.0 0.0 1.0) 0.51 0.80 : 0 (0.9 0.0 0.0 0.0) 0.50 0.11 : 1 (0.0 0.8 0.0 0.2) 0.20 0.36 : 3 (0.0 0.0 0.0 1.0) 0.62 0.67 : 0 (1.0 0.0 0.0 0.0) 0.63 0.69 : 0 (1.0 0.0 0.0 0.0) 0.03 0.45 : 3 (0.0 0.0 0.0 1.0) 0.53 0.01 : 1 (0.0 1.0 0.0 0.0) 0.42 0.69 : 2 (0.0 0.0 1.0 0.0) 0.43 0.91 : 2 (0.0 0.0 1.0 0.0) 0.70 0.18 : 1 (0.0 1.0 0.0 0.0) 0.54 0.62 : 0 (1.0 0.0 0.0 0.0) 0.29 0.47 : 3 (0.0 0.0 0.0 1.0) 0.05 0.80 : 2 (0.0 0.0 1.0 0.0) 0.11 0.20 : 3 (0.0 0.0 0.0 1.0) 0.49 0.55 : 0 (0.3 0.0 0.0 0.0) 0.21 0.89 : 2 (0.0 0.0 1.0 0.0) 0.82 0.39 : 1 (0.0 1.0 0.0 0.0) 0.92 0.73 : 0 (1.0 0.0 0.0 0.0) 0.06 0.55 : 2 (0.0 0.0 1.0 0.0) 0.61 0.33 : 1 (0.0 1.0 0.0 0.0) 0.93 0.25 : 1 (0.0 1.0 0.0 0.0) 0.23 0.88 : 2 (0.0 0.0 1.0 0.0) 0.82 0.83 : 0 (1.0 0.0 0.0 0.0) 0.68 0.18 : 1 (0.0 1.0 0.0 0.0) 0.50 0.04 : 1 (0.0 0.9 0.0 0.0) 0.46 0.25 : 3 (0.0 0.0 0.0 1.0) 0.18 0.95 : 2 (0.0 0.0 1.0 0.0) 0.67 0.34 : 1 (0.0 1.0 0.0 0.0) 0.01 0.41 : 3 (0.0 0.0 0.0 1.0)