A ``Hello World" Example of Supervised Learning in
Artificial Intelligence Using Multilayer Perceptron and
Backpropagation in CUDA

Materials available at: https://forejune.co/cuda/

Reviews

Perceptron

  • Perceptron is the simplest artificial neural network.
  • It is a supervised learning algorithm.
  • 
    
    Activation function:
  • Learning in a perceptron: Initialization: Start with random weights and bias. Prediction: For each input, compute the output using the current weights and bias. with x0 = 1 Update: Update the weights and bias.
  •    Can only learn linearly separable patterns. 
    
    
  • Multilayer Perceptron (MLP)
    	
    	Able to distinguish data that are not linearly separable
    	
     
  • Supervised Learning
    General Idea
      
    
      1. Feed the MLP an input pattern x = (x1, x2, ...., xn) 
         from the training set.
    
      2. MLP generates an output pattern 
              y = (y1, y2, ....,  yk)     
      
      3. Compare y with the desired output or target yd, 
         to get the error quantity.
    
      4. Adjust the weights using the error quantity so that 
         in the next iteration,y will be closer to yd.
    
      5. Repeat with another input pattern x from the 
         training set.
    Error Function: Mean Squared Error (MSE)
    
      An error function represents the difference over a set of inputs.
    	
      Adjust the weights so that E decreases.
    
      A common technique used: Gradient descent
      Adjust the weights so that E decreases along the gradient.
    
    	
      
     
    After performing a series of straightforward mathematical derivations, one can arrive at the following algorithm that trains an MLP.
    Multilayer Perceptron (MLP) with Backpropagation
    
    1. Notations
    
        Inputs: x1, x2, ..., xn
    
        Hidden Nodes: h1, h2, ..., hm
    
        Outputs: y1, y2, ..., yk
    
        Weights: wij (weight from node i to node j)
    
    
    Activation Function: Sigmoid function
         
    Derivative of Sigmoid function:
         
    2. Structure of MLP The MLP consists of three layers: Input Layer, Hidden Layer, Output Layer
    Weights: wij: Weight from input xi to hidden node hj. wojl: Weight from hidden node hj to output node yl 3. Forward Propagation Computes the output of the network from inputs: Step 1: Compute Hidden Layer Activations Each nuron hj computes     g is the activation function σ w0j is the bias, and x0 = 1 Step 2: Compute Output Layer Activations Each output node yl computes the weighted sum of hidden layer activations and apply the activation function:     wo0l is the bias, and h0 = 1 4. Backward Propagation    
    Implementations in C/C++
    
       XOR 
    	x1 x2 | x1 XOR x2 
    	------|-----------
    	 0  0 |     0
    	 0  1 |     1
    	 1  0 |     1
    	 1  1 |     0
        
    
    
    //mlp.cpp
    
    #include <iostream>
    #include <iomanip>
    #include <cmath>
    
    using namespace std;
    //MLP training for an XOR gate
    const int n1 = 3;  //number of inputs and bias
    const int m1 = 3;  //number of hidden nodes and bias
    const int K = 1;   //number of outputs
    const int numSamples = 4;
    double inputs[numSamples][n1] = 
    {
            1, 0, 0,	//x0=1, x1=0, x2=0
            1, 0, 1,	//x0=1, x1=0, x2=1
            1, 1, 0,	//x0=1, x1=1, x2=0
            1, 1, 1		//x0=1, x1=1, x2=1
    };
    
    double labels[numSamples][K] = {0, 1, 1, 0};//target outputs (labels)
    double w[n1][m1] = {0.97, 0.2, 0.7,//random weights (input to hidden)
        		  0.73, 0.1, 0.9,		  
        		  0.2, 0.7, 0.3};		  
    double wo[m1][K] = {0.76, 0.6, 0.1};//random weights (hidden to output)
    
    // Simple MLP class
    class MLP 
    {
    private:
        double a[m1];  //linear sum of products of inputs and weights
        double h[m1];  //hidden nodes h[j] = g(a[j])
        double y[K];   //predicted output 
        double z[K];   //linear sum of products of weights and hidden nodes
        double eta; //learning rate
    public:
        MLP(double learning_rate) 
        {
    	eta = learning_rate;
    	h[0] = 1;	//for bias 
        }
    
        //Sigmoid activation function
        double g(double x) 
        {
          return 1.0 / (1.0 + exp(-x));
        }
    
        //Derivative of sigmoid function
        double gd(double x) 
        {
           double s = g(x);
            
           return s * (1 - s);
         }
    
    
      //Forward propagation
      double* forward(double x[]) 
      {
    	//hidden layer activation
    	for (int j = 0; j < m1; j++) {
              a[j] = 0;
              for (int i = 0; i < n1; i++) 
                a[j] += w[i][j] * x[i]; 
    	   if ( j > 0 )
    	     h[j] = g( a[j] );
             }
    	 //output layer activation
    	 for (int l = 0; l < K; l++){
    	   z[l] = 0; 
    	   for(int j = 0; j < m1; j++) 
    	     z[l] += wo[j][l] * h[j];  
    	   y[l] = g( z[l] );
    	  }
    
              return y;  //return output
      }
    
      //Backward propagation, yd is the desired output
      void backward(double yd[], double x[])
      {
        double delta[m1];
        double deltao[K];
    
       for (int l = 0; l < K; l++) { 
          double e = yd[l] - y[l];  //output layer error
         deltao[l] = e * gd(z[l]);
       }
         
       //Compute hidden layer error
       for(int j = 0; j < m1; j++){
          delta[j] = 0;
          for(int l = 0; l < K; l++)
            delta[j] += deltao[l] * wo[j][l] * gd(a[j]);
        }
    
        //Update weights (hidden to output) 
        for(int j = 0; j < m1; j++)
          for(int l = 0; l < K; l++) 
           wo[j][l] += eta * deltao[l] * h[j];
         
         //Update weights (input to hidden)
         for(int i = 0; i < n1; i++)
    	for(int j = 0; j < m1; j++)
    	  w[i][j] += eta * delta[j] * x[i];
      } 
    
      void get_input_data(int k, double x[])
      {
         for(int i=0; i < n1; i++)
           x[i]=inputs[k][i];
      }
    
      // Train the MLP
      // epochs = number of times of training 
      // numSamples = number of different input sets 
      void train(int numSamples, int epochs)
      {
        double x[n1];	//inputs
        double *yd = &labels[0][0];
    
        for (int epoch = 0; epoch < epochs; epoch++){
          for (int k = 0; k < numSamples; k++) {
    	get_input_data(k, x);
            forward( x );
            backward(yd+k*K, x);
          }
        }
      }
    
        void printWeights()
        {
    	cout << "\nMLP input to hidden weights: ";
    	for(int i = 0; i < n1; i++){
    	   cout << endl;
    	  for(int j = 0; j < m1; j++)
    	    printf("w[%d][%d]: %5.3f\t", i, j, w[i][j]);
    	}
    	cout << "\nMLP hidden to output weights:";
    	for(int i = 0; i < m1; i++) {
    	  cout << endl;
    	  for(int j = 0; j < K; j++)
    	    printf("wo[%d][%d]: %5.3f\t", i, j, wo[i][j]);
    	}
        }
    
        ~MLP()
        {
    
        }
    };
    
    int classifier( double x)
    {
      if ( x < 0.2 && x > -0.1)
        return 0;
      if ( x > 0.8 && x < 1.1)
        return 1;
    
      return -1;
    }
    
    int main() 
    {
        string gates = " XOR ";
        MLP mlp(0.5); 
        
        mlp.train(4, 10000);
    
        // Test the MLP
        double x[3];
        cout << "\nTesting MLP:" << endl;
        x[0] = 1;
        for (int i = 0; i < 4; i++) {
          x[1] = i & 1;
          x[2] = i >> 1;
          double *output = mlp.forward(x);
          cout << fixed << setprecision(0) << "  " <<  x[2] 
           << gates << x[1] << " = " << classifier(*output) <<
    	     setprecision(2) << " (" << output[0]<<")" << endl;   
        }
    
        mlp.printWeights();
        cout << endl << "Hello, AI World!" << endl;
        return 0;
    }
    
    

    CUDA Implementation

      3 groups of threads
      1 group -- input nodes   n1
      1 group -- hidden nodes  m1
      1 group -- output nodes  K
    
      dim3 threads(n1, m1, K);
      Thread indexes:
         threadIdx.x  -- indexing input nodes
         threadIdx.y  -- indexing hidden nodes
          threadIdx.z -- indexing output nodes
          
    
    //xorMLP.cu
    
    #include <iostream>
    #include <iomanip>
    #include <cmath>
    #include <stdio.h>
    
    using namespace std;
    #define n1 3
    #define m1 3
    #define K  1
    #define numSamples 4
    #define eta 0.5
    
    double w[n1][m1] = {0.99, 0.2, 0.7,	//random weights (input to hidden)
        		  0.73, 0.1, 0.9,		  
        		  0.2, 0.7, 0.3};		  
    double wo[m1][K] = {0.88, 0.6, 0.1};	//random weights (hidden to output)
    
    __device__ const double d_inputs[numSamples][n1] =
    {
            1, 0, 0,	//x0=1, x1=0, x2=1
            1, 0, 1,	//x0=1, x1=0, x2=1
            1, 1, 0,	//x0=1, x1=1, x2=0
            1, 1, 1		//x0=1, x1=1, x2=1
    };
    
    __device__ const double d_labels[numSamples][K] = {0, 1, 1, 0};//target outputs (labels)
    
    //Sigmoid activation function
    __device__ double d_g(double x) 
    {
          return 1.0 / (1.0 + exp(-x));
    }
        
    //Derivative of sigmoid function
    __device__ double d_gd(double x) 
    {
        double s = d_g(x);
         
        return s * (1 - s);
    }
    
    //y[] is the output
    __global__ void d_forward(int k, double *w, double *wo, double *h, double *a, double *z,  double *y)
    {
      __shared__ double x[n1];
      int ii = threadIdx.x;		//indexing input nodes
      int jj = threadIdx.y;		//indexing hidden nodes
      int ll = threadIdx.z;		//indexing output nodes
    
      x[ii]= d_inputs[k][ii];	//get input data, input threads work in parallel
      __syncthreads();
    
      //hidden layer activation, all hidden layer threads work in parallel
      a[jj] = 0;
      for (int i = 0; i < n1; i++) 
         a[jj] += w[i*m1+jj] * x[i]; 
      if ( jj > 0 )
         h[jj] = d_g(a[jj]); 	//sigmoid activation function
      __syncthreads();
    
       //output layer activation, all output layer threads work in parallel
       z[ll] = 0; 
       for(int j = 0; j < m1; j++) 
         z[ll] += wo[j*K+ll] * h[j];  
       y[ll] = d_g( z[ll] );
    }
      
    //Backward propagation, yd is the desired output
    __global__  void d_backward(int k, double *w, double *wo,  double *h, double *a, double *z, double *y)
    {
      double delta[m1];
      double deltao[K];
      __shared__ double x[n1];	//inputs
      __shared__ double yd[K];	//desired outputs
      int ii = threadIdx.x;		//indexing input nodes
      int jj = threadIdx.y;		//indexing hidden nodes
      int ll  = threadIdx.z;	//indexing output nodes
      x[ii] = d_inputs[k][ii];	//get inputs, input threads work in parallel	
      yd[ll]= d_labels[k][ll];	//get target ouputs, output threads work in parallel
      __syncthreads();
     
      //output threads work in parallel 
      double e = yd[ll] - y[ll];  //output layer error
      deltao[ll] = e * d_gd(z[ll]);
      __syncthreads();
    
       //Compute hidden layer error, hidden threads work in parallel
       delta[jj] = 0;
       for(int l = 0; l < K; l++)
         delta[jj] += deltao[l] * wo[jj*K+l] * d_gd(a[jj]);
        __syncthreads();
        
        //Update weights (hidden to output), hidden and output threads work in parallel 
        wo[jj*K+ll] += eta * deltao[ll] * h[jj];
        __syncthreads();
         
        //Update weights (input to hidden), input and hidden threads work in parallel
    	  w[ii*m1+jj] += eta * delta[jj] * x[ii];
        __syncthreads();
    } 
    
    // Simple MLP class
    class MLP 
    {
    private:
        double a[m1];  //linear sum of products of inputs and weights
        double h[m1];  //hidden nodes h[j] = g(a[j])
        double y[K];   //predicted output 
        double z[K];   //linear sum of products of weights and hidden nodes
        double *d_a;
        double *d_h;	//hidden layer
        double *d_y;	//output
        double *d_yd;	//desired output
        double *d_z;
        double *d_w;
        double *d_wo;
    public:
        MLP()
        {
    	int wSize = n1 * m1 * sizeof(double);
    	int woSize = m1 * K * sizeof(double);
    	cudaMalloc((void**)&d_a, m1 * sizeof(double));
    	cudaMalloc((void**)&d_h, m1 * sizeof(double));
    	cudaMalloc((void**)&d_z, K * sizeof(double));
    	cudaMalloc((void**)&d_y, K * sizeof(double));
    	cudaMalloc((void**)&d_yd, K * sizeof(double));
    	cudaMalloc((void**)&d_w, wSize);
    	cudaMalloc((void**)&d_wo, woSize);
            cudaMemcpy(d_w, w, wSize, cudaMemcpyHostToDevice);
            cudaMemcpy(d_wo, wo, woSize, cudaMemcpyHostToDevice);
    	h[0] = 1;	//for bias 
            cudaMemcpy(d_h, h, m1 * sizeof(double), cudaMemcpyHostToDevice);
        }
    
        //Sigmoid activation function
        double g(double x) 
        {
          return 1.0 / (1.0 + exp(-x));
        }
    
        //Derivative of sigmoid function
        double gd(double x) 
        {
           double s = g(x);
            
           return s * (1 - s);
         }
    
         //Forward propagation
         double* predicted(double x[]) 
         {
    	//hidden layer activation
    	for (int j = 0; j < m1; j++) {
              a[j] = 0;
              for (int i = 0; i < n1; i++) 
                a[j] += w[i][j] * x[i]; 
    	   if ( j > 0 )
    	     h[j] = g( a[j] );
             }
    	 //output layer activation
    	 for (int l = 0; l < K; l++){
    	   z[l] = 0; 
    	   for(int j = 0; j < m1; j++) 
    	     z[l] += wo[j][l] * h[j];  
    	   y[l] = g( z[l] );
    	  }
    
              return y;  //return output
         } 
    
         // Train the perceptron
         // eta = learning rate
         // epochs = number of times of training 
         // numSamples = number of different input sets (x_data)
         // yd is target output
         void train(int epochs)
         {
            dim3 threads(n1, m1, K);   //input, hidden, output layers threads
            for (int epoch = 0; epoch < epochs; epoch++){
              for (int k = 0; k < numSamples; k++) {
    	    d_forward<<<1,threads>>>(k, d_w, d_wo, d_h, d_a, d_z, d_y);
      	    cudaDeviceSynchronize();
    	    d_backward<<<1, threads>>>(k, d_w, d_wo, d_h, d_a, d_z, d_y);
      	    cudaDeviceSynchronize();
          }
        }
        //save the weights
        cudaMemcpy(&w[0][0], d_w, sizeof(w), cudaMemcpyDeviceToHost);
        cudaMemcpy(&wo[0][0], d_wo, sizeof(wo), cudaMemcpyDeviceToHost);
      }
    
        void printWeights()
        {
    	cout << "\nMLP input to hidden weights: ";
    	for(int i = 0; i < n1; i++){
    	   cout << endl;
    	  for(int j = 0; j < m1; j++)
    	    printf("w[%d][%d]: %5.3f\t", i, j, w[i][j]);
    	  //  cout << " w[i" << i*n1+j <<": " << w[i*m1+j];
    	}
    	cout << "\nMLP hidden to output weights:";
    	for(int i = 0; i < m1; i++) {
    	  cout << endl;
    	  for(int j = 0; j < K; j++)
    	    printf("wo[%d][%d]: %5.3f\t", i, j, wo[i][j]);
    	    //cout << " wo" << i <<": " << wo[i*K+j];
    	}
        }
    
        ~MLP()
        {
    	cudaFree(d_a);
    	cudaFree(d_h);
    	cudaFree(d_z);
    	cudaFree(d_y);
        }
    };
    
    int classifier( double x)
    {
      if ( x < 0.2 && x > -0.1)
        return 0;
      if ( x > 0.8 && x < 1.1)
        return 1;
    
      return -1;
    }
    
    int main() 
    {
        string gates = " XOR ";
        MLP mlp; 
        
        mlp.train(10000);
    
        // Test the MLP
        double x[3];
        cout << "Testing MLP:" << endl;
        x[0] = 1;
        for (int i = 0; i < 4; i++) {
          x[1] = i & 1;
          x[2] = i >> 1;
          double *output = mlp.predicted(x);
          cout << fixed << setprecision(0) << "  " <<  x[2] << gates << x[1] << " = " 
          	<< classifier(*output) << setprecision(2) << " (" << output[0]<<")" << endl;   
        }
    
        mlp.printWeights();
        cout << endl << "Hello, AI World!" << endl;
        return 0;
    }
    
    
        Binary Full Adder
    	x1
    	x2
          +	x3
        -----------------
          C  S
    	
    	x3 x2 x1 | C  S 
    	---------|-----------
    	0  0  0  | 0  0
    	0  0  1  | 0  1
    	0  1  0  | 0  1
    	0  1  1  | 1  0
    	1  0  0  | 0  1
    	1  0  1  | 1  0
    	1  1  0  | 1  0
    	1  1  1  | 1  1
    
    	3 inputs, 2 outputs, i.e. n1 = 4, K = 2
    	use 4 hidden nodes, so m1 = 5