A ``Hello World" Example of Supervised Learning in Artificial Intelligence
Using Multilayer Perceptron and Backpropagation in C/C++

Materials available at: https://forejune.co/cuda/

Brief Review of Perceptron

Perceptron is the simplest artificial neural network.

It is a supervised learning algorithm.





Activation function:









Learning in a perceptron:
  
  Initialization: Start with random weights and bias.

  Prediction: For each input, compute the output using the current weights and bias.
   with x₀ = 1 
  Update: Update the weights and bias.

   Can only learn linearly separable patterns.

Multilayer Perceptron (MLP)

	
	Able to distinguish data that are not linearly separable

Supervised Learning

General Idea
  
  1. Feed the MLP an input pattern x = (x₁, x₂, ...., x_n) 
     from the training set.

  2. MLP generates an output pattern 
          y = (y₁, y₂, ....,  y_k)     
  
  3. Compare y with the desired output or target yd, 
     to get the error quantity.

  4. Adjust the weights using the error quantity so that 
     in the next iteration,y will be closer to yd.

  5. Repeat with another input pattern x from the 
     training set.
  
  
  
  
  
Error Function: Mean Squared Error (MSE)


  An error function represents the difference over a set of inputs.
	
  Adjust the weights so that E decreases.

  A common technique used: Gradient descent
  Adjust the weights so that E decreases along the gradient.

	
  
 
  
  
	
  
  
  
 After performing a series of straightforward mathematical derivations, 
 one can arrive at the following algorithm that trains an MLP.

Multilayer Perceptron (MLP) with Backpropagation


1. Notations

    Inputs: x₁, x₂, ..., x_n

    Hidden Nodes: h₁, h₂, ..., h_m

    Outputs: y₁, y₂, ..., y_k

    Weights: w_ij (weight from node i to node j)


    Activation Function: Sigmoid function 
           
    Derivative of Sigmoid function:
           
   
    
        
    
   

2. Structure of MLP

    The MLP consists of three layers:
      Input Layer, Hidden Layer, Output Layer
      

    Weights:
	w_ij: Weight from input x_i to hidden node h_j.

	w_jl: Weight from hidden node h_j to output node y_l

3. Forward Propagation

    Computes the output of the network from inputs:
      Step 1: Compute Hidden Layer Activations
        Each nuron h_j computes
             
        g is the activation function σ
        w_0j is the bias, and x₀ = 1

      Step 2: Compute Output Layer Activations
        Each output node y_l computes the weighted sum of hidden layer
        activations and apply the activation function:
             
        w^o_0l is the bias, and h₀ = 1

4. Backward Propagation

Implementations in C/C++


   XOR 
	x₁ x₂ | x₁ XOR x₂ 
	------|-----------
	 0  0 |     0
	 0  1 |     1
	 1  0 |     1
	 1  1 |     0


//mlp.cpp

#include <iostream>
#include <iomanip>
#include <cmath>

using namespace std;
//MLP training for an XOR gate
const int n1 = 3;  //number of inputs and bias
const int m1 = 3;  //number of hidden nodes and bias
const int K = 1;   //number of outputs
const int numSamples = 4;
double inputs[numSamples][n1] = 
{
        1, 0, 0,	//x0=1, x1=0, x2=0
        1, 0, 1,	//x0=1, x1=0, x2=1
        1, 1, 0,	//x0=1, x1=1, x2=0
        1, 1, 1		//x0=1, x1=1, x2=1
};

double labels[numSamples][K] = {0, 1, 1, 0};//target outputs (labels)
double w[n1][m1] = {0.97, 0.2, 0.7,//random weights (input to hidden)
    		  0.73, 0.1, 0.9,		  
    		  0.2, 0.7, 0.3};		  
double wo[m1][K] = {0.76, 0.6, 0.1};//random weights (hidden to output)

// Simple MLP class
class MLP 
{
private:
    double a[m1];  //linear sum of products of inputs and weights
    double h[m1];  //hidden nodes h[j] = g(a[j])
    double y[K];   //predicted output 
    double z[K];   //linear sum of products of weights and hidden nodes
    double eta; //learning rate
public:
    MLP(double learning_rate) 
    {
	eta = learning_rate;
	h[0] = 1;	//for bias 
    }

    //Sigmoid activation function
    double g(double x) 
    {
      return 1.0 / (1.0 + exp(-x));
    }

    //Derivative of sigmoid function
    double gd(double x) 
    {
       double s = g(x);
        
       return s * (1 - s);
     }


  //Forward propagation
  double* forward(double x[]) 
  {
	//hidden layer activation
	for (int j = 0; j < m1; j++) {
          a[j] = 0;
          for (int i = 0; i < n1; i++) 
            a[j] += w[i][j] * x[i]; 
	   if ( j > 0 )
	     h[j] = g( a[j] );
         }
	 //output layer activation
	 for (int l = 0; l < K; l++){
	   z[l] = 0; 
	   for(int j = 0; j < m1; j++) 
	     z[l] += wo[j][l] * h[j];  
	   y[l] = g( z[l] );
	  }

          return y;  //return output
  }

  //Backward propagation, yd is the desired output
  void backward(double yd[], double x[])
  {
    double delta[m1];
    double deltao[K];

   for (int l = 0; l < K; l++) { 
      double e = yd[l] - y[l];  //output layer error
     deltao[l] = e * gd(z[l]);
   }
     
   //Compute hidden layer error
   for(int j = 0; j < m1; j++){
      delta[j] = 0;
      for(int l = 0; l < K; l++)
        delta[j] += deltao[l] * wo[j][l] * gd(a[j]);
    }

    //Update weights (hidden to output) 
    for(int j = 0; j < m1; j++)
      for(int l = 0; l < K; l++) 
       wo[j][l] += eta * deltao[l] * h[j];
     
     //Update weights (input to hidden)
     for(int i = 0; i < n1; i++)
	for(int j = 0; j < m1; j++)
	  w[i][j] += eta * delta[j] * x[i];
  } 

  void get_input_data(int k, double x[])
  {
     for(int i=0; i < n1; i++)
       x[i]=inputs[k][i];
  }

  // Train the MLP
  // epochs = number of times of training 
  // numSamples = number of different input sets 
  void train(int numSamples, int epochs)
  {
    double x[n1];	//inputs
    double *yd = &labels[0][0];

    for (int epoch = 0; epoch < epochs; epoch++){
      for (int k = 0; k < numSamples; k++) {
	get_input_data(k, x);
        forward( x );
        backward(yd+k*K, x);
      }
    }
  }

    void printWeights()
    {
	cout << "\nMLP input to hidden weights: ";
	for(int i = 0; i < n1; i++){
	   cout << endl;
	  for(int j = 0; j < m1; j++)
	    printf("w[%d][%d]: %5.3f\t", i, j, w[i][j]);
	}
	cout << "\nMLP hidden to output weights:";
	for(int i = 0; i < m1; i++) {
	  cout << endl;
	  for(int j = 0; j < K; j++)
	    printf("wo[%d][%d]: %5.3f\t", i, j, wo[i][j]);
	}
    }

    ~MLP()
    {

    }
};

int classifier( double x)
{
  if ( x < 0.2 && x > -0.1)
    return 0;
  if ( x > 0.8 && x < 1.1)
    return 1;

  return -1;
}

int main() 
{
    string gates = " XOR ";
    MLP mlp(0.5); 
    
    mlp.train(4, 10000);

    // Test the MLP
    double x[3];
    cout << "\nTesting MLP:" << endl;
    x[0] = 1;
    for (int i = 0; i < 4; i++) {
      x[1] = i & 1;
      x[2] = i >> 1;
      double *output = mlp.forward(x);
      cout << fixed << setprecision(0) << "  " <<  x[2] 
       << gates << x[1] << " = " << classifier(*output) <<
	     setprecision(2) << " (" << output[0]<<")" << endl;   
    }

    mlp.printWeights();
    cout << endl << "Hello, AI World!" << endl;
    return 0;
}

    Binary Full Adder
	x₁
	x₂
      +	x₃
    -----------------
      C  S
	
	x₃ x₂ x₁ | C  S 
	---------|-----------
	0  0  0  | 0  0
	0  0  1  | 0  1
	0  1  0  | 0  1
	0  1  1  | 1  0
	1  0  0  | 0  1
	1  0  1  | 1  0
	1  1  0  | 1  0
	1  1  1  | 1  1

	3 inputs, 2 outputs, i.e. n1 = 4, K = 2
	use 4 hidden nodes, so m1 = 5

A ``Hello World" Example of Supervised Learning in Artificial IntelligenceUsing Multilayer Perceptron and Backpropagation in C/C++

A ``Hello World" Example of Supervised Learning in Artificial Intelligence
Using Multilayer Perceptron and Backpropagation in C/C++