Using Softmax Function in Machine Learning

Materials available at: https://forejune.co/cuda/

The Softmax Function

  • Has roots in statistical mechanics, especially in the Boltzman distribution.
  • Can be used as an activation function in multi-class classification
  • Sigmoid, and Step functions -- for binary-class classification

      Sigmoid Function
      Sigmoid Function
      Softmax Function
  • An activation function Converts logits into interpretable probabilities
    	In neural networks, logits are the raw, unnormalized scores 
    	produced at a model's output layer before the  activation 
    	function is applied.
    			
       

    	zl are logits
    	g() is an activation function
    
    	wo0l is the bias, 
    	and h0 = 1
    
    	
  • Example:
    	
    
    // softmaxExamp.cpp
    // forejune.co/cuda
    
    #include <iostream>
    #include <math.h>
    #include <stdio.h>
    
    using namespace std;
    
    int main()
    {
      const int K = 5;
      double sum = 0;
      double z[K] = {1.3, 5.1, 2.2, 0.7, 1.1};
      double p[K];
    
      for (int l = 0; l < K; l++)
        sum += exp(z[l]);
    
       printf("sum = %f\n", sum);
      
       for (int l = 0; l < K; l++) {
         p[l] = exp(z[l]) / sum;
         printf("%1.1f: %1.2f\n", z[l], p[l]);
       }
    
       return 0;
    }
    	
  • Full Adder MLP Using Sigmoid Function

        Binary Full Adder
    	x1
    	x2
          +	x3
        -----------------
          C  S		(carry, sum)
    	
    	x3 x2 x1 | C  S 
    	---------|-----------
    	0  0  0  | 0  0
    	0  0  1  | 0  1
    	0  1  0  | 0  1
    	0  1  1  | 1  0
    	1  0  0  | 0  1
    	1  0  1  | 1  0
    	1  1  0  | 1  0
    	1  1  1  | 1  1
    
    	3 inputs, 2 outputs, i.e. n1 = 4, K = 2
    	use 4 hidden nodes, so m1 = 5
    
    	See https://forejune.co/cuda/ai/mlp/mlp.html
    	
    
    // fullAdder.cpp
    // https://forejune.co/cuda
    
    #include <iostream>
    #include <iomanip>
    #include <cmath>
    
    using namespace std;
    
    const int n1 = 4;  //number of inputs and bias
    const int m1 = 5;  //number of hidden nodes and bias
    const int K = 2;   //number of outputs
    const int numSamples = 8;
    double inputs[numSamples][n1] = 
    {
     1, 0, 0, 0,     //x0=1, x1=0, x2=0, x3 = 0
     1, 0, 0, 1,             
     1, 0, 1, 0,             
     1, 0, 1, 1,             
     1, 1, 0, 0,             
     1, 1, 0, 1,             
     1, 1, 1, 0,     //x0=1, x1=1, x2=1, x3=0
     1, 1, 1, 1      //x0=1, x1=1, x2=1, x2=1
    };
    
    double labels[numSamples][K] = {
                    0, 0,
                 	0, 1,
                    0, 1,
                    1, 0,
                    0, 1,
                    1, 0,
                    1, 0,
                    1, 1};	//target outputs (labels)
    double w[n1][m1] = {
                    0.97, 0.2, 0.7, 0.5, 0.65,    
                    0.73, 0.1, 0.9, 0.3, 0.23,
                    0.37, 0.3, 0.8, 0.3, 0.19,
                    0.2, 0.7, 0.3,  0.6, 0.87}; //random weights (input to hidden)
    double wo[m1][K] = {
               0.76, 0.6, 0.1, 0.2, 0.7,
               0.67, 0.45, 0.3, 0.15, 0.4}; //random weights (hidden to output)
    
    // Simple MLP class
    class MLP 
    {
    private:
        double a[m1];  //linear sum of products of inputs and weights
        double h[m1];  //hidden nodes h[j] = g(a[j])
        double y[K];   //predicted output 
        double z[K];   //linear sum of products of weights and hidden nodes (logits)
        double eta; //learning rate
    public:
        MLP(double learning_rate) 
        {
    	eta = learning_rate;
    	h[0] = 1;	//for bias 
        }
    
        //Sigmoid activation function
        double g(double x) 
        {
          return 1.0 / (1.0 + exp(-x));
        }
    
        //Derivative of sigmoid function
        double gd(double x) 
        {
           double s = g(x);
         
           return s * (1 - s);
         }
    
    
      //Forward propagation
      double* forward(double x[]) 
      {
    	//hidden layer activation
    	for (int j = 0; j < m1; j++) {
              a[j] = 0;
              for (int i = 0; i < n1; i++) 
                a[j] += w[i][j] * x[i]; 
    	   if ( j > 0 )
    	     h[j] = g( a[j] );
             }
    	 //output layer activation
    	 for (int l = 0; l < K; l++){
    	   z[l] = 0; 
    	   for(int j = 0; j < m1; j++) 
    	     z[l] += wo[j][l] * h[j];  
    	   y[l] = g( z[l] );
    	  }
    
              return y;  //return output
        }
    
      //Backward propagation, yd is the desired output
      void backward(double yd[], double x[])
      {
        double delta[m1];
        double deltao[K];
    
       for (int l = 0; l < K; l++) { 
          double e = yd[l] - y[l];  //output layer error
         deltao[l] = e * gd(z[l]);
       }
        //Compute hidden layer error
        for(int j = 0; j < m1; j++){
          delta[j] = 0;
          for(int l = 0; l < K; l++)
            delta[j] += deltao[l] * wo[j][l] * gd(a[j]);
        }
    
        //Update weights (hidden to output) 
        for(int j = 0; j < m1; j++)
          for(int l = 0; l < K; l++) 
           wo[j][l] += eta * deltao[l] * h[j];
         
         //Update weights (input to hidden)
         for(int i = 0; i < n1; i++)
    	for(int j = 0; j < m1; j++)
    	  w[i][j] += eta * delta[j] * x[i];
      } 
    
      void get_input_data(int k, double x[])
      {
         for(int i=0; i < n1; i++)
           x[i]=inputs[k][i];
      }
      // Train the perceptron
      // eta = learning rate
      // epochs = number of times of training 
      // numSamples = number of different input sets (x_data)
      // yd is target output
      void train(int numSamples, int epochs)
      {
        double x[n1];	//inputs
        double *yd = &labels[0][0];
    
        for (int epoch = 0; epoch < epochs; epoch++){
          for (int k = 0; k < numSamples; k++) {
    	get_input_data(k, x);
            forward( x );
            backward(yd+k*K, x);
          }
        }
      }
    
        void printWeights()
        {
    	cout << "\nMLP input to hidden weights: ";
    	for(int i = 0; i < n1; i++){
    	   cout << endl;
    	  for(int j = 0; j < m1; j++)
    	    printf("w[%d][%d]: %6.3f\t", i, j, w[i][j]);
    	}
    	cout << "\nMLP hidden to output weights:";
    	for(int i = 0; i < m1; i++) {
    	  cout << endl;
    	  for(int j = 0; j < K; j++)
    	    printf("wo[%d][%d]: %6.3f\t", i, j, wo[i][j]);
    	}
        }
    
        ~MLP()
        {
    
        }
    };
    
    int classifier( double x)
    {
      if ( x < 0.2 && x > -0.1)
        return 0;
      if ( x > 0.8 && x < 1.1)
        return 1;
    
      return -1;
    }
    
    int main() 
    {
        MLP mlp(0.5); 
        
        mlp.train(8, 10000);
    
        // Test the MLP
        double *x;
        x = &inputs[0][0];
        cout << "Testing MLP:" << endl;
        for (int i = 0; i < 8; i++) {
          double *output = mlp.forward(x);
          cout << fixed << setprecision(0) << "  " << x[1] << " " <<  
          x[2] << " "  << x[3] << " : " << classifier(output[0]) << " " 
          << classifier (output[1]) << " (" << setprecision(2) <<  output[0] 
          <<  " " << output[1]<<")" << endl;
          x += 4;
        }
    
        mlp.printWeights();
        cout << endl << "Hello, AI World!" << endl;
        return 0;
    }
    
    
    	
    Full Adder MLP Using Softmax Function
    	x3 x2 x1 | Class	0 1 2 3 
    	---------|-----------------------
    	0  0  0  |  0		1 0 0 0
    	0  0  1  |  1		0 1 0 0
    	0  1  0  |  1		0 1 0 0
    	0  1  1  |  2		0 0 1 0
    	1  0  0  |  1		0 1 0 0
    	1  0  1  |  2		0 0 1 0
    	1  1  0  |  2		0 0 1 0
    	1  1  1  |  3		0 0 0 1
    	
    
    
    const int n1 = 4;  //number of inputs and bias
    const int m1 = 7;  //number of hidden nodes and bias
    const int K = 4;   //number of outputs
    const int numSamples = 8;
    double inputs[numSamples][n1] =
    {
     1, 0, 0, 0,     //x0=1, x1=0, x2=0, x3 = 0
     1, 0, 0, 1,
     1, 0, 1, 0,
     1, 0, 1, 1,
     1, 1, 0, 0,
     1, 1, 0, 1,
     1, 1, 1, 0,     //x0=1, x1=1, x2=1, x3=0
     1, 1, 1, 1      //x0=1, x1=1, x2=1, x2=1
    };
    
    double labels[numSamples][K] = {
     1, 0, 0, 0,
     0, 1, 0, 0,
     0, 1, 0, 0,
     0, 0, 1, 0,
     0, 1, 0, 0,
     0, 0, 1, 0,
     0, 0, 1, 0,
     0, 0, 0, 1};
    
    double w[n1][m1] = {
                    0.97, 0.2, 0.7, 0.5, 0.65,
                    0.73, 0.1, 0.9, 0.3, 0.23,
                    0.37, 0.3, 0.8, 0.3, 0.19,
                    0.2, 0.7, 0.3,  0.6, 0.87}; //random weights (input to hidden)
    
    double wo[m1][K] = {
                    0.76, 0.6, 0.1, 0.2, 0.7,
                    0.67, 0.45, 0.3, 0.15, 0.4,
                    0.12, 0.87, 0.9, 0.6, 0.2,
                    0.8, 0.5, 0.44, 0.73, 0.52
    }; //random weights (hidden to output)
    
    class MLP
    {
    .....
    
      //denominator of softmax
      double sumExp()
      {
            double sum = 0;
            for (int l = 0; l < K; l++)
              sum += exp( z[l] );
    
            return sum;
      }
    
      //Forward propagation
      double* forward(double x[])
      {
            //hidden layer activation
            for (int j = 0; j < m1; j++) {
              a[j] = 0;
              for (int i = 0; i < n1; i++)
                a[j] += w[i][j] * x[i];
               if ( j > 0 )
                 h[j] = g( a[j] );
             }
             //output layer activation
             for (int l = 0; l < K; l++){
               z[l] = 0;
               for(int j = 0; j < m1; j++)
                 z[l] += wo[j][l] * h[j];
              }
    
              //softmax
              double denom = sumExp();
              for (int l = 0; l < K; l++)
                y[l] = exp(z[l]) / denom;
    
              return y;  //return output
        }
    
       void printProbabilities( double *output )
       {
         for (int l = 0; l < K; l++)
             printf("%4.2f,  ", output[l] );
         printf("\t");
       }
    
       int predict(double x[])
       {
            double *output = forward( x );
            printProbabilities( output );
            int label = 0;
            double maxProb = output[0];
            for (int l = 0; l < K; l++) {
                if (output[l] > maxProb) {
                    maxProb = output[l];
                    label = l;
                }
            }
            return label;
        }
    };
    
    int main()
    {
        MLP mlp(0.5);
    
        mlp.train(8, 10000);
    
        // Test the MLP
        double *x;
        x = &inputs[0][0];
        cout << "Testing MLP:" << endl;
        for (int i = 0; i < 8; i++) {
          int label = mlp.predict( x );
          cout << fixed << setprecision(0) << "  " << x[1] << " " <<  
          x[2] << " "  << x[3] << " : " << label << " "  << endl;
          x += 4;
        }
    
        cout << endl << "Hello, AI World!" << endl;
        return 0;
    }
    
    
    Outputs: Testing MLP: 0.99, 0.01, 0.00, 0.00, 0 0 0 : 0 0.00, 0.99, 0.01, 0.00, 0 0 1 : 1 0.00, 0.99, 0.01, 0.00, 0 1 0 : 1 0.00, 0.01, 0.98, 0.00, 0 1 1 : 2 0.00, 0.99, 0.01, 0.00, 1 0 0 : 1 0.00, 0.01, 0.99, 0.00, 1 0 1 : 2 0.00, 0.01, 0.99, 0.00, 1 1 0 : 2 0.00, 0.00, 0.02, 0.98, 1 1 1 : 3 Hello, AI World!
    Example 2: Classifying Points
    	A simplified version of K-means clustering using MLP
    	
    
    // softmaxDemo.cpp 
    // https://forejune.co/cuda
    
    #include <iostream>
    #include <iomanip>
    #include <cmath>
    
    using namespace std;
    
    const int n1 = 3;  //number of inputs and bias
    const int m1 = 7;  //number of hidden nodes and bias
    const int K  = 3;  //3 classes, number of outputs
    const int numSamples = 6;
    double w1[n1][m1];
    
    // inputs could be normalized (x, y) coordinates
    double inputs[numSamples][n1] = 
    {
     1, 0.1, 0.2,     
     1, 0.8, 0.9,             
     1, 0.3, 0.6,              
     1, 0.5, 0.4,             
     1, 0.2, 0.3,              
     1, 0.7, 0.8};      
    
    double labels[numSamples][K] = {
     1, 0, 0,	//class 0	(cluster 0) 
     0, 0, 1,	//class 2
     0, 1, 0,	//class 1
     0, 1, 0,	//class 1
     1, 0, 0, 	//class 0
     0, 0, 1};	//class 2
    	
    double w[n1][m1] = {
    	0.97, 0.2, 0.7, 0.5, 0.65, 0.1, 0.9,    
            0.73, 0.1, 0.9, 0.3, 0.23, 0.2, 0.8,
            0.2, 0.7, 0.3,  0.6, 0.87, 0.4, 0.7}; //random weights (input to hidden)
    
    double wo[m1][K] = {
    	0.76, 0.6, 0.1, 
    	0.2, 0.7, 0.67, 
    	0.45, 0.3, 0.15, 
    	0.4, 0.8, 0.5, 
    	0.44, 0.73, 0.52
    }; //random weights (hidden to output)
    
    // Simple MLP class
    class MLP 
    {
    private:
        double a[m1];  //linear sum of products of inputs and weights
        double h[m1];  //hidden nodes h[j] = g(a[j])
        double y[K];   //predicted output 
        double z[K];   //linear sum of products of weights and hidden nodes
        double eta; //learning rate
    public:
        MLP(double learning_rate) 
        {
    	eta = learning_rate;
    	h[0] = 1;	//for bias 
        }
    
        //Sigmoid activation function
        double g(double x) 
        {
          return 1.0 / (1.0 + exp(-x));
        }
    
        //Derivative of sigmoid function
        double gd(double x) 
        {
           double s = g(x);
            
           return s * (1 - s);
         }
        
        //denominator of softmax
        double sumExp() 
        {
            double sum = 0;
    	for (int l = 0; l < K; l++)
    	  sum += exp( z[l] );
    
            return sum;
        }
    
        
    
      //Forward propagation
      double* forward(double x[]) 
      {
    	//hidden layer activation
    	for (int j = 0; j < m1; j++) {
              a[j] = 0;
              for (int i = 0; i < n1; i++) 
                a[j] += w[i][j] * x[i]; 
    	   if ( j > 0 )
    	     h[j] = g( a[j] );
             }
    	 //output layer activation
    	 for (int l = 0; l < K; l++){
    	   z[l] = 0; 
    	   for(int j = 0; j < m1; j++) 
    	     z[l] += wo[j][l] * h[j];   //logits
    	  }
    
    	  //softmax
    	  double denom = sumExp();
    	  for (int l = 0; l < K; l++)
    	    y[l] = exp(z[l]) / denom;
    
              return y;  //return output
      }
    
      //Backward propagation, yd is the desired output
      void backward(double yd[], double x[])
      {
        double delta[m1];
        double deltao[K];
    
       for (int l = 0; l < K; l++) { 
          double e = yd[l] - y[l];  //output layer error
          deltao[l] = e * gd(z[l]);
       }
       
       for(int j = 0; j < m1; j++){
          delta[j] = 0;
          for(int l = 0; l < K; l++)
            delta[j] += deltao[l] * wo[j][l] * gd(a[j]);
       }
    
       //Update weights (hidden to output) 
       for(int j = 0; j < m1; j++)
         for(int l = 0; l < K; l++) 
           wo[j][l] += eta * deltao[l] * h[j];
         
         //Update weights (input to hidden)
         for(int i = 0; i < n1; i++)
    	for(int j = 0; j < m1; j++)
    	  w[i][j] += eta * delta[j] * x[i];
      } 
    
      void get_input_data(int k, double x[])
      {
         for(int i=0; i < n1; i++)
           x[i]=inputs[k][i];
      }
    
      // Train the perceptron
      // eta = learning rate
      // epochs = number of times of training 
      // numSamples = number of different input sets (x_data)
      // yd is target output
      void train(int numSamples, int epochs)
      {
        double x[n1];	//inputs
        double *yd = &labels[0][0];
    
        for (int epoch = 0; epoch < epochs; epoch++){
          for (int k = 0; k < numSamples; k++) {
    	get_input_data(k, x);
            forward( x );
            backward(yd+k*K, x);
          }
        }
      }
    
       void printProbabilities( double *output )
       {
         for (int l = 0; l < K; l++)
    	 printf("%4.2f,  ", output[l] );
         printf("\t");
       }
    
       int predict(double x[]) 
       {
    	double *output = forward( x );
    	printProbabilities( output );
            int label = 0;
            double maxProb = output[0];
            for (int l = 0; l < K; l++) {
                if (output[l] > maxProb) {
                    maxProb = output[l];
                    label = l;
                }
            }
            return label;
        }
    
        void printWeights()
        {
    	cout << "\nMLP input to hidden weights: ";
    	for(int i = 0; i < n1; i++){
    	   cout << endl;
    	  for(int j = 0; j < m1; j++)
    	    printf("w[%d][%d]: %5.3f\t", i, j, w[i][j]);
    	}
    	cout << "\nMLP hidden to output weights:";
    	for(int i = 0; i < m1; i++) {
    	  cout << endl;
    	  for(int j = 0; j < K; j++)
    	    printf("wo[%d][%d]: %5.3f\t", i, j, wo[i][j]);
    	}
        }
    
        ~MLP()
        {
    
        }
    };
    
    int main() 
    {
        MLP mlp(0.5); 
        mlp.train(6, 10000);
    
        // Test the MLP
        double *x;
        const int ntest = 9;
        double test_inputs[ntest][n1] = 
        {
     	1, 0.1, 0.2,     
     	1, 0.8, 0.9,             
     	1, 0.3, 0.6,              
     	1, 0.5, 0.4,             
     	1, 0.2, 0.3,              
     	1, 0.7, 0.8,
    	1, 0.1, 0.1,
    	1, 0.5, 0.5,
    	1, 0.9, 0.9
        };      
        x = &test_inputs[0][0];
        cout << "Testing MLP:" << endl;
        for (int i = 0; i < ntest; i++) {
          int label = mlp.predict( x );
          cout << fixed << setprecision(2) << "  " <<  x[1] << " "  << x[2] 
          << " : " "Class " << label <<  " "  << endl;
          x += n1;
        }
    
        return 0;
    }
    
    		
    Outputs:
    
    Testing MLP:
    1.00,  0.00,  0.00,  	  0.10 0.20 : Class 0
    0.00,  0.00,  1.00,  	  0.80 0.90 : Class 2
    0.00,  0.99,  0.00,  	  0.30 0.60 : Class 1
    0.00,  0.99,  0.00,  	  0.50 0.40 : Class 1
    0.99,  0.01,  0.00,  	  0.20 0.30 : Class 0
    0.00,  0.00,  1.00,  	  0.70 0.80 : Class 2
    1.00,  0.00,  0.00,  	  0.10 0.10 : Class 0
    0.00,  0.99,  0.01,  	  0.50 0.50 : Class 1
    0.00,  0.00,  1.00,  	  0.90 0.90 : Class 2