A Simple GPT-style AI Transformer in C/C++

Materials available at: http://forejune.co/cuda/

Review


Decoder-Only (GPT-style) Transformer

The Model:

  • has decoder only
  • no encoder, no cross-attention
  • masked self-attention only
  • predicts next token given previous tokens
  • Advantages:

    1. Simpler architecture

    2. Unified framework (very flexible)
      text --> text continuation
      e.g.
      • translation
      • summarization
      • code generation
      just by formating input.

    3. Efficient inference (with caching)

    4. Scales very well
        more data + bigger model --> better performance

    5. No alignment requirement
        Encoder-decoder needs: (source, target) pairs

        GPT can use: any raw text

    Disadvantages:

    1. Not naturally suited for input-output tasks
      Translation ideally wants:
        source --> target (explicit separation)
      But GPT sees:
        source <SEP> target
        separation is implicit, not structural

    2. Requires masking tricks
      One has to do:
        loss masking: only target contributes
      Encoder-decoder:
        clean separation: no masking needed

    3. Less efficient for long inputs

    4. Weaker alignment for translation

    5. Exposure bias in generation

    6. Needs more data

    When Decoder-Only Works Best:

      Ideal for:
    1. language modeling
    2. chat systems
    3. code generation
    4. general-purpose AI
    5. Less ideal for:

    6. strict translation
    7. structured input-output tasks
    8. low-data scenarios
    9. Summary:

        
         Feature                  Decoder-Only (GPT)  Encoder-Decoder
        
         Architecture             Simple              More complex    
        
         Flexibility              Very high           Medium   
        
         Translation quality      Good                Better 
        
         Training data            Unstructured OK     Needs pairs   
        
         Efficiency (long input)  Lower               Better   
        
         Implementation           Easier              Harder          
        

      Full Implementation in C/C++

      Helper Functions:

      util.h:

      #ifndef __UTIL_H__
      #define __UTIL_H__
      #include <vector>
      #include <algorithm>
      #include <math.h>
      #include <iostream>
      #include <iomanip>
      #include <random>
      
      using namespace std;
      
      typedef vector<vector<double>>  matrixd;
      
      // an activation function
      double f(double z);
      
      // Relu activation function with matrix input
      matrixd relu(const matrixd &X);
      
      // Derivative of Relu
      matrixd relu_deriv (const matrixd &X);
      
      // calculates entropy loss
      double crossEntropy(const matrixd &P, const vector<int> &target);
      
      // add two matrices
      matrixd addMat(const matrixd &A, const matrixd &B);
      
      // Transpose matrix n x m, B = A^T : m x n
      matrixd transpose(const matrixd& A);
      
      // Matrix multiplication   C = A X B   A: nxm, B: mxr, C: nxr
      matrixd matmul(const matrixd& A, const matrixd& B);
      
      // multiply a matrix by a scalar
      matrixd mulMats (const matrixd &A, const double s);
      
      // scale a matrix
      void scaleMat(matrixd &A, const double s);
      
      // apply masking to S
      matrixd causalMask(matrixd S);
      
      // update weight matrix W -= dW * eta
      void updateWeight(matrixd &W, const matrixd &dW, const double eta);
      
      // initialize an input matrix with certain random values
      void initMatrix(matrixd& M, const int n, const int m);
      
      void initMatrix(matrixd& M, const int n, const int m, double dModel);
      
      // print a token dictionary
      void printDictionary(unordered_map<string, int> &d);
      
      // print a matrix
      void printMatrix(const matrixd &y);
      
      // Add & Norm
      matrixd addNnorm(const matrixd &A, const matrixd &B);
      
      void clip(matrixd &A, double max_val);
      
      // softmax, 1D vector input
      vector<double> softmax(const vector<double> &v);
      
      // softmax activation function, 2D input vector
      matrixd softmax(const matrixd& input);
      
      // derivative function of softmax
      matrixd softmax_derivative(const matrixd &dL_dP, const matrixd &P);
      #endif
      						

      util.cpp:

      // util.cpp: helper functions
      // http://forejune.co/cuda/
      #include "util.h"
      
      #include <random>
      
      using namespace std;
      
      // Activation function: using ReLU (Rectified Linear Unit)
      double f(double z)
      {
        return max(0.0, z);
      }
      
      // Relu activation function with matrix input
      matrixd relu(const matrixd &X)
      {
          int rows = X.size();
          int cols = X[0].size();
          matrixd Y = X;	// same dimension of X
         
          for(int i = 0; i < rows; i++)
              for(int j = 0; j < cols; j++)
                  Y[i][j] = max(0.0, X[i][j]);
      
          return Y;
      }
      
      // Derivative of Relu
      matrixd relu_deriv(const matrixd &X)
      {
          int rows = X.size();
          int cols = X[0].size();
          matrixd Y = X;	// same dimension as X
         
          for(int i = 0; i < rows; i++)
              for(int j = 0; j < cols; j++)
      	  if ( X[i][j] > 0 )
                  Y[i][j] = 1;
      	  else
                  Y[i][j] = 0;
      
          return Y;
      }
      
      // calculates entropy loss 
      double crossEntropy(const matrixd &P, const vector<int> &target)
      {
        double L = 0;
        int nt = P.size();
        for (int i = 0; i < nt; i++){
          int k = target[i];
          L -= log( P[i][k] + 1e-12 );
        }
      
        return L / nt;
      }
      
      // C = A + B
      matrixd addMat(const matrixd &A, const matrixd &B)
      {
           int n = A.size();          // rows
           int m = A[0].size();       // cols
      
           matrixd C = A;       // same dimension as A
           for (int i = 0; i < n; i++)
             for(int j = 0; j < m; j++)
               C[i][j] = A[i][j] + B[i][j];
      
           return C;
      }
      
      // Transpose matrix n x m, B = A^T : m x n
      matrixd transpose(const matrixd& A)
      {
         int n = A.size();    //number of rows
         int m = A[0].size(); //number of columns
      
         matrixd B(m, vector<double>(n));
         for (int i = 0; i < n; i++)
           for (int j = 0; j < m; j++)
             B[j][i] = A[i][j];
      
         return B;
      }
      
      // Matrix multiplication   C = A X B   A: nxm, B: mxr, C: nxr
      matrixd matmul(const matrixd& A, const matrixd& B)
      {
        int n  = A.size();
        int m = A[0].size();
        int r = B[0].size();
      
        matrixd C(n, vector<double>(r, 0));
      
        for (int i = 0; i < n; i++)
          for (int j = 0; j < r; j++)
            for (int k = 0; k < m; k++)
              C[i][j] += A[i][k] * B[k][j];
      
         return C;
      }
      
      // scale a matrix
      void scaleMat(matrixd &A, const double s)
      {
         int m = A.size();	// number of rows
         int n = A[0].size();    // number of columns
      
         for (int i = 0; i < m; i++)
           for (int j = 0; j < n; j++)
             A[i][j] *= s;
      }
      
      // maultiply a matrix by a scalar s
      matrixd mulMats (const matrixd &A, const double s)
      {
         int m = A.size();	// number of rows
         int n = A[0].size();    // number of columns
        
         matrixd B = A;
      
         for (int i = 0; i < m; i++)
           for (int j = 0; j < n; j++)
             B[i][j] *= s;
      
         return B;
      }
      
      void initMatrix(matrixd& M, const int n, const int m)
      {
         // resizing 2D vector to n x m with initial values 0
         M.assign(n, vector<double>(m, 0));
         random_device rd;
         mt19937 gen(rd());      //psuedo random number generator
         double sigma = 1.0/16.0;
         normal_distribution<double> dist(0.0, sigma);  
         for (int i = 0; i < n; ++i) 
           for (int j = 0; j < m; ++j)
              M[i][j] = dist(gen);
      }
      
      void printDictionary(unordered_map<string, int> &d)
      {
        unordered_map<string, int>::iterator it = d.begin();
        int k = 0;
        while ( it != d.end() ){
          cout << right << setw(12) << it->first << ": " << setw(3) << it->second;
          it++;
          k++;
          if ( k % 5 == 0 )
            cout << endl;
        }
      }
      
      void printMatrix(const matrixd &y)
      {
          int cols = y.size();
          int rows = y[0].size();
      
          for (int i = 0; i < cols; ++i) {
              cout << "Token " << i << ": [";
              for (int j = 0; j < rows; ++j) {
                  cout << fixed << setw(7) << setprecision(3) << y[i][j];
                  if (j < 4) cout << ", ";
              }
              cout << " ]\n";
          }
      }
      
      matrixd addNnorm(const matrixd &A, const matrixd &B)
      {
           auto x = addMat(A, B);     // add two matrices
           int n = x.size();          // rows
           int m = x[0].size();       // cols
           matrixd y = x;             // output same size as x 
      
           // normalize the result
           double epsilon = 1e-5;
      
           for (int i = 0; i < n; i++) {
             double sum = 0;          // sum of one row
             double std = 0;          //sigma_i square
             for (int j = 0; j < m; j++)
               sum += x[i][j];
              double mean = sum / m;  // mu_i, mean of i-th row
      
              sum = 0;
              for (int j = 0; j < m; j++) {
                double diff = x[i][j] - mean;
                sum += diff * diff;
              }
              std = sum / m;
              for (int j = 0; j < m; j++) {
                double xd = (x[i][j] - mean) / sqrt(std + epsilon);
                y[i][j] = xd;         // gamma = 1, beta = 0;
              }
           }
      
           return y;  // final output of Add & norm
      }
      
      
      matrixd scaledAttention(const matrixd &Q, const matrixd &K, const matrixd &V,
              bool mask)
      {
        double d_k, scale;
        matrixd KT, A;	// K transpose, attention
        
        d_k = Q[0].size();
        scale = 1.0 / sqrt( d_k );
        KT = transpose ( K );
        A = matmul(Q, KT);
        scaleMat(A, scale); 
      
        if ( mask ) {
           for (int i = 0; i < A.size(); i++)
             for (int j = i+1; j < A[0].size(); j++) 
               A[i][j] = -1e9;	// causal mask
        }
      
        A = softmax( A );
        
        matrixd output = matmul(A, V);
      
        return output;
      }
      
      matrixd causalMask(matrixd S) 
      {
          int n = S.size();
          for (int i = 0; i < n; i++) {
              for (int j = i+1; j < n; j++) {
                  S[i][j] = -1e9;  // effectively -infinity
              }
          }
          return S;
      }
      
      void updateWeight(matrixd &W, const matrixd &dW, const double eta)
      {
         for (int i = 0; i < W.size(); i++)
           for (int j = 0; j < W[0].size(); j++)
               W[i][j] -= eta * dW[i][j];
      }
      
      void clip(matrixd &A, double max_val = 5.0)
      {
          for (auto &row : A)
              for (auto &x : row)
                  if (x > max_val) x = max_val;
                  else if (x < -max_val) x = -max_val;
      }
      
      // softmax activation function, 1D input vector
      vector<double> softmax(const vector<double> &z)
      {
        int m = z.size();
        vector<double> y(m, 0);
      
         //maximum value of vector
        double maxVal = *max_element(z.begin(), z.end());
      
        double sum = 0;
        // Subtract max for numerical stability
        for (int j = 0; j < m; j++) {
          y[j] = exp(z[j] - maxVal);
          sum += y[j];
        }
      
        // Normalize
        for (int j = 0; j < m; j++)
          y[j] /= sum;
      
        return y;
      }
      
      // softmax activation function, 2D input vector
      matrixd softmax(const matrixd& input)
      {
        int n = input.size(), m = input[0].size();    // n x m matrix 
      
        // 2D output vector n x m y[n][m]
        vector<vector<double>> y(n, vector<double>(m));
      
        for (int i = 0; i < n; i++) {
          //maximum value of the row
          double maxVal = *max_element(input[i].begin(), input[i].end());
      
          double sum = 0;
          // Subtract max for numerical stability
          for (int j = 0; j < m; j++) {
            y[i][j] = exp(input[i][j] - maxVal);
            sum += y[i][j];
          }
      
          // Normalize
          for (int j = 0; j < input[i].size(); j++)
            y[i][j] /= sum;
        }
      
        return y;
      }
      
      // derivative function of softmax
      matrixd softmax_derivative(const matrixd &dL_dP, const matrixd &P)
      {
          int nt = P.size(), ns = P[0].size();
          matrixd dL_dS(nt, vector<double>(ns,0));
      
          for (int i = 0; i < nt; i++)
          {
                double dot = 0.0;
                for (int j = 0; j < ns; j++)
                  dot += P[i][j] * dL_dP[i][j];  // dot product of two row-vectors
      
                for (int j = 0; j < ns; j++)  // dot used by all columns of dL/dA
                   dL_dS[i][j] = P[i][j] * (dL_dP[i][j] - dot);
          }
      
          return dL_dS;
      }

      Transformer Classes:

      transGpt.h:

      #ifndef TRANSGPT_H
      #define TRANSGPT_H
      class Tokenizer
      {
      private:
          unordered_map<string, int> words;
          unordered_map<int, string> wordIndex;
          int nTokens;        // number of words
      
          // special tokens
          const string PAD = "<PAD>";
          const string UNK = "<UNK>";
          const string BOS = "<BOS>";
          const string EOS = "<EOS>";
          const string SEP = "<SEP>";
      
          void addWord(const string &w) 
          {
              if (words.find(w) == words.end()) {
                  int id = nTokens;
                  words[w] = id;
                  wordIndex[id] = w;
                  nTokens++;
              }
          }
      
          vector<string> split(const string &str) 
          {
              vector<string> tokens;
              stringstream ss(str);
              string word;
              while (ss >> word) {
                  tokens.push_back(word);
              }
              return tokens;
          }
      
      public:
          Tokenizer();
          Tokenizer(ifstream &fs);
          unordered_map<string, int>  getTokensWords();
          vector<int> tokenize(const string& str);
          string detokenize(const vector<int>& tokenIDs);
          int get_nTokens() const;
          unordered_map<int, string> get_wordIndex();
      };
      
      class Embedding
      {
      private:
          int nTokens;        // number of words
          int dim;            // embedding dimension
          matrixd em_matrix;  // embedding matrix;
      public:
          Embedding(int sequence_length, int embeddingDimension);
          // create embedding matrix with tokenIDs
          matrixd embed(const vector<int>& tokenIDs);
          int get_embedDim() const;
          int get_nTokens() const;
      };
      
      // Positional Encoding
      class PositionalEncoding
      {
      private:
          int maxTokens; // maximum sequence length
          int dModel;    // embedding dimension
          matrixd PE;    // positional_encodings matrix;
      public:
          PositionalEncoding(int max_seq_length, int embedding_dimension);
          matrixd addPE(const matrixd& embeddings);
          matrixd getPE();
      };
      
      // Add & Norm
      class AddnNorm
      {
      private:
          vector<double> gamma;   // size dModel
          vector<double> beta;    // size dModel
      
          matrixd Zhat;           // normalized values (nt x d)
          vector<double> mean;    // per row (nt)
          vector<double> var;     // per row (nt)
      
          double eps;             // small value
      
      public:
         // constructor
         AddnNorm(int dModel);
      
         // forward computation
         matrixd  forward (const matrixd &A, const matrixd &B);
      
         // Assume forward already filled: Zhat, mean, var
         matrixd backProp(const matrixd &X, const matrixd &dL_dY, double eta);
      
         // Optional setters (to plug in forward results)
         void setCache(const matrixd& zhat, const vector<double>& m,
                        const vector<double>& v);
      
          // Accessors (optional)
          const vector<double>& getGamma();
          const vector<double>& getBeta();  
      };
      
      // Feed-forward Network
      class FeedForward
      {
      private:
          matrixd W1, W2;
          int dModel, d_ff;
      
          // cache
          matrixd F1;   // after activation function ReLU
          matrixd Z;    // before ReLU (needed for derivative)
      
      public:
          // constructor
          FeedForward(int dModel, int d_ff_);
          // ------------------------------
          // Forward (for cache)
          // ------------------------------
          matrixd FFoutput(const matrixd &X);
       // ------------------------------
          // Backprop
          // ------------------------------
          matrixd backProp(const matrixd &X, const matrixd &dL_dY,  double eta);
      };
      
      // ----------- MultiHeadAttention -----------
      class MultiHeadAttention {
      private:
          int dModel, nHeads, d_k;
      
          vector<matrixd> WQ, WK, WV; // per head
          matrixd WO;
      
          // cache
          vector<matrixd> Qs, Ks, Vs, Ps, As;
      public:
          MultiHeadAttention(int dModel, int nHeads);
          matrixd computeAttention(const matrixd &X_Q, const matrixd &X_K, const matrixd &X_V,
                                   bool mask);
           matrixd backProp( const matrixd &X_Q, const matrixd &X_K, const matrixd &X_V,
              const matrixd &dL_dH, double eta, bool mask);
      };
      
      // ----------------- OutputLayer ---------------
      class OutputLayer {
      private:
          matrixd W;   // dModel x vocab_size
          int dModel, vocab_size;
      
      public:
          OutputLayer(int dModel, int vocab_size);
          matrixd forward(const matrixd &X);
          matrixd backward(const matrixd &X, const matrixd &P,
                           const vector<int> &targets, double eta);
      };
      
      // ------------------- GPT-style Transformer ---------------
      class GPTBlock {
      private:
          MultiHeadAttention mha;
          AddnNorm norm1;
          FeedForward ffn;
          AddnNorm norm2;
      
      public:
          GPTBlock(int dModel, int nHeads, int d_ff);
          matrixd forward(const matrixd &X);
          matrixd backward(const matrixd &X, const matrixd &dL_dY, double eta);
      };
      
      class MiniGPT {
      private:
          Embedding embed;
          PositionalEncoding pe;
          vector<GPTBlock> layers;
          OutputLayer output;
      
          int seq_len, vocab_size;
      
      public:
          MiniGPT(int vocab_size, int seq_len, int dModel, 
      		    int nHeads, int d_ff, int nLayers);
          matrixd forward(const vector<int> &tokens);
          double trainStep(const vector<int> &tokens, double eta);
      };
      #endif
      			 

      transGpt.cpp:

      // http://forejune.co/cuda
      // transGpt.cpp : classes of GPT-style Transformer
      #include <iostream>
      #include <fstream>
      #include <vector>
      #include <string>
      #include <unordered_map>
      #include <cmath>
      #include <random>
      #include <algorithm>
      #include <iomanip>
      #include "util.h"
      #include "transGpt.h"
      
      using namespace std;
      
      // ------------------------ Tokenizer class --------------------------
      
      // -------------------------------
      // Default constructor
      // -------------------------------
      Tokenizer::Tokenizer() 
      {
          nTokens = 0;
      
          // Initialize special tokens first
          addWord(PAD); // 0
          addWord(UNK); // 1
          addWord(BOS); // 2
          addWord(EOS); // 3
          addWord(SEP); // 4
      }
      
      // -------------------------------
      // Build vocab from file
      // -------------------------------
      Tokenizer::Tokenizer(ifstream &fs) : Tokenizer() 
      {
          string line;
      
          while (getline(fs, line)) {
              vector<string> tokens = split(line);	//split(line);
      	for (int i = 0; i < tokens.size(); i++) 
                 addWord(tokens[i]);
          }
      }
      
      unordered_map<string, int> Tokenizer::getTokensWords() 
      {
          return words;
      }
      
      vector<int> Tokenizer::tokenize(const string& str) 
      {
        vector<int> ids;
      
        vector<string> tokens = split(str);	
      
        for (int i = 0; i < tokens.size(); i++) {
          string w = tokens[i];
          if (words.find(w) != words.end())
            ids.push_back(words[w]);
          else
            ids.push_back(words[UNK]);
        }
      
        return ids;
      }
      
      string Tokenizer::detokenize(const vector<int>& tokenIDs) 
      {
          string words;
      
          int n = tokenIDs.size();
          for (int i = 0; i < n; i++) {
        	int id = tokenIDs[i];
              if (wordIndex.find(id) == wordIndex.end())
              continue;
      
              string w = wordIndex[id];
      
              // Skip special tokens (optional behavior)
              if (w == "<PAD>" || w == "<BOS>" || w == "<EOS>")
              continue;
      
              words += w + " ";
          }
      
          return words;
      }
      
      int Tokenizer::get_nTokens() const 
      {
          return nTokens;
      }
      
      unordered_map<int, string>Tokenizer::get_wordIndex()
      {
              return wordIndex;
      }
      
      //------------------   Embedding Class ------------------
      Embedding::Embedding(int sequence_length, int embeddingDimension)
      { 
        nTokens = sequence_length;
        dim = embeddingDimension;
        // initialize embeddings as an nTokensxdim matrix with certain random values 
        initMatrix(em_matrix, nTokens, dim);	
      }
          
      // create embedding matrix with tokenIDs 
      matrixd Embedding::embed(const vector<int>& tokenIDs) 
      {
        matrixd embeddings;
        embeddings.reserve(tokenIDs.size()); // set capacity of embeddings vector
        for(int i = 0; i < tokenIDs.size(); i++) {
          if (tokenIDs[i] >= 0 && tokenIDs[i] < nTokens)
             embeddings.push_back(em_matrix[tokenIDs[i]]);
          else 
            // Out-of-dictionary tokens
            embeddings.push_back(vector<double>(dim, 0.0));
         }
          
         return embeddings;
      }
      
      int Embedding::get_embedDim() const 
      { 
             return dim; 
      }
      
      int Embedding::get_nTokens() const 
      { 
             return nTokens; 
      }
      
      // --------------------- Positional Encoding class ---------------
      PositionalEncoding::PositionalEncoding(int max_seq_length, int embed_dimension) 
      {
            maxTokens = max_seq_length;
            dModel = embed_dimension;
      
            PE.resize(maxTokens, vector<double>(dModel));
            for (int pos = 0; pos < maxTokens; pos++) {
          for (int i = 0; i < dModel/2; i++) {
          		PE[pos][2*i] = sin( pos / pow(10000, 2.0*i/dModel) );
      	    	PE[pos][2*i+1] = cos( pos / pow(10000, 2.0*i/dModel) );
      	}
            }
      }
      
      // add positional encodings to embeddings
      matrixd PositionalEncoding::addPE(const matrixd& embeddings) 
      {
          matrixd x = embeddings;
          int nTokens = embeddings.size();
      
          for (int pos = 0; pos < nTokens; pos++) 
            for (int i = 0; i < dModel; i++) 
               x[pos][i] += PE[pos][i];
      
           return x;
      }
      
      matrixd PositionalEncoding::getPE() 
      {
          return PE;
      }
      
      // -------------------- Add & Norm Class---------------------------------
      // constructor
      AddnNorm::AddnNorm(int dModel)
      {
          gamma.assign(dModel, 1.0);
          beta.assign(dModel, 0.0),
          eps = 1e-5;
      }
      
      matrixd AddnNorm::forward (const matrixd &A, const matrixd &B)
      {
          matrixd x = addMat(A, B); // add two matrices
          int n = x.size();         // rows
          int m = x[0].size();      // cols
          mean.resize( n );
          var.resize( n );
          Zhat.resize(n, vector<double>(m, 0));
          matrixd Y(n, vector(m));
      
          // normalize the result
          for (int i = 0; i < n; i++) {
             double sum = 0;      // sum of one row
             var[i] = 0;
             for (int j = 0; j < m; j++)
                sum += x[i][j];
             mean[i] = sum / m;  // mu_i, mean of i-th row
      
             sum = 0;
             for (int j = 0; j < m; j++) {
                double diff = x[i][j] - mean[i];
                sum += diff * diff;
             }
             var[i] = sum / m;
             for (int j = 0; j < m; j++) {
                double xd = (x[i][j] - mean[i]) / sqrt(var[i] + eps); 
      	  Zhat[i][j] = xd;
                Y[i][j] =  gamma[j] * xd + beta[j];
             }
          }
      
          return Y;  // final output of Add & norm
      }
      
      // Assume forward already filled: Zhat, mean, var
      matrixd AddnNorm::backProp(const matrixd &X, const matrixd &dL_dY, double eta)
      {
          int nt = X.size();
          int m  = X[0].size();
      
          matrixd dL_dZ(nt, vector<double>(m, 0.0));
      
          vector<double> dL_dGamma(m, 0.0);	// gradient wrt gamma
          vector<double> dL_dBeta(m, 0.0);	// gradient wrt beta
      
          // 1. Compute dL_dGamma and dL_dBeta for each token
          for (int i = 0; i < nt; i++)
          {
              for (int j = 0; j < m; j++){
                 dL_dBeta[j]  += dL_dY[i][j];
                 dL_dGamma[j] += dL_dY[i][j] * Zhat[i][j];
              }
          }
      
          // 2. Update parameters
          for (int j = 0; j < m; j++)
          {
              gamma[j] -= eta * dL_dGamma[j];
              beta[j]  -= eta * dL_dBeta[j];
          }
      
          // 3. Backprop through LayerNorm (row-wise)
          for (int i = 0; i < nt; i++)
          {
              double inv_sigma = 1.0 / sqrt(var[i] + eps);
      
              // Step A: compute intermediate values
              vector<double> dL_dZhat(m);
              for (int j = 0; j < m; j++)
              {
                  dL_dZhat[j] = dL_dY[i][j] * gamma[j];
              }
      
              double sum_dL_dZhat = 0.0;
              double sum_dL_dZhat_zhat = 0.0;
      
              for (int j = 0; j < m; j++)
              {
                  sum_dL_dZhat += dL_dZhat[j];
                  sum_dL_dZhat_zhat += dL_dZhat[j] * Zhat[i][j];
              }
      
              // Final gradient
              for (int j = 0; j < m; j++)
              {
                  dL_dZ[i][j] = inv_sigma * (dL_dZhat[j] - sum_dL_dZhat / m
                       - Zhat[i][j] * sum_dL_dZhat_zhat / m);
              }
          }
      
          return dL_dZ;
      }
      
      // Optional setters (to plug in forward results)
      void AddnNorm::setCache(const matrixd& zhat, const vector<double>& m, 
      		                 const vector<double>& v)
      {
          Zhat = zhat;
          mean = m;
          var  = v;
      }
      
      // ---------------------------  Feed Forward -----------------------------------
      FeedForward::FeedForward(int dModel0, int d_ff0)
      {
          dModel = dModel0;
          d_ff = d_ff0;
      
          W1.resize(dModel, vector<double>(d_ff));
          W2.resize(d_ff, vector<double>(dModel));
      
          for (int i = 0; i < dModel; i++)
              for (int j = 0; j < d_ff; j++)
              W1[i][j] = ((double)rand() / RAND_MAX - 0.5);
      
          for (int i = 0; i < d_ff; i++)
              for (int j = 0; j < dModel; j++)
              W2[i][j] = ((double)rand() / RAND_MAX - 0.5);
      }
      
      // ------------------------------
      // Forward (for cache)
      // ------------------------------
      matrixd FeedForward::FFoutput(const matrixd &X)
      {
          Z  = matmul(X, W1);   // pre-activation
          F1 = relu( Z );  //f(Z);        // F1 = f(XW1)
          matrixd F2 = matmul(F1, W2); // F2 = F1 W2
      
          return F2;
      }
      
      // ------------------------------
      // Backprop
      // ------------------------------
      matrixd FeedForward::backProp(const matrixd &X, const matrixd &dL_dY,  // dL/dF2
                   double eta)
      {
          // ---- W2 ----
          matrixd dL_dF2 = dL_dY;
      
          // dL/dW2 = F1^T * dL_dF2
          matrixd F1T = transpose(F1);
          matrixd dL_dW2 = matmul(F1T, dL_dF2);
      
          // dL_dF1 = dL_dF2 * W2^T
          matrixd W2T = transpose(W2);
          matrixd dL_dF1 = matmul(dL_dF2, W2T);
      
          // ---- find dL_dZ1 ----
          matrixd fd_Z  = relu_deriv(Z); // f_deriv(Z);
          int rows = dL_dF1.size();
          int cols = dL_dF1[0].size();
      
          matrixd dL_dZ1 = dL_dF1;	  // same dimension
          for (int i = 0; i < rows; i++)
             for (int j = 0; j < cols; j++)
                dL_dZ1[i][j] = dL_dF1[i][j] * fd_Z[i][j];
      
          // ---- W1 ----
          // dL/dW1 = X^T * dL/dZ1
          matrixd XT = transpose(X);
          matrixd dL_dW1 = matmul(XT, dL_dZ1);
      
          // dL/dX = dL/dZ1 * W1^T
          matrixd W1T = transpose(W1);
          matrixd dL_dX = matmul(dL_dZ1, W1T);
      
          // ---- Update weights ----
          for (int i = 0; i < W2.size(); i++)
             for (int j = 0; j < W2[0].size(); j++)
                W2[i][j] -= eta * dL_dW2[i][j];
      
          for (int i = 0; i < W1.size(); i++)
             for (int j = 0; j < W1[0].size(); j++)
                W1[i][j] -= eta * dL_dW1[i][j];
      
          return dL_dX;	// This becomes the dL_dY of next stage
      }
      
      // ------------------ MultiHeadAttention --------------------
      MultiHeadAttention::MultiHeadAttention(int dModel0, int nHeads0)
      {
          dModel = dModel0;
          nHeads = nHeads0;
          d_k = dModel / nHeads;
      
          WQ.resize(nHeads);
          WK.resize(nHeads);
          WV.resize(nHeads);
      
          for (int h = 0; h < nHeads; h++) {
              initMatrix(WQ[h], dModel, d_k);
              initMatrix(WK[h], dModel, d_k);
              initMatrix(WV[h], dModel, d_k);
          }
      
          initMatrix(WO, dModel, dModel);
      }
      
      matrixd MultiHeadAttention::computeAttention(const matrixd &X_Q,
           const matrixd &X_K, const matrixd &X_V, bool mask)
      {
          Qs.clear(); Ks.clear(); Vs.clear();
          Ps.clear(); As.clear();
      
          vector<matrixd> heads;
      
          for (int h = 0; h < nHeads; h++) {
              matrixd Q = matmul(X_Q, WQ[h]);
              matrixd K = matmul(X_K, WK[h]);
              matrixd V = matmul(X_V, WV[h]);
      
              matrixd S = matmul(Q, transpose(K));
              scaleMat(S, 1.0 / sqrt(d_k));
      
      
              if (mask)
                S = causalMask(S);
      
              matrixd P = softmax(S);  
              matrixd A = matmul(P, V);
      
              Qs.push_back(Q);
              Ks.push_back(K);
              Vs.push_back(V);
              Ps.push_back(P);
              As.push_back(A);
      
              heads.push_back(A);
          }
      
          // Concatenate heads
          int n = heads[0].size();
          matrixd concat(n, vector<double>(dModel, 0.0));
      
          for (int h = 0; h < nHeads; h++) 
             for (int i = 0; i < n; i++) 
                for (int j = 0; j < d_k; j++) 
                   concat[i][h * d_k + j] = heads[h][i][j];
      
          return matmul(concat, WO);
      }
      
      // backpropagation
      matrixd MultiHeadAttention::backProp( const matrixd &X_Q, const matrixd &X_K,
          const matrixd &X_V, const matrixd &dL_dH, double eta, bool mask)
      {
          // ---- WO gradients ----
          matrixd A_cat; // rebuild concat from cached As
          int nt = As[0].size();
          double s = 1.0 / sqrt(d_k);   // for scaling 
      
          A_cat = matrixd(nt, vector<double>(dModel, 0.0));
          for (int h = 0; h < nHeads; h++) 
             for (int i = 0; i < nt; i++) 
                for (int j = 0; j < d_k; j++) 
                   A_cat[i][h*d_k + j] = As[h][i][j];
      
          matrixd dL_dWO = matmul(transpose(A_cat), dL_dH);
          matrixd dL_dAcat = matmul(dL_dH, transpose(WO));
      
          // update WO
          updateWeight(WO, dL_dWO, eta);
       
          // ---- initialize accumulators ----
          matrixd dL_dXQ(X_Q.size(), vector<double>(dModel, 0.0));
          matrixd dL_dXK(X_K.size(), vector<double>(dModel, 0.0));
          matrixd dL_dXV(X_V.size(), vector<double>(dModel, 0.0));
      
          // ---- per head ----
          for (int h = 0; h < nHeads; h++) {
             // slice gradient
             matrixd dL_dA(nt, vector<double>(d_k));
             for (int i = 0; i < nt; i++)
                for (int j = 0; j < d_k; j++)
                   dL_dA[i][j] = dL_dAcat[i][h*d_k + j];
      
             matrixd P = Ps[h];
             matrixd V = Vs[h];
             matrixd Q = Qs[h];
             matrixd K = Ks[h];
      
             // ---- A = P V ----
             matrixd dL_dP = matmul(dL_dA, transpose(V));
             matrixd dL_dV = matmul(transpose(P), dL_dA);
      
             // ---- softmax ----
             matrixd dL_dS = softmax_derivative(dL_dP, P);
      
      
             // ---- S = QK^T ----
             matrixd dL_dQ = matmul(dL_dS, K);
             matrixd dL_dK = matmul(transpose(dL_dS), Q);
      
             scaleMat(dL_dQ, s);
             scaleMat(dL_dK, s);
      
             // ---- Weight gradients ----
             matrixd dL_dWQ = matmul(transpose(X_Q), dL_dQ);
             matrixd dL_dWK = matmul(transpose(X_K), dL_dK);
             matrixd dL_dWV = matmul(transpose(X_V), dL_dV);
      
             // update weights
             updateWeight(WQ[h], dL_dWQ, eta);
      
             for (int i = 0; i < WK[h].size(); i++)
                for (int j = 0; j < WK[h][0].size(); j++)
                   WK[h][i][j] -= eta * dL_dWK[i][j];
      
             for (int i = 0; i < WV[h].size(); i++)
                for (int j = 0; j < WV[h][0].size(); j++)
                   WV[h][i][j] -= eta * dL_dWV[i][j];
      
             // ---- input gradients ----
             matrixd dXQ = matmul(dL_dQ, transpose(WQ[h]));
             matrixd dXK = matmul(dL_dK, transpose(WK[h]));
             matrixd dXV = matmul(dL_dV, transpose(WV[h]));
      
             // accumulate
             dL_dXQ = addMat(dL_dXQ, dXQ);
             dL_dXK = addMat(dL_dXK, dXK);
             dL_dXV = addMat(dL_dXV, dXV);
          }  // for h
      
          // For self-attention, these are the same
          return dL_dXQ;
      }
      
      // ---------- OutputLayer ----------------------
      OutputLayer::OutputLayer(int dModel0, int vocab_size0)
      {
          dModel = dModel0;
          vocab_size = vocab_size0;
      
          initMatrix(W, dModel, vocab_size);
      }
      
      // Forward: logits --> probabilities
      matrixd OutputLayer::forward(const matrixd &X) {
          // X: (seq_len x dModel)
          // output: (seq_len x vocab_size)
          matrixd logits = matmul(X, W);
      
          return softmax(logits);
      }
      
      // Backward: returns dL/dX
      matrixd OutputLayer::backward(const matrixd &X, const matrixd &P,
                   const vector<int> &targets, double eta)
      {
          int n = P.size();
      
          // dL/dP = P - Y (cross-entropy + softmax)
          matrixd dL_dP = P;
      
          for (int i = 0; i < n; i++) 
              dL_dP[i][targets[i]] -= 1.0;
      
          // Gradient wrt weights
          matrixd dL_dW = matmul(transpose(X), dL_dP);
      
          //update weights
          updateWeight(W, dL_dW, eta);
      
          // Gradient wrt input (back to decoder)
          matrixd dL_dX = matmul(dL_dP, transpose(W));
      
          return dL_dX;
      }
      
      // ------------ GPT-style Transformer (Decoder-only) -------------------
      GPTBlock::GPTBlock(int dModel, int nHeads, int d_ff)   // Decoder
          : mha(dModel, nHeads), norm1(dModel), ffn(dModel, d_ff), norm2(dModel) 
      {
      }
      
      matrixd GPTBlock::forward(const matrixd &X) 
      {
          matrixd H = mha.computeAttention(X, X, X, true); // masked
          matrixd X1 = norm1.forward(X, H);
      
          matrixd F = ffn.FFoutput(X1);
          matrixd Y = norm2.forward(X1, F);
      
          return Y;
      }
      
      matrixd GPTBlock::backward(const matrixd &X, const matrixd &dL_dY, double eta) 
      {
      
          matrixd dF = norm2.backProp(X, dL_dY, eta);
          matrixd dX1 = ffn.backProp(X, dF, eta);
          matrixd dH = norm1.backProp(X, dX1, eta);
      
          return mha.backProp(X, X, X, dH, eta, true);
      }
      
      MiniGPT::MiniGPT(int vocab_size, int seq_len, int dModel, int nHeads, int d_ff, 
      	int nLayers) : embed(vocab_size, dModel), pe(seq_len, dModel),
            	output(dModel, vocab_size), seq_len(seq_len), vocab_size(vocab_size)
      {
          for (int i = 0; i < nLayers; i++) {
             GPTBlock gptBlock(dModel, nHeads, d_ff);
             layers.push_back(gptBlock);
          }
      }
      
      matrixd MiniGPT::forward(const vector<int> &tokens) 
      {
          matrixd X = embed.embed(tokens);
          X = pe.addPE(X);
      
          for(int i = 0; i < layers.size(); i++)
              X = layers[i].forward(X);
      
          return output.forward(X);
      }
      
      double MiniGPT::trainStep(const vector<int> &tokens, double eta) 
      {
          // create input/target (shifted)
          vector<int> input(tokens.begin(), tokens.end()-1);
          vector<int> target(tokens.begin()+1, tokens.end());
          /*
            e.g.
            input sequence: ["<SOS>", "one", "two"]
            target sequence: ["one", "two", "three"] 
          */
          matrixd X = embed.embed(input);
          X = pe.addPE(X);
      
          vector<matrixd> cache;
          for (int i = 0; i < layers.size(); i++) {
              X = layers[i].forward(X);
              cache.push_back(X);
          }
      
          matrixd P = output.forward(X);
          /*
            Position i: Model takes tokens 1..i-1 (from input) and outputs 
            probability distribution P[i] over the vocabulary
      
            target[i] = correct token at position i 
      
            P[i][target[i]] = probability model assigned to the correct token
          */
          // loss is for reference only, we do not need it in back propagation
          double loss = crossEntropy(P, target);
          
          // backward, gradient loss is computed from P and target
          matrixd dL_dX = output.backward(X, P, target, eta);
      
          for (int i = layers.size()-1; i >= 0; i--) 
              dL_dX = layers[i].backward(cache[i], dL_dX, eta);
      
          return loss;
      }
      
      						

      testMain.cpp:

      // http://forejune.co/cuda/
      // testMain.cpp
      
      #include "util.h"
      #include "transGpt.h"
      #include <fstream>
      
      using namespace std;
      
      vector<int> generate(MiniGPT &model, vector<int> prompt, int max_len)
      {
          vector<int> tokens = prompt;
      
          for (int step = 0; step < max_len; step++) {
      	vector<int> input(tokens.begin(), tokens.end());
              matrixd P = model.forward(input);
      
              vector<double> last = P.back();   //last row of matrix P
      	// last contains probability distribution over the vocabulary for the next token
      	// each index in last corresponds to a token ID in the vocabulary
      
      	// greedy decoding, - last.begin() converts interator to index
              int next = max_element(last.begin(), last.end()) - last.begin();
      
              tokens.push_back(next);
          }
      
          return tokens;
      }
      
      vector<vector<int>> buildDataset(Tokenizer &tokenizer)
      {
          vector<vector<int>> dataset;
      
          string str = "one two three  uno  dos  tres";
          vector<int> tokenIDs = tokenizer.tokenize(str);
          dataset.push_back( tokenIDs );
          str = "two three  four dos  tres  cuatro ";
          tokenIDs = tokenizer.tokenize(str);
          dataset.push_back( tokenIDs );
          str = "three  four  five tres  cuatro  cinco";
          tokenIDs = tokenizer.tokenize(str);
          dataset.push_back( tokenIDs );
          str = "one two three  uno  dos  tres";
          tokenIDs = tokenizer.tokenize(str);
          dataset.push_back( tokenIDs );
          str = "three  four  five tres  cuatro  cinco";
          tokenIDs = tokenizer.tokenize(str);
          dataset.push_back( tokenIDs );
      
          return dataset;
      }
      
      int main() 
      {
          // -------------------------
          // Tokenizer setup
          // -------------------------
          
          // Build vocabulary 
          ifstream ifs;
          char fname[] = "t.txt";
          ifs.open( fname );
          if (!ifs.is_open()) {
                cerr << "Unable to open file " << fname  << endl;
                exit( 1 );
          }
      
          Tokenizer tokenizer( ifs );
      
          int vocab_size = tokenizer.get_nTokens();
      
          cout << "Vocab size: " << vocab_size << endl;
      
          // -------------------------
          // Model
          // -------------------------
          int seq_len = 50;	// maximum sequence length model can handle
          int dModel = 32;
          int nHeads = 4;
          int d_ff = 128;
          int nLayers = 2;
      
          MiniGPT model(vocab_size, seq_len, dModel, nHeads, d_ff, nLayers);
      
          // -------------------------
          // Dataset 
          // -------------------------
          vector<vector<int>> dataset;
          dataset = buildDataset( tokenizer );
      
          // -------------------------
          // Training
          // -------------------------
          for (int epoch = 0; epoch < 201; epoch++) {
              double loss = 0;
      
              for (auto &seq : dataset)
                  loss += model.trainStep(seq, 0.002);
      
              if (epoch % 20 == 0)
                  cout << "Epoch " << epoch << " Loss: " << loss << endl;
          }
      
          // -------------------------
          // Inference
          // -------------------------
          string prompt = "one two three";
          vector<int> prompt_tokens = tokenizer.tokenize(prompt);
          cout << "\nPrompt: " << prompt << endl;
          vector<int> generated = generate(model, prompt_tokens, 3);
          string output = tokenizer.detokenize(generated);
          cout << "\nGenerated text:\n" << output << endl;
      
          prompt = "three four five";
          prompt_tokens = tokenizer.tokenize(prompt);
          cout << "\nPrompt: " << prompt << endl;
          generated = generate(model, prompt_tokens, 3);
          output = tokenizer.detokenize(generated);
          cout << "\nGenerated text:\n" << output << endl;
      
          return 0;
      }
      						

      Makefile:

      PROG	= testMain
      #source codes
      SRCS =  $(PROG).cpp
      #substitute .cpp by .o to obtain object filenames
      OBJS = $(SRCS:.cpp=.o) util.o  transGpt.o
      
      #$< evaluates to the target's dependencies, 
      #$@ evaluates to the target
      
      $(PROG): $(OBJS)
      	g++ -o $@ $(OBJS)  
      
      $(OBJS): 
      	g++ -c  -std=c++20 $*.cpp 
      
      clean:
      	rm $(OBJS) $(PROG)