Decoder of an AI Transformer in C/C++

Materials available at: http://forejune.co/cuda/

Transformer Decoder

  • Mask:
    An upper triangular matrix applied to the attention scores 
    Unmasked Attention:
    Token 0: [ -0.16,   0.57,   0.84,   1.78 ]  
    Token 1: [  0.95,  -0.49,  -0.82,   1.04 ] 
    Token 2: [  1.18,  -1.40,  -1.05,   1.64 ] 
    Token 3: [  0.44,  -1.55,  -0.36,   1.64 ] 
    Masked Attention:
    Token 0: [ -0.16,   -∞,    -∞       -∞ ]  
    Token 1: [  0.95,  -0.49,  -∞       -∞ ]  
    Token 2: [  1.18,  -1.40,  -1.05,   -∞ ] 
    Token 3: [  0.44,  -1.55,  -0.36,   1.64 ] 
    
    After activation function, -∞ becomes 0
  • Parameters:
      Batch size = 1 (one input sequence)
      Target sequence length (decoder) = nt
      Source sequence length (encoder) = ns
      Model dimension = dmodel
      Number of heads = nh
      Head dimension = dk = dmodel / nh
      Feed forward hidden size = dff
  • Inputs to Decoder Block:
      Docoder input: 
      	X : nt x dmodel
      Encoder output:
      	E : ns x dmodel
      	
      
  • Masked Multi-Head Attention

    Algorithm:
    1. Linear projections:
          Q = X x WQ, K = X x WK, V = X x WV
          Dimensions: 
    	 WQ, WK, WV, WO : dmodel x dmodel
    	 Q, K, V : nt x dmodel
    
    2. Splitting into heads:
    	Q, K, V :  nh x nt x dk
    
    3. Attention and Output after projection:
     
        Q x KT : nt x nt
            Every token attends to every previous token
        Let MHA(X) = Output = concat(Q,K,V) x WO 
    
    4. Add & Norm:
    	X1 = Normalize(X + MHA(X))
       	X1 : nt x dmodel
    
    			

    Cross Multi-Head Attention

    Inputs:
      Query from decoder:
        Q = X1 x WQ :  nt x dmodel
      
      Key from encoder:
        K = E x WK :  ns x dmodel 
    
      Value from encoder:
        V = E x WV :  ns x dmodel
    
    Attention dimensions now:
        Q x KT : nt x ns
        meaning that each decoder token attends over 
        all encoder tokens.
        
    Add & Norm :
        X2 = Normalize(X1 + Cross-MHA(X1))
    
    Feed-Forward Network (FFN) :
        FFN(X2) = f(X2 x W1) x W2   
    	f is an activation function like ReLU
    	We do not use any bias for simplicity
    	Note: Each token is processed independently
    	X2 : nt x dmodel
    	W1 : dmodel x dff
    	W2 : dff x dmodel
    	FFN(X2) : nt x dmodel
    
    Final Add & Norm :
          Output Y = Normalize(X2 + FFN(X2))
    	Y : nt x dmodel
    	Y will be inputs to the next decoder layer (Y → X)
    	only the output of the last decoder layer goes to 
    	Linear then Softmax
    				
    Weights

    In general:

    C/C++ Implementation

    util.h:
    // util.h
    // http://forejune.co/cuda/
    #ifndef __UTIL_H__
    #define __UTIL_H__
    #include <vector>
    #include <algorithm>
    #include <math.h>
    #include <iostream>
    #include <iomanip>
    #include <random>
    
    using namespace std;
    
    typedef vector<vector<double>>  matrixd;
    
    // an activation function
    double f(double z);
    
    // softmax, 1D input
    vector<double> softmax(const vector<double> &v);
    
    // softmax activation function, 2D input vector
    matrixd softmax(const matrixd& input);
    
    // add two matrices
    matrixd addMat(const matrixd &A, const matrixd &B);
    
    // Transpose matrix n x m, B = A^T : m x n
    matrixd transpose(const matrixd& A);
    
    // Matrix multiplication   C = A X B   A: nxm, B: mxr, C: nxr
    matrixd matmul(const matrixd& A, const matrixd& B);
    
    // scale a matrix
    void scaleMat(matrixd &A, const double s);
    
    // scaled self attention
    matrixd scaledAttention(const matrixd &Q, const matrixd &K, const matrixd &V,
    	bool mask);
    
    
    // initialize an input matrix with certain random values
    void initMatrix(matrixd& M, const int n, const int m);
    
    // print a token dictionary
    void printDictionary(unordered_map<string, int> &d);
    
    // print a matrix
    void printMatrix(const matrixd &y);
    
    // Add & Norm
    matrixd addNnorm(const matrixd &A, const matrixd &B);
    #endif
    

    util.cpp (Helper functions):
    #include "util.h"
    
    using namespace std;
    
    typedef vector<vector<double>>  matrixd;
    
    // Activation function: using ReLU (Rectified Linear Unit)
    double f(double z)
    {
      return max(0.0, z);
    }
    
    // softmax activation function, 1D input vector
    vector<double> softmax(const vector<double> &z)
    {
      int m = z.size();
      vector<double> y(m, 0);
    
       //maximum value of vector
      double maxVal = *max_element(z.begin(), z.end());
    
      double sum = 0;
      // Subtract max for numerical stability
      for (int j = 0; j < m; j++) {
        y[j] = exp(z[j] - maxVal);
        sum += y[j];
      }
    
      // Normalize
      for (int j = 0; j < m; j++)
        y[j] /= sum;
    
      return y;
    }
    
    // softmax activation function, 2D input vector
    matrixd softmax(const matrixd& input)
    {
      int n = input.size(), m = input[0].size();    // n x m matrix 
    
      // 2D output vector n x m y[n][m]
      vector<vector<double>> y(n, vector<double>(m));
    
      for (int i = 0; i < n; i++) {
        //maximum value of the row
        double maxVal = *max_element(input[i].begin(), input[i].end());
    
        double sum = 0;
        // Subtract max for numerical stability
        for (int j = 0; j < m; j++) {
          y[i][j] = exp(input[i][j] - maxVal);
          sum += y[i][j];
        }
    
        // Normalize
        for (int j = 0; j < input[i].size(); j++)
          y[i][j] /= sum;
      }
    
      return y;
    }
    
    // C = A + B
    matrixd addMat(const matrixd &A, const matrixd &B)
    {
         int n = A.size();          // rows
         int m = A[0].size();       // cols
         matrixd C = A;      
    
         for (int i = 0; i < n; i++)
           for(int j = 0; j < m; j++)
             C[i][j] = A[i][j] + B[i][j];
    
         return C;
    }
    
    // Transpose matrix n x m, B = A^T : m x n
    //vector<vector<double>> transpose(const vector<vector<double>>& A)
    matrixd transpose(const matrixd& A)
    {
       int n = A.size();    //number of rows
       int m = A[0].size(); //number of columns
    
       matrixd B(m, vector<double>(n));
       for (int i = 0; i < n; i++)
         for (int j = 0; j < m; j++)
           B[j][i] = A[i][j];
    
       return B;
    }
    
    // Matrix multiplication   C = A X B   A: nxm, B: mxr, C: nxr
    matrixd matmul(const matrixd& A, const matrixd& B)
    {
      int n  = A.size();
      int m = A[0].size();
      int r = B[0].size();
    
      matrixd C(n, vector<double>(r, 0));
    
      for (int i = 0; i < n; i++)
        for (int j = 0; j < r; j++)
          for (int k = 0; k < m; k++)
            C[i][j] += A[i][k] * B[k][j];
    
       return C;
    }
    
    void initMatrix(matrixd& M, const int n, const int m)
    {
       // resizing 2D vector to n x m with initial values 0
       M.assign(n, vector<double>(m, 0));
       random_device rd;
       mt19937 gen(rd());      //psuedo random number generator
       //mean = 0.0, standard deviation (sigma = 0.2)
       normal_distribution<double> dist(0.0, 0.2);  
       for (int i = 0; i < n; ++i) 
         for (int j = 0; j < m; ++j)
            M[i][j] = dist(gen);
    }
    
    void printDictionary(unordered_map<string, int> &d)
    {
      unordered_map<string, int>::iterator it = d.begin();
      int k = 0;
      while ( it != d.end() ){
        cout << right << setw(12) << it->first << ": " << setw(3) << it->second;
        it++;
        k++;
        if ( k % 5 == 0 )
          cout << endl;
      }
    }
    
    void printMatrix(const matrixd &y)
    {
        int cols = y.size();
        int rows = y[0].size();
    
        for (int i = 0; i < cols; ++i) {
            cout << "Token " << i << ": [";
            for (int j = 0; j < rows; ++j) {
                cout << fixed << setw(7) << setprecision(3) << y[i][j];
                if (j < 4) cout << ", ";
            }
            cout << " ]\n";
        }
    }
    
    matrixd addNnorm(const matrixd &A, const matrixd &B)
    {
         auto x = addMat(A, B);     // add two matrices
         int n = x.size();          // rows
         int m = x[0].size();       // cols
         matrixd y = x;             // output same size as x 
    
         // normalize the result
         double epsilon = 1e-5;
    
         for (int i = 0; i < n; i++) {
           double sum = 0;          // sum of one row
           double std = 0;          //sigma_i square
           for (int j = 0; j < m; j++)
             sum += x[i][j];
            double mean = sum / m;  // mu_i, mean of i-th row
    
            sum = 0;
            for (int j = 0; j < m; j++) {
              double diff = x[i][j] - mean;
              sum += diff * diff;
            }
            std = sum / m;
            for (int j = 0; j < m; j++) {
              double xd = (x[i][j] - mean) / sqrt(std + epsilon);
              y[i][j] = xd;         // gamma = 1, beta = 0;
            }
         }
    
         return y;  // final output of Add & norm
    }
    
    // scale a matrix
    void scaleMat(matrixd &A, const double s)
    {
       int m = A.size();	// number of rows
       int n = A[0].size();    // number of columns
    
       for (int i = 0; i < m; i++)
         for (int j = 0; j < n; j++)
           A[i][j] *= s;
    }
    
    matrixd scaledAttention(const matrixd &Q, const matrixd &K, const matrixd &V,
            bool mask)
    {
      double d_k, scale;
      matrixd KT, A;	// K transpose, attention
      
      d_k = Q[0].size();
      scale = 1.0 / sqrt( d_k );
      KT = transpose ( K );
      A = matmul(Q, KT);
      scaleMat(A, scale); 
    
      if ( mask ) {
         for (int i = 0; i < A.size(); i++)
           for (int j = i+1; j < A[0].size(); j++) 
             A[i][j] = -1e9;	// causal mask
      }
    
      A = softmax( A );
      
      matrixd output = matmul(A, V);
    
      return output;
    }
    

    transformer.h:
    #ifndef __TRANSFORMER_H__
    #define __TRANSFORMER_H__
    #include <fstream>
    #include <vector>
    #include <string>
    #include <unordered_map>
    
    using namespace std;
    
    class Tokenizer 
    {
    private:
        unordered_map<string, int> words;
        unordered_map<int, string> wordIndex;
        int nTokens;	// number of words
    
    public:
        Tokenizer();
        Tokenizer(ifstream &fs);
        unordered_map<string, int>  getTokensWords();
        vector<int> tokenize(const string& str);
        string detokenize(const vector<int>& tokenIDs); 
        int get_nTokens() const;
    }; 
    
    class Embedding
    {
    private:
        int nTokens;        // number of words
        int dim;            // embedding dimension
        matrixd em_matrix;  // embedding matrix;
    public:
        Embedding(int sequence_length, int embeddingDimension);
        // create embedding matrix with tokenIDs
        matrixd embed(const vector<int>& tokenIDs);
        int get_embedDim() const;
        int get_nTokens() const;
    };
    
    class PositionalEncoding
    {
    private:
        int maxTokens; // maximum sequence length
        int dModel;    // embedding dimension
        matrixd PE;    // positional_encodings matrix;
    public:
        PositionalEncoding(int max_seq_length, int embedding_dimension);
        matrixd addPE(const matrixd& embeddings);
        matrixd getPE();
    };
    
    class MultiHeadAttention {
    private:
        int dModel;
        int nHeads;
        int d_k; // dimension per head
    
        matrixd WQ, WK, WV, WO;
    public:
        MultiHeadAttention(int dm, int nh);
        // input nTokens x dModel
        matrixd computeAttention(const matrixd &X_Q, const matrixd &X_K, const matrixd &X_V,  bool mask);
    };
    
    class FeedForward
    {
    private:
      matrixd W1, W2;
      int dModel, d_ff;
    public:
      FeedForward(int dModel, int d_ff);
      matrixd FFoutput(const matrixd &x);
      void updateWeights(const matrixd &dW1, const matrixd &dW2);
    };
    
    class Encoder{
    private:
       int dModel;
       int nHeads;
       int d_ff;
       matrixd X;
       matrixd W1,W2;
       MultiHeadAttention *mha;
       matrixd output;
       FeedForward *ff;
    public:
        Encoder(const int dModel,  const int nHeads, const int d_ff, const matrixd &input);
        matrixd computeOutput();
        matrixd getOutput();
        ~Encoder(){
           delete mha;
           delete ff;
        }
    };
    
    class Decoder {
    private:
       int dModel;
       int d_ff;
       int nHeads;
       matrixd X;	//external input
       matrixd E;	//input from encoder
       matrixd output;
       MultiHeadAttention *mha;
       FeedForward *ff;
    public:
        Decoder(const int dModel, const int nHeads, const int d_ff, const matrixd &X, const matrixd &E);
        matrixd computeOutput();
        matrixd output2(const matrixd &E,  const matrixd &X1);
        matrixd getOutput();
        ~Decoder(){
          // delete maskedMHA;
          // delete crossMHA;
           delete mha;
           delete ff;
        }
    };
    #endif
    

    transformer.cpp (transformer classes):
    // transformer.cpp -- classes of a multi-head transformer 
    // http://forejune.co/cuda/
    
    #include <iostream>
    #include <fstream>
    #include <vector>
    #include <string>
    #include <unordered_map>
    #include <cmath>
    #include <random>
    #include <algorithm>
    #include <iomanip>
    #include "util.h"
    #include "transformer.h"
    
    using namespace std;
    
    // ------------------------ Tokenizer class --------------------------
    Tokenizer::Tokenizer()
    {
      // Create a simple dictionary
      words = {
          {"[UNK]", 0}, {"[PAD]", 1}, {"[CLS]", 2}, {"[SEP]", 3},
          {"strong", 4}, {"nation", 5}, {"recycles", 6}, {"its", 7},
          {"huge", 8}, {"trade", 9}, {"surplus", 10}, {"to", 11},
          {"america", 12}, {"to", 13}, {"infiltrate", 14}, {"it", 15}, {"!", 16}
       };
            
       // Create reverse mapping
       nTokens = words.size();
       unordered_map<string, int>::iterator it = words.begin();
       while ( it != words.end() ) {
         wordIndex[it->second] = it->first;
         it++;
       } 
    }
        
    Tokenizer::Tokenizer(ifstream &fs)
    {
            string token;
            vector<string> tokens;
    
            // Read tokens (words) one by one, delimited by whitespace
    	int k = 0;
    	words["[UNK]"] = k;	//unknown token
            wordIndex[k] = "UNK]";
     	k++;	
            while (fs >> token) {
    	  if (!words.contains(token)){
    	    words[token] = k;
    	    wordIndex[k] = token;
    	    k++;
    	  } 
            }
    	nTokens = words.size();
    } 
    
    unordered_map<string, int>  Tokenizer::getTokensWords()
    {
          return words;
    }
    
    // tokenize a string
    vector<int> Tokenizer::tokenize(const string& str) 
    {
       vector<int> tokenIDs;
       string token;
            
       // Simple tokenization
       for (char c : str) {
         if (isalnum(c)) {
            //change character to lower case, add it to current token string
            token += tolower(c);  
          } else {	// non-alpha numeric, form a complete token
            if (!token.empty()) {  //string not empty 
              if (words.find(token) != words.end()) 
    	    // token is in the words dictionary, thus save its ID
                tokenIDs.push_back(words[token]);
              else // unknown token
                tokenIDs.push_back(words["[UNK]"]);
                  token.clear();	// clear variable to form next token
    	  }
                    if (c == '!')	// special token 
                        tokenIDs.push_back(words["!"]);
            }
          }
          // possible last token
          if (!token.empty()) {
            if (words.find(token) != words.end()) 
              tokenIDs.push_back(words[token]);
            else 
              tokenIDs.push_back(words["[UNK]"]);
            }
    
            return tokenIDs;
    }
       
    // Change a vector of tokens back to text 
    string Tokenizer::detokenize(const vector<int>& tokenIDs) 
    {
            string str;
    
            for (int i = 0; i < tokenIDs.size(); i++) {
                if (wordIndex.find(tokenIDs[i]) != wordIndex.end()) {
                    str += wordIndex[tokenIDs[i]];
                    // add a delimiter
                    if (i < tokenIDs.size() - 1 && wordIndex[tokenIDs[i+1]] != "!") 
                        str += " ";
                }
            }
    
            return str;
    }
    
    int Tokenizer::get_nTokens() const 
    { 
          return nTokens; 
    }
    
    //------------------   Embedding Class ------------------
    Embedding::Embedding(int sequence_length, int embeddingDimension)
    { 
      nTokens = sequence_length;
      dim = embeddingDimension;
      // initialize embeddings as an nTokens x dim matrix with certain random values 
      initMatrix(em_matrix, nTokens, dim);	
    }
        
    // create embedding matrix with tokenIDs 
    matrixd Embedding::embed(const vector<int>& tokenIDs) 
    {
          matrixd embeddings;
          embeddings.reserve(tokenIDs.size()); // set capacity of embeddings vector
          for(int i = 0; i < tokenIDs.size(); i++) {
    	if (tokenIDs[i] >= 0 && tokenIDs[i] < nTokens)
               embeddings.push_back(em_matrix[tokenIDs[i]]);
    	else 
              // Out-of-dictionary tokens
              embeddings.push_back(vector<double>(dim, 0.0));
          }
            
          return embeddings;
    }
    
    int Embedding::get_embedDim() const 
    { 
           return dim; 
    }
    
    int Embedding::get_nTokens() const 
    { 
           return nTokens; 
    }
    
    // --------------------- Positional Encoding class ---------------
    PositionalEncoding::PositionalEncoding(int max_seq_length, int embed_dimension) 
    {
          maxTokens = max_seq_length;
          dModel = embed_dimension;
    
          PE.resize(maxTokens, vector<double>(dModel));
          for (int pos = 0; pos < maxTokens; pos++) {
            for (int i = 0; i < dModel/2; i++) {
        		PE[pos][2*i] = sin( pos / pow(10000, 2.0*i/dModel) );
    	    	PE[pos][2*i+1] = cos( pos / pow(10000, 2.0*i/dModel) );
    	}
          }
    }
    
    // add positional encodings to embeddings
    matrixd PositionalEncoding::addPE(const matrixd& embeddings) 
    {
          matrixd x = embeddings;
          int nTokens = embeddings.size();
    
          for (int pos = 0; pos < nTokens; pos++) 
            for (int i = 0; i < dModel; i++) 
              x[pos][i] += PE[pos][i];
    
          return x;
    }
    
    matrixd PositionalEncoding::getPE() 
    {
      return PE;
    }
    
    // ------------------------------- Multi-Head Attention Class ---------------
    MultiHeadAttention::MultiHeadAttention(int dm, int nh)
    {
          dModel = dm;	//embedding dimension
          nHeads = nh; 	//number of heads
          d_k = dModel / nHeads;	
    
          // initialize weight matrices with some random values
          initMatrix(WQ, dModel, dModel);
          initMatrix(WK, dModel, dModel);
          initMatrix(WV, dModel, dModel);
          initMatrix(WO, dModel, dModel);
    }
    
    // input Matrix X:  ns (or nt) x dModel
    matrixd MultiHeadAttention::computeAttention(const matrixd &X_Q, const matrixd &X_K, 
                                                 const matrixd &X_V,  bool mask)
    {
        matrixd Q = matmul(X_Q, WQ);
        matrixd K = matmul(X_K, WK);
        matrixd V = matmul(X_V, WV);
    
        vector<matrixd> head_outputs;
    
        for (int h = 0; h < nHeads; h++) {
           matrixd Qh(Q.size(), vector<double>(d_k));
           matrixd Kh(K.size(), vector<double>(d_k));
           matrixd Vh(V.size(), vector<double>(d_k));
           for (int i = 0; i < Q.size(); i++) 
             for (int j = 0; j < d_k; j++) 
               Qh[i][j] = Q[i][h*d_k + j];
           for (int i = 0; i < K.size(); i++) 
             for (int j = 0; j < d_k; j++) 
               Kh[i][j] = K[i][h*d_k + j];
           for (int i = 0; i < V.size(); i++) 
             for (int j = 0; j < d_k; j++) 
               Vh[i][j] = V[i][h*d_k + j];
           // save attention of each head in a vector
           head_outputs.push_back(scaledAttention(Qh, Kh, Vh, mask));
        }
    
        // concatenate heads
        matrixd concat(Q.size(), vector<double>(dModel));
        for (int h = 0; h < nHeads; h++)
          for (int i = 0; i < Q.size(); i++)
            for (int j = 0; j < d_k; j++)
               concat[i][h*d_k + j] = head_outputs[h][i][j];
    
         // muliply by WO to give final output
         auto y = matmul(concat, WO);
    
         return y;
    }
    
    // ---------------------------  Feed Forward -----------------------------------
    FeedForward::FeedForward(int d_model, int dFF)
    {
        dModel = d_model;
        d_ff = dFF;
        initMatrix(W1, dModel, d_ff);
        initMatrix(W2, d_ff, dModel);
    }  
    
    matrixd FeedForward::FFoutput(const matrixd &X)
    {
        matrixd z = X;       // output same size as x
        auto A = matmul(X, W1);	// A = X x W1
        matrixd y = A;       // same matrix size
        int n = A.size();    // rows
        int m = A[0].size(); // cols
        for (int i = 0; i < n; i++)
           for(int j = 0; j < m; j++)
    	 y[i][j] = f(A[i][j]);	// f is an activation function
    
         z = matmul(y, W2);		// z = y x W2
    
         return z;   // output of FeedForward
    }
    
    void FeedForward::updateWeights(const matrixd &dW1, const matrixd &dW2)
    {
        W1 = addMat(W1, dW1);
        W2 = addMat(W2, dW2);
    }
    // ------- Encoder Layer Class : one layer of encoder --------------
    Encoder::Encoder(const int d_model, const int num_heads, 
                          const int dFF, const matrixd &input) 
    {
       dModel = d_model;
       nHeads = num_heads;
       d_ff = dFF;
       X = input;
       initMatrix(W1, dModel, d_ff);
       initMatrix(W2, d_ff, dModel);
       mha = new MultiHeadAttention(dModel, nHeads);
       ff = new FeedForward(dModel, d_ff);
    }
    
    // X = the input matrix embeddings + positional encoding
    matrixd Encoder::computeOutput()
    { 
      matrixd y1 = mha->computeAttention(X, X, X, false);  // umasked MHA
      matrixd y2 = addNnorm(X, y1);
      matrixd ff_out = ff->FFoutput( y2 );
      output = addNnorm(y2, ff_out);
    
      return output;
    }
    
    matrixd Encoder::getOutput()
    {
      return output;
    }
    
    // ------------------- Decoder Layer -------------
    Decoder::Decoder (const int d_model, const int n_heads, 
                      const int dFF, const matrixd &input, 
                      const matrixd &input_from_encoder)
    {
       dModel = d_model;
       nHeads = n_heads;
       d_ff = dFF;
       X = input;
       E = input_from_encoder;
       mha = new MultiHeadAttention(dModel, nHeads);
       ff = new FeedForward(dModel, d_ff);
    }
    
    matrixd Decoder::computeOutput()
    {
       matrixd masked_mha = mha->computeAttention(X, X, X, true);
       matrixd X1 = addNnorm(X, masked_mha);
       matrixd cross_mha = mha->computeAttention(X1, E, E, false);
       matrixd X2 = addNnorm(X1, cross_mha);
       matrixd ff_out = ff->FFoutput( X2 );
       output = addNnorm(X2, ff_out);
    
       return output;
    }
    
    matrixd Decoder::getOutput()
    {
      return output;
    }
    

    testDecoder.cpp (testing routine):
    // testDecoder.cpp -- a main program for testing a transformer decoder
    // http://forejune.co/cuda/
    
    #include <iostream>
    #include <iomanip>
    #include "util.h"
    #include "transformer.h"
    
    using namespace std;
    
    int main() 
    {
        srand(time (0));
        int num_heads = 2;
        string sentence = "The waves crashed forcefully against the shore!";
    
        // Tokenization
        ifstream ifs;
        char fname[] = "t.txt";
        ifs.open( fname );
        if (!ifs.is_open()) {
              cerr << "Unable to open file " << fname  << endl;
              exit( 1 );
        }
     
        Tokenizer tokenizer( ifs );
        vector<int> tokens = tokenizer.tokenize(sentence);
        // Create embeddings
        const int dModel = 8;
        int totalTokens = tokenizer.get_nTokens();
        cout << "\n=== Source sentence Tokenization ===" << endl;
        cout << "Original sentence:\n\t " << sentence << "\n\n";
        cout << "Tokenization Results:\n";
        cout << "Tokens: [";
        for (size_t i = 0; i < tokens.size(); ++i) {
            cout << tokens[i];
            if (i < tokens.size() - 1) cout << ", ";
        }
        cout << "]\n";
        getchar();
    
    
        Embedding embedding_layer(totalTokens, dModel);
        matrixd embeddings = embedding_layer.embed(tokens);
    
        // Add positional encoding
        const int MAX_SEQ_LENGTH = 16;
        PositionalEncoding pos_encoding(MAX_SEQ_LENGTH, dModel);
        matrixd pe = pos_encoding.getPE();
    
        // add positional encoding to embeddings
        matrixd embeddings_with_pos = pos_encoding.addPE(embeddings);
        cout << "\nInput to Encoder:\n";
        int rows = embeddings_with_pos.size();
        int cols = embeddings_with_pos[0].size(); 
        for (int i = 0; i < rows; i++) {
            cout << "Token " << i << ": [";
            for (int j = 0; j < cols; ++j) {
                cout << fixed << setw(6) << setprecision(2) << embeddings_with_pos[i][j];
                if (j < cols-1) cout << ", ";
            }
            cout << "]\n";
        }
        getchar();
    
        // Pass through transformer encoder
        int embedDimension = embedding_layer.get_embedDim();
        int d_ff = 128;
        Encoder encoder(embedDimension, num_heads, d_ff, embeddings_with_pos);
        cout << "\nOutput of Encoder:\n";
        matrixd E = encoder.computeOutput(); // embeddings_with_pos );
        printMatrix( E );
        getchar();
    
       // ------------------- Decoder --------------------
       string target_sentence = "¡Las olas rompieron con fuerza contra la orilla!";
       ifs.close();
       char fnamet[] = "t1.txt";
       ifs.open( fnamet );
        if (!ifs.is_open()) {
              cerr << "Unable to open file " << fnamet  << endl;
              exit( 1 );
        }
        Tokenizer target_tokenizer( ifs );
        unordered_map<string, int> dictionary;
        dictionary = target_tokenizer.getTokensWords();
        vector<int> target_tokens = target_tokenizer.tokenize(target_sentence);
        cout << "\n=== Target sentence Tokenization ===" << endl;
        cout << "Original sentence:\n\t " << target_sentence << "\n\n";
        cout << "Target Tokenization Results:\n";
        cout << "Tokens: [";
        for (int i = 0; i < target_tokens.size(); ++i) {
            cout << target_tokens[i];
            if (i < target_tokens.size() - 1) cout << ", ";
        }
        cout << "]\n";
        getchar();
        totalTokens = target_tokenizer.get_nTokens();
        cout << "\nTarget vocabulary size: " << totalTokens; 
        cout << "\nEmbedding dimension: " << dModel << "\n";
        Embedding target_embedding_layer(totalTokens, dModel);
        matrixd target_embeddings = target_embedding_layer.embed(target_tokens);
        cout <<"\nTarget Token Embeddings(" << target_embeddings.size() <<" x "<<dModel<<"):\n";
        printMatrix( target_embeddings );
        getchar();
        PositionalEncoding target_pos_encoding(MAX_SEQ_LENGTH, dModel);
    
        // add positional encoding to embeddings
        matrixd target_embeddings_with_pos = target_pos_encoding.addPE(target_embeddings);
        cout << "\nInput to Decoder:\n";
        rows = target_embeddings_with_pos.size();
        cols = target_embeddings_with_pos[0].size(); 
        for (int i = 0; i < rows; i++) {
            cout << "Token " << i << ": [";
            for (int j = 0; j < cols; ++j) {
                cout << fixed << setw(6) << setprecision(2) << target_embeddings_with_pos[i][j];
                if (j < cols-1) cout << ", ";
            }
            cout << "]\n";
        }
        getchar();
    
        Decoder decoder(dModel, num_heads, d_ff,  target_embeddings_with_pos, E);
        matrixd y = decoder.computeOutput(); 
        cout << "Decoder output:\n";
        printMatrix( y );
        getchar();
    
        return 0;
    }
    

    Makefile :
    PROG    = testDecoder
    #source codes
    SRCS =  $(PROG).cpp
    #substitute .cpp by .o to obtain object filenames
    OBJS = $(SRCS:.cpp=.o) util.o transformer.o
    
    #$@ evaluates to the target
    
    $(PROG): $(OBJS)
    	g++ -o $@ $(OBJS)  
    
    $(OBJS): 
    	g++ -c  -std=c++20 $*.cpp 
    
    clean:
    	rm $(OBJS) $(PROG)