Decoder of an AI Transformer in C/C++

Materials available at: http://forejune.co/cuda/

Transformer Decoder

Mask:

An upper triangular matrix applied to the attention scores 
Unmasked Attention:
Token 0: [ -0.16,   0.57,   0.84,   1.78 ]  
Token 1: [  0.95,  -0.49,  -0.82,   1.04 ] 
Token 2: [  1.18,  -1.40,  -1.05,   1.64 ] 
Token 3: [  0.44,  -1.55,  -0.36,   1.64 ] 
Masked Attention:
Token 0: [ -0.16,   -∞,    -∞       -∞ ]  
Token 1: [  0.95,  -0.49,  -∞       -∞ ]  
Token 2: [  1.18,  -1.40,  -1.05,   -∞ ] 
Token 3: [  0.44,  -1.55,  -0.36,   1.64 ] 

After activation function, -∞ becomes 0

Parameters:

  Batch size = 1 (one input sequence)
  Target sequence length (decoder) = n_t
  Source sequence length (encoder) = n_s
  Model dimension = d_model
  Number of heads = n_h
  Head dimension = d_k = d_model / n_h
  Feed forward hidden size = d_ff

Inputs to Decoder Block:

  Docoder input: 
  	X : n_t x d_model
  Encoder output:
  	E : n_s x d_model

Masked Multi-Head Attention

Algorithm:
1. Linear projections:
      Q = X x W_Q, K = X x W_K, V = X x W_V
      Dimensions: 
	 W_Q, W_K, W_V, W_O : d_model x d_model
	 Q, K, V : n_t x d_model

2. Splitting into heads:
	Q, K, V :  n_h x n_t x d_k

3. Attention and Output after projection:
 
    Q x K^T : n_t x n_t
        Every token attends to every previous token
    Let MHA(X) = Output = concat(Q,K,V) x W_O 

4. Add & Norm:
	X₁ = Normalize(X + MHA(X))
   	X₁ : n_t x d_model

Cross Multi-Head Attention

Inputs:
  Query from decoder:
    Q = X₁ x W_Q :  n_t x d_model
  
  Key from encoder:
    K = E x W_K :  n_s x d_model 

  Value from encoder:
    V = E x W_V :  n_s x d_model

Attention dimensions now:
    Q x K^T : n_t x n_s
    meaning that each decoder token attends over 
    all encoder tokens.
    
Add & Norm :
    X₂ = Normalize(X₁ + Cross-MHA(X₁))

Feed-Forward Network (FFN) :
    FFN(X₂) = f(X₂ x W₁) x W₂   
	f is an activation function like ReLU
	We do not use any bias for simplicity
	Note: Each token is processed independently
	X₂ : n_t x d_model
	W₁ : d_model x d_ff
	W₂ : d_ff x d_model
	FFN(X₂) : n_t x d_model

Final Add & Norm :
      Output Y = Normalize(X₂ + FFN(X₂))
	Y : n_t x d_model
	Y will be inputs to the next decoder layer (Y → X)
	only the output of the last decoder layer goes to 
	Linear then Softmax

Weights

In general:

All weights in each layer are independent from other layers Layer 1 weights ≠ Layer 2 weights
Encoder weights are independent from decoder weights e.g. Encoder FFN weights ≠ Decoder FFN weights
Each FFN has its own weights W₁ and W2
Nothing is reused across layers
Everything is independently learned.
One global loss (calculated after Softmax) is used to update
all weights of the encoder and decoder in each layer

C/C++ Implementation

util.h:

// util.h
// http://forejune.co/cuda/
#ifndef __UTIL_H__
#define __UTIL_H__
#include <vector>
#include <algorithm>
#include <math.h>
#include <iostream>
#include <iomanip>
#include <random>

using namespace std;

typedef vector<vector<double>>  matrixd;

// an activation function
double f(double z);

// softmax, 1D input
vector<double> softmax(const vector<double> &v);

// softmax activation function, 2D input vector
matrixd softmax(const matrixd& input);

// add two matrices
matrixd addMat(const matrixd &A, const matrixd &B);

// Transpose matrix n x m, B = A^T : m x n
matrixd transpose(const matrixd& A);

// Matrix multiplication   C = A X B   A: nxm, B: mxr, C: nxr
matrixd matmul(const matrixd& A, const matrixd& B);

// scale a matrix
void scaleMat(matrixd &A, const double s);

// scaled self attention
matrixd scaledAttention(const matrixd &Q, const matrixd &K, const matrixd &V,
	bool mask);


// initialize an input matrix with certain random values
void initMatrix(matrixd& M, const int n, const int m);

// print a token dictionary
void printDictionary(unordered_map<string, int> &d);

// print a matrix
void printMatrix(const matrixd &y);

// Add & Norm
matrixd addNnorm(const matrixd &A, const matrixd &B);
#endif

util.cpp (Helper functions):
#include "util.h" using namespace std; typedef vector<vector<double>> matrixd; // Activation function: using ReLU (Rectified Linear Unit) double f(double z) { return max(0.0, z); } // softmax activation function, 1D input vector vector<double> softmax(const vector<double> &z) { int m = z.size(); vector<double> y(m, 0); //maximum value of vector double maxVal = *max_element(z.begin(), z.end()); double sum = 0; // Subtract max for numerical stability for (int j = 0; j < m; j++) { y[j] = exp(z[j] - maxVal); sum += y[j]; } // Normalize for (int j = 0; j < m; j++) y[j] /= sum; return y; } // softmax activation function, 2D input vector matrixd softmax(const matrixd& input) { int n = input.size(), m = input[0].size(); // n x m matrix // 2D output vector n x m y[n][m] vector<vector<double>> y(n, vector<double>(m)); for (int i = 0; i < n; i++) { //maximum value of the row double maxVal = *max_element(input[i].begin(), input[i].end()); double sum = 0; // Subtract max for numerical stability for (int j = 0; j < m; j++) { y[i][j] = exp(input[i][j] - maxVal); sum += y[i][j]; } // Normalize for (int j = 0; j < input[i].size(); j++) y[i][j] /= sum; } return y; } // C = A + B matrixd addMat(const matrixd &A, const matrixd &B) { int n = A.size(); // rows int m = A[0].size(); // cols matrixd C = A; for (int i = 0; i < n; i++) for(int j = 0; j < m; j++) C[i][j] = A[i][j] + B[i][j]; return C; } // Transpose matrix n x m, B = A^T : m x n //vector<vector<double>> transpose(const vector<vector<double>>& A) matrixd transpose(const matrixd& A) { int n = A.size(); //number of rows int m = A[0].size(); //number of columns matrixd B(m, vector<double>(n)); for (int i = 0; i < n; i++) for (int j = 0; j < m; j++) B[j][i] = A[i][j]; return B; } // Matrix multiplication C = A X B A: nxm, B: mxr, C: nxr matrixd matmul(const matrixd& A, const matrixd& B) { int n = A.size(); int m = A[0].size(); int r = B[0].size(); matrixd C(n, vector<double>(r, 0)); for (int i = 0; i < n; i++) for (int j = 0; j < r; j++) for (int k = 0; k < m; k++) C[i][j] += A[i][k] * B[k][j]; return C; } void initMatrix(matrixd& M, const int n, const int m) { // resizing 2D vector to n x m with initial values 0 M.assign(n, vector<double>(m, 0)); random_device rd; mt19937 gen(rd()); //psuedo random number generator //mean = 0.0, standard deviation (sigma = 0.2) normal_distribution<double> dist(0.0, 0.2); for (int i = 0; i < n; ++i) for (int j = 0; j < m; ++j) M[i][j] = dist(gen); } void printDictionary(unordered_map<string, int> &d) { unordered_map<string, int>::iterator it = d.begin(); int k = 0; while ( it != d.end() ){ cout << right << setw(12) << it->first << ": " << setw(3) << it->second; it++; k++; if ( k % 5 == 0 ) cout << endl; } } void printMatrix(const matrixd &y) { int cols = y.size(); int rows = y[0].size(); for (int i = 0; i < cols; ++i) { cout << "Token " << i << ": ["; for (int j = 0; j < rows; ++j) { cout << fixed << setw(7) << setprecision(3) << y[i][j]; if (j < 4) cout << ", "; } cout << " ]\n"; } } matrixd addNnorm(const matrixd &A, const matrixd &B) { auto x = addMat(A, B); // add two matrices int n = x.size(); // rows int m = x[0].size(); // cols matrixd y = x; // output same size as x // normalize the result double epsilon = 1e-5; for (int i = 0; i < n; i++) { double sum = 0; // sum of one row double std = 0; //sigma_i square for (int j = 0; j < m; j++) sum += x[i][j]; double mean = sum / m; // mu_i, mean of i-th row sum = 0; for (int j = 0; j < m; j++) { double diff = x[i][j] - mean; sum += diff * diff; } std = sum / m; for (int j = 0; j < m; j++) { double xd = (x[i][j] - mean) / sqrt(std + epsilon); y[i][j] = xd; // gamma = 1, beta = 0; } } return y; // final output of Add & norm } // scale a matrix void scaleMat(matrixd &A, const double s) { int m = A.size(); // number of rows int n = A[0].size(); // number of columns for (int i = 0; i < m; i++) for (int j = 0; j < n; j++) A[i][j] *= s; } matrixd scaledAttention(const matrixd &Q, const matrixd &K, const matrixd &V, bool mask) { double d_k, scale; matrixd KT, A; // K transpose, attention d_k = Q[0].size(); scale = 1.0 / sqrt( d_k ); KT = transpose ( K ); A = matmul(Q, KT); scaleMat(A, scale); if ( mask ) { for (int i = 0; i < A.size(); i++) for (int j = i+1; j < A[0].size(); j++) A[i][j] = -1e9; // causal mask } A = softmax( A ); matrixd output = matmul(A, V); return output; }

transformer.h:

#ifndef __TRANSFORMER_H__ #define __TRANSFORMER_H__ #include <fstream> #include <vector> #include <string> #include <unordered_map> using namespace std; class Tokenizer { private: unordered_map<string, int> words; unordered_map<int, string> wordIndex; int nTokens; // number of words public: Tokenizer(); Tokenizer(ifstream &fs); unordered_map<string, int> getTokensWords(); vector<int> tokenize(const string& str); string detokenize(const vector<int>& tokenIDs); int get_nTokens() const; }; class Embedding { private: int nTokens; // number of words int dim; // embedding dimension matrixd em_matrix; // embedding matrix; public: Embedding(int sequence_length, int embeddingDimension); // create embedding matrix with tokenIDs matrixd embed(const vector<int>& tokenIDs); int get_embedDim() const; int get_nTokens() const; }; class PositionalEncoding { private: int maxTokens; // maximum sequence length int dModel; // embedding dimension matrixd PE; // positional_encodings matrix; public: PositionalEncoding(int max_seq_length, int embedding_dimension); matrixd addPE(const matrixd& embeddings); matrixd getPE(); }; class MultiHeadAttention { private: int dModel; int nHeads; int d_k; // dimension per head matrixd WQ, WK, WV, WO; public: MultiHeadAttention(int dm, int nh); // input nTokens x dModel matrixd computeAttention(const matrixd &X_Q, const matrixd &X_K, const matrixd &X_V, bool mask); }; class FeedForward { private: matrixd W1, W2; int dModel, d_ff; public: FeedForward(int dModel, int d_ff); matrixd FFoutput(const matrixd &x); void updateWeights(const matrixd &dW1, const matrixd &dW2); }; class Encoder{ private: int dModel; int nHeads; int d_ff; matrixd X; matrixd W1,W2; MultiHeadAttention *mha; matrixd output; FeedForward *ff; public: Encoder(const int dModel, const int nHeads, const int d_ff, const matrixd &input); matrixd computeOutput(); matrixd getOutput(); ~Encoder(){ delete mha; delete ff; } }; class Decoder { private: int dModel; int d_ff; int nHeads; matrixd X; //external input matrixd E; //input from encoder matrixd output; MultiHeadAttention *mha; FeedForward *ff; public: Decoder(const int dModel, const int nHeads, const int d_ff, const matrixd &X, const matrixd &E); matrixd computeOutput(); matrixd output2(const matrixd &E, const matrixd &X1); matrixd getOutput(); ~Decoder(){ // delete maskedMHA; // delete crossMHA; delete mha; delete ff; } }; #endif

transformer.cpp (transformer classes):

// transformer.cpp -- classes of a multi-head transformer // http://forejune.co/cuda/ #include <iostream> #include <fstream> #include <vector> #include <string> #include <unordered_map> #include <cmath> #include <random> #include <algorithm> #include <iomanip> #include "util.h" #include "transformer.h" using namespace std; // ------------------------ Tokenizer class -------------------------- Tokenizer::Tokenizer() { // Create a simple dictionary words = { {"[UNK]", 0}, {"[PAD]", 1}, {"[CLS]", 2}, {"[SEP]", 3}, {"strong", 4}, {"nation", 5}, {"recycles", 6}, {"its", 7}, {"huge", 8}, {"trade", 9}, {"surplus", 10}, {"to", 11}, {"america", 12}, {"to", 13}, {"infiltrate", 14}, {"it", 15}, {"!", 16} }; // Create reverse mapping nTokens = words.size(); unordered_map<string, int>::iterator it = words.begin(); while ( it != words.end() ) { wordIndex[it->second] = it->first; it++; } } Tokenizer::Tokenizer(ifstream &fs) { string token; vector<string> tokens; // Read tokens (words) one by one, delimited by whitespace int k = 0; words["[UNK]"] = k; //unknown token wordIndex[k] = "UNK]"; k++; while (fs >> token) { if (!words.contains(token)){ words[token] = k; wordIndex[k] = token; k++; } } nTokens = words.size(); } unordered_map<string, int> Tokenizer::getTokensWords() { return words; } // tokenize a string vector<int> Tokenizer::tokenize(const string& str) { vector<int> tokenIDs; string token; // Simple tokenization for (char c : str) { if (isalnum(c)) { //change character to lower case, add it to current token string token += tolower(c); } else { // non-alpha numeric, form a complete token if (!token.empty()) { //string not empty if (words.find(token) != words.end()) // token is in the words dictionary, thus save its ID tokenIDs.push_back(words[token]); else // unknown token tokenIDs.push_back(words["[UNK]"]); token.clear(); // clear variable to form next token } if (c == '!') // special token tokenIDs.push_back(words["!"]); } } // possible last token if (!token.empty()) { if (words.find(token) != words.end()) tokenIDs.push_back(words[token]); else tokenIDs.push_back(words["[UNK]"]); } return tokenIDs; } // Change a vector of tokens back to text string Tokenizer::detokenize(const vector<int>& tokenIDs) { string str; for (int i = 0; i < tokenIDs.size(); i++) { if (wordIndex.find(tokenIDs[i]) != wordIndex.end()) { str += wordIndex[tokenIDs[i]]; // add a delimiter if (i < tokenIDs.size() - 1 && wordIndex[tokenIDs[i+1]] != "!") str += " "; } } return str; } int Tokenizer::get_nTokens() const { return nTokens; } //------------------ Embedding Class ------------------ Embedding::Embedding(int sequence_length, int embeddingDimension) { nTokens = sequence_length; dim = embeddingDimension; // initialize embeddings as an nTokens x dim matrix with certain random values initMatrix(em_matrix, nTokens, dim); } // create embedding matrix with tokenIDs matrixd Embedding::embed(const vector<int>& tokenIDs) { matrixd embeddings; embeddings.reserve(tokenIDs.size()); // set capacity of embeddings vector for(int i = 0; i < tokenIDs.size(); i++) { if (tokenIDs[i] >= 0 && tokenIDs[i] < nTokens) embeddings.push_back(em_matrix[tokenIDs[i]]); else // Out-of-dictionary tokens embeddings.push_back(vector<double>(dim, 0.0)); } return embeddings; } int Embedding::get_embedDim() const { return dim; } int Embedding::get_nTokens() const { return nTokens; } // --------------------- Positional Encoding class --------------- PositionalEncoding::PositionalEncoding(int max_seq_length, int embed_dimension) { maxTokens = max_seq_length; dModel = embed_dimension; PE.resize(maxTokens, vector<double>(dModel)); for (int pos = 0; pos < maxTokens; pos++) { for (int i = 0; i < dModel/2; i++) { PE[pos][2*i] = sin( pos / pow(10000, 2.0*i/dModel) ); PE[pos][2*i+1] = cos( pos / pow(10000, 2.0*i/dModel) ); } } } // add positional encodings to embeddings matrixd PositionalEncoding::addPE(const matrixd& embeddings) { matrixd x = embeddings; int nTokens = embeddings.size(); for (int pos = 0; pos < nTokens; pos++) for (int i = 0; i < dModel; i++) x[pos][i] += PE[pos][i]; return x; } matrixd PositionalEncoding::getPE() { return PE; } // ------------------------------- Multi-Head Attention Class --------------- MultiHeadAttention::MultiHeadAttention(int dm, int nh) { dModel = dm; //embedding dimension nHeads = nh; //number of heads d_k = dModel / nHeads; // initialize weight matrices with some random values initMatrix(WQ, dModel, dModel); initMatrix(WK, dModel, dModel); initMatrix(WV, dModel, dModel); initMatrix(WO, dModel, dModel); } // input Matrix X: ns (or nt) x dModel matrixd MultiHeadAttention::computeAttention(const matrixd &X_Q, const matrixd &X_K, const matrixd &X_V, bool mask) { matrixd Q = matmul(X_Q, WQ); matrixd K = matmul(X_K, WK); matrixd V = matmul(X_V, WV); vector<matrixd> head_outputs; for (int h = 0; h < nHeads; h++) { matrixd Qh(Q.size(), vector<double>(d_k)); matrixd Kh(K.size(), vector<double>(d_k)); matrixd Vh(V.size(), vector<double>(d_k)); for (int i = 0; i < Q.size(); i++) for (int j = 0; j < d_k; j++) Qh[i][j] = Q[i][h*d_k + j]; for (int i = 0; i < K.size(); i++) for (int j = 0; j < d_k; j++) Kh[i][j] = K[i][h*d_k + j]; for (int i = 0; i < V.size(); i++) for (int j = 0; j < d_k; j++) Vh[i][j] = V[i][h*d_k + j]; // save attention of each head in a vector head_outputs.push_back(scaledAttention(Qh, Kh, Vh, mask)); } // concatenate heads matrixd concat(Q.size(), vector<double>(dModel)); for (int h = 0; h < nHeads; h++) for (int i = 0; i < Q.size(); i++) for (int j = 0; j < d_k; j++) concat[i][h*d_k + j] = head_outputs[h][i][j]; // muliply by WO to give final output auto y = matmul(concat, WO); return y; } // --------------------------- Feed Forward ----------------------------------- FeedForward::FeedForward(int d_model, int dFF) { dModel = d_model; d_ff = dFF; initMatrix(W1, dModel, d_ff); initMatrix(W2, d_ff, dModel); } matrixd FeedForward::FFoutput(const matrixd &X) { matrixd z = X; // output same size as x auto A = matmul(X, W1); // A = X x W1 matrixd y = A; // same matrix size int n = A.size(); // rows int m = A[0].size(); // cols for (int i = 0; i < n; i++) for(int j = 0; j < m; j++) y[i][j] = f(A[i][j]); // f is an activation function z = matmul(y, W2); // z = y x W2 return z; // output of FeedForward } void FeedForward::updateWeights(const matrixd &dW1, const matrixd &dW2) { W1 = addMat(W1, dW1); W2 = addMat(W2, dW2); }

// ------- Encoder Layer Class : one layer of encoder -------------- Encoder::Encoder(const int d_model, const int num_heads, const int dFF, const matrixd &input) { dModel = d_model; nHeads = num_heads; d_ff = dFF; X = input; initMatrix(W1, dModel, d_ff); initMatrix(W2, d_ff, dModel); mha = new MultiHeadAttention(dModel, nHeads); ff = new FeedForward(dModel, d_ff); } // X = the input matrix embeddings + positional encoding matrixd Encoder::computeOutput() { matrixd y1 = mha->computeAttention(X, X, X, false); // umasked MHA matrixd y2 = addNnorm(X, y1); matrixd ff_out = ff->FFoutput( y2 ); output = addNnorm(y2, ff_out); return output; } matrixd Encoder::getOutput() { return output; } // ------------------- Decoder Layer ------------- Decoder::Decoder (const int d_model, const int n_heads, const int dFF, const matrixd &input, const matrixd &input_from_encoder) { dModel = d_model; nHeads = n_heads; d_ff = dFF; X = input; E = input_from_encoder; mha = new MultiHeadAttention(dModel, nHeads); ff = new FeedForward(dModel, d_ff); } matrixd Decoder::computeOutput() { matrixd masked_mha = mha->computeAttention(X, X, X, true); matrixd X1 = addNnorm(X, masked_mha); matrixd cross_mha = mha->computeAttention(X1, E, E, false); matrixd X2 = addNnorm(X1, cross_mha); matrixd ff_out = ff->FFoutput( X2 ); output = addNnorm(X2, ff_out); return output; } matrixd Decoder::getOutput() { return output; }

testDecoder.cpp (testing routine):

// testDecoder.cpp -- a main program for testing a transformer decoder // http://forejune.co/cuda/ #include <iostream> #include <iomanip> #include "util.h" #include "transformer.h" using namespace std; int main() { srand(time (0)); int num_heads = 2; string sentence = "The waves crashed forcefully against the shore!"; // Tokenization ifstream ifs; char fname[] = "t.txt"; ifs.open( fname ); if (!ifs.is_open()) { cerr << "Unable to open file " << fname << endl; exit( 1 ); } Tokenizer tokenizer( ifs ); vector<int> tokens = tokenizer.tokenize(sentence); // Create embeddings const int dModel = 8; int totalTokens = tokenizer.get_nTokens(); cout << "\n=== Source sentence Tokenization ===" << endl; cout << "Original sentence:\n\t " << sentence << "\n\n"; cout << "Tokenization Results:\n"; cout << "Tokens: ["; for (size_t i = 0; i < tokens.size(); ++i) { cout << tokens[i]; if (i < tokens.size() - 1) cout << ", "; } cout << "]\n"; getchar(); Embedding embedding_layer(totalTokens, dModel); matrixd embeddings = embedding_layer.embed(tokens); // Add positional encoding const int MAX_SEQ_LENGTH = 16; PositionalEncoding pos_encoding(MAX_SEQ_LENGTH, dModel); matrixd pe = pos_encoding.getPE(); // add positional encoding to embeddings matrixd embeddings_with_pos = pos_encoding.addPE(embeddings); cout << "\nInput to Encoder:\n"; int rows = embeddings_with_pos.size(); int cols = embeddings_with_pos[0].size(); for (int i = 0; i < rows; i++) { cout << "Token " << i << ": ["; for (int j = 0; j < cols; ++j) { cout << fixed << setw(6) << setprecision(2) << embeddings_with_pos[i][j]; if (j < cols-1) cout << ", "; } cout << "]\n"; } getchar(); // Pass through transformer encoder int embedDimension = embedding_layer.get_embedDim(); int d_ff = 128; Encoder encoder(embedDimension, num_heads, d_ff, embeddings_with_pos); cout << "\nOutput of Encoder:\n"; matrixd E = encoder.computeOutput(); // embeddings_with_pos ); printMatrix( E ); getchar(); // ------------------- Decoder -------------------- string target_sentence = "¡Las olas rompieron con fuerza contra la orilla!"; ifs.close(); char fnamet[] = "t1.txt"; ifs.open( fnamet ); if (!ifs.is_open()) { cerr << "Unable to open file " << fnamet << endl; exit( 1 ); } Tokenizer target_tokenizer( ifs ); unordered_map<string, int> dictionary; dictionary = target_tokenizer.getTokensWords(); vector<int> target_tokens = target_tokenizer.tokenize(target_sentence); cout << "\n=== Target sentence Tokenization ===" << endl; cout << "Original sentence:\n\t " << target_sentence << "\n\n"; cout << "Target Tokenization Results:\n"; cout << "Tokens: ["; for (int i = 0; i < target_tokens.size(); ++i) { cout << target_tokens[i]; if (i < target_tokens.size() - 1) cout << ", "; } cout << "]\n"; getchar(); totalTokens = target_tokenizer.get_nTokens(); cout << "\nTarget vocabulary size: " << totalTokens; cout << "\nEmbedding dimension: " << dModel << "\n"; Embedding target_embedding_layer(totalTokens, dModel); matrixd target_embeddings = target_embedding_layer.embed(target_tokens); cout <<"\nTarget Token Embeddings(" << target_embeddings.size() <<" x "<<dModel<<"):\n"; printMatrix( target_embeddings ); getchar(); PositionalEncoding target_pos_encoding(MAX_SEQ_LENGTH, dModel); // add positional encoding to embeddings matrixd target_embeddings_with_pos = target_pos_encoding.addPE(target_embeddings); cout << "\nInput to Decoder:\n"; rows = target_embeddings_with_pos.size(); cols = target_embeddings_with_pos[0].size(); for (int i = 0; i < rows; i++) { cout << "Token " << i << ": ["; for (int j = 0; j < cols; ++j) { cout << fixed << setw(6) << setprecision(2) << target_embeddings_with_pos[i][j]; if (j < cols-1) cout << ", "; } cout << "]\n"; } getchar(); Decoder decoder(dModel, num_heads, d_ff, target_embeddings_with_pos, E); matrixd y = decoder.computeOutput(); cout << "Decoder output:\n"; printMatrix( y ); getchar(); return 0; }

Makefile :

PROG = testDecoder #source codes SRCS = $(PROG).cpp #substitute .cpp by .o to obtain object filenames OBJS = $(SRCS:.cpp=.o) util.o transformer.o #$@ evaluates to the target $(PROG): $(OBJS) g++ -o $@ $(OBJS) $(OBJS): g++ -c -std=c++20 $*.cpp clean: rm $(OBJS) $(PROG)