A Full Encoder-Decoder AI Transformer in C/C++

Materials available at: http://forejune.co/cuda/

Review

Back Propagation of Output Layer

General Idea of Back Propagation:

Ouput Layer:

class OutputLayer { private: matrixd W; // dModel x vocab_size int dModel, vocab_size; public: OutputLayer(int dim, int nv) { dModel = dim; vocab_size = nv; initMatrix(W, dModel, vocab_size); } // Forward: logits --> probabilities matrixd forward(const matrixd &X) { // X: (seq_len x dModel) // output: (seq_len x vocab_size) matrixd logits = matmul(X, W); return softmax(logits); } // Backward: returns dL/dX matrixd backward(const matrixd &X, const matrixd &P, const vector &targets, double eta) { int nt = P.size(); // dL/dY = P - T (cross-entropy + softmax) matrixd dL_dY = P; for (int i = 0; i < nt; i++) dL_dY[i][targets[i]] -= 1.0; // Gradient wrt weights matrixd dL_dW = matmul(transpose(X), dL_dY); // Gradient wrt input (back to decoder) matrixd dL_dX = matmul(dL_dY, transpose(W)); //update weights after gradient computation updateWeight(W, dL_dW, eta); return dL_dX; } };

Back Propagation of Encoder Layers and Decoder Layers

Decoder Layer:

struct DecoderGrad {
  matrixd dL_dX;   // gradient to decoder input (dL/dX)
  matrixd dL_dE;   // gradient to encoder output
};

    // obtain dL/dX from Output Layer
    matrixd dL_dX = output.backward(dec_out, P, target_out, eta);

    matrixd dL_dE;   // accumulate gradients to encoder
    bool first = true;

    // accumulate encoder gradients dL_dE
    for (int i = dLayers.size() - 1; i >= 0; i--) {
        DecoderGrad g = dLayers[i].backProp(dL_dX, eta);

        dL_dX = g.dL_dX;

        if (first) {
            dL_dE = g.dL_dE;
            first = false;
        } else {
            dL_dE = addMat(dL_dE, g.dL_dE);
        }
    }

Encoder Layers:

Forward computation:

  matrixd E = source_pe;	    // embeddings + positional encoding
  for (int i = 0; i < eLayers.size(); i++)
      E = eLayers[i].forward(E);

In coding

		dL/dY ~ dL/dX_l+1	(output gradient)
		dL/dX ~ dL/dX_l	(input gradient)

	 // at the last encoder layer, dL_dE is from decoder layers 
	 for (int i = eLayers.size() - 1; i >= 0; i--)
	   dL_dE = eLayers[i].backProp(dL_dE, eta);

Full Implementation in C/C++

Helper Functions:

util.h:

#ifndef __UTIL_H__ #define __UTIL_H__ #include <vector> #include <algorithm> #include <math.h> #include <iostream> #include <iomanip> #include <random> using namespace std; typedef vector<vector<double>> matrixd; // an activation function double f(double z); // Relu activation function with matrix input matrixd relu(const matrixd &X); // Derivative of Relu matrixd relu_deriv (const matrixd &X); // calculates entropy loss double crossEntropy(const matrixd &P, const vector<int> &target); // add two matrices matrixd addMat(const matrixd &A, const matrixd &B); // Transpose matrix n x m, B = A^T : m x n matrixd transpose(const matrixd& A); // Matrix multiplication C = A X B A: nxm, B: mxr, C: nxr matrixd matmul(const matrixd& A, const matrixd& B); // multiply a matrix by a scalar matrixd mulMats (const matrixd &A, const double s); // scale a matrix void scaleMat(matrixd &A, const double s); // apply masking to S matrixd causalMask(matrixd S); // update weight matrix W -= dW * eta void updateWeight(matrixd &W, const matrixd &dW, const double eta); // initialize an input matrix with certain random values void initMatrix(matrixd& M, const int n, const int m); void initMatrix(matrixd& M, const int n, const int m, double dModel); // print a token dictionary void printDictionary(unordered_map<string, int> &d); void printVec (const vector<int> &v); // print a matrix void printMatrix(const matrixd &y); // Add & Norm matrixd addNnorm(const matrixd &A, const matrixd &B); void clip(matrixd &A, double max_val); // softmax, 1D vector input vector<double> softmax(const vector<double> &v); // softmax activation function, 2D input vector matrixd softmax(const matrixd& input); // derivative function of softmax matrixd softmax_derivative(const matrixd &dL_dP, const matrixd &P); #endif

util.cpp:

// util.cpp: helper functions // http://forejune.co/cuda/ #include "util.h" #include <random> using namespace std; // Activation function: using ReLU (Rectified Linear Unit) double f(double z) { return max(0.0, z); } // Relu activation function with matrix input matrixd relu(const matrixd &X) { int rows = X.size(); int cols = X[0].size(); matrixd Y = X; // same dimension of X for(int i = 0; i < rows; i++) for(int j = 0; j < cols; j++) Y[i][j] = max(0.0, X[i][j]); return Y; } // Derivative of Relu matrixd relu_deriv(const matrixd &X) { int rows = X.size(); int cols = X[0].size(); matrixd Y = X; // same dimension as X for(int i = 0; i < rows; i++) for(int j = 0; j < cols; j++) if ( X[i][j] > 0 ) Y[i][j] = 1; else Y[i][j] = 0; return Y; } // calculates entropy loss double crossEntropy(const matrixd &P, const vector<int> &target) { double L = 0; int nt = P.size(); for (int i = 0; i < nt; i++){ int k = target[i]; L -= log( P[i][k] + 1e-12 ); } return L / nt; } // C = A + B matrixd addMat(const matrixd &A, const matrixd &B) { int n = A.size(); // rows int m = A[0].size(); // cols matrixd C = A; // same dimension as A for (int i = 0; i < n; i++) for(int j = 0; j < m; j++) C[i][j] = A[i][j] + B[i][j]; return C; } // Transpose matrix n x m, B = A^T : m x n matrixd transpose(const matrixd& A) { int n = A.size(); //number of rows int m = A[0].size(); //number of columns matrixd B(m, vector<double>(n)); for (int i = 0; i < n; i++) for (int j = 0; j < m; j++) B[j][i] = A[i][j]; return B; } // Matrix multiplication C = A X B A: nxm, B: mxr, C: nxr matrixd matmul(const matrixd& A, const matrixd& B) { int n = A.size(); int m = A[0].size(); int r = B[0].size(); matrixd C(n, vector<double>(r, 0)); for (int i = 0; i < n; i++) for (int j = 0; j < r; j++) for (int k = 0; k < m; k++) C[i][j] += A[i][k] * B[k][j]; return C; } // scale a matrix void scaleMat(matrixd &A, const double s) { int m = A.size(); // number of rows int n = A[0].size(); // number of columns for (int i = 0; i < m; i++) for (int j = 0; j < n; j++) A[i][j] *= s; } // maultiply a matrix by a scalar s matrixd mulMats (const matrixd &A, const double s) { int m = A.size(); // number of rows int n = A[0].size(); // number of columns matrixd B = A; for (int i = 0; i < m; i++) for (int j = 0; j < n; j++) B[i][j] *= s; return B; } void initMatrix(matrixd& M, const int n, const int m) { // resizing 2D vector to n x m with initial values 0 M.assign(n, vector<double>(m, 0)); random_device rd; mt19937 gen(rd()); //psuedo random number generator double sigma = 1.0 / n; normal_distribution<double> dist(0.0, sigma); for (int i = 0; i < n; ++i) for (int j = 0; j < m; ++j) M[i][j] = dist(gen); for (int i = 0; i < n; ++i) for (int j = 0; j < m; ++j) M[i][j] = rand() % 10000 / 100000.0; } void initMatrix(matrixd& M, const int n, const int m, double sigma) { // resizing 2D vector to n x m with initial values 0 M.assign(n, vector<double>(m, 0)); random_device rd; mt19937 gen(rd()); //psuedo random number generator double d = n; normal_distribution<double> dist(0.0, sigma); for (int i = 0; i < n; ++i) for (int j = 0; j < m; ++j) M[i][j] = dist(gen); } void printDictionary(unordered_map<string, int> &d) { unordered_map<string, int>::iterator it = d.begin(); int k = 0; while ( it != d.end() ){ cout << right << setw(12) << it->first << ": " << setw(3) << it->second; it++; k++; if ( k % 5 == 0 ) cout << endl; } } void printVec (const vector<int> &v) { int n = v.size(); for (int i = 0; i < n; i++){ cout << v[i]; if ( i < n - 1) cout << ", "; } cout << endl; } void printMatrix(const matrixd &y) { int cols = y.size(); int rows = y[0].size(); for (int i = 0; i < cols; ++i) { cout << "Token " << i << ": ["; for (int j = 0; j < rows; ++j) { cout << fixed << setw(7) << setprecision(3) << y[i][j]; if (j < 4) cout << ", "; } cout << " ]\n"; } } matrixd addNnorm(const matrixd &A, const matrixd &B) { auto x = addMat(A, B); // add two matrices int n = x.size(); // rows int m = x[0].size(); // cols matrixd y = x; // output same size as x // normalize the result double epsilon = 1e-5; for (int i = 0; i < n; i++) { double sum = 0; // sum of one row double std = 0; //sigma_i square for (int j = 0; j < m; j++) sum += x[i][j]; double mean = sum / m; // mu_i, mean of i-th row sum = 0; for (int j = 0; j < m; j++) { double diff = x[i][j] - mean; sum += diff * diff; } std = sum / m; for (int j = 0; j < m; j++) { double xd = (x[i][j] - mean) / sqrt(std + epsilon); y[i][j] = xd; // gamma = 1, beta = 0; } } return y; // final output of Add & norm } matrixd scaledAttention(const matrixd &Q, const matrixd &K, const matrixd &V, bool mask) { double d_k, scale; matrixd KT, A; // K transpose, attention d_k = Q[0].size(); scale = 1.0 / sqrt( d_k ); KT = transpose ( K ); A = matmul(Q, KT); scaleMat(A, scale); if ( mask ) { for (int i = 0; i < A.size(); i++) for (int j = i+1; j < A[0].size(); j++) A[i][j] = -1e9; // causal mask } A = softmax( A ); matrixd output = matmul(A, V); return output; } matrixd causalMask(matrixd S) { int n = S.size(); for (int i = 0; i < n; i++) { for (int j = i+1; j < n; j++) { S[i][j] = -1e9; // effectively -∞ } } return S; } void updateWeight(matrixd &W, const matrixd &dW, const double eta) { for (int i = 0; i < W.size(); i++) for (int j = 0; j < W[0].size(); j++) W[i][j] -= eta * dW[i][j]; } void clip(matrixd &A, double max_val = 5.0) { for (auto &row : A) for (auto &x : row) if (x > max_val) x = max_val; else if (x < -max_val) x = -max_val; } // softmax activation function, 1D input vector vector<double> softmax(const vector<double> &z) { int m = z.size(); vector<double> y(m, 0); //maximum value of vector double maxVal = *max_element(z.begin(), z.end()); double sum = 0; // Subtract max for numerical stability for (int j = 0; j < m; j++) { y[j] = exp(z[j] - maxVal); sum += y[j]; } // Normalize for (int j = 0; j < m; j++) y[j] /= sum; return y; } // softmax activation function, 2D input vector matrixd softmax(const matrixd& input) { int n = input.size(), m = input[0].size(); // n x m matrix // 2D output vector n x m y[n][m] vector<vector<double>> y(n, vector<double>(m)); for (int i = 0; i < n; i++) { //maximum value of the row double maxVal = *max_element(input[i].begin(), input[i].end()); double sum = 0; // Subtract max for numerical stability for (int j = 0; j < m; j++) { y[i][j] = exp(input[i][j] - maxVal); sum += y[i][j]; } // Normalize for (int j = 0; j < input[i].size(); j++) y[i][j] /= sum; } return y; } // derivative function of softmax matrixd softmax_derivative(const matrixd &dL_dP, const matrixd &P) { int nt = P.size(), ns = P[0].size(); matrixd dL_dS(nt, vector<double>(ns,0)); for (int i = 0; i < nt; i++) { double dot = 0.0; for (int j = 0; j < ns; j++) dot += P[i][j] * dL_dP[i][j]; // dot product of two row-vectors for (int j = 0; j < ns; j++) // dot used by all columns of dL/dA dL_dS[i][j] = P[i][j] * (dL_dP[i][j] - dot); } return dL_dS; }

Classes:

transformer.h:

#ifndef TRANSFORMER_H #define TRANSFORMER_H class Tokenizer { private: unordered_map<string, int> words; unordered_map<int, string> wordIndex; int nTokens; // number of words // special tokens const string PAD = "<PAD>"; const string UNK = "<UNK>"; const string BOS = "<BOS>"; const string EOS = "<EOS>"; const string SEP = "<SEP>"; void addWord(const string &w) { if (words.find(w) == words.end()) { int id = nTokens; words[w] = id; wordIndex[id] = w; nTokens++; } } vector<string> split(const string &str) { vector<string> tokens; stringstream ss(str); string word; while (ss >> word) { tokens.push_back(word); } return tokens; } public: Tokenizer(); Tokenizer(ifstream &fs); unordered_map<string, int> getTokensWords(); vector<int> tokenize(const string& str); string detokenize(const vector<int>& tokenIDs); int get_nTokens() const; unordered_map<int, string> get_wordIndex(); }; class Embedding { private: int nTokens; // number of words int dim; // embedding dimension matrixd em_matrix; // embedding matrix; public: Embedding(int sequence_length, int embeddingDimension); // create embedding matrix with tokenIDs matrixd embed(const vector<int>& tokenIDs); void backProp(const vector<int>& tokens, const matrixd& dX, double eta); int get_embedDim() const; int get_nTokens() const; }; // Positional Encoding class PositionalEncoding { private: int maxTokens; // maximum sequence length int dModel; // embedding dimension matrixd PE; // positional_encodings matrix; public: PositionalEncoding(int max_seq_length, int embedding_dimension); void extendPE(int new_maxTokens); matrixd addPE(const matrixd& embeddings); matrixd getPE(); }; // Add & Norm struct AddNormGrad { matrixd dL_dA; // residual path matrixd dL_dB; // sublayer path }; class AddnNorm { private: vector<double> gamma; // size dModel vector<double> beta; // size dModel matrixd Zhat; // normalized values (nt x d) vector<double> mean; // per row (nt) vector<double> var; // per row (nt) double eps; // small value public: AddnNorm(int dModel); // forward computation matrixd forward (const matrixd &A, const matrixd &B); // Assume forward already filled: Zhat, mean, var AddNormGrad backProp(const matrixd &X, const matrixd &dL_dY, double eta); // Optional setters (to plug in forward results) void setCache(const matrixd& zhat, const vector<double>& m, const vector<double>& v); // Accessors (optional) const vector<double>& getGamma(); const vector<double>& getBeta(); }; // Feed-forward Network class FeedForward { private: matrixd W1, W2; int dModel, d_ff; // cache matrixd F1; // after activation function ReLU matrixd Z; // before ReLU (needed for derivative) public: // constructor FeedForward(int dModel, int d_ff_); // ------------------------------ // Forward (for cache) // ------------------------------ matrixd FFoutput(const matrixd &X); // ------------------------------ // Backprop // ------------------------------ matrixd backProp(const matrixd &X, const matrixd &dL_dY, double eta); }; // ----------- MultiHeadAttention ----------- struct MHA_Gradient { matrixd dL_dQ; matrixd dL_dK; matrixd dL_dV; }; class MultiHeadAttention { private: int dModel, nHeads, d_k; vector<matrixd> WQ, WK, WV; // per head matrixd WO; // cache vector<matrixd> Qs, Ks, Vs, Ps, As; public: MultiHeadAttention(int dModel, int nHeads); void clearCache(); matrixd computeAttention(const matrixd &X_Q, const matrixd &X_K, const matrixd &X_V, bool mask); MHA_Gradient backProp( const matrixd &X_Q, const matrixd &X_K, const matrixd &X_V, const matrixd &dL_dH, double eta, bool mask); }; // ----------------- OutputLayer --------------- class OutputLayer { private: matrixd W; // dModel x vocab_size int dModel, vocab_size; public: OutputLayer(int dModel, int vocab_size); matrixd forward(const matrixd &X); matrixd backward(const matrixd &X, const matrixd &P, const vector<int> &targets, double eta); }; // ---------- EncoderLayer -------------------- class EncoderLayer { private: MultiHeadAttention mha; FeedForward ffn; AddnNorm norm1, norm2; matrixd X, H1, H2; public: EncoderLayer(); //default constructor EncoderLayer(int d_model, int nHeads, int d_ff); matrixd forward(const matrixd& input); matrixd backProp(const matrixd& dL_dOut, double eta); }; // ------------ Decoder Layer ------------------- struct DecoderGrad { matrixd dL_dX; // gradient to decoder input matrixd dL_dE; // gradient to encoder output }; class DecoderLayer { private: FeedForward ffn; AddnNorm norm1, norm2, norm3; matrixd X, H1, H2, H3; matrixd E; // final output of Encoder public: MultiHeadAttention self_mha; MultiHeadAttention cross_mha; DecoderLayer(int d_model, int nHeads, int d_ff); matrixd forward(const matrixd& input, const matrixd& encoder_output); DecoderGrad backProp(const matrixd& dL_dOut, double eta); }; // ------------------- Transformer --------------- class MiniTrans { private: Embedding embed; PositionalEncoding pe; vector<EncoderLayer> eLayers; vector<DecoderLayer> dLayers; OutputLayer output; int seq_len, vocab_size; int dModel; matrixd X_embed; // cache matrixd X_pe; // cache matrixd X_final; matrixd dec_out; // cache decoder output public: MiniTrans(int vocab_size, int seq_len, int dModel, int nHeads, int d_ff, int nLayers); matrixd forward(const vector<int> &src_tokens, const vector<int> &target_tokens); double trainStep(const vector<int> &src, const vector<int> &target_input, const vector<int> &target_out, double eta); vector<int> generate(const vector<int> &src, int max_len, int start_token, int end_token); }; #endif

transformer.cpp:

// http://forejune.co/cuda // transformer.cpp: Encoder-Decoder Transformer #include <iostream> #include <fstream> #include <vector> #include <string> #include <unordered_map> #include <cmath> #include <random> #include <algorithm> #include <iomanip> #include <cassert> #include "util.h" #include "transformer.h" using namespace std; // ------------------------ Tokenizer class -------------------------- // ------------------------------- // Default constructor // ------------------------------- Tokenizer::Tokenizer() { nTokens = 0; // Initialize special tokens first addWord(PAD); // 0 addWord(UNK); // 1 addWord(BOS); // 2 addWord(EOS); // 3 addWord(SEP); // 4 } // ------------------------------- // Build vocab from file // ------------------------------- Tokenizer::Tokenizer(ifstream &fs) : Tokenizer() { string line; while (getline(fs, line)) { vector<string> tokens = split(line); //split(line); for (int i = 0; i < tokens.size(); i++) addWord(tokens[i]); } } unordered_map<string, int> Tokenizer::getTokensWords() { return words; } vector<int> Tokenizer::tokenize(const string& str) { vector<int> ids; vector<string> tokens = split(str); for (int i = 0; i < tokens.size(); i++) { string w = tokens[i]; if (words.find(w) != words.end()) ids.push_back(words[w]); else ids.push_back(words[UNK]); } return ids; } string Tokenizer::detokenize(const vector<int>& tokenIDs) { string words; int n = tokenIDs.size(); for (int i = 0; i < n; i++) { int id = tokenIDs[i]; if (wordIndex.find(id) == wordIndex.end()) continue; string w = wordIndex[id]; // Skip special tokens (optional behavior) if (w == "<PAD>" || w == "<BOS>" || w == "<EOS>") continue; words += w + " "; } return words; } int Tokenizer::get_nTokens() const { return nTokens; } unordered_map<int, string>Tokenizer::get_wordIndex() { return wordIndex; } //------------------ Embedding Class ------------------ Embedding::Embedding(int sequence_length, int embeddingDimension) { nTokens = sequence_length; dim = embeddingDimension; // initialize embeddings as an nTokensxdim matrix with certain random values initMatrix(em_matrix, nTokens, dim); } // create embedding matrix with tokenIDs matrixd Embedding::embed(const vector<int>& tokenIDs) { matrixd embeddings; embeddings.reserve(tokenIDs.size()); // set capacity of embeddings vector for(int i = 0; i < tokenIDs.size(); i++) { if (tokenIDs[i] >= 0 && tokenIDs[i] < nTokens) embeddings.push_back(em_matrix[tokenIDs[i]]); else // Out-of-dictionary tokens embeddings.push_back(vector<double>(dim, 0.0)); } return embeddings; } int Embedding::get_embedDim() const { return dim; } int Embedding::get_nTokens() const { return nTokens; } // optional void Embedding::backProp(const vector<int>& tokenIDs, const matrixd& dL_dX, double eta) { int nt = tokenIDs.size(); for (int i = 0; i < nt; i++) { int token = tokenIDs[i]; for (int j = 0; j < dim; j++) em_matrix[token][j] -= eta * dL_dX[i][j]; } } // --------------------- Positional Encoding class --------------- PositionalEncoding::PositionalEncoding(int max_seq_length, int embed_dimension) { maxTokens = max_seq_length; dModel = embed_dimension; PE.resize(maxTokens, vector<double>(dModel)); for (int pos = 0; pos < maxTokens; pos++) { for (int i = 0; i < dModel/2; i++) { PE[pos][2*i] = sin( pos / pow(10000, 2.0*i/dModel) ); PE[pos][2*i+1] = cos( pos / pow(10000, 2.0*i/dModel) ); } } } void PositionalEncoding::extendPE(int new_maxTokens) { if (new_maxTokens <= maxTokens) return; int oldMax = maxTokens; maxTokens = new_maxTokens; PE.resize(maxTokens, vector<double>(dModel, 0.0)); for (int pos = oldMax; pos < maxTokens; pos++) { for (int i = 0; i < dModel; i += 2) { double denom = pow(10000.0, (double)i / dModel); PE[pos][i] = sin(pos / denom); if (i + 1 < dModel) PE[pos][i + 1] = cos(pos / denom); } } } // add positional encodings to embeddings matrixd PositionalEncoding::addPE(const matrixd& embeddings) { int nTokens = embeddings.size(); int dim = embeddings[0].size(); assert(dim == dModel); if (nTokens > maxTokens) { extendPE(nTokens); } matrixd x = embeddings; // scale before adding PE (optional) double scale = sqrt(dModel); for (int i = 0; i < nTokens; i++) for (int j = 0; j < dModel; j++) x[i][j] = embeddings[i][j] * scale + PE[i][j]; return x; } matrixd PositionalEncoding::getPE() { return PE; } // -------------------- Add & Norm Class--------------------------------- // constructor AddnNorm::AddnNorm(int dModel) { gamma.assign(dModel, 1.0); beta.assign(dModel, 0.0), eps = 1e-5; } matrixd AddnNorm::forward (const matrixd &A, const matrixd &B) { matrixd x = addMat(A, B); // add two matrices int n = x.size(); // rows int m = x[0].size(); // cols mean.resize( n ); var.resize( n ); Zhat.resize(n, vector<double>(m, 0)); matrixd Y(n, vector<double>(m)); // normalize the result for (int i = 0; i < n; i++) { double sum = 0; // sum of one row var[i] = 0; for (int j = 0; j < m; j++) sum += x[i][j]; mean[i] = sum / m; // mu_i, mean of i-th row sum = 0; for (int j = 0; j < m; j++) { double diff = x[i][j] - mean[i]; sum += diff * diff; } var[i] = sum / m; for (int j = 0; j < m; j++) { double xd = (x[i][j] - mean[i]) / sqrt(var[i] + eps); //epsilon); Zhat[i][j] = xd; Y[i][j] = gamma[j] * xd + beta[j]; } } return Y; // final output of Add & norm } // Assume forward already filled: Zhat, mean, var AddNormGrad AddnNorm::backProp(const matrixd &X, const matrixd &dL_dY, double eta) { int nt = X.size(); int m = X[0].size(); matrixd dL_dZ(nt, vector<double>(m, 0.0)); matrixd dL_dZhat(nt, vector<double>(m, 0.0)); vector<double> dL_dGamma(m, 0.0); // gradient wrt gamma vector<double> dL_dBeta(m, 0.0); // gradient wrt beta // 1. Compute dL_dGamma and dL_dBeta for each token for (int i = 0; i < nt; i++) { for (int j = 0; j < m; j++){ dL_dBeta[j] += dL_dY[i][j]; dL_dGamma[j] += dL_dY[i][j] * Zhat[i][j]; } } // 2. Backprop through LayerNorm (row-wise) for (int i = 0; i < nt; i++) { double inv_sigma = 1.0 / sqrt(var[i] + eps); // Step A: compute intermediate values vector<double> dL_dZhat(m); for (int j = 0; j < m; j++) { dL_dZhat[j] = dL_dY[i][j] * gamma[j]; } double sum_dL_dZhat = 0.0; double sum_dL_dZhat_zhat = 0.0; for (int j = 0; j < m; j++) { sum_dL_dZhat += dL_dZhat[j]; sum_dL_dZhat_zhat += dL_dZhat[j] * Zhat[i][j]; } // Final gradient for (int j = 0; j < m; j++) { dL_dZ[i][j] = inv_sigma * (dL_dZhat[j] - sum_dL_dZhat / m - Zhat[i][j] * sum_dL_dZhat_zhat / m); } } // 3. Update parameters for (int j = 0; j < m; j++) { gamma[j] -= eta * dL_dGamma[j]; beta[j] -= eta * dL_dBeta[j]; } AddNormGrad grad; grad.dL_dA = dL_dZ; // residual path grad.dL_dB = dL_dZ; // sublayer path return grad; } // Optional setters (to plug in forward results) void AddnNorm::setCache(const matrixd& zhat, const vector<double>& m, const vector<double>& v) { Zhat = zhat; mean = m; var = v; } // --------------------------- Feed Forward ----------------------------------- FeedForward::FeedForward(int dModel0, int d_ff0) { dModel = dModel0; d_ff = d_ff0; W1.resize(dModel, vector<double>(d_ff)); W2.resize(d_ff, vector<double>(dModel)); initMatrix(W1, dModel, d_ff); initMatrix(W2, d_ff, dModel); } // ------------------------------ // Forward (for cache) // ------------------------------ matrixd FeedForward::FFoutput(const matrixd &X) { Z = matmul(X, W1); // pre-activation F1 = relu( Z ); // f(Z); F1 = f(XW1) matrixd F2 = matmul(F1, W2); // F2 = F1 W2 return F2; } // ------------------------------ // Backprop // ------------------------------ matrixd FeedForward::backProp(const matrixd &X, const matrixd &dL_dY, // dL/dF2 double eta) { // ---- W2 ---- matrixd dL_dF2 = dL_dY; // dL/dW2 = F1^T * dL_dF2 matrixd F1T = transpose(F1); matrixd dL_dW2 = matmul(F1T, dL_dF2); // dL_dF1 = dL_dF2 * W2^T matrixd W2T = transpose(W2); matrixd dL_dF1 = matmul(dL_dF2, W2T); // ---- find dL_dZ1 ---- matrixd fd_Z = relu_deriv(Z); // f_deriv(Z); int rows = dL_dF1.size(); int cols = dL_dF1[0].size(); matrixd dL_dZ1 = dL_dF1; // same dimension for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) dL_dZ1[i][j] = dL_dF1[i][j] * fd_Z[i][j]; // ---- W1 ---- // dL/dW1 = X^T * dL/dZ1 matrixd XT = transpose(X); matrixd dL_dW1 = matmul(XT, dL_dZ1); // dL/dX = dL/dZ1 * W1^T matrixd W1T = transpose(W1); matrixd dL_dX = matmul(dL_dZ1, W1T); // ---- Update weights ---- for (int i = 0; i < W2.size(); i++) for (int j = 0; j < W2[0].size(); j++) W2[i][j] -= eta * dL_dW2[i][j]; for (int i = 0; i < W1.size(); i++) for (int j = 0; j < W1[0].size(); j++) W1[i][j] -= eta * dL_dW1[i][j]; return dL_dX; // This becomes the dL_dY of next stage } // ------------------ MultiHeadAttention -------------------- MultiHeadAttention::MultiHeadAttention(int dim, int n) { dModel = dim; nHeads = n; d_k = dModel / nHeads; WQ.resize(nHeads); WK.resize(nHeads); WV.resize(nHeads); for (int h = 0; h < nHeads; h++) { initMatrix(WQ[h], dModel, d_k); initMatrix(WK[h], dModel, d_k); initMatrix(WV[h], dModel, d_k); } initMatrix(WO, dModel, dModel); } void MultiHeadAttention::clearCache() { Qs.clear(); Ks.clear(); Vs.clear(); Ps.clear(); As.clear(); } matrixd MultiHeadAttention::computeAttention(const matrixd &X_Q, const matrixd &X_K, const matrixd &X_V, bool mask) { Qs.clear(); Ks.clear(); Vs.clear(); Ps.clear(); As.clear(); vector<matrixd> heads; int nq = X_Q.size(); int nk = X_K.size(); for (int h = 0; h < nHeads; h++) { matrixd Qh = matmul(X_Q, WQ[h]); // nq x d_k matrixd Kh = matmul(X_K, WK[h]); // nk x d_k matrixd Vh = matmul(X_V, WV[h]); // nk x d_k matrixd S = matmul(Qh, transpose(Kh)); scaleMat(S, 1.0 / sqrt(d_k)); if (mask) S = causalMask(S); matrixd Ph = softmax(S); // row-wise matrixd Ah = matmul(Ph, Vh); // nq x d_k Qs.push_back(Qh); Ks.push_back(Kh); Vs.push_back(Vh); Ps.push_back(Ph); As.push_back(Ah); heads.push_back(Ah); } // Concatenate heads int n = heads[0].size(); matrixd concat(n, vector<double>(dModel, 0.0)); for (int h = 0; h < nHeads; h++) for (int i = 0; i < n; i++) for (int j = 0; j < d_k; j++) concat[i][h * d_k + j] = heads[h][i][j]; // Final projection matrixd H = matmul(concat, WO); return H; } // backpropagation MHA_Gradient MultiHeadAttention::backProp( const matrixd &X_Q, const matrixd &X_K, const matrixd &X_V, const matrixd &dL_dH, double eta, bool mask) { if (Ps.empty() || As.empty()) { cout << "ERROR: MHA cache empty in backProp!" << endl; exit(1); } int nt = X_Q.size(); double s = 1.0 / sqrt(d_k); // ========================= // Step 1: concatenate heads // ========================= matrixd A_cat(nt, vector<double>(dModel, 0.0)); for (int h = 0; h < nHeads; h++) for (int i = 0; i < nt; i++) for (int j = 0; j < d_k; j++) A_cat[i][h*d_k + j] = As[h][i][j]; // ========================= // Step 2: gradients for WO // ========================= matrixd dL_dWO = matmul(transpose(A_cat), dL_dH); // Compute before updating WO matrixd dL_dAcat = matmul(dL_dH, transpose(WO)); // ========================= // Step 3: initialize accumulators // ========================= matrixd dL_dXQ(X_Q.size(), vector<double>(dModel, 0.0)); matrixd dL_dXK(X_K.size(), vector<double>(dModel, 0.0)); matrixd dL_dXV(X_V.size(), vector<double>(dModel, 0.0)); // ========================= // Step 4: per-head backprop // ========================= for (int h = 0; h < nHeads; h++) { // ---- slice dA ---- matrixd dL_dA(nt, vector<double>(d_k)); for (int i = 0; i < nt; i++) for (int j = 0; j < d_k; j++) dL_dA[i][j] = dL_dAcat[i][h*d_k + j]; matrixd P = Ps[h]; matrixd V = Vs[h]; matrixd Q = Qs[h]; matrixd K = Ks[h]; // ===================== // A = P V // ===================== matrixd dL_dP = matmul(dL_dA, transpose(V)); matrixd dL_dV = matmul(transpose(P), dL_dA); // optional stabilization //scaleMat(dL_dP, 1.0 / nt); // ===================== // softmax backward // ===================== matrixd dL_dS = softmax_derivative(dL_dP, P); // apply mask if used if (mask) { for (int i = 0; i < nt; i++) for (int j = i+1; j < nt; j++) dL_dS[i][j] = 0.0; } // ===================== // S = (Q K^T) * s // ===================== matrixd dL_dQ = matmul(dL_dS, K); matrixd dL_dK = matmul(transpose(dL_dS), Q); scaleMat(dL_dQ, s); scaleMat(dL_dK, s); // ===================== // Weight gradients // ===================== matrixd dL_dWQ = matmul(transpose(X_Q), dL_dQ); matrixd dL_dWK = matmul(transpose(X_K), dL_dK); matrixd dL_dWV = matmul(transpose(X_V), dL_dV); // ===================== // Input gradients (USE OLD WEIGHTS!) // ===================== matrixd dXQ = matmul(dL_dQ, transpose(WQ[h])); matrixd dXK = matmul(dL_dK, transpose(WK[h])); matrixd dXV = matmul(dL_dV, transpose(WV[h])); // accumulate across heads dL_dXQ = addMat(dL_dXQ, dXQ); dL_dXK = addMat(dL_dXK, dXK); dL_dXV = addMat(dL_dXV, dXV); // ===================== // Update weights after gradients // ===================== updateWeight(WQ[h], dL_dWQ, eta); updateWeight(WK[h], dL_dWK, eta); updateWeight(WV[h], dL_dWV, eta); } // ========================= // Step 5: update WO // ========================= updateWeight(WO, dL_dWO, eta); // ========================= // return gradients // ========================= MHA_Gradient grad; grad.dL_dQ = dL_dXQ; grad.dL_dK = dL_dXK; grad.dL_dV = dL_dXV; return grad; } // ---------- EncoderLayer -------------------- EncoderLayer::EncoderLayer(int d_model, int nHeads, int d_ff) : mha(d_model, nHeads), ffn(d_model, d_ff), norm1(d_model), norm2(d_model) {} matrixd EncoderLayer::forward(const matrixd& input) { X = input; mha.clearCache(); matrixd A1 = mha.computeAttention(X, X, X, true); H1 = norm1.forward(X, A1); matrixd A2 = ffn.FFoutput(H1); H2 = norm2.forward(H1, A2); return H2; } matrixd EncoderLayer::backProp(const matrixd& dL_dY, double eta) { // ========================= // Step 1: FFN block // ========================= AddNormGrad g2 = norm2.backProp(H1, dL_dY, eta); matrixd dL_dH1_res = g2.dL_dA; matrixd dL_dFFN_in = g2.dL_dB; matrixd dL_dH1_ffn = ffn.backProp(H1, dL_dFFN_in, eta); matrixd dL_dH1 = addMat(dL_dH1_res, dL_dH1_ffn); // ========================= // Step 2: MHA block // ========================= AddNormGrad g1 = norm1.backProp(X, dL_dH1, eta); matrixd dX_res = g1.dL_dA; matrixd dL_dMHA = g1.dL_dB; MHA_Gradient g_mha = mha.backProp(X, X, X, dL_dMHA, eta, false); matrixd dL_dX_mha = addMat(g_mha.dL_dQ, addMat(g_mha.dL_dK, g_mha.dL_dV)); matrixd dL_dX = addMat(dX_res, dL_dX_mha); return dL_dX; } // ---------- OutputLayer ---------------------- OutputLayer::OutputLayer(int dim, int nv) { dModel = dim; vocab_size = nv; initMatrix(W, dModel, vocab_size); } // Forward: logits -> probabilities matrixd OutputLayer::forward(const matrixd &X) { // X: (seq_len x dModel) // output: (seq_len x vocab_size) matrixd logits = matmul(X, W); return softmax(logits); } // Backward: returns dL/dX matrixd OutputLayer::backward(const matrixd &X, const matrixd &P, const vector<int> &targets, double eta) { int nt = P.size(); // dL/dY = P - T (cross-entropy + softmax) matrixd dL_dY = P; for (int i = 0; i < nt; i++) dL_dY[i][targets[i]] -= 1.0; // optional but recommended: normalize for (int i = 0; i < nt; i++) for (int j = 0; j < P[0].size(); j++) dL_dY[i][j] /= nt; // Gradient wrt weights matrixd dL_dW = matmul(transpose(X), dL_dY); // Gradient wrt input (back to decoder) matrixd dL_dX = matmul(dL_dY, transpose(W)); //update weights after gradient computation updateWeight(W, dL_dW, eta); return dL_dX; } // ------------ Decoder Layer ------------------- DecoderLayer::DecoderLayer(int d_model, int nHeads, int d_ff) : self_mha(d_model, nHeads), cross_mha(d_model, nHeads), ffn(d_model, d_ff), norm1(d_model), norm2(d_model), norm3(d_model) {} matrixd DecoderLayer::forward(const matrixd& input, const matrixd& encoder_output) { X = input; E = encoder_output; self_mha.clearCache(); matrixd A1 = self_mha.computeAttention(X, X, X, true); H1 = norm1.forward(X, A1); cross_mha.clearCache(); matrixd A2 = cross_mha.computeAttention(H1, E, E, false); H2 = norm2.forward(H1, A2); matrixd A3 = ffn.FFoutput(H2); H3 = norm3.forward(H2, A3); return H3; } DecoderGrad DecoderLayer::backProp(const matrixd& dL_dY, double eta) { // ========================= // Step 1: FFN // ========================= AddNormGrad g3 = norm3.backProp(H2, dL_dY, eta); matrixd dL_dH2_res = g3.dL_dA; matrixd dL_dFFN_in = g3.dL_dB; matrixd dL_dH2_ffn = ffn.backProp(H2, dL_dFFN_in, eta); matrixd dL_dH2 = addMat(dL_dH2_res, dL_dH2_ffn); // ========================= // Step 2: Cross Attention // ========================= AddNormGrad g2 = norm2.backProp(H1, dL_dH2, eta); matrixd dL_dH1_res = g2.dL_dA; matrixd dL_dCross = g2.dL_dB; MHA_Gradient g_cross = cross_mha.backProp(H1, E, E, dL_dCross, eta, false); matrixd dL_dEnc = addMat(g_cross.dL_dK, g_cross.dL_dV); // encoder gradients matrixd dL_dH1 = addMat(dL_dH1_res, g_cross.dL_dQ); // decoder path // ========================= // Step 3: Self Attention // ========================= AddNormGrad g1 = norm1.backProp(X, dL_dH1, eta); matrixd dL_dX_res = g1.dL_dA; matrixd dL_dSelf = g1.dL_dB; MHA_Gradient g_self = self_mha.backProp(X, X, X, dL_dSelf, eta, true); matrixd dL_dSelf_mha = addMat(g_self.dL_dQ, addMat(g_self.dL_dK, g_self.dL_dV)); matrixd dL_dX = addMat(dL_dX_res, dL_dSelf_mha); return {dL_dX, dL_dEnc}; } // ------------ Transformer (Encoder-Decoder) ------------------- MiniTrans::MiniTrans(int vocab_size_, int seq_len_, int dModel_, int nHeads, int d_ff, int nLayers) : embed(vocab_size_, dModel_), pe(seq_len_, dModel_), output(dModel_, vocab_size_) { vocab_size = vocab_size_; seq_len = seq_len_; dModel = dModel_; for (int i = 0; i < nLayers; i++) { EncoderLayer e = EncoderLayer(dModel, nHeads, d_ff); eLayers.push_back( e ); DecoderLayer d = DecoderLayer(dModel, nHeads, d_ff); dLayers.push_back( d ); } } matrixd MiniTrans::forward(const vector<int> &source_tokens, const vector<int> &target_in) { // ========================= // Encoder // ========================= matrixd src_embed = embed.embed(source_tokens); matrixd src_pe = pe.addPE(src_embed); matrixd E = src_pe; // E will be final output of encoder for (int i = 0; i < eLayers.size(); i++) E = eLayers[i].forward(E); // ========================= // Decoder // ========================= matrixd tgt_embed = embed.embed(target_in); matrixd tgt_pe = pe.addPE(tgt_embed); matrixd dec = tgt_pe; for (int i = 0; i < dLayers.size(); i++) dec = dLayers[i].forward(dec, E); // cache final decoder output (IMPORTANT for backward) dec_out = dec; // ========================= // Output layer // ========================= matrixd P = output.forward(dec_out); return P; } double MiniTrans::trainStep(const vector<int> &source, const vector<int> &target_in, const vector<int> &target_out, double eta) { // ========================= // Step 1: Forward // ========================= matrixd P = forward(source, target_in); double loss = crossEntropy(P, target_out); // ========================= // Step 2: Output backward // ========================= matrixd dL_dX = output.backward(dec_out, P, target_out, eta); // ========================= // Step 3: Decoder backward // ========================= matrixd dL_dE; // accumulate gradients to encoder bool first = true; // accumulate encoder gradients dL_dE for (int i = dLayers.size() - 1; i >= 0; i--) { DecoderGrad g = dLayers[i].backProp(dL_dX, eta); dL_dX = g.dL_dX; if (first) { dL_dE = g.dL_dE; first = false; } else { dL_dE = addMat(dL_dE, g.dL_dE); } } // ========================= // Step 4: Encoder backward // ========================= for (int i = eLayers.size() - 1; i >= 0; i--) dL_dE = eLayers[i].backProp(dL_dE, eta); // ========================= // Step 5: Embedding update, optional // ========================= embed.backProp(source, dL_dE, eta); // encoder side embed.backProp(target_in, dL_dX, eta); // decoder side return loss; } vector<int> MiniTrans::generate(const vector<int> &source, int max_len, int start_token, int end_token) { // ========================= // Pass source through encoder // ========================= matrixd src_embed = embed.embed(source); matrixd src_pe = pe.addPE(src_embed); matrixd E = src_pe; for (int i = 0; i < eLayers.size(); i++) E = eLayers[i].forward(E); // ========================= // Initialize sequence // ========================= vector<int> generated; generated.push_back(start_token); // ========================= // Autoregressive loop // ========================= for (int step = 0; step < max_len; step++) { // embed current sequence matrixd target_embed = embed.embed(generated); matrixd target_pe = pe.addPE(target_embed); matrixd dec = target_pe; // pass through decoder for (int i = 0; i < dLayers.size(); i++) dec = dLayers[i].forward(dec, E); matrixd P = output.forward(dec); vector<double> last = P.back(); //last row of matrix P int best_token = max_element(last.begin(), last.end()) - last.begin(); generated.push_back(best_token); if (best_token == end_token) break; } return generated; }

Test Program:

// testMain.cpp : main() for testing a transformer
// http://forejune.co/cuda/
#include <cmath>
#include <algorithm>
#include <fstream>
#include "util.h"
#include "transformer.h"

using namespace std;

struct TrainingDS {
  vector<int> source;
  vector<int> target_in;
  vector<int> target_out;
};

vector <TrainingDS> buildDataset(Tokenizer &tokenizer)
{
  vector <TrainingDS> dataset;
  TrainingDS ds;	
  
  const int n = 3;	// 3 training examples

  string sources[n] = {
	"one two three four", 
        "one two three four reverse",
  	"one two three four digit"};
   string target_ins[n] = {
     	 "<BOS> uno dos tres cuatro",
  	 "<BOS> four three two one",
  	 "<BOS> 1 2 3 4"};
   string target_outs[n] = {
  	 "uno dos tres cuatro <EOS>",
  	"four three two one <EOS>",
  	"1 2 3 4 <EOS>"};

   for (int i = 0; i < n; i++) {
      vector<int> token_src = tokenizer.tokenize(sources[i]);
      vector<int> token_tin = tokenizer.tokenize(target_ins[i] );
      vector<int> token_tout =tokenizer.tokenize(target_outs[i]); 
      ds = TrainingDS (token_src, token_tin, token_tout);
      dataset.push_back( ds );
   }

   return dataset;
}


int main()
{
    // -------------------------
    // Tokenizer setup
    // -------------------------

    // Build vocabulary 
    ifstream ifs;
    char fname[] = "t.txt";
    ifs.open( fname );
    if (!ifs.is_open()) {
          cerr << "Unable to open file " << fname  << endl;
          exit( 1 );
    }

    Tokenizer tokenizer( ifs );

    int vocab_size = tokenizer.get_nTokens();

    cout << "Vocab size: " << vocab_size << endl;

    vector <TrainingDS> datasets;
    datasets = buildDataset( tokenizer );
    int n_ds = datasets.size(); 

    cout << "==== Encoder-Decoder Transformer Tests ====" << endl;
    // ----------------------------
    // Hyperparameters
    // ----------------------------
    int seq_len = 16;      
    int dModel = 16;
    int nHeads = 4;
    int d_ff = 16; 
    int nLayers = 4;

    double eta = 0.002;
    int nEpoch = 601;

    // ----------------------------
    // Model
    // ----------------------------
    MiniTrans model(vocab_size, seq_len, dModel, nHeads, d_ff, nLayers);
    int START = tokenizer.tokenize( "<BOS>" )[0];
    int EOS = tokenizer.tokenize( "<EOS>" )[0];

    for (int k = 0; k < n_ds; k++) {
       cout << "Example " << k + 1 << ":" << endl;
       
       // Training loop
       for (int i = 0; i < nEpoch; i++)
       {
          double loss;
          loss = model.trainStep(datasets[k].source, datasets[k].target_in, 
			  datasets[k].target_out, eta);
          if (i % 50 == 0)
             cout << "Epoch " << i << " Loss: " << loss << endl;
       }

       // Inference (greedy decoding)
       cout << "\n==== Inference ====" << endl;

       vector<int> predict = model.generate(datasets[k].source, 12, START, EOS);

       cout << "Source: ";
       string str = tokenizer.detokenize( datasets[k].source );
       cout << str << endl;

       cout << "Predicted: ";
       str = tokenizer.detokenize( predict );
       cout << str << endl; 

       cout << "Expected:  ";
       str = tokenizer.detokenize( datasets[k].target_out );
       cout << str << endl; 

       getchar(); 	// pause
    }  // for k

    return 0;
}

Makefile:

PROG	= testMain
#source codes
SRCS =  $(PROG).cpp
#substitute .cpp by .o to obtain object filenames
OBJS = $(SRCS:.cpp=.o) util.o  transformer.o

#$< evaluates to the target's dependencies, 
#$@ evaluates to the target

$(PROG): $(OBJS)
	g++ -o $@ $(OBJS)  

$(OBJS): 
	g++ -c  -std=c++20 $*.cpp 

clean:
	rm $(OBJS) $(PROG)