A Simple GPT-style AI Transformer in C/C++

Materials available at: http://forejune.co/cuda/

Review

Decoder-Only (GPT-style) Transformer

The Model:

has decoder only

no encoder, no cross-attention

masked self-attention only

predicts next token given previous tokens

Advantages:

Simpler architecture
Unified framework (very flexible)
text --> text continuation
e.g.
- translation
- summarization
- code generation
just by formating input.
Efficient inference (with caching)
Scales very well
No alignment requirement

Disadvantages:

Not naturally suited for input-output tasks
Translation ideally wants:
But GPT sees:
Requires masking tricks
One has to do:
Encoder-decoder:
Less efficient for long inputs
Weaker alignment for translation
Exposure bias in generation
Needs more data

When Decoder-Only Works Best:

language modeling

chat systems

code generation

general-purpose AI

Less ideal for:

strict translation

structured input-output tasks

low-data scenarios

Summary:


 Feature                  Decoder-Only (GPT)  Encoder-Decoder

 Architecture             Simple              More complex    

 Flexibility              Very high           Medium   

 Translation quality      Good                Better 

 Training data            Unstructured OK     Needs pairs   

 Efficiency (long input)  Lower               Better   

 Implementation           Easier              Harder

Full Implementation in C/C++

Helper Functions:

util.h:

#ifndef __UTIL_H__ #define __UTIL_H__ #include <vector> #include <algorithm> #include <math.h> #include <iostream> #include <iomanip> #include <random> using namespace std; typedef vector<vector<double>> matrixd; // an activation function double f(double z); // Relu activation function with matrix input matrixd relu(const matrixd &X); // Derivative of Relu matrixd relu_deriv (const matrixd &X); // calculates entropy loss double crossEntropy(const matrixd &P, const vector<int> &target); // add two matrices matrixd addMat(const matrixd &A, const matrixd &B); // Transpose matrix n x m, B = A^T : m x n matrixd transpose(const matrixd& A); // Matrix multiplication C = A X B A: nxm, B: mxr, C: nxr matrixd matmul(const matrixd& A, const matrixd& B); // multiply a matrix by a scalar matrixd mulMats (const matrixd &A, const double s); // scale a matrix void scaleMat(matrixd &A, const double s); // apply masking to S matrixd causalMask(matrixd S); // update weight matrix W -= dW * eta void updateWeight(matrixd &W, const matrixd &dW, const double eta); // initialize an input matrix with certain random values void initMatrix(matrixd& M, const int n, const int m); void initMatrix(matrixd& M, const int n, const int m, double dModel); // print a token dictionary void printDictionary(unordered_map<string, int> &d); // print a matrix void printMatrix(const matrixd &y); // Add & Norm matrixd addNnorm(const matrixd &A, const matrixd &B); void clip(matrixd &A, double max_val); // softmax, 1D vector input vector<double> softmax(const vector<double> &v); // softmax activation function, 2D input vector matrixd softmax(const matrixd& input); // derivative function of softmax matrixd softmax_derivative(const matrixd &dL_dP, const matrixd &P); #endif

util.cpp:

// util.cpp: helper functions // http://forejune.co/cuda/ #include "util.h" #include <random> using namespace std; // Activation function: using ReLU (Rectified Linear Unit) double f(double z) { return max(0.0, z); } // Relu activation function with matrix input matrixd relu(const matrixd &X) { int rows = X.size(); int cols = X[0].size(); matrixd Y = X; // same dimension of X for(int i = 0; i < rows; i++) for(int j = 0; j < cols; j++) Y[i][j] = max(0.0, X[i][j]); return Y; } // Derivative of Relu matrixd relu_deriv(const matrixd &X) { int rows = X.size(); int cols = X[0].size(); matrixd Y = X; // same dimension as X for(int i = 0; i < rows; i++) for(int j = 0; j < cols; j++) if ( X[i][j] > 0 ) Y[i][j] = 1; else Y[i][j] = 0; return Y; } // calculates entropy loss double crossEntropy(const matrixd &P, const vector<int> &target) { double L = 0; int nt = P.size(); for (int i = 0; i < nt; i++){ int k = target[i]; L -= log( P[i][k] + 1e-12 ); } return L / nt; } // C = A + B matrixd addMat(const matrixd &A, const matrixd &B) { int n = A.size(); // rows int m = A[0].size(); // cols matrixd C = A; // same dimension as A for (int i = 0; i < n; i++) for(int j = 0; j < m; j++) C[i][j] = A[i][j] + B[i][j]; return C; } // Transpose matrix n x m, B = A^T : m x n matrixd transpose(const matrixd& A) { int n = A.size(); //number of rows int m = A[0].size(); //number of columns matrixd B(m, vector<double>(n)); for (int i = 0; i < n; i++) for (int j = 0; j < m; j++) B[j][i] = A[i][j]; return B; } // Matrix multiplication C = A X B A: nxm, B: mxr, C: nxr matrixd matmul(const matrixd& A, const matrixd& B) { int n = A.size(); int m = A[0].size(); int r = B[0].size(); matrixd C(n, vector<double>(r, 0)); for (int i = 0; i < n; i++) for (int j = 0; j < r; j++) for (int k = 0; k < m; k++) C[i][j] += A[i][k] * B[k][j]; return C; } // scale a matrix void scaleMat(matrixd &A, const double s) { int m = A.size(); // number of rows int n = A[0].size(); // number of columns for (int i = 0; i < m; i++) for (int j = 0; j < n; j++) A[i][j] *= s; } // maultiply a matrix by a scalar s matrixd mulMats (const matrixd &A, const double s) { int m = A.size(); // number of rows int n = A[0].size(); // number of columns matrixd B = A; for (int i = 0; i < m; i++) for (int j = 0; j < n; j++) B[i][j] *= s; return B; } void initMatrix(matrixd& M, const int n, const int m) { // resizing 2D vector to n x m with initial values 0 M.assign(n, vector<double>(m, 0)); random_device rd; mt19937 gen(rd()); //psuedo random number generator double sigma = 1.0/16.0; normal_distribution<double> dist(0.0, sigma); for (int i = 0; i < n; ++i) for (int j = 0; j < m; ++j) M[i][j] = dist(gen); } void printDictionary(unordered_map<string, int> &d) { unordered_map<string, int>::iterator it = d.begin(); int k = 0; while ( it != d.end() ){ cout << right << setw(12) << it->first << ": " << setw(3) << it->second; it++; k++; if ( k % 5 == 0 ) cout << endl; } } void printMatrix(const matrixd &y) { int cols = y.size(); int rows = y[0].size(); for (int i = 0; i < cols; ++i) { cout << "Token " << i << ": ["; for (int j = 0; j < rows; ++j) { cout << fixed << setw(7) << setprecision(3) << y[i][j]; if (j < 4) cout << ", "; } cout << " ]\n"; } } matrixd addNnorm(const matrixd &A, const matrixd &B) { auto x = addMat(A, B); // add two matrices int n = x.size(); // rows int m = x[0].size(); // cols matrixd y = x; // output same size as x // normalize the result double epsilon = 1e-5; for (int i = 0; i < n; i++) { double sum = 0; // sum of one row double std = 0; //sigma_i square for (int j = 0; j < m; j++) sum += x[i][j]; double mean = sum / m; // mu_i, mean of i-th row sum = 0; for (int j = 0; j < m; j++) { double diff = x[i][j] - mean; sum += diff * diff; } std = sum / m; for (int j = 0; j < m; j++) { double xd = (x[i][j] - mean) / sqrt(std + epsilon); y[i][j] = xd; // gamma = 1, beta = 0; } } return y; // final output of Add & norm } matrixd scaledAttention(const matrixd &Q, const matrixd &K, const matrixd &V, bool mask) { double d_k, scale; matrixd KT, A; // K transpose, attention d_k = Q[0].size(); scale = 1.0 / sqrt( d_k ); KT = transpose ( K ); A = matmul(Q, KT); scaleMat(A, scale); if ( mask ) { for (int i = 0; i < A.size(); i++) for (int j = i+1; j < A[0].size(); j++) A[i][j] = -1e9; // causal mask } A = softmax( A ); matrixd output = matmul(A, V); return output; } matrixd causalMask(matrixd S) { int n = S.size(); for (int i = 0; i < n; i++) { for (int j = i+1; j < n; j++) { S[i][j] = -1e9; // effectively -infinity } } return S; } void updateWeight(matrixd &W, const matrixd &dW, const double eta) { for (int i = 0; i < W.size(); i++) for (int j = 0; j < W[0].size(); j++) W[i][j] -= eta * dW[i][j]; } void clip(matrixd &A, double max_val = 5.0) { for (auto &row : A) for (auto &x : row) if (x > max_val) x = max_val; else if (x < -max_val) x = -max_val; } // softmax activation function, 1D input vector vector<double> softmax(const vector<double> &z) { int m = z.size(); vector<double> y(m, 0); //maximum value of vector double maxVal = *max_element(z.begin(), z.end()); double sum = 0; // Subtract max for numerical stability for (int j = 0; j < m; j++) { y[j] = exp(z[j] - maxVal); sum += y[j]; } // Normalize for (int j = 0; j < m; j++) y[j] /= sum; return y; } // softmax activation function, 2D input vector matrixd softmax(const matrixd& input) { int n = input.size(), m = input[0].size(); // n x m matrix // 2D output vector n x m y[n][m] vector<vector<double>> y(n, vector<double>(m)); for (int i = 0; i < n; i++) { //maximum value of the row double maxVal = *max_element(input[i].begin(), input[i].end()); double sum = 0; // Subtract max for numerical stability for (int j = 0; j < m; j++) { y[i][j] = exp(input[i][j] - maxVal); sum += y[i][j]; } // Normalize for (int j = 0; j < input[i].size(); j++) y[i][j] /= sum; } return y; } // derivative function of softmax matrixd softmax_derivative(const matrixd &dL_dP, const matrixd &P) { int nt = P.size(), ns = P[0].size(); matrixd dL_dS(nt, vector<double>(ns,0)); for (int i = 0; i < nt; i++) { double dot = 0.0; for (int j = 0; j < ns; j++) dot += P[i][j] * dL_dP[i][j]; // dot product of two row-vectors for (int j = 0; j < ns; j++) // dot used by all columns of dL/dA dL_dS[i][j] = P[i][j] * (dL_dP[i][j] - dot); } return dL_dS; }

Transformer Classes:

transGpt.h:

#ifndef TRANSGPT_H #define TRANSGPT_H class Tokenizer { private: unordered_map<string, int> words; unordered_map<int, string> wordIndex; int nTokens; // number of words // special tokens const string PAD = "<PAD>"; const string UNK = "<UNK>"; const string BOS = "<BOS>"; const string EOS = "<EOS>"; const string SEP = "<SEP>"; void addWord(const string &w) { if (words.find(w) == words.end()) { int id = nTokens; words[w] = id; wordIndex[id] = w; nTokens++; } } vector<string> split(const string &str) { vector<string> tokens; stringstream ss(str); string word; while (ss >> word) { tokens.push_back(word); } return tokens; } public: Tokenizer(); Tokenizer(ifstream &fs); unordered_map<string, int> getTokensWords(); vector<int> tokenize(const string& str); string detokenize(const vector<int>& tokenIDs); int get_nTokens() const; unordered_map<int, string> get_wordIndex(); }; class Embedding { private: int nTokens; // number of words int dim; // embedding dimension matrixd em_matrix; // embedding matrix; public: Embedding(int sequence_length, int embeddingDimension); // create embedding matrix with tokenIDs matrixd embed(const vector<int>& tokenIDs); int get_embedDim() const; int get_nTokens() const; }; // Positional Encoding class PositionalEncoding { private: int maxTokens; // maximum sequence length int dModel; // embedding dimension matrixd PE; // positional_encodings matrix; public: PositionalEncoding(int max_seq_length, int embedding_dimension); matrixd addPE(const matrixd& embeddings); matrixd getPE(); }; // Add & Norm class AddnNorm { private: vector<double> gamma; // size dModel vector<double> beta; // size dModel matrixd Zhat; // normalized values (nt x d) vector<double> mean; // per row (nt) vector<double> var; // per row (nt) double eps; // small value public: // constructor AddnNorm(int dModel); // forward computation matrixd forward (const matrixd &A, const matrixd &B); // Assume forward already filled: Zhat, mean, var matrixd backProp(const matrixd &X, const matrixd &dL_dY, double eta); // Optional setters (to plug in forward results) void setCache(const matrixd& zhat, const vector<double>& m, const vector<double>& v); // Accessors (optional) const vector<double>& getGamma(); const vector<double>& getBeta(); }; // Feed-forward Network class FeedForward { private: matrixd W1, W2; int dModel, d_ff; // cache matrixd F1; // after activation function ReLU matrixd Z; // before ReLU (needed for derivative) public: // constructor FeedForward(int dModel, int d_ff_); // ------------------------------ // Forward (for cache) // ------------------------------ matrixd FFoutput(const matrixd &X); // ------------------------------ // Backprop // ------------------------------ matrixd backProp(const matrixd &X, const matrixd &dL_dY, double eta); }; // ----------- MultiHeadAttention ----------- class MultiHeadAttention { private: int dModel, nHeads, d_k; vector<matrixd> WQ, WK, WV; // per head matrixd WO; // cache vector<matrixd> Qs, Ks, Vs, Ps, As; public: MultiHeadAttention(int dModel, int nHeads); matrixd computeAttention(const matrixd &X_Q, const matrixd &X_K, const matrixd &X_V, bool mask); matrixd backProp( const matrixd &X_Q, const matrixd &X_K, const matrixd &X_V, const matrixd &dL_dH, double eta, bool mask); }; // ----------------- OutputLayer --------------- class OutputLayer { private: matrixd W; // dModel x vocab_size int dModel, vocab_size; public: OutputLayer(int dModel, int vocab_size); matrixd forward(const matrixd &X); matrixd backward(const matrixd &X, const matrixd &P, const vector<int> &targets, double eta); }; // ------------------- GPT-style Transformer --------------- class GPTBlock { private: MultiHeadAttention mha; AddnNorm norm1; FeedForward ffn; AddnNorm norm2; public: GPTBlock(int dModel, int nHeads, int d_ff); matrixd forward(const matrixd &X); matrixd backward(const matrixd &X, const matrixd &dL_dY, double eta); }; class MiniGPT { private: Embedding embed; PositionalEncoding pe; vector<GPTBlock> layers; OutputLayer output; int seq_len, vocab_size; public: MiniGPT(int vocab_size, int seq_len, int dModel, int nHeads, int d_ff, int nLayers); matrixd forward(const vector<int> &tokens); double trainStep(const vector<int> &tokens, double eta); }; #endif

transGpt.cpp:

// http://forejune.co/cuda // transGpt.cpp : classes of GPT-style Transformer #include <iostream> #include <fstream> #include <vector> #include <string> #include <unordered_map> #include <cmath> #include <random> #include <algorithm> #include <iomanip> #include "util.h" #include "transGpt.h" using namespace std; // ------------------------ Tokenizer class -------------------------- // ------------------------------- // Default constructor // ------------------------------- Tokenizer::Tokenizer() { nTokens = 0; // Initialize special tokens first addWord(PAD); // 0 addWord(UNK); // 1 addWord(BOS); // 2 addWord(EOS); // 3 addWord(SEP); // 4 } // ------------------------------- // Build vocab from file // ------------------------------- Tokenizer::Tokenizer(ifstream &fs) : Tokenizer() { string line; while (getline(fs, line)) { vector<string> tokens = split(line); //split(line); for (int i = 0; i < tokens.size(); i++) addWord(tokens[i]); } } unordered_map<string, int> Tokenizer::getTokensWords() { return words; } vector<int> Tokenizer::tokenize(const string& str) { vector<int> ids; vector<string> tokens = split(str); for (int i = 0; i < tokens.size(); i++) { string w = tokens[i]; if (words.find(w) != words.end()) ids.push_back(words[w]); else ids.push_back(words[UNK]); } return ids; } string Tokenizer::detokenize(const vector<int>& tokenIDs) { string words; int n = tokenIDs.size(); for (int i = 0; i < n; i++) { int id = tokenIDs[i]; if (wordIndex.find(id) == wordIndex.end()) continue; string w = wordIndex[id]; // Skip special tokens (optional behavior) if (w == "<PAD>" || w == "<BOS>" || w == "<EOS>") continue; words += w + " "; } return words; } int Tokenizer::get_nTokens() const { return nTokens; } unordered_map<int, string>Tokenizer::get_wordIndex() { return wordIndex; } //------------------ Embedding Class ------------------ Embedding::Embedding(int sequence_length, int embeddingDimension) { nTokens = sequence_length; dim = embeddingDimension; // initialize embeddings as an nTokensxdim matrix with certain random values initMatrix(em_matrix, nTokens, dim); } // create embedding matrix with tokenIDs matrixd Embedding::embed(const vector<int>& tokenIDs) { matrixd embeddings; embeddings.reserve(tokenIDs.size()); // set capacity of embeddings vector for(int i = 0; i < tokenIDs.size(); i++) { if (tokenIDs[i] >= 0 && tokenIDs[i] < nTokens) embeddings.push_back(em_matrix[tokenIDs[i]]); else // Out-of-dictionary tokens embeddings.push_back(vector<double>(dim, 0.0)); } return embeddings; } int Embedding::get_embedDim() const { return dim; } int Embedding::get_nTokens() const { return nTokens; } // --------------------- Positional Encoding class --------------- PositionalEncoding::PositionalEncoding(int max_seq_length, int embed_dimension) { maxTokens = max_seq_length; dModel = embed_dimension; PE.resize(maxTokens, vector<double>(dModel)); for (int pos = 0; pos < maxTokens; pos++) { for (int i = 0; i < dModel/2; i++) { PE[pos][2*i] = sin( pos / pow(10000, 2.0*i/dModel) ); PE[pos][2*i+1] = cos( pos / pow(10000, 2.0*i/dModel) ); } } } // add positional encodings to embeddings matrixd PositionalEncoding::addPE(const matrixd& embeddings) { matrixd x = embeddings; int nTokens = embeddings.size(); for (int pos = 0; pos < nTokens; pos++) for (int i = 0; i < dModel; i++) x[pos][i] += PE[pos][i]; return x; } matrixd PositionalEncoding::getPE() { return PE; } // -------------------- Add & Norm Class--------------------------------- // constructor AddnNorm::AddnNorm(int dModel) { gamma.assign(dModel, 1.0); beta.assign(dModel, 0.0), eps = 1e-5; } matrixd AddnNorm::forward (const matrixd &A, const matrixd &B) { matrixd x = addMat(A, B); // add two matrices int n = x.size(); // rows int m = x[0].size(); // cols mean.resize( n ); var.resize( n ); Zhat.resize(n, vector<double>(m, 0)); matrixd Y(n, vector(m)); // normalize the result for (int i = 0; i < n; i++) { double sum = 0; // sum of one row var[i] = 0; for (int j = 0; j < m; j++) sum += x[i][j]; mean[i] = sum / m; // mu_i, mean of i-th row sum = 0; for (int j = 0; j < m; j++) { double diff = x[i][j] - mean[i]; sum += diff * diff; } var[i] = sum / m; for (int j = 0; j < m; j++) { double xd = (x[i][j] - mean[i]) / sqrt(var[i] + eps); Zhat[i][j] = xd; Y[i][j] = gamma[j] * xd + beta[j]; } } return Y; // final output of Add & norm } // Assume forward already filled: Zhat, mean, var matrixd AddnNorm::backProp(const matrixd &X, const matrixd &dL_dY, double eta) { int nt = X.size(); int m = X[0].size(); matrixd dL_dZ(nt, vector<double>(m, 0.0)); vector<double> dL_dGamma(m, 0.0); // gradient wrt gamma vector<double> dL_dBeta(m, 0.0); // gradient wrt beta // 1. Compute dL_dGamma and dL_dBeta for each token for (int i = 0; i < nt; i++) { for (int j = 0; j < m; j++){ dL_dBeta[j] += dL_dY[i][j]; dL_dGamma[j] += dL_dY[i][j] * Zhat[i][j]; } } // 2. Update parameters for (int j = 0; j < m; j++) { gamma[j] -= eta * dL_dGamma[j]; beta[j] -= eta * dL_dBeta[j]; } // 3. Backprop through LayerNorm (row-wise) for (int i = 0; i < nt; i++) { double inv_sigma = 1.0 / sqrt(var[i] + eps); // Step A: compute intermediate values vector<double> dL_dZhat(m); for (int j = 0; j < m; j++) { dL_dZhat[j] = dL_dY[i][j] * gamma[j]; } double sum_dL_dZhat = 0.0; double sum_dL_dZhat_zhat = 0.0; for (int j = 0; j < m; j++) { sum_dL_dZhat += dL_dZhat[j]; sum_dL_dZhat_zhat += dL_dZhat[j] * Zhat[i][j]; } // Final gradient for (int j = 0; j < m; j++) { dL_dZ[i][j] = inv_sigma * (dL_dZhat[j] - sum_dL_dZhat / m - Zhat[i][j] * sum_dL_dZhat_zhat / m); } } return dL_dZ; } // Optional setters (to plug in forward results) void AddnNorm::setCache(const matrixd& zhat, const vector<double>& m, const vector<double>& v) { Zhat = zhat; mean = m; var = v; } // --------------------------- Feed Forward ----------------------------------- FeedForward::FeedForward(int dModel0, int d_ff0) { dModel = dModel0; d_ff = d_ff0; W1.resize(dModel, vector<double>(d_ff)); W2.resize(d_ff, vector<double>(dModel)); for (int i = 0; i < dModel; i++) for (int j = 0; j < d_ff; j++) W1[i][j] = ((double)rand() / RAND_MAX - 0.5); for (int i = 0; i < d_ff; i++) for (int j = 0; j < dModel; j++) W2[i][j] = ((double)rand() / RAND_MAX - 0.5); } // ------------------------------ // Forward (for cache) // ------------------------------ matrixd FeedForward::FFoutput(const matrixd &X) { Z = matmul(X, W1); // pre-activation F1 = relu( Z ); //f(Z); // F1 = f(XW1) matrixd F2 = matmul(F1, W2); // F2 = F1 W2 return F2; } // ------------------------------ // Backprop // ------------------------------ matrixd FeedForward::backProp(const matrixd &X, const matrixd &dL_dY, // dL/dF2 double eta) { // ---- W2 ---- matrixd dL_dF2 = dL_dY; // dL/dW2 = F1^T * dL_dF2 matrixd F1T = transpose(F1); matrixd dL_dW2 = matmul(F1T, dL_dF2); // dL_dF1 = dL_dF2 * W2^T matrixd W2T = transpose(W2); matrixd dL_dF1 = matmul(dL_dF2, W2T); // ---- find dL_dZ1 ---- matrixd fd_Z = relu_deriv(Z); // f_deriv(Z); int rows = dL_dF1.size(); int cols = dL_dF1[0].size(); matrixd dL_dZ1 = dL_dF1; // same dimension for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) dL_dZ1[i][j] = dL_dF1[i][j] * fd_Z[i][j]; // ---- W1 ---- // dL/dW1 = X^T * dL/dZ1 matrixd XT = transpose(X); matrixd dL_dW1 = matmul(XT, dL_dZ1); // dL/dX = dL/dZ1 * W1^T matrixd W1T = transpose(W1); matrixd dL_dX = matmul(dL_dZ1, W1T); // ---- Update weights ---- for (int i = 0; i < W2.size(); i++) for (int j = 0; j < W2[0].size(); j++) W2[i][j] -= eta * dL_dW2[i][j]; for (int i = 0; i < W1.size(); i++) for (int j = 0; j < W1[0].size(); j++) W1[i][j] -= eta * dL_dW1[i][j]; return dL_dX; // This becomes the dL_dY of next stage } // ------------------ MultiHeadAttention -------------------- MultiHeadAttention::MultiHeadAttention(int dModel0, int nHeads0) { dModel = dModel0; nHeads = nHeads0; d_k = dModel / nHeads; WQ.resize(nHeads); WK.resize(nHeads); WV.resize(nHeads); for (int h = 0; h < nHeads; h++) { initMatrix(WQ[h], dModel, d_k); initMatrix(WK[h], dModel, d_k); initMatrix(WV[h], dModel, d_k); } initMatrix(WO, dModel, dModel); } matrixd MultiHeadAttention::computeAttention(const matrixd &X_Q, const matrixd &X_K, const matrixd &X_V, bool mask) { Qs.clear(); Ks.clear(); Vs.clear(); Ps.clear(); As.clear(); vector<matrixd> heads; for (int h = 0; h < nHeads; h++) { matrixd Q = matmul(X_Q, WQ[h]); matrixd K = matmul(X_K, WK[h]); matrixd V = matmul(X_V, WV[h]); matrixd S = matmul(Q, transpose(K)); scaleMat(S, 1.0 / sqrt(d_k)); if (mask) S = causalMask(S); matrixd P = softmax(S); matrixd A = matmul(P, V); Qs.push_back(Q); Ks.push_back(K); Vs.push_back(V); Ps.push_back(P); As.push_back(A); heads.push_back(A); } // Concatenate heads int n = heads[0].size(); matrixd concat(n, vector<double>(dModel, 0.0)); for (int h = 0; h < nHeads; h++) for (int i = 0; i < n; i++) for (int j = 0; j < d_k; j++) concat[i][h * d_k + j] = heads[h][i][j]; return matmul(concat, WO); } // backpropagation matrixd MultiHeadAttention::backProp( const matrixd &X_Q, const matrixd &X_K, const matrixd &X_V, const matrixd &dL_dH, double eta, bool mask) { // ---- WO gradients ---- matrixd A_cat; // rebuild concat from cached As int nt = As[0].size(); double s = 1.0 / sqrt(d_k); // for scaling A_cat = matrixd(nt, vector<double>(dModel, 0.0)); for (int h = 0; h < nHeads; h++) for (int i = 0; i < nt; i++) for (int j = 0; j < d_k; j++) A_cat[i][h*d_k + j] = As[h][i][j]; matrixd dL_dWO = matmul(transpose(A_cat), dL_dH); matrixd dL_dAcat = matmul(dL_dH, transpose(WO)); // update WO updateWeight(WO, dL_dWO, eta); // ---- initialize accumulators ---- matrixd dL_dXQ(X_Q.size(), vector<double>(dModel, 0.0)); matrixd dL_dXK(X_K.size(), vector<double>(dModel, 0.0)); matrixd dL_dXV(X_V.size(), vector<double>(dModel, 0.0)); // ---- per head ---- for (int h = 0; h < nHeads; h++) { // slice gradient matrixd dL_dA(nt, vector<double>(d_k)); for (int i = 0; i < nt; i++) for (int j = 0; j < d_k; j++) dL_dA[i][j] = dL_dAcat[i][h*d_k + j]; matrixd P = Ps[h]; matrixd V = Vs[h]; matrixd Q = Qs[h]; matrixd K = Ks[h]; // ---- A = P V ---- matrixd dL_dP = matmul(dL_dA, transpose(V)); matrixd dL_dV = matmul(transpose(P), dL_dA); // ---- softmax ---- matrixd dL_dS = softmax_derivative(dL_dP, P); // ---- S = QK^T ---- matrixd dL_dQ = matmul(dL_dS, K); matrixd dL_dK = matmul(transpose(dL_dS), Q); scaleMat(dL_dQ, s); scaleMat(dL_dK, s); // ---- Weight gradients ---- matrixd dL_dWQ = matmul(transpose(X_Q), dL_dQ); matrixd dL_dWK = matmul(transpose(X_K), dL_dK); matrixd dL_dWV = matmul(transpose(X_V), dL_dV); // update weights updateWeight(WQ[h], dL_dWQ, eta); for (int i = 0; i < WK[h].size(); i++) for (int j = 0; j < WK[h][0].size(); j++) WK[h][i][j] -= eta * dL_dWK[i][j]; for (int i = 0; i < WV[h].size(); i++) for (int j = 0; j < WV[h][0].size(); j++) WV[h][i][j] -= eta * dL_dWV[i][j]; // ---- input gradients ---- matrixd dXQ = matmul(dL_dQ, transpose(WQ[h])); matrixd dXK = matmul(dL_dK, transpose(WK[h])); matrixd dXV = matmul(dL_dV, transpose(WV[h])); // accumulate dL_dXQ = addMat(dL_dXQ, dXQ); dL_dXK = addMat(dL_dXK, dXK); dL_dXV = addMat(dL_dXV, dXV); } // for h // For self-attention, these are the same return dL_dXQ; } // ---------- OutputLayer ---------------------- OutputLayer::OutputLayer(int dModel0, int vocab_size0) { dModel = dModel0; vocab_size = vocab_size0; initMatrix(W, dModel, vocab_size); } // Forward: logits --> probabilities matrixd OutputLayer::forward(const matrixd &X) { // X: (seq_len x dModel) // output: (seq_len x vocab_size) matrixd logits = matmul(X, W); return softmax(logits); } // Backward: returns dL/dX matrixd OutputLayer::backward(const matrixd &X, const matrixd &P, const vector<int> &targets, double eta) { int n = P.size(); // dL/dP = P - Y (cross-entropy + softmax) matrixd dL_dP = P; for (int i = 0; i < n; i++) dL_dP[i][targets[i]] -= 1.0; // Gradient wrt weights matrixd dL_dW = matmul(transpose(X), dL_dP); //update weights updateWeight(W, dL_dW, eta); // Gradient wrt input (back to decoder) matrixd dL_dX = matmul(dL_dP, transpose(W)); return dL_dX; } // ------------ GPT-style Transformer (Decoder-only) ------------------- GPTBlock::GPTBlock(int dModel, int nHeads, int d_ff) // Decoder : mha(dModel, nHeads), norm1(dModel), ffn(dModel, d_ff), norm2(dModel) { } matrixd GPTBlock::forward(const matrixd &X) { matrixd H = mha.computeAttention(X, X, X, true); // masked matrixd X1 = norm1.forward(X, H); matrixd F = ffn.FFoutput(X1); matrixd Y = norm2.forward(X1, F); return Y; } matrixd GPTBlock::backward(const matrixd &X, const matrixd &dL_dY, double eta) { matrixd dF = norm2.backProp(X, dL_dY, eta); matrixd dX1 = ffn.backProp(X, dF, eta); matrixd dH = norm1.backProp(X, dX1, eta); return mha.backProp(X, X, X, dH, eta, true); } MiniGPT::MiniGPT(int vocab_size, int seq_len, int dModel, int nHeads, int d_ff, int nLayers) : embed(vocab_size, dModel), pe(seq_len, dModel), output(dModel, vocab_size), seq_len(seq_len), vocab_size(vocab_size) { for (int i = 0; i < nLayers; i++) { GPTBlock gptBlock(dModel, nHeads, d_ff); layers.push_back(gptBlock); } } matrixd MiniGPT::forward(const vector<int> &tokens) { matrixd X = embed.embed(tokens); X = pe.addPE(X); for(int i = 0; i < layers.size(); i++) X = layers[i].forward(X); return output.forward(X); } double MiniGPT::trainStep(const vector<int> &tokens, double eta) { // create input/target (shifted) vector<int> input(tokens.begin(), tokens.end()-1); vector<int> target(tokens.begin()+1, tokens.end()); /* e.g. input sequence: ["<SOS>", "one", "two"] target sequence: ["one", "two", "three"] */ matrixd X = embed.embed(input); X = pe.addPE(X); vector<matrixd> cache; for (int i = 0; i < layers.size(); i++) { X = layers[i].forward(X); cache.push_back(X); } matrixd P = output.forward(X); /* Position i: Model takes tokens 1..i-1 (from input) and outputs probability distribution P[i] over the vocabulary target[i] = correct token at position i P[i][target[i]] = probability model assigned to the correct token */ // loss is for reference only, we do not need it in back propagation double loss = crossEntropy(P, target); // backward, gradient loss is computed from P and target matrixd dL_dX = output.backward(X, P, target, eta); for (int i = layers.size()-1; i >= 0; i--) dL_dX = layers[i].backward(cache[i], dL_dX, eta); return loss; }

testMain.cpp:

// http://forejune.co/cuda/
// testMain.cpp

#include "util.h"
#include "transGpt.h"
#include <fstream>

using namespace std;

vector<int> generate(MiniGPT &model, vector<int> prompt, int max_len)
{
    vector<int> tokens = prompt;

    for (int step = 0; step < max_len; step++) {
	vector<int> input(tokens.begin(), tokens.end());
        matrixd P = model.forward(input);

        vector<double> last = P.back();   //last row of matrix P
	// last contains probability distribution over the vocabulary for the next token
	// each index in last corresponds to a token ID in the vocabulary

	// greedy decoding, - last.begin() converts interator to index
        int next = max_element(last.begin(), last.end()) - last.begin();

        tokens.push_back(next);
    }

    return tokens;
}

vector<vector<int>> buildDataset(Tokenizer &tokenizer)
{
    vector<vector<int>> dataset;

    string str = "one two three  uno  dos  tres";
    vector<int> tokenIDs = tokenizer.tokenize(str);
    dataset.push_back( tokenIDs );
    str = "two three  four dos  tres  cuatro ";
    tokenIDs = tokenizer.tokenize(str);
    dataset.push_back( tokenIDs );
    str = "three  four  five tres  cuatro  cinco";
    tokenIDs = tokenizer.tokenize(str);
    dataset.push_back( tokenIDs );
    str = "one two three  uno  dos  tres";
    tokenIDs = tokenizer.tokenize(str);
    dataset.push_back( tokenIDs );
    str = "three  four  five tres  cuatro  cinco";
    tokenIDs = tokenizer.tokenize(str);
    dataset.push_back( tokenIDs );

    return dataset;
}

int main() 
{
    // -------------------------
    // Tokenizer setup
    // -------------------------
    
    // Build vocabulary 
    ifstream ifs;
    char fname[] = "t.txt";
    ifs.open( fname );
    if (!ifs.is_open()) {
          cerr << "Unable to open file " << fname  << endl;
          exit( 1 );
    }

    Tokenizer tokenizer( ifs );

    int vocab_size = tokenizer.get_nTokens();

    cout << "Vocab size: " << vocab_size << endl;

    // -------------------------
    // Model
    // -------------------------
    int seq_len = 50;	// maximum sequence length model can handle
    int dModel = 32;
    int nHeads = 4;
    int d_ff = 128;
    int nLayers = 2;

    MiniGPT model(vocab_size, seq_len, dModel, nHeads, d_ff, nLayers);

    // -------------------------
    // Dataset 
    // -------------------------
    vector<vector<int>> dataset;
    dataset = buildDataset( tokenizer );

    // -------------------------
    // Training
    // -------------------------
    for (int epoch = 0; epoch < 201; epoch++) {
        double loss = 0;

        for (auto &seq : dataset)
            loss += model.trainStep(seq, 0.002);

        if (epoch % 20 == 0)
            cout << "Epoch " << epoch << " Loss: " << loss << endl;
    }

    // -------------------------
    // Inference
    // -------------------------
    string prompt = "one two three";
    vector<int> prompt_tokens = tokenizer.tokenize(prompt);
    cout << "\nPrompt: " << prompt << endl;
    vector<int> generated = generate(model, prompt_tokens, 3);
    string output = tokenizer.detokenize(generated);
    cout << "\nGenerated text:\n" << output << endl;

    prompt = "three four five";
    prompt_tokens = tokenizer.tokenize(prompt);
    cout << "\nPrompt: " << prompt << endl;
    generated = generate(model, prompt_tokens, 3);
    output = tokenizer.detokenize(generated);
    cout << "\nGenerated text:\n" << output << endl;

    return 0;
}

Makefile:

PROG	= testMain
#source codes
SRCS =  $(PROG).cpp
#substitute .cpp by .o to obtain object filenames
OBJS = $(SRCS:.cpp=.o) util.o  transGpt.o

#$< evaluates to the target's dependencies, 
#$@ evaluates to the target

$(PROG): $(OBJS)
	g++ -o $@ $(OBJS)  

$(OBJS): 
	g++ -c  -std=c++20 $*.cpp 

clean:
	rm $(OBJS) $(PROG)