Materials available at: http://forejune.co/cuda/
Review
|
|
Back Propagation of Output Layer
|
|
Ouput Layer:
|
class OutputLayer {
private:
matrixd W; // dModel x vocab_size
int dModel, vocab_size;
public:
OutputLayer(int dim, int nv)
{
dModel = dim;
vocab_size = nv;
initMatrix(W, dModel, vocab_size);
}
// Forward: logits --> probabilities
matrixd forward(const matrixd &X) {
// X: (seq_len x dModel)
// output: (seq_len x vocab_size)
matrixd logits = matmul(X, W);
return softmax(logits);
}
// Backward: returns dL/dX
matrixd backward(const matrixd &X, const matrixd &P,
const vector
|
Back Propagation of Encoder Layers and Decoder Layers
struct DecoderGrad {
matrixd dL_dX; // gradient to decoder input (dL/dX)
matrixd dL_dE; // gradient to encoder output
};
|
|
// obtain dL/dX from Output Layer
matrixd dL_dX = output.backward(dec_out, P, target_out, eta);
matrixd dL_dE; // accumulate gradients to encoder
bool first = true;
// accumulate encoder gradients dL_dE
for (int i = dLayers.size() - 1; i >= 0; i--) {
DecoderGrad g = dLayers[i].backProp(dL_dX, eta);
dL_dX = g.dL_dX;
if (first) {
dL_dE = g.dL_dE;
first = false;
} else {
dL_dE = addMat(dL_dE, g.dL_dE);
}
}
Forward computation:
matrixd E = source_pe; // embeddings + positional encoding
for (int i = 0; i < eLayers.size(); i++)
E = eLayers[i].forward(E);
In coding
dL/dY ~ dL/dXl+1 (output gradient)
dL/dX ~ dL/dXl (input gradient)
// at the last encoder layer, dL_dE is from decoder layers
for (int i = eLayers.size() - 1; i >= 0; i--)
dL_dE = eLayers[i].backProp(dL_dE, eta);
Full Implementation in C/C++
Helper Functions:
#ifndef __UTIL_H__
#define __UTIL_H__
#include <vector>
#include <algorithm>
#include <math.h>
#include <iostream>
#include <iomanip>
#include <random>
using namespace std;
typedef vector<vector<double>> matrixd;
// an activation function
double f(double z);
// Relu activation function with matrix input
matrixd relu(const matrixd &X);
// Derivative of Relu
matrixd relu_deriv (const matrixd &X);
// calculates entropy loss
double crossEntropy(const matrixd &P, const vector<int> &target);
// add two matrices
matrixd addMat(const matrixd &A, const matrixd &B);
// Transpose matrix n x m, B = A^T : m x n
matrixd transpose(const matrixd& A);
// Matrix multiplication C = A X B A: nxm, B: mxr, C: nxr
matrixd matmul(const matrixd& A, const matrixd& B);
// multiply a matrix by a scalar
matrixd mulMats (const matrixd &A, const double s);
// scale a matrix
void scaleMat(matrixd &A, const double s);
// apply masking to S
matrixd causalMask(matrixd S);
// update weight matrix W -= dW * eta
void updateWeight(matrixd &W, const matrixd &dW, const double eta);
// initialize an input matrix with certain random values
void initMatrix(matrixd& M, const int n, const int m);
void initMatrix(matrixd& M, const int n, const int m, double dModel);
// print a token dictionary
void printDictionary(unordered_map<string, int> &d);
void printVec (const vector<int> &v);
// print a matrix
void printMatrix(const matrixd &y);
// Add & Norm
matrixd addNnorm(const matrixd &A, const matrixd &B);
void clip(matrixd &A, double max_val);
// softmax, 1D vector input
vector<double> softmax(const vector<double> &v);
// softmax activation function, 2D input vector
matrixd softmax(const matrixd& input);
// derivative function of softmax
matrixd softmax_derivative(const matrixd &dL_dP, const matrixd &P);
#endif
|
util.cpp:
// util.cpp: helper functions
// http://forejune.co/cuda/
#include "util.h"
#include <random>
using namespace std;
// Activation function: using ReLU (Rectified Linear Unit)
double f(double z)
{
return max(0.0, z);
}
// Relu activation function with matrix input
matrixd relu(const matrixd &X)
{
int rows = X.size();
int cols = X[0].size();
matrixd Y = X; // same dimension of X
for(int i = 0; i < rows; i++)
for(int j = 0; j < cols; j++)
Y[i][j] = max(0.0, X[i][j]);
return Y;
}
// Derivative of Relu
matrixd relu_deriv(const matrixd &X)
{
int rows = X.size();
int cols = X[0].size();
matrixd Y = X; // same dimension as X
for(int i = 0; i < rows; i++)
for(int j = 0; j < cols; j++)
if ( X[i][j] > 0 )
Y[i][j] = 1;
else
Y[i][j] = 0;
return Y;
}
// calculates entropy loss
double crossEntropy(const matrixd &P, const vector<int> &target)
{
double L = 0;
int nt = P.size();
for (int i = 0; i < nt; i++){
int k = target[i];
L -= log( P[i][k] + 1e-12 );
}
return L / nt;
}
// C = A + B
matrixd addMat(const matrixd &A, const matrixd &B)
{
int n = A.size(); // rows
int m = A[0].size(); // cols
matrixd C = A; // same dimension as A
for (int i = 0; i < n; i++)
for(int j = 0; j < m; j++)
C[i][j] = A[i][j] + B[i][j];
return C;
}
// Transpose matrix n x m, B = A^T : m x n
matrixd transpose(const matrixd& A)
{
int n = A.size(); //number of rows
int m = A[0].size(); //number of columns
matrixd B(m, vector<double>(n));
for (int i = 0; i < n; i++)
for (int j = 0; j < m; j++)
B[j][i] = A[i][j];
return B;
}
// Matrix multiplication C = A X B A: nxm, B: mxr, C: nxr
matrixd matmul(const matrixd& A, const matrixd& B)
{
int n = A.size();
int m = A[0].size();
int r = B[0].size();
matrixd C(n, vector<double>(r, 0));
for (int i = 0; i < n; i++)
for (int j = 0; j < r; j++)
for (int k = 0; k < m; k++)
C[i][j] += A[i][k] * B[k][j];
return C;
}
// scale a matrix
void scaleMat(matrixd &A, const double s)
{
int m = A.size(); // number of rows
int n = A[0].size(); // number of columns
for (int i = 0; i < m; i++)
for (int j = 0; j < n; j++)
A[i][j] *= s;
}
// maultiply a matrix by a scalar s
matrixd mulMats (const matrixd &A, const double s)
{
int m = A.size(); // number of rows
int n = A[0].size(); // number of columns
matrixd B = A;
for (int i = 0; i < m; i++)
for (int j = 0; j < n; j++)
B[i][j] *= s;
return B;
}
void initMatrix(matrixd& M, const int n, const int m)
{
// resizing 2D vector to n x m with initial values 0
M.assign(n, vector<double>(m, 0));
random_device rd;
mt19937 gen(rd()); //psuedo random number generator
double sigma = 1.0 / n;
normal_distribution<double> dist(0.0, sigma);
for (int i = 0; i < n; ++i)
for (int j = 0; j < m; ++j)
M[i][j] = dist(gen);
for (int i = 0; i < n; ++i)
for (int j = 0; j < m; ++j)
M[i][j] = rand() % 10000 / 100000.0;
}
void initMatrix(matrixd& M, const int n, const int m, double sigma)
{
// resizing 2D vector to n x m with initial values 0
M.assign(n, vector<double>(m, 0));
random_device rd;
mt19937 gen(rd()); //psuedo random number generator
double d = n;
normal_distribution<double> dist(0.0, sigma);
for (int i = 0; i < n; ++i)
for (int j = 0; j < m; ++j)
M[i][j] = dist(gen);
}
void printDictionary(unordered_map<string, int> &d)
{
unordered_map<string, int>::iterator it = d.begin();
int k = 0;
while ( it != d.end() ){
cout << right << setw(12) << it->first << ": " << setw(3) << it->second;
it++;
k++;
if ( k % 5 == 0 )
cout << endl;
}
}
void printVec (const vector<int> &v)
{
int n = v.size();
for (int i = 0; i < n; i++){
cout << v[i];
if ( i < n - 1)
cout << ", ";
}
cout << endl;
}
void printMatrix(const matrixd &y)
{
int cols = y.size();
int rows = y[0].size();
for (int i = 0; i < cols; ++i) {
cout << "Token " << i << ": [";
for (int j = 0; j < rows; ++j) {
cout << fixed << setw(7) << setprecision(3) << y[i][j];
if (j < 4) cout << ", ";
}
cout << " ]\n";
}
}
matrixd addNnorm(const matrixd &A, const matrixd &B)
{
auto x = addMat(A, B); // add two matrices
int n = x.size(); // rows
int m = x[0].size(); // cols
matrixd y = x; // output same size as x
// normalize the result
double epsilon = 1e-5;
for (int i = 0; i < n; i++) {
double sum = 0; // sum of one row
double std = 0; //sigma_i square
for (int j = 0; j < m; j++)
sum += x[i][j];
double mean = sum / m; // mu_i, mean of i-th row
sum = 0;
for (int j = 0; j < m; j++) {
double diff = x[i][j] - mean;
sum += diff * diff;
}
std = sum / m;
for (int j = 0; j < m; j++) {
double xd = (x[i][j] - mean) / sqrt(std + epsilon);
y[i][j] = xd; // gamma = 1, beta = 0;
}
}
return y; // final output of Add & norm
}
matrixd scaledAttention(const matrixd &Q, const matrixd &K, const matrixd &V,
bool mask)
{
double d_k, scale;
matrixd KT, A; // K transpose, attention
d_k = Q[0].size();
scale = 1.0 / sqrt( d_k );
KT = transpose ( K );
A = matmul(Q, KT);
scaleMat(A, scale);
if ( mask ) {
for (int i = 0; i < A.size(); i++)
for (int j = i+1; j < A[0].size(); j++)
A[i][j] = -1e9; // causal mask
}
A = softmax( A );
matrixd output = matmul(A, V);
return output;
}
matrixd causalMask(matrixd S)
{
int n = S.size();
for (int i = 0; i < n; i++) {
for (int j = i+1; j < n; j++) {
S[i][j] = -1e9; // effectively -∞
}
}
return S;
}
void updateWeight(matrixd &W, const matrixd &dW, const double eta)
{
for (int i = 0; i < W.size(); i++)
for (int j = 0; j < W[0].size(); j++)
W[i][j] -= eta * dW[i][j];
}
void clip(matrixd &A, double max_val = 5.0)
{
for (auto &row : A)
for (auto &x : row)
if (x > max_val) x = max_val;
else if (x < -max_val) x = -max_val;
}
// softmax activation function, 1D input vector
vector<double> softmax(const vector<double> &z)
{
int m = z.size();
vector<double> y(m, 0);
//maximum value of vector
double maxVal = *max_element(z.begin(), z.end());
double sum = 0;
// Subtract max for numerical stability
for (int j = 0; j < m; j++) {
y[j] = exp(z[j] - maxVal);
sum += y[j];
}
// Normalize
for (int j = 0; j < m; j++)
y[j] /= sum;
return y;
}
// softmax activation function, 2D input vector
matrixd softmax(const matrixd& input)
{
int n = input.size(), m = input[0].size(); // n x m matrix
// 2D output vector n x m y[n][m]
vector<vector<double>> y(n, vector<double>(m));
for (int i = 0; i < n; i++) {
//maximum value of the row
double maxVal = *max_element(input[i].begin(), input[i].end());
double sum = 0;
// Subtract max for numerical stability
for (int j = 0; j < m; j++) {
y[i][j] = exp(input[i][j] - maxVal);
sum += y[i][j];
}
// Normalize
for (int j = 0; j < input[i].size(); j++)
y[i][j] /= sum;
}
return y;
}
// derivative function of softmax
matrixd softmax_derivative(const matrixd &dL_dP, const matrixd &P)
{
int nt = P.size(), ns = P[0].size();
matrixd dL_dS(nt, vector<double>(ns,0));
for (int i = 0; i < nt; i++)
{
double dot = 0.0;
for (int j = 0; j < ns; j++)
dot += P[i][j] * dL_dP[i][j]; // dot product of two row-vectors
for (int j = 0; j < ns; j++) // dot used by all columns of dL/dA
dL_dS[i][j] = P[i][j] * (dL_dP[i][j] - dot);
}
return dL_dS;
}
|
#ifndef TRANSFORMER_H
#define TRANSFORMER_H
class Tokenizer
{
private:
unordered_map<string, int> words;
unordered_map<int, string> wordIndex;
int nTokens; // number of words
// special tokens
const string PAD = "<PAD>";
const string UNK = "<UNK>";
const string BOS = "<BOS>";
const string EOS = "<EOS>";
const string SEP = "<SEP>";
void addWord(const string &w)
{
if (words.find(w) == words.end()) {
int id = nTokens;
words[w] = id;
wordIndex[id] = w;
nTokens++;
}
}
vector<string> split(const string &str)
{
vector<string> tokens;
stringstream ss(str);
string word;
while (ss >> word) {
tokens.push_back(word);
}
return tokens;
}
public:
Tokenizer();
Tokenizer(ifstream &fs);
unordered_map<string, int> getTokensWords();
vector<int> tokenize(const string& str);
string detokenize(const vector<int>& tokenIDs);
int get_nTokens() const;
unordered_map<int, string> get_wordIndex();
};
class Embedding
{
private:
int nTokens; // number of words
int dim; // embedding dimension
matrixd em_matrix; // embedding matrix;
public:
Embedding(int sequence_length, int embeddingDimension);
// create embedding matrix with tokenIDs
matrixd embed(const vector<int>& tokenIDs);
void backProp(const vector<int>& tokens,
const matrixd& dX,
double eta);
int get_embedDim() const;
int get_nTokens() const;
};
// Positional Encoding
class PositionalEncoding
{
private:
int maxTokens; // maximum sequence length
int dModel; // embedding dimension
matrixd PE; // positional_encodings matrix;
public:
PositionalEncoding(int max_seq_length, int embedding_dimension);
void extendPE(int new_maxTokens);
matrixd addPE(const matrixd& embeddings);
matrixd getPE();
};
// Add & Norm
struct AddNormGrad {
matrixd dL_dA; // residual path
matrixd dL_dB; // sublayer path
};
class AddnNorm
{
private:
vector<double> gamma; // size dModel
vector<double> beta; // size dModel
matrixd Zhat; // normalized values (nt x d)
vector<double> mean; // per row (nt)
vector<double> var; // per row (nt)
double eps; // small value
public:
AddnNorm(int dModel);
// forward computation
matrixd forward (const matrixd &A, const matrixd &B);
// Assume forward already filled: Zhat, mean, var
AddNormGrad backProp(const matrixd &X, const matrixd &dL_dY, double eta);
// Optional setters (to plug in forward results)
void setCache(const matrixd& zhat, const vector<double>& m,
const vector<double>& v);
// Accessors (optional)
const vector<double>& getGamma();
const vector<double>& getBeta();
};
// Feed-forward Network
class FeedForward
{
private:
matrixd W1, W2;
int dModel, d_ff;
// cache
matrixd F1; // after activation function ReLU
matrixd Z; // before ReLU (needed for derivative)
public:
// constructor
FeedForward(int dModel, int d_ff_);
// ------------------------------
// Forward (for cache)
// ------------------------------
matrixd FFoutput(const matrixd &X);
// ------------------------------
// Backprop
// ------------------------------
matrixd backProp(const matrixd &X, const matrixd &dL_dY, double eta);
};
// ----------- MultiHeadAttention -----------
struct MHA_Gradient {
matrixd dL_dQ;
matrixd dL_dK;
matrixd dL_dV;
};
class MultiHeadAttention {
private:
int dModel, nHeads, d_k;
vector<matrixd> WQ, WK, WV; // per head
matrixd WO;
// cache
vector<matrixd> Qs, Ks, Vs, Ps, As;
public:
MultiHeadAttention(int dModel, int nHeads);
void clearCache();
matrixd computeAttention(const matrixd &X_Q, const matrixd &X_K, const matrixd &X_V,
bool mask);
MHA_Gradient backProp( const matrixd &X_Q, const matrixd &X_K, const matrixd &X_V,
const matrixd &dL_dH, double eta, bool mask);
};
// ----------------- OutputLayer ---------------
class OutputLayer {
private:
matrixd W; // dModel x vocab_size
int dModel, vocab_size;
public:
OutputLayer(int dModel, int vocab_size);
matrixd forward(const matrixd &X);
matrixd backward(const matrixd &X, const matrixd &P,
const vector<int> &targets, double eta);
};
// ---------- EncoderLayer --------------------
class EncoderLayer {
private:
MultiHeadAttention mha;
FeedForward ffn;
AddnNorm norm1, norm2;
matrixd X, H1, H2;
public:
EncoderLayer(); //default constructor
EncoderLayer(int d_model, int nHeads, int d_ff);
matrixd forward(const matrixd& input);
matrixd backProp(const matrixd& dL_dOut, double eta);
};
// ------------ Decoder Layer -------------------
struct DecoderGrad {
matrixd dL_dX; // gradient to decoder input
matrixd dL_dE; // gradient to encoder output
};
class DecoderLayer {
private:
FeedForward ffn;
AddnNorm norm1, norm2, norm3;
matrixd X, H1, H2, H3;
matrixd E; // final output of Encoder
public:
MultiHeadAttention self_mha;
MultiHeadAttention cross_mha;
DecoderLayer(int d_model, int nHeads, int d_ff);
matrixd forward(const matrixd& input,
const matrixd& encoder_output);
DecoderGrad backProp(const matrixd& dL_dOut, double eta);
};
// ------------------- Transformer ---------------
class MiniTrans {
private:
Embedding embed;
PositionalEncoding pe;
vector<EncoderLayer> eLayers;
vector<DecoderLayer> dLayers;
OutputLayer output;
int seq_len, vocab_size;
int dModel;
matrixd X_embed; // cache
matrixd X_pe; // cache
matrixd X_final;
matrixd dec_out; // cache decoder output
public:
MiniTrans(int vocab_size, int seq_len,
int dModel, int nHeads,
int d_ff, int nLayers);
matrixd forward(const vector<int> &src_tokens, const vector<int> &target_tokens);
double trainStep(const vector<int> &src,
const vector<int> &target_input,
const vector<int> &target_out,
double eta);
vector<int> generate(const vector<int> &src,
int max_len,
int start_token,
int end_token);
};
#endif
|
transformer.cpp:
// http://forejune.co/cuda
// transformer.cpp: Encoder-Decoder Transformer
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <unordered_map>
#include <cmath>
#include <random>
#include <algorithm>
#include <iomanip>
#include <cassert>
#include "util.h"
#include "transformer.h"
using namespace std;
// ------------------------ Tokenizer class --------------------------
// -------------------------------
// Default constructor
// -------------------------------
Tokenizer::Tokenizer()
{
nTokens = 0;
// Initialize special tokens first
addWord(PAD); // 0
addWord(UNK); // 1
addWord(BOS); // 2
addWord(EOS); // 3
addWord(SEP); // 4
}
// -------------------------------
// Build vocab from file
// -------------------------------
Tokenizer::Tokenizer(ifstream &fs) : Tokenizer()
{
string line;
while (getline(fs, line)) {
vector<string> tokens = split(line); //split(line);
for (int i = 0; i < tokens.size(); i++)
addWord(tokens[i]);
}
}
unordered_map<string, int> Tokenizer::getTokensWords()
{
return words;
}
vector<int> Tokenizer::tokenize(const string& str)
{
vector<int> ids;
vector<string> tokens = split(str);
for (int i = 0; i < tokens.size(); i++) {
string w = tokens[i];
if (words.find(w) != words.end())
ids.push_back(words[w]);
else
ids.push_back(words[UNK]);
}
return ids;
}
string Tokenizer::detokenize(const vector<int>& tokenIDs)
{
string words;
int n = tokenIDs.size();
for (int i = 0; i < n; i++) {
int id = tokenIDs[i];
if (wordIndex.find(id) == wordIndex.end())
continue;
string w = wordIndex[id];
// Skip special tokens (optional behavior)
if (w == "<PAD>" || w == "<BOS>" || w == "<EOS>")
continue;
words += w + " ";
}
return words;
}
int Tokenizer::get_nTokens() const
{
return nTokens;
}
unordered_map<int, string>Tokenizer::get_wordIndex()
{
return wordIndex;
}
//------------------ Embedding Class ------------------
Embedding::Embedding(int sequence_length, int embeddingDimension)
{
nTokens = sequence_length;
dim = embeddingDimension;
// initialize embeddings as an nTokensxdim matrix with certain random values
initMatrix(em_matrix, nTokens, dim);
}
// create embedding matrix with tokenIDs
matrixd Embedding::embed(const vector<int>& tokenIDs)
{
matrixd embeddings;
embeddings.reserve(tokenIDs.size()); // set capacity of embeddings vector
for(int i = 0; i < tokenIDs.size(); i++) {
if (tokenIDs[i] >= 0 && tokenIDs[i] < nTokens)
embeddings.push_back(em_matrix[tokenIDs[i]]);
else
// Out-of-dictionary tokens
embeddings.push_back(vector<double>(dim, 0.0));
}
return embeddings;
}
int Embedding::get_embedDim() const
{
return dim;
}
int Embedding::get_nTokens() const
{
return nTokens;
}
// optional
void Embedding::backProp(const vector<int>& tokenIDs,
const matrixd& dL_dX,
double eta)
{
int nt = tokenIDs.size();
for (int i = 0; i < nt; i++) {
int token = tokenIDs[i];
for (int j = 0; j < dim; j++)
em_matrix[token][j] -= eta * dL_dX[i][j];
}
}
// --------------------- Positional Encoding class ---------------
PositionalEncoding::PositionalEncoding(int max_seq_length, int embed_dimension)
{
maxTokens = max_seq_length;
dModel = embed_dimension;
PE.resize(maxTokens, vector<double>(dModel));
for (int pos = 0; pos < maxTokens; pos++) {
for (int i = 0; i < dModel/2; i++) {
PE[pos][2*i] = sin( pos / pow(10000, 2.0*i/dModel) );
PE[pos][2*i+1] = cos( pos / pow(10000, 2.0*i/dModel) );
}
}
}
void PositionalEncoding::extendPE(int new_maxTokens)
{
if (new_maxTokens <= maxTokens) return;
int oldMax = maxTokens;
maxTokens = new_maxTokens;
PE.resize(maxTokens, vector<double>(dModel, 0.0));
for (int pos = oldMax; pos < maxTokens; pos++) {
for (int i = 0; i < dModel; i += 2) {
double denom = pow(10000.0, (double)i / dModel);
PE[pos][i] = sin(pos / denom);
if (i + 1 < dModel)
PE[pos][i + 1] = cos(pos / denom);
}
}
}
// add positional encodings to embeddings
matrixd PositionalEncoding::addPE(const matrixd& embeddings)
{
int nTokens = embeddings.size();
int dim = embeddings[0].size();
assert(dim == dModel);
if (nTokens > maxTokens) {
extendPE(nTokens);
}
matrixd x = embeddings;
// scale before adding PE (optional)
double scale = sqrt(dModel);
for (int i = 0; i < nTokens; i++)
for (int j = 0; j < dModel; j++)
x[i][j] = embeddings[i][j] * scale + PE[i][j];
return x;
}
matrixd PositionalEncoding::getPE()
{
return PE;
}
// -------------------- Add & Norm Class---------------------------------
// constructor
AddnNorm::AddnNorm(int dModel)
{
gamma.assign(dModel, 1.0);
beta.assign(dModel, 0.0),
eps = 1e-5;
}
matrixd AddnNorm::forward (const matrixd &A, const matrixd &B)
{
matrixd x = addMat(A, B); // add two matrices
int n = x.size(); // rows
int m = x[0].size(); // cols
mean.resize( n );
var.resize( n );
Zhat.resize(n, vector<double>(m, 0));
matrixd Y(n, vector<double>(m));
// normalize the result
for (int i = 0; i < n; i++) {
double sum = 0; // sum of one row
var[i] = 0;
for (int j = 0; j < m; j++)
sum += x[i][j];
mean[i] = sum / m; // mu_i, mean of i-th row
sum = 0;
for (int j = 0; j < m; j++) {
double diff = x[i][j] - mean[i];
sum += diff * diff;
}
var[i] = sum / m;
for (int j = 0; j < m; j++) {
double xd = (x[i][j] - mean[i]) / sqrt(var[i] + eps); //epsilon);
Zhat[i][j] = xd;
Y[i][j] = gamma[j] * xd + beta[j];
}
}
return Y; // final output of Add & norm
}
// Assume forward already filled: Zhat, mean, var
AddNormGrad AddnNorm::backProp(const matrixd &X, const matrixd &dL_dY, double eta)
{
int nt = X.size();
int m = X[0].size();
matrixd dL_dZ(nt, vector<double>(m, 0.0));
matrixd dL_dZhat(nt, vector<double>(m, 0.0));
vector<double> dL_dGamma(m, 0.0); // gradient wrt gamma
vector<double> dL_dBeta(m, 0.0); // gradient wrt beta
// 1. Compute dL_dGamma and dL_dBeta for each token
for (int i = 0; i < nt; i++)
{
for (int j = 0; j < m; j++){
dL_dBeta[j] += dL_dY[i][j];
dL_dGamma[j] += dL_dY[i][j] * Zhat[i][j];
}
}
// 2. Backprop through LayerNorm (row-wise)
for (int i = 0; i < nt; i++)
{
double inv_sigma = 1.0 / sqrt(var[i] + eps);
// Step A: compute intermediate values
vector<double> dL_dZhat(m);
for (int j = 0; j < m; j++)
{
dL_dZhat[j] = dL_dY[i][j] * gamma[j];
}
double sum_dL_dZhat = 0.0;
double sum_dL_dZhat_zhat = 0.0;
for (int j = 0; j < m; j++)
{
sum_dL_dZhat += dL_dZhat[j];
sum_dL_dZhat_zhat += dL_dZhat[j] * Zhat[i][j];
}
// Final gradient
for (int j = 0; j < m; j++)
{
dL_dZ[i][j] = inv_sigma * (dL_dZhat[j] - sum_dL_dZhat / m
- Zhat[i][j] * sum_dL_dZhat_zhat / m);
}
}
// 3. Update parameters
for (int j = 0; j < m; j++)
{
gamma[j] -= eta * dL_dGamma[j];
beta[j] -= eta * dL_dBeta[j];
}
AddNormGrad grad;
grad.dL_dA = dL_dZ; // residual path
grad.dL_dB = dL_dZ; // sublayer path
return grad;
}
// Optional setters (to plug in forward results)
void AddnNorm::setCache(const matrixd& zhat, const vector<double>& m,
const vector<double>& v)
{
Zhat = zhat;
mean = m;
var = v;
}
// --------------------------- Feed Forward -----------------------------------
FeedForward::FeedForward(int dModel0, int d_ff0)
{
dModel = dModel0;
d_ff = d_ff0;
W1.resize(dModel, vector<double>(d_ff));
W2.resize(d_ff, vector<double>(dModel));
initMatrix(W1, dModel, d_ff);
initMatrix(W2, d_ff, dModel);
}
// ------------------------------
// Forward (for cache)
// ------------------------------
matrixd FeedForward::FFoutput(const matrixd &X)
{
Z = matmul(X, W1); // pre-activation
F1 = relu( Z ); // f(Z); F1 = f(XW1)
matrixd F2 = matmul(F1, W2); // F2 = F1 W2
return F2;
}
// ------------------------------
// Backprop
// ------------------------------
matrixd FeedForward::backProp(const matrixd &X, const matrixd &dL_dY, // dL/dF2
double eta)
{
// ---- W2 ----
matrixd dL_dF2 = dL_dY;
// dL/dW2 = F1^T * dL_dF2
matrixd F1T = transpose(F1);
matrixd dL_dW2 = matmul(F1T, dL_dF2);
// dL_dF1 = dL_dF2 * W2^T
matrixd W2T = transpose(W2);
matrixd dL_dF1 = matmul(dL_dF2, W2T);
// ---- find dL_dZ1 ----
matrixd fd_Z = relu_deriv(Z); // f_deriv(Z);
int rows = dL_dF1.size();
int cols = dL_dF1[0].size();
matrixd dL_dZ1 = dL_dF1; // same dimension
for (int i = 0; i < rows; i++)
for (int j = 0; j < cols; j++)
dL_dZ1[i][j] = dL_dF1[i][j] * fd_Z[i][j];
// ---- W1 ----
// dL/dW1 = X^T * dL/dZ1
matrixd XT = transpose(X);
matrixd dL_dW1 = matmul(XT, dL_dZ1);
// dL/dX = dL/dZ1 * W1^T
matrixd W1T = transpose(W1);
matrixd dL_dX = matmul(dL_dZ1, W1T);
// ---- Update weights ----
for (int i = 0; i < W2.size(); i++)
for (int j = 0; j < W2[0].size(); j++)
W2[i][j] -= eta * dL_dW2[i][j];
for (int i = 0; i < W1.size(); i++)
for (int j = 0; j < W1[0].size(); j++)
W1[i][j] -= eta * dL_dW1[i][j];
return dL_dX; // This becomes the dL_dY of next stage
}
// ------------------ MultiHeadAttention --------------------
MultiHeadAttention::MultiHeadAttention(int dim, int n)
{
dModel = dim;
nHeads = n;
d_k = dModel / nHeads;
WQ.resize(nHeads);
WK.resize(nHeads);
WV.resize(nHeads);
for (int h = 0; h < nHeads; h++) {
initMatrix(WQ[h], dModel, d_k);
initMatrix(WK[h], dModel, d_k);
initMatrix(WV[h], dModel, d_k);
}
initMatrix(WO, dModel, dModel);
}
void MultiHeadAttention::clearCache()
{
Qs.clear();
Ks.clear();
Vs.clear();
Ps.clear();
As.clear();
}
matrixd MultiHeadAttention::computeAttention(const matrixd &X_Q,
const matrixd &X_K, const matrixd &X_V, bool mask)
{
Qs.clear(); Ks.clear(); Vs.clear();
Ps.clear(); As.clear();
vector<matrixd> heads;
int nq = X_Q.size();
int nk = X_K.size();
for (int h = 0; h < nHeads; h++) {
matrixd Qh = matmul(X_Q, WQ[h]); // nq x d_k
matrixd Kh = matmul(X_K, WK[h]); // nk x d_k
matrixd Vh = matmul(X_V, WV[h]); // nk x d_k
matrixd S = matmul(Qh, transpose(Kh));
scaleMat(S, 1.0 / sqrt(d_k));
if (mask)
S = causalMask(S);
matrixd Ph = softmax(S); // row-wise
matrixd Ah = matmul(Ph, Vh); // nq x d_k
Qs.push_back(Qh);
Ks.push_back(Kh);
Vs.push_back(Vh);
Ps.push_back(Ph);
As.push_back(Ah);
heads.push_back(Ah);
}
// Concatenate heads
int n = heads[0].size();
matrixd concat(n, vector<double>(dModel, 0.0));
for (int h = 0; h < nHeads; h++)
for (int i = 0; i < n; i++)
for (int j = 0; j < d_k; j++)
concat[i][h * d_k + j] = heads[h][i][j];
// Final projection
matrixd H = matmul(concat, WO);
return H;
}
// backpropagation
MHA_Gradient MultiHeadAttention::backProp(
const matrixd &X_Q,
const matrixd &X_K,
const matrixd &X_V,
const matrixd &dL_dH,
double eta,
bool mask)
{
if (Ps.empty() || As.empty()) {
cout << "ERROR: MHA cache empty in backProp!" << endl;
exit(1);
}
int nt = X_Q.size();
double s = 1.0 / sqrt(d_k);
// =========================
// Step 1: concatenate heads
// =========================
matrixd A_cat(nt, vector<double>(dModel, 0.0));
for (int h = 0; h < nHeads; h++)
for (int i = 0; i < nt; i++)
for (int j = 0; j < d_k; j++)
A_cat[i][h*d_k + j] = As[h][i][j];
// =========================
// Step 2: gradients for WO
// =========================
matrixd dL_dWO = matmul(transpose(A_cat), dL_dH);
// Compute before updating WO
matrixd dL_dAcat = matmul(dL_dH, transpose(WO));
// =========================
// Step 3: initialize accumulators
// =========================
matrixd dL_dXQ(X_Q.size(), vector<double>(dModel, 0.0));
matrixd dL_dXK(X_K.size(), vector<double>(dModel, 0.0));
matrixd dL_dXV(X_V.size(), vector<double>(dModel, 0.0));
// =========================
// Step 4: per-head backprop
// =========================
for (int h = 0; h < nHeads; h++)
{
// ---- slice dA ----
matrixd dL_dA(nt, vector<double>(d_k));
for (int i = 0; i < nt; i++)
for (int j = 0; j < d_k; j++)
dL_dA[i][j] = dL_dAcat[i][h*d_k + j];
matrixd P = Ps[h];
matrixd V = Vs[h];
matrixd Q = Qs[h];
matrixd K = Ks[h];
// =====================
// A = P V
// =====================
matrixd dL_dP = matmul(dL_dA, transpose(V));
matrixd dL_dV = matmul(transpose(P), dL_dA);
// optional stabilization
//scaleMat(dL_dP, 1.0 / nt);
// =====================
// softmax backward
// =====================
matrixd dL_dS = softmax_derivative(dL_dP, P);
// apply mask if used
if (mask) {
for (int i = 0; i < nt; i++)
for (int j = i+1; j < nt; j++)
dL_dS[i][j] = 0.0;
}
// =====================
// S = (Q K^T) * s
// =====================
matrixd dL_dQ = matmul(dL_dS, K);
matrixd dL_dK = matmul(transpose(dL_dS), Q);
scaleMat(dL_dQ, s);
scaleMat(dL_dK, s);
// =====================
// Weight gradients
// =====================
matrixd dL_dWQ = matmul(transpose(X_Q), dL_dQ);
matrixd dL_dWK = matmul(transpose(X_K), dL_dK);
matrixd dL_dWV = matmul(transpose(X_V), dL_dV);
// =====================
// Input gradients (USE OLD WEIGHTS!)
// =====================
matrixd dXQ = matmul(dL_dQ, transpose(WQ[h]));
matrixd dXK = matmul(dL_dK, transpose(WK[h]));
matrixd dXV = matmul(dL_dV, transpose(WV[h]));
// accumulate across heads
dL_dXQ = addMat(dL_dXQ, dXQ);
dL_dXK = addMat(dL_dXK, dXK);
dL_dXV = addMat(dL_dXV, dXV);
// =====================
// Update weights after gradients
// =====================
updateWeight(WQ[h], dL_dWQ, eta);
updateWeight(WK[h], dL_dWK, eta);
updateWeight(WV[h], dL_dWV, eta);
}
// =========================
// Step 5: update WO
// =========================
updateWeight(WO, dL_dWO, eta);
// =========================
// return gradients
// =========================
MHA_Gradient grad;
grad.dL_dQ = dL_dXQ;
grad.dL_dK = dL_dXK;
grad.dL_dV = dL_dXV;
return grad;
}
// ---------- EncoderLayer --------------------
EncoderLayer::EncoderLayer(int d_model, int nHeads, int d_ff)
: mha(d_model, nHeads),
ffn(d_model, d_ff),
norm1(d_model),
norm2(d_model) {}
matrixd EncoderLayer::forward(const matrixd& input)
{
X = input;
mha.clearCache();
matrixd A1 = mha.computeAttention(X, X, X, true);
H1 = norm1.forward(X, A1);
matrixd A2 = ffn.FFoutput(H1);
H2 = norm2.forward(H1, A2);
return H2;
}
matrixd EncoderLayer::backProp(const matrixd& dL_dY, double eta)
{
// =========================
// Step 1: FFN block
// =========================
AddNormGrad g2 = norm2.backProp(H1, dL_dY, eta);
matrixd dL_dH1_res = g2.dL_dA;
matrixd dL_dFFN_in = g2.dL_dB;
matrixd dL_dH1_ffn = ffn.backProp(H1, dL_dFFN_in, eta);
matrixd dL_dH1 = addMat(dL_dH1_res, dL_dH1_ffn);
// =========================
// Step 2: MHA block
// =========================
AddNormGrad g1 = norm1.backProp(X, dL_dH1, eta);
matrixd dX_res = g1.dL_dA;
matrixd dL_dMHA = g1.dL_dB;
MHA_Gradient g_mha = mha.backProp(X, X, X, dL_dMHA, eta, false);
matrixd dL_dX_mha = addMat(g_mha.dL_dQ,
addMat(g_mha.dL_dK, g_mha.dL_dV));
matrixd dL_dX = addMat(dX_res, dL_dX_mha);
return dL_dX;
}
// ---------- OutputLayer ----------------------
OutputLayer::OutputLayer(int dim, int nv)
{
dModel = dim;
vocab_size = nv;
initMatrix(W, dModel, vocab_size);
}
// Forward: logits -> probabilities
matrixd OutputLayer::forward(const matrixd &X)
{
// X: (seq_len x dModel)
// output: (seq_len x vocab_size)
matrixd logits = matmul(X, W);
return softmax(logits);
}
// Backward: returns dL/dX
matrixd OutputLayer::backward(const matrixd &X, const matrixd &P,
const vector<int> &targets, double eta)
{
int nt = P.size();
// dL/dY = P - T (cross-entropy + softmax)
matrixd dL_dY = P;
for (int i = 0; i < nt; i++)
dL_dY[i][targets[i]] -= 1.0;
// optional but recommended: normalize
for (int i = 0; i < nt; i++)
for (int j = 0; j < P[0].size(); j++)
dL_dY[i][j] /= nt;
// Gradient wrt weights
matrixd dL_dW = matmul(transpose(X), dL_dY);
// Gradient wrt input (back to decoder)
matrixd dL_dX = matmul(dL_dY, transpose(W));
//update weights after gradient computation
updateWeight(W, dL_dW, eta);
return dL_dX;
}
// ------------ Decoder Layer -------------------
DecoderLayer::DecoderLayer(int d_model, int nHeads, int d_ff)
: self_mha(d_model, nHeads),
cross_mha(d_model, nHeads),
ffn(d_model, d_ff),
norm1(d_model),
norm2(d_model),
norm3(d_model) {}
matrixd DecoderLayer::forward(const matrixd& input,
const matrixd& encoder_output)
{
X = input;
E = encoder_output;
self_mha.clearCache();
matrixd A1 = self_mha.computeAttention(X, X, X, true);
H1 = norm1.forward(X, A1);
cross_mha.clearCache();
matrixd A2 = cross_mha.computeAttention(H1, E, E, false);
H2 = norm2.forward(H1, A2);
matrixd A3 = ffn.FFoutput(H2);
H3 = norm3.forward(H2, A3);
return H3;
}
DecoderGrad DecoderLayer::backProp(const matrixd& dL_dY, double eta)
{
// =========================
// Step 1: FFN
// =========================
AddNormGrad g3 = norm3.backProp(H2, dL_dY, eta);
matrixd dL_dH2_res = g3.dL_dA;
matrixd dL_dFFN_in = g3.dL_dB;
matrixd dL_dH2_ffn = ffn.backProp(H2, dL_dFFN_in, eta);
matrixd dL_dH2 = addMat(dL_dH2_res, dL_dH2_ffn);
// =========================
// Step 2: Cross Attention
// =========================
AddNormGrad g2 = norm2.backProp(H1, dL_dH2, eta);
matrixd dL_dH1_res = g2.dL_dA;
matrixd dL_dCross = g2.dL_dB;
MHA_Gradient g_cross =
cross_mha.backProp(H1, E, E, dL_dCross, eta, false);
matrixd dL_dEnc = addMat(g_cross.dL_dK, g_cross.dL_dV); // encoder gradients
matrixd dL_dH1 = addMat(dL_dH1_res, g_cross.dL_dQ); // decoder path
// =========================
// Step 3: Self Attention
// =========================
AddNormGrad g1 = norm1.backProp(X, dL_dH1, eta);
matrixd dL_dX_res = g1.dL_dA;
matrixd dL_dSelf = g1.dL_dB;
MHA_Gradient g_self =
self_mha.backProp(X, X, X, dL_dSelf, eta, true);
matrixd dL_dSelf_mha = addMat(g_self.dL_dQ,
addMat(g_self.dL_dK, g_self.dL_dV));
matrixd dL_dX = addMat(dL_dX_res, dL_dSelf_mha);
return {dL_dX, dL_dEnc};
}
// ------------ Transformer (Encoder-Decoder) -------------------
MiniTrans::MiniTrans(int vocab_size_, int seq_len_,
int dModel_, int nHeads,
int d_ff, int nLayers)
: embed(vocab_size_, dModel_),
pe(seq_len_, dModel_),
output(dModel_, vocab_size_)
{
vocab_size = vocab_size_;
seq_len = seq_len_;
dModel = dModel_;
for (int i = 0; i < nLayers; i++) {
EncoderLayer e = EncoderLayer(dModel, nHeads, d_ff);
eLayers.push_back( e );
DecoderLayer d = DecoderLayer(dModel, nHeads, d_ff);
dLayers.push_back( d );
}
}
matrixd MiniTrans::forward(const vector<int> &source_tokens,
const vector<int> &target_in)
{
// =========================
// Encoder
// =========================
matrixd src_embed = embed.embed(source_tokens);
matrixd src_pe = pe.addPE(src_embed);
matrixd E = src_pe; // E will be final output of encoder
for (int i = 0; i < eLayers.size(); i++)
E = eLayers[i].forward(E);
// =========================
// Decoder
// =========================
matrixd tgt_embed = embed.embed(target_in);
matrixd tgt_pe = pe.addPE(tgt_embed);
matrixd dec = tgt_pe;
for (int i = 0; i < dLayers.size(); i++)
dec = dLayers[i].forward(dec, E);
// cache final decoder output (IMPORTANT for backward)
dec_out = dec;
// =========================
// Output layer
// =========================
matrixd P = output.forward(dec_out);
return P;
}
double MiniTrans::trainStep(const vector<int> &source,
const vector<int> &target_in,
const vector<int> &target_out,
double eta)
{
// =========================
// Step 1: Forward
// =========================
matrixd P = forward(source, target_in);
double loss = crossEntropy(P, target_out);
// =========================
// Step 2: Output backward
// =========================
matrixd dL_dX = output.backward(dec_out, P, target_out, eta);
// =========================
// Step 3: Decoder backward
// =========================
matrixd dL_dE; // accumulate gradients to encoder
bool first = true;
// accumulate encoder gradients dL_dE
for (int i = dLayers.size() - 1; i >= 0; i--) {
DecoderGrad g = dLayers[i].backProp(dL_dX, eta);
dL_dX = g.dL_dX;
if (first) {
dL_dE = g.dL_dE;
first = false;
} else {
dL_dE = addMat(dL_dE, g.dL_dE);
}
}
// =========================
// Step 4: Encoder backward
// =========================
for (int i = eLayers.size() - 1; i >= 0; i--)
dL_dE = eLayers[i].backProp(dL_dE, eta);
// =========================
// Step 5: Embedding update, optional
// =========================
embed.backProp(source, dL_dE, eta); // encoder side
embed.backProp(target_in, dL_dX, eta); // decoder side
return loss;
}
vector<int> MiniTrans::generate(const vector<int> &source, int max_len,
int start_token, int end_token)
{
// =========================
// Pass source through encoder
// =========================
matrixd src_embed = embed.embed(source);
matrixd src_pe = pe.addPE(src_embed);
matrixd E = src_pe;
for (int i = 0; i < eLayers.size(); i++)
E = eLayers[i].forward(E);
// =========================
// Initialize sequence
// =========================
vector<int> generated;
generated.push_back(start_token);
// =========================
// Autoregressive loop
// =========================
for (int step = 0; step < max_len; step++)
{
// embed current sequence
matrixd target_embed = embed.embed(generated);
matrixd target_pe = pe.addPE(target_embed);
matrixd dec = target_pe;
// pass through decoder
for (int i = 0; i < dLayers.size(); i++)
dec = dLayers[i].forward(dec, E);
matrixd P = output.forward(dec);
vector<double> last = P.back(); //last row of matrix P
int best_token = max_element(last.begin(), last.end()) - last.begin();
generated.push_back(best_token);
if (best_token == end_token)
break;
}
return generated;
}
|
Test Program:
// testMain.cpp : main() for testing a transformer
// http://forejune.co/cuda/
#include <cmath>
#include <algorithm>
#include <fstream>
#include "util.h"
#include "transformer.h"
using namespace std;
struct TrainingDS {
vector<int> source;
vector<int> target_in;
vector<int> target_out;
};
vector <TrainingDS> buildDataset(Tokenizer &tokenizer)
{
vector <TrainingDS> dataset;
TrainingDS ds;
const int n = 3; // 3 training examples
string sources[n] = {
"one two three four",
"one two three four reverse",
"one two three four digit"};
string target_ins[n] = {
"<BOS> uno dos tres cuatro",
"<BOS> four three two one",
"<BOS> 1 2 3 4"};
string target_outs[n] = {
"uno dos tres cuatro <EOS>",
"four three two one <EOS>",
"1 2 3 4 <EOS>"};
for (int i = 0; i < n; i++) {
vector<int> token_src = tokenizer.tokenize(sources[i]);
vector<int> token_tin = tokenizer.tokenize(target_ins[i] );
vector<int> token_tout =tokenizer.tokenize(target_outs[i]);
ds = TrainingDS (token_src, token_tin, token_tout);
dataset.push_back( ds );
}
return dataset;
}
int main()
{
// -------------------------
// Tokenizer setup
// -------------------------
// Build vocabulary
ifstream ifs;
char fname[] = "t.txt";
ifs.open( fname );
if (!ifs.is_open()) {
cerr << "Unable to open file " << fname << endl;
exit( 1 );
}
Tokenizer tokenizer( ifs );
int vocab_size = tokenizer.get_nTokens();
cout << "Vocab size: " << vocab_size << endl;
vector <TrainingDS> datasets;
datasets = buildDataset( tokenizer );
int n_ds = datasets.size();
cout << "==== Encoder-Decoder Transformer Tests ====" << endl;
// ----------------------------
// Hyperparameters
// ----------------------------
int seq_len = 16;
int dModel = 16;
int nHeads = 4;
int d_ff = 16;
int nLayers = 4;
double eta = 0.002;
int nEpoch = 601;
// ----------------------------
// Model
// ----------------------------
MiniTrans model(vocab_size, seq_len, dModel, nHeads, d_ff, nLayers);
int START = tokenizer.tokenize( "<BOS>" )[0];
int EOS = tokenizer.tokenize( "<EOS>" )[0];
for (int k = 0; k < n_ds; k++) {
cout << "Example " << k + 1 << ":" << endl;
// Training loop
for (int i = 0; i < nEpoch; i++)
{
double loss;
loss = model.trainStep(datasets[k].source, datasets[k].target_in,
datasets[k].target_out, eta);
if (i % 50 == 0)
cout << "Epoch " << i << " Loss: " << loss << endl;
}
// Inference (greedy decoding)
cout << "\n==== Inference ====" << endl;
vector<int> predict = model.generate(datasets[k].source, 12, START, EOS);
cout << "Source: ";
string str = tokenizer.detokenize( datasets[k].source );
cout << str << endl;
cout << "Predicted: ";
str = tokenizer.detokenize( predict );
cout << str << endl;
cout << "Expected: ";
str = tokenizer.detokenize( datasets[k].target_out );
cout << str << endl;
getchar(); // pause
} // for k
return 0;
}
|
Makefile:
PROG = testMain #source codes SRCS = $(PROG).cpp #substitute .cpp by .o to obtain object filenames OBJS = $(SRCS:.cpp=.o) util.o transformer.o #$< evaluates to the target's dependencies, #$@ evaluates to the target $(PROG): $(OBJS) g++ -o $@ $(OBJS) $(OBJS): g++ -c -std=c++20 $*.cpp clean: rm $(OBJS) $(PROG) |