Materials available at: http://forejune.co/cuda/
Transformer Decoder
|
An upper triangular matrix applied to the attention scores Unmasked Attention: Token 0: [ -0.16, 0.57, 0.84, 1.78 ] Token 1: [ 0.95, -0.49, -0.82, 1.04 ] Token 2: [ 1.18, -1.40, -1.05, 1.64 ] Token 3: [ 0.44, -1.55, -0.36, 1.64 ] Masked Attention: Token 0: [ -0.16, -∞, -∞ -∞ ] Token 1: [ 0.95, -0.49, -∞ -∞ ] Token 2: [ 1.18, -1.40, -1.05, -∞ ] Token 3: [ 0.44, -1.55, -0.36, 1.64 ] After activation function, -∞ becomes 0 Batch size = 1 (one input sequence) Target sequence length (decoder) = nt Source sequence length (encoder) = ns Model dimension = dmodel Number of heads = nh Head dimension = dk = dmodel / nh Feed forward hidden size = dff Docoder input: X : nt x dmodel Encoder output: E : ns x dmodel |
Masked Multi-Head Attention
Algorithm:
1. Linear projections:
Q = X x WQ, K = X x WK, V = X x WV
Dimensions:
WQ, WK, WV, WO : dmodel x dmodel
Q, K, V : nt x dmodel
2. Splitting into heads:
Q, K, V : nh x nt x dk
3. Attention and Output after projection:
|
|
Cross Multi-Head Attention
Inputs:
Query from decoder:
Q = X1 x WQ : nt x dmodel
Key from encoder:
K = E x WK : ns x dmodel
Value from encoder:
V = E x WV : ns x dmodel
Attention dimensions now:
Q x KT : nt x ns
meaning that each decoder token attends over
all encoder tokens.
Add & Norm :
X2 = Normalize(X1 + Cross-MHA(X1))
Feed-Forward Network (FFN) :
FFN(X2) = f(X2 x W1) x W2
f is an activation function like ReLU
We do not use any bias for simplicity
Note: Each token is processed independently
X2 : nt x dmodel
W1 : dmodel x dff
W2 : dff x dmodel
FFN(X2) : nt x dmodel
Final Add & Norm :
Output Y = Normalize(X2 + FFN(X2))
Y : nt x dmodel
Y will be inputs to the next decoder layer (Y → X)
only the output of the last decoder layer goes to
Linear then Softmax
|
|
C/C++ Implementation
// util.h // http://forejune.co/cuda/ #ifndef __UTIL_H__ #define __UTIL_H__ #include <vector> #include <algorithm> #include <math.h> #include <iostream> #include <iomanip> #include <random> using namespace std; typedef vector<vector<double>> matrixd; // an activation function double f(double z); // softmax, 1D input vector<double> softmax(const vector<double> &v); // softmax activation function, 2D input vector matrixd softmax(const matrixd& input); // add two matrices matrixd addMat(const matrixd &A, const matrixd &B); // Transpose matrix n x m, B = A^T : m x n matrixd transpose(const matrixd& A); // Matrix multiplication C = A X B A: nxm, B: mxr, C: nxr matrixd matmul(const matrixd& A, const matrixd& B); // scale a matrix void scaleMat(matrixd &A, const double s); // scaled self attention matrixd scaledAttention(const matrixd &Q, const matrixd &K, const matrixd &V, bool mask); // initialize an input matrix with certain random values void initMatrix(matrixd& M, const int n, const int m); // print a token dictionary void printDictionary(unordered_map<string, int> &d); // print a matrix void printMatrix(const matrixd &y); // Add & Norm matrixd addNnorm(const matrixd &A, const matrixd &B); #endif |
util.cpp (Helper functions):
|
transformer.h:
|
transformer.cpp (transformer classes):
|
|
testDecoder.cpp (testing routine):
// testDecoder.cpp -- a main program for testing a transformer decoder
// http://forejune.co/cuda/
#include <iostream>
#include <iomanip>
#include "util.h"
#include "transformer.h"
using namespace std;
int main()
{
srand(time (0));
int num_heads = 2;
string sentence = "The waves crashed forcefully against the shore!";
// Tokenization
ifstream ifs;
char fname[] = "t.txt";
ifs.open( fname );
if (!ifs.is_open()) {
cerr << "Unable to open file " << fname << endl;
exit( 1 );
}
Tokenizer tokenizer( ifs );
vector<int> tokens = tokenizer.tokenize(sentence);
// Create embeddings
const int dModel = 8;
int totalTokens = tokenizer.get_nTokens();
cout << "\n=== Source sentence Tokenization ===" << endl;
cout << "Original sentence:\n\t " << sentence << "\n\n";
cout << "Tokenization Results:\n";
cout << "Tokens: [";
for (size_t i = 0; i < tokens.size(); ++i) {
cout << tokens[i];
if (i < tokens.size() - 1) cout << ", ";
}
cout << "]\n";
getchar();
Embedding embedding_layer(totalTokens, dModel);
matrixd embeddings = embedding_layer.embed(tokens);
// Add positional encoding
const int MAX_SEQ_LENGTH = 16;
PositionalEncoding pos_encoding(MAX_SEQ_LENGTH, dModel);
matrixd pe = pos_encoding.getPE();
// add positional encoding to embeddings
matrixd embeddings_with_pos = pos_encoding.addPE(embeddings);
cout << "\nInput to Encoder:\n";
int rows = embeddings_with_pos.size();
int cols = embeddings_with_pos[0].size();
for (int i = 0; i < rows; i++) {
cout << "Token " << i << ": [";
for (int j = 0; j < cols; ++j) {
cout << fixed << setw(6) << setprecision(2) << embeddings_with_pos[i][j];
if (j < cols-1) cout << ", ";
}
cout << "]\n";
}
getchar();
// Pass through transformer encoder
int embedDimension = embedding_layer.get_embedDim();
int d_ff = 128;
Encoder encoder(embedDimension, num_heads, d_ff, embeddings_with_pos);
cout << "\nOutput of Encoder:\n";
matrixd E = encoder.computeOutput(); // embeddings_with_pos );
printMatrix( E );
getchar();
// ------------------- Decoder --------------------
string target_sentence = "¡Las olas rompieron con fuerza contra la orilla!";
ifs.close();
char fnamet[] = "t1.txt";
ifs.open( fnamet );
if (!ifs.is_open()) {
cerr << "Unable to open file " << fnamet << endl;
exit( 1 );
}
Tokenizer target_tokenizer( ifs );
unordered_map<string, int> dictionary;
dictionary = target_tokenizer.getTokensWords();
vector<int> target_tokens = target_tokenizer.tokenize(target_sentence);
cout << "\n=== Target sentence Tokenization ===" << endl;
cout << "Original sentence:\n\t " << target_sentence << "\n\n";
cout << "Target Tokenization Results:\n";
cout << "Tokens: [";
for (int i = 0; i < target_tokens.size(); ++i) {
cout << target_tokens[i];
if (i < target_tokens.size() - 1) cout << ", ";
}
cout << "]\n";
getchar();
totalTokens = target_tokenizer.get_nTokens();
cout << "\nTarget vocabulary size: " << totalTokens;
cout << "\nEmbedding dimension: " << dModel << "\n";
Embedding target_embedding_layer(totalTokens, dModel);
matrixd target_embeddings = target_embedding_layer.embed(target_tokens);
cout <<"\nTarget Token Embeddings(" << target_embeddings.size() <<" x "<<dModel<<"):\n";
printMatrix( target_embeddings );
getchar();
PositionalEncoding target_pos_encoding(MAX_SEQ_LENGTH, dModel);
// add positional encoding to embeddings
matrixd target_embeddings_with_pos = target_pos_encoding.addPE(target_embeddings);
cout << "\nInput to Decoder:\n";
rows = target_embeddings_with_pos.size();
cols = target_embeddings_with_pos[0].size();
for (int i = 0; i < rows; i++) {
cout << "Token " << i << ": [";
for (int j = 0; j < cols; ++j) {
cout << fixed << setw(6) << setprecision(2) << target_embeddings_with_pos[i][j];
if (j < cols-1) cout << ", ";
}
cout << "]\n";
}
getchar();
Decoder decoder(dModel, num_heads, d_ff, target_embeddings_with_pos, E);
matrixd y = decoder.computeOutput();
cout << "Decoder output:\n";
printMatrix( y );
getchar();
return 0;
}
|
Makefile :
PROG = testDecoder #source codes SRCS = $(PROG).cpp #substitute .cpp by .o to obtain object filenames OBJS = $(SRCS:.cpp=.o) util.o transformer.o #$@ evaluates to the target $(PROG): $(OBJS) g++ -o $@ $(OBJS) $(OBJS): g++ -c -std=c++20 $*.cpp clean: rm $(OBJS) $(PROG) |