Materials available at: http://forejune.co/cuda/
Review
|
|
Decoder-Only (GPT-style) Transformer
The Model:
Advantages:
GPT can use: any raw text
Disadvantages:
When Decoder-Only Works Best:
Less ideal for:
Summary:
Feature Decoder-Only (GPT) Encoder-Decoder Architecture Simple More complex Flexibility Very high Medium Translation quality Good Better Training data Unstructured OK Needs pairs Efficiency (long input) Lower Better Implementation Easier Harder |
Full Implementation in C/C++
Helper Functions:
#ifndef __UTIL_H__ #define __UTIL_H__ #include <vector> #include <algorithm> #include <math.h> #include <iostream> #include <iomanip> #include <random> using namespace std; typedef vector<vector<double>> matrixd; // an activation function double f(double z); // Relu activation function with matrix input matrixd relu(const matrixd &X); // Derivative of Relu matrixd relu_deriv (const matrixd &X); // calculates entropy loss double crossEntropy(const matrixd &P, const vector<int> &target); // add two matrices matrixd addMat(const matrixd &A, const matrixd &B); // Transpose matrix n x m, B = A^T : m x n matrixd transpose(const matrixd& A); // Matrix multiplication C = A X B A: nxm, B: mxr, C: nxr matrixd matmul(const matrixd& A, const matrixd& B); // multiply a matrix by a scalar matrixd mulMats (const matrixd &A, const double s); // scale a matrix void scaleMat(matrixd &A, const double s); // apply masking to S matrixd causalMask(matrixd S); // update weight matrix W -= dW * eta void updateWeight(matrixd &W, const matrixd &dW, const double eta); // initialize an input matrix with certain random values void initMatrix(matrixd& M, const int n, const int m); void initMatrix(matrixd& M, const int n, const int m, double dModel); // print a token dictionary void printDictionary(unordered_map<string, int> &d); // print a matrix void printMatrix(const matrixd &y); // Add & Norm matrixd addNnorm(const matrixd &A, const matrixd &B); void clip(matrixd &A, double max_val); // softmax, 1D vector input vector<double> softmax(const vector<double> &v); // softmax activation function, 2D input vector matrixd softmax(const matrixd& input); // derivative function of softmax matrixd softmax_derivative(const matrixd &dL_dP, const matrixd &P); #endif |
util.cpp:
// util.cpp: helper functions
// http://forejune.co/cuda/
#include "util.h"
#include <random>
using namespace std;
// Activation function: using ReLU (Rectified Linear Unit)
double f(double z)
{
return max(0.0, z);
}
// Relu activation function with matrix input
matrixd relu(const matrixd &X)
{
int rows = X.size();
int cols = X[0].size();
matrixd Y = X; // same dimension of X
for(int i = 0; i < rows; i++)
for(int j = 0; j < cols; j++)
Y[i][j] = max(0.0, X[i][j]);
return Y;
}
// Derivative of Relu
matrixd relu_deriv(const matrixd &X)
{
int rows = X.size();
int cols = X[0].size();
matrixd Y = X; // same dimension as X
for(int i = 0; i < rows; i++)
for(int j = 0; j < cols; j++)
if ( X[i][j] > 0 )
Y[i][j] = 1;
else
Y[i][j] = 0;
return Y;
}
// calculates entropy loss
double crossEntropy(const matrixd &P, const vector<int> &target)
{
double L = 0;
int nt = P.size();
for (int i = 0; i < nt; i++){
int k = target[i];
L -= log( P[i][k] + 1e-12 );
}
return L / nt;
}
// C = A + B
matrixd addMat(const matrixd &A, const matrixd &B)
{
int n = A.size(); // rows
int m = A[0].size(); // cols
matrixd C = A; // same dimension as A
for (int i = 0; i < n; i++)
for(int j = 0; j < m; j++)
C[i][j] = A[i][j] + B[i][j];
return C;
}
// Transpose matrix n x m, B = A^T : m x n
matrixd transpose(const matrixd& A)
{
int n = A.size(); //number of rows
int m = A[0].size(); //number of columns
matrixd B(m, vector<double>(n));
for (int i = 0; i < n; i++)
for (int j = 0; j < m; j++)
B[j][i] = A[i][j];
return B;
}
// Matrix multiplication C = A X B A: nxm, B: mxr, C: nxr
matrixd matmul(const matrixd& A, const matrixd& B)
{
int n = A.size();
int m = A[0].size();
int r = B[0].size();
matrixd C(n, vector<double>(r, 0));
for (int i = 0; i < n; i++)
for (int j = 0; j < r; j++)
for (int k = 0; k < m; k++)
C[i][j] += A[i][k] * B[k][j];
return C;
}
// scale a matrix
void scaleMat(matrixd &A, const double s)
{
int m = A.size(); // number of rows
int n = A[0].size(); // number of columns
for (int i = 0; i < m; i++)
for (int j = 0; j < n; j++)
A[i][j] *= s;
}
// maultiply a matrix by a scalar s
matrixd mulMats (const matrixd &A, const double s)
{
int m = A.size(); // number of rows
int n = A[0].size(); // number of columns
matrixd B = A;
for (int i = 0; i < m; i++)
for (int j = 0; j < n; j++)
B[i][j] *= s;
return B;
}
void initMatrix(matrixd& M, const int n, const int m)
{
// resizing 2D vector to n x m with initial values 0
M.assign(n, vector<double>(m, 0));
random_device rd;
mt19937 gen(rd()); //psuedo random number generator
double sigma = 1.0/16.0;
normal_distribution<double> dist(0.0, sigma);
for (int i = 0; i < n; ++i)
for (int j = 0; j < m; ++j)
M[i][j] = dist(gen);
}
void printDictionary(unordered_map<string, int> &d)
{
unordered_map<string, int>::iterator it = d.begin();
int k = 0;
while ( it != d.end() ){
cout << right << setw(12) << it->first << ": " << setw(3) << it->second;
it++;
k++;
if ( k % 5 == 0 )
cout << endl;
}
}
void printMatrix(const matrixd &y)
{
int cols = y.size();
int rows = y[0].size();
for (int i = 0; i < cols; ++i) {
cout << "Token " << i << ": [";
for (int j = 0; j < rows; ++j) {
cout << fixed << setw(7) << setprecision(3) << y[i][j];
if (j < 4) cout << ", ";
}
cout << " ]\n";
}
}
matrixd addNnorm(const matrixd &A, const matrixd &B)
{
auto x = addMat(A, B); // add two matrices
int n = x.size(); // rows
int m = x[0].size(); // cols
matrixd y = x; // output same size as x
// normalize the result
double epsilon = 1e-5;
for (int i = 0; i < n; i++) {
double sum = 0; // sum of one row
double std = 0; //sigma_i square
for (int j = 0; j < m; j++)
sum += x[i][j];
double mean = sum / m; // mu_i, mean of i-th row
sum = 0;
for (int j = 0; j < m; j++) {
double diff = x[i][j] - mean;
sum += diff * diff;
}
std = sum / m;
for (int j = 0; j < m; j++) {
double xd = (x[i][j] - mean) / sqrt(std + epsilon);
y[i][j] = xd; // gamma = 1, beta = 0;
}
}
return y; // final output of Add & norm
}
matrixd scaledAttention(const matrixd &Q, const matrixd &K, const matrixd &V,
bool mask)
{
double d_k, scale;
matrixd KT, A; // K transpose, attention
d_k = Q[0].size();
scale = 1.0 / sqrt( d_k );
KT = transpose ( K );
A = matmul(Q, KT);
scaleMat(A, scale);
if ( mask ) {
for (int i = 0; i < A.size(); i++)
for (int j = i+1; j < A[0].size(); j++)
A[i][j] = -1e9; // causal mask
}
A = softmax( A );
matrixd output = matmul(A, V);
return output;
}
matrixd causalMask(matrixd S)
{
int n = S.size();
for (int i = 0; i < n; i++) {
for (int j = i+1; j < n; j++) {
S[i][j] = -1e9; // effectively -infinity
}
}
return S;
}
void updateWeight(matrixd &W, const matrixd &dW, const double eta)
{
for (int i = 0; i < W.size(); i++)
for (int j = 0; j < W[0].size(); j++)
W[i][j] -= eta * dW[i][j];
}
void clip(matrixd &A, double max_val = 5.0)
{
for (auto &row : A)
for (auto &x : row)
if (x > max_val) x = max_val;
else if (x < -max_val) x = -max_val;
}
// softmax activation function, 1D input vector
vector<double> softmax(const vector<double> &z)
{
int m = z.size();
vector<double> y(m, 0);
//maximum value of vector
double maxVal = *max_element(z.begin(), z.end());
double sum = 0;
// Subtract max for numerical stability
for (int j = 0; j < m; j++) {
y[j] = exp(z[j] - maxVal);
sum += y[j];
}
// Normalize
for (int j = 0; j < m; j++)
y[j] /= sum;
return y;
}
// softmax activation function, 2D input vector
matrixd softmax(const matrixd& input)
{
int n = input.size(), m = input[0].size(); // n x m matrix
// 2D output vector n x m y[n][m]
vector<vector<double>> y(n, vector<double>(m));
for (int i = 0; i < n; i++) {
//maximum value of the row
double maxVal = *max_element(input[i].begin(), input[i].end());
double sum = 0;
// Subtract max for numerical stability
for (int j = 0; j < m; j++) {
y[i][j] = exp(input[i][j] - maxVal);
sum += y[i][j];
}
// Normalize
for (int j = 0; j < input[i].size(); j++)
y[i][j] /= sum;
}
return y;
}
// derivative function of softmax
matrixd softmax_derivative(const matrixd &dL_dP, const matrixd &P)
{
int nt = P.size(), ns = P[0].size();
matrixd dL_dS(nt, vector<double>(ns,0));
for (int i = 0; i < nt; i++)
{
double dot = 0.0;
for (int j = 0; j < ns; j++)
dot += P[i][j] * dL_dP[i][j]; // dot product of two row-vectors
for (int j = 0; j < ns; j++) // dot used by all columns of dL/dA
dL_dS[i][j] = P[i][j] * (dL_dP[i][j] - dot);
}
return dL_dS;
} |
Transformer Classes:
transGpt.h:
#ifndef TRANSGPT_H
#define TRANSGPT_H
class Tokenizer
{
private:
unordered_map<string, int> words;
unordered_map<int, string> wordIndex;
int nTokens; // number of words
// special tokens
const string PAD = "<PAD>";
const string UNK = "<UNK>";
const string BOS = "<BOS>";
const string EOS = "<EOS>";
const string SEP = "<SEP>";
void addWord(const string &w)
{
if (words.find(w) == words.end()) {
int id = nTokens;
words[w] = id;
wordIndex[id] = w;
nTokens++;
}
}
vector<string> split(const string &str)
{
vector<string> tokens;
stringstream ss(str);
string word;
while (ss >> word) {
tokens.push_back(word);
}
return tokens;
}
public:
Tokenizer();
Tokenizer(ifstream &fs);
unordered_map<string, int> getTokensWords();
vector<int> tokenize(const string& str);
string detokenize(const vector<int>& tokenIDs);
int get_nTokens() const;
unordered_map<int, string> get_wordIndex();
};
class Embedding
{
private:
int nTokens; // number of words
int dim; // embedding dimension
matrixd em_matrix; // embedding matrix;
public:
Embedding(int sequence_length, int embeddingDimension);
// create embedding matrix with tokenIDs
matrixd embed(const vector<int>& tokenIDs);
int get_embedDim() const;
int get_nTokens() const;
};
// Positional Encoding
class PositionalEncoding
{
private:
int maxTokens; // maximum sequence length
int dModel; // embedding dimension
matrixd PE; // positional_encodings matrix;
public:
PositionalEncoding(int max_seq_length, int embedding_dimension);
matrixd addPE(const matrixd& embeddings);
matrixd getPE();
};
// Add & Norm
class AddnNorm
{
private:
vector<double> gamma; // size dModel
vector<double> beta; // size dModel
matrixd Zhat; // normalized values (nt x d)
vector<double> mean; // per row (nt)
vector<double> var; // per row (nt)
double eps; // small value
public:
// constructor
AddnNorm(int dModel);
// forward computation
matrixd forward (const matrixd &A, const matrixd &B);
// Assume forward already filled: Zhat, mean, var
matrixd backProp(const matrixd &X, const matrixd &dL_dY, double eta);
// Optional setters (to plug in forward results)
void setCache(const matrixd& zhat, const vector<double>& m,
const vector<double>& v);
// Accessors (optional)
const vector<double>& getGamma();
const vector<double>& getBeta();
};
// Feed-forward Network
class FeedForward
{
private:
matrixd W1, W2;
int dModel, d_ff;
// cache
matrixd F1; // after activation function ReLU
matrixd Z; // before ReLU (needed for derivative)
public:
// constructor
FeedForward(int dModel, int d_ff_);
// ------------------------------
// Forward (for cache)
// ------------------------------
matrixd FFoutput(const matrixd &X);
// ------------------------------
// Backprop
// ------------------------------
matrixd backProp(const matrixd &X, const matrixd &dL_dY, double eta);
};
// ----------- MultiHeadAttention -----------
class MultiHeadAttention {
private:
int dModel, nHeads, d_k;
vector<matrixd> WQ, WK, WV; // per head
matrixd WO;
// cache
vector<matrixd> Qs, Ks, Vs, Ps, As;
public:
MultiHeadAttention(int dModel, int nHeads);
matrixd computeAttention(const matrixd &X_Q, const matrixd &X_K, const matrixd &X_V,
bool mask);
matrixd backProp( const matrixd &X_Q, const matrixd &X_K, const matrixd &X_V,
const matrixd &dL_dH, double eta, bool mask);
};
// ----------------- OutputLayer ---------------
class OutputLayer {
private:
matrixd W; // dModel x vocab_size
int dModel, vocab_size;
public:
OutputLayer(int dModel, int vocab_size);
matrixd forward(const matrixd &X);
matrixd backward(const matrixd &X, const matrixd &P,
const vector<int> &targets, double eta);
};
// ------------------- GPT-style Transformer ---------------
class GPTBlock {
private:
MultiHeadAttention mha;
AddnNorm norm1;
FeedForward ffn;
AddnNorm norm2;
public:
GPTBlock(int dModel, int nHeads, int d_ff);
matrixd forward(const matrixd &X);
matrixd backward(const matrixd &X, const matrixd &dL_dY, double eta);
};
class MiniGPT {
private:
Embedding embed;
PositionalEncoding pe;
vector<GPTBlock> layers;
OutputLayer output;
int seq_len, vocab_size;
public:
MiniGPT(int vocab_size, int seq_len, int dModel,
int nHeads, int d_ff, int nLayers);
matrixd forward(const vector<int> &tokens);
double trainStep(const vector<int> &tokens, double eta);
};
#endif
|
transGpt.cpp:
// http://forejune.co/cuda
// transGpt.cpp : classes of GPT-style Transformer
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <unordered_map>
#include <cmath>
#include <random>
#include <algorithm>
#include <iomanip>
#include "util.h"
#include "transGpt.h"
using namespace std;
// ------------------------ Tokenizer class --------------------------
// -------------------------------
// Default constructor
// -------------------------------
Tokenizer::Tokenizer()
{
nTokens = 0;
// Initialize special tokens first
addWord(PAD); // 0
addWord(UNK); // 1
addWord(BOS); // 2
addWord(EOS); // 3
addWord(SEP); // 4
}
// -------------------------------
// Build vocab from file
// -------------------------------
Tokenizer::Tokenizer(ifstream &fs) : Tokenizer()
{
string line;
while (getline(fs, line)) {
vector<string> tokens = split(line); //split(line);
for (int i = 0; i < tokens.size(); i++)
addWord(tokens[i]);
}
}
unordered_map<string, int> Tokenizer::getTokensWords()
{
return words;
}
vector<int> Tokenizer::tokenize(const string& str)
{
vector<int> ids;
vector<string> tokens = split(str);
for (int i = 0; i < tokens.size(); i++) {
string w = tokens[i];
if (words.find(w) != words.end())
ids.push_back(words[w]);
else
ids.push_back(words[UNK]);
}
return ids;
}
string Tokenizer::detokenize(const vector<int>& tokenIDs)
{
string words;
int n = tokenIDs.size();
for (int i = 0; i < n; i++) {
int id = tokenIDs[i];
if (wordIndex.find(id) == wordIndex.end())
continue;
string w = wordIndex[id];
// Skip special tokens (optional behavior)
if (w == "<PAD>" || w == "<BOS>" || w == "<EOS>")
continue;
words += w + " ";
}
return words;
}
int Tokenizer::get_nTokens() const
{
return nTokens;
}
unordered_map<int, string>Tokenizer::get_wordIndex()
{
return wordIndex;
}
//------------------ Embedding Class ------------------
Embedding::Embedding(int sequence_length, int embeddingDimension)
{
nTokens = sequence_length;
dim = embeddingDimension;
// initialize embeddings as an nTokensxdim matrix with certain random values
initMatrix(em_matrix, nTokens, dim);
}
// create embedding matrix with tokenIDs
matrixd Embedding::embed(const vector<int>& tokenIDs)
{
matrixd embeddings;
embeddings.reserve(tokenIDs.size()); // set capacity of embeddings vector
for(int i = 0; i < tokenIDs.size(); i++) {
if (tokenIDs[i] >= 0 && tokenIDs[i] < nTokens)
embeddings.push_back(em_matrix[tokenIDs[i]]);
else
// Out-of-dictionary tokens
embeddings.push_back(vector<double>(dim, 0.0));
}
return embeddings;
}
int Embedding::get_embedDim() const
{
return dim;
}
int Embedding::get_nTokens() const
{
return nTokens;
}
// --------------------- Positional Encoding class ---------------
PositionalEncoding::PositionalEncoding(int max_seq_length, int embed_dimension)
{
maxTokens = max_seq_length;
dModel = embed_dimension;
PE.resize(maxTokens, vector<double>(dModel));
for (int pos = 0; pos < maxTokens; pos++) {
for (int i = 0; i < dModel/2; i++) {
PE[pos][2*i] = sin( pos / pow(10000, 2.0*i/dModel) );
PE[pos][2*i+1] = cos( pos / pow(10000, 2.0*i/dModel) );
}
}
}
// add positional encodings to embeddings
matrixd PositionalEncoding::addPE(const matrixd& embeddings)
{
matrixd x = embeddings;
int nTokens = embeddings.size();
for (int pos = 0; pos < nTokens; pos++)
for (int i = 0; i < dModel; i++)
x[pos][i] += PE[pos][i];
return x;
}
matrixd PositionalEncoding::getPE()
{
return PE;
}
// -------------------- Add & Norm Class---------------------------------
// constructor
AddnNorm::AddnNorm(int dModel)
{
gamma.assign(dModel, 1.0);
beta.assign(dModel, 0.0),
eps = 1e-5;
}
matrixd AddnNorm::forward (const matrixd &A, const matrixd &B)
{
matrixd x = addMat(A, B); // add two matrices
int n = x.size(); // rows
int m = x[0].size(); // cols
mean.resize( n );
var.resize( n );
Zhat.resize(n, vector<double>(m, 0));
matrixd Y(n, vector |
testMain.cpp:
// http://forejune.co/cuda/
// testMain.cpp
#include "util.h"
#include "transGpt.h"
#include <fstream>
using namespace std;
vector<int> generate(MiniGPT &model, vector<int> prompt, int max_len)
{
vector<int> tokens = prompt;
for (int step = 0; step < max_len; step++) {
vector<int> input(tokens.begin(), tokens.end());
matrixd P = model.forward(input);
vector<double> last = P.back(); //last row of matrix P
// last contains probability distribution over the vocabulary for the next token
// each index in last corresponds to a token ID in the vocabulary
// greedy decoding, - last.begin() converts interator to index
int next = max_element(last.begin(), last.end()) - last.begin();
tokens.push_back(next);
}
return tokens;
}
vector<vector<int>> buildDataset(Tokenizer &tokenizer)
{
vector<vector<int>> dataset;
string str = "one two three uno dos tres";
vector<int> tokenIDs = tokenizer.tokenize(str);
dataset.push_back( tokenIDs );
str = "two three four dos tres cuatro ";
tokenIDs = tokenizer.tokenize(str);
dataset.push_back( tokenIDs );
str = "three four five tres cuatro cinco";
tokenIDs = tokenizer.tokenize(str);
dataset.push_back( tokenIDs );
str = "one two three uno dos tres";
tokenIDs = tokenizer.tokenize(str);
dataset.push_back( tokenIDs );
str = "three four five tres cuatro cinco";
tokenIDs = tokenizer.tokenize(str);
dataset.push_back( tokenIDs );
return dataset;
}
int main()
{
// -------------------------
// Tokenizer setup
// -------------------------
// Build vocabulary
ifstream ifs;
char fname[] = "t.txt";
ifs.open( fname );
if (!ifs.is_open()) {
cerr << "Unable to open file " << fname << endl;
exit( 1 );
}
Tokenizer tokenizer( ifs );
int vocab_size = tokenizer.get_nTokens();
cout << "Vocab size: " << vocab_size << endl;
// -------------------------
// Model
// -------------------------
int seq_len = 50; // maximum sequence length model can handle
int dModel = 32;
int nHeads = 4;
int d_ff = 128;
int nLayers = 2;
MiniGPT model(vocab_size, seq_len, dModel, nHeads, d_ff, nLayers);
// -------------------------
// Dataset
// -------------------------
vector<vector<int>> dataset;
dataset = buildDataset( tokenizer );
// -------------------------
// Training
// -------------------------
for (int epoch = 0; epoch < 201; epoch++) {
double loss = 0;
for (auto &seq : dataset)
loss += model.trainStep(seq, 0.002);
if (epoch % 20 == 0)
cout << "Epoch " << epoch << " Loss: " << loss << endl;
}
// -------------------------
// Inference
// -------------------------
string prompt = "one two three";
vector<int> prompt_tokens = tokenizer.tokenize(prompt);
cout << "\nPrompt: " << prompt << endl;
vector<int> generated = generate(model, prompt_tokens, 3);
string output = tokenizer.detokenize(generated);
cout << "\nGenerated text:\n" << output << endl;
prompt = "three four five";
prompt_tokens = tokenizer.tokenize(prompt);
cout << "\nPrompt: " << prompt << endl;
generated = generate(model, prompt_tokens, 3);
output = tokenizer.detokenize(generated);
cout << "\nGenerated text:\n" << output << endl;
return 0;
}
|
Makefile:
PROG = testMain #source codes SRCS = $(PROG).cpp #substitute .cpp by .o to obtain object filenames OBJS = $(SRCS:.cpp=.o) util.o transGpt.o #$< evaluates to the target's dependencies, #$@ evaluates to the target $(PROG): $(OBJS) g++ -o $@ $(OBJS) $(OBJS): g++ -c -std=c++20 $*.cpp clean: rm $(OBJS) $(PROG) |
|
|