A Simple Example of Reinforcement Learning of AI in CUDA

Materials available at: https://forejune.co/cuda/

Reinforcement Learning

Unsupervised

Supervised Learning

Learning from a training set of labeled examples
Not adequate for learning from interaction

No targets provided

An agent learns from its own experience,
interacting with its environment to achieve a goal

Maximizing reward

A learning agent tries to maximize a reward signal by taking appropriate actions

Actions may affect not only the immediate reward
but also the next situations and all subsequent rewards

Exploration-exploitation Trade-off

Exploration is used to explore the unknown environment.
The process selects actions with uncertain outcomes to enrich information about the environment

Exploitation is a strategy of using the present knowledge to make decisions
to maximize the expected reward
Lower level of risks

Reinforcement Learning System

policy

reward signal

value function

model of the environment (optional)

Two Popular Learning Methods
Temporal Difference (TD) Learning
- Agent focuses on changes of environment
- Evaluate the value function of a given policy
  t ~ current time step
  t + 1 ~ next time step
  s_t current state
  s_t+1 next state
Q-Learning
- Evaluate Q(s, a), saved in a table
- Q represents the maximum expected future reward achievable by taking action a in state s
  α = learning rate (e.g., 0.1)
  γ = discount factor (e.g. 0.9)
  r = reward

Tic-Tac-Toe Game

	
	Player -- X, O

		A cell (square): Empty (0), X (1), O (2)
		A state = configuration of the 3 x 3 board
		Total number of states = 3⁹ = 19683

	Label the cells from 0 to 8:

			

	Represent a state by an integer:
	         	
		
		 
		Digits: 8 7 6 5 4 3 2 1 0  
		        0 0 1 1 2 2 0 1 0
			    base 3 number
		

		
		
		
			
		
		
		
		Value of the state = 0x3⁰ + 1x3¹ + 0x3² + 2x3³ + 2x3⁴ + 1x3⁵ + 1x3⁶ + 0x3⁷ + 0x3⁸  
		                   = 1191
	
	
		 
			
								
int getState( int board[] )
{
  int s = 0, m =1;
  for(int i = 0; i < 9; i++){
    s += m * board[i];
    m = m * 3;
  }
  return s;
} 
		
		
	

	In the context of tic-tac-toe:

    		Agent: The AI player

    		Environment: The game board

    		State: Current board configuration

    		Action: Placing an X or O in an empty cell

    		Reward: +1 for win, -1 for loss, 0 for draw/intermediate moves


	Store Q values in a 2-D array:
	
		
	    #define Nstates 19683
	    double Q[Nstates][9];  //Q[state][action]   

	    	 
			
		
		
		
	Update Q value
			
	
		 
			
								
void updateQtable(int state, int action, double reward, int nextState)
{
    double maxQnext = 0.0;  //maximum Q value of the next state
    double q;
    for (int j = 0; j < 9; j++){
      q = Q[nextState][j];
      if (q > maxQnext) maxQnext = q;
    }

    Q[state][action] += alpha * (reward + gamma * maxQnext - Q[state][action]);
} 
		
		
	
	
	Exploration and Exploitation
	
		 
			
								
//choose best location (action) to mark X or O
int chooseAction(int  s, bool explore)
{
    int i;

    if ((double)rand() / RAND_MAX < epsilon && explore == true) {
        // Exploration
        int n = 0;
        int moves[9];
        for (i = 0; i < 9; i++)
            if (board[i] == EMPTY) moves[n++] = i;

        if (n == 0) return -1;   //no more space

        //choose randomly one of the available space randomly
        i = rand() % n;

        return moves[i];
    } else {
        // Exploitation
        // Choose the action with maximum quality
        int bestMove = -1;
        double max_q = -1000;
        for (i = 0; i < 9; i++) {
            if (board[i] == EMPTY) {
                double q = Q[s][i];
                if (q > max_q) {
                    max_q = q;
                    bestMove = i;
                }
            }
        }
        return bestMove;
    }
}

CUDA Implementation

Random Number Generators

	  cuRAND	-- a GPU-accelerated random number generation (RNG) library from Nvidia
	  curandState_t -- we create an array of curandState_t objects to save random numbers 
	  curand_init() -- initializes each state using a specified seed, sequence number, and offset
	  curand()	-- generates a sequence of psuedo random numbers
	  curand_uniform() -- generates uniformly distributed floating-point numbers

	  #include <curand.h>
	  #include <curand_kernel.h>

Some Functions

	cudaError_t cudaMemcpyFromSymbol (
		void *dst,
		const char *  	symbol,
		size_t  	count,
		size_t  	offset = 0,
		enum cudaMemcpyKind  	kind = cudaMemcpyDeviceToHost
	)

	Parameters:
    	dst 	- Destination memory address
    	symbol 	- Symbol source from device
    	count 	- Size in bytes to copy
    	offset 	- Offset from start of symbol in bytes
    	kind 	- Type of transfer

	Returns:
    	cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidSymbol, cudaErrorInvalidDevicePointer, 
	cudaErrorInvalidMemcpyDirection


	cudaDeviceSynchronize()

	Returns
	cudaSuccess

Three Threads for Training

	Thread 0 -- miscellaneous work
	Thread 1 -- Player X
	Thread 2 -- Player O


	
		 
			

// tictactoe.cu
// CUDA implementation of Q-Learning for Tic-Tac-Toe
#include <iostream>
#include <ctime>
#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>

using namespace std;

// Constants
const double alpha = 0.1;
const double Gamma = 0.9;
const double epsilon = 0.1;
#define Nstates 19683  
#define BLOCK_SIZE 256

enum Player {EMPTY=0, X, O};

// Device globals
__device__ Player d_board[9];
__device__ double d_Q[Nstates][9];  // Q-table
__device__ curandState_t d_rand_states[2][Nstates];  ;  // Random states for each thread

// Host board for display purposes
Player h_board[9];

/*
 * CUDA Kernel to initialize Q-table and random states
 */
__global__ void initQAndRandomStates(unsigned int seed, unsigned int seed1)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx < Nstates) {
        // Initialize Q-table
        for (int i = 0; i < 9; i++)
            d_Q[idx][i] = 0.0;

        // Initialize random states for each player
        curand_init(seed, idx, 0, &d_rand_states[0][idx]);
        curand_init(seed1, idx, 0, &d_rand_states[1][idx]);
    }
}

/*
 * Reset the board
 */
__device__ void resetBoard()
{
    for (int i = 0; i < 9; i++)
        d_board[i] = EMPTY;
}

/*
 * Get state of the board (device version)
 */
__device__ int getState()
{
    int s = 0, m = 1;
    for (int i = 0; i < 9; i++) {
        s += m * d_board[i];
        m = m * 3;
    }
    return s;
}

/*
 * Check if a player has won (device version)
 */
__device__ bool isWinning(Player player)
{
    const int w[8][3] = {
        {0,1,2},{3,4,5},{6,7,8},
        {0,3,6},{1,4,7},{2,5,8},
        {0,4,8},{2,4,6}
    };

    for (int i = 0; i < 8; i++) {
        if (d_board[w[i][0]] == player &&
            d_board[w[i][1]] == player &&
            d_board[w[i][2]] == player) {
            return true;
        }
    }
    return false;
}

/*
 * Check if the game is a draw (device version)
 */
__device__ bool isDraw()
{
    for (int i = 0; i < 9; i++)
        if (d_board[i] == EMPTY)
            return false;

    return true;
}

/*
 * Choose an action (device version)
 */
__device__ int chooseAction(int s, bool explore, curandState_t* state)
{
    //curand_uniform returns a float between 0 and 1
    if (explore && curand_uniform(state) < epsilon) {
        // Exploration
        int n = 0;
        int moves[9];

        for (int i = 0; i < 9; i++)
            if (d_board[i] == EMPTY)
                moves[n++] = i;

        if (n == 0) return -1;
        return moves[(int)(curand_uniform(state) * n)];
    } else {
        // Exploitation
        int bestMove = -1;
        double max_q = -1000.0;

        for (int i = 0; i < 9; i++) {
            if (d_board[i] == EMPTY) {
                double q = d_Q[s][i];
                if (q > max_q) {
                    max_q = q;
                    bestMove = i;
                }
            }
        }
        return bestMove;
    }
}

/*
 * Update Q-table (device version)
 */
__device__ void updateQtable(int state, int action, double reward, int nextState)
{
    double maxQnext = 0.0;

    for (int j = 0; j < 9; j++) {
        double q = d_Q[nextState][j];
        if (q > maxQnext) maxQnext = q;
    }

    d_Q[state][action] += alpha * (reward + Gamma * maxQnext - d_Q[state][action]);
}

//CUDA synchronization functions like  atomicCAS(), atomicAdd() do not
//work well in my machine. So, use some other methods
__global__ void trainKernel(int episodes) // int *turn)
{
  __shared__ int episode;
  __shared__ int turn;
  __shared__ bool youLost[3];
  __shared__ int nextState;
  __shared__ bool draw;
  int tid = threadIdx.x;   //tid = 1: X, 2: O
  if ( tid == 0 ) episode = 0;
  __syncthreads();
  while (episode < episodes){
    Player currentPlayer;
    if ( tid == 0 ) {
      resetBoard();
      turn = 1;         //start with X
    }
    __syncthreads();
    if ( tid > 0 ) {
      curandState_t local_state = d_rand_states[tid-1][episode % Nstates];
      currentPlayer = (Player) tid;
      int action[9], state[9], n = 0;
      youLost[tid] = false;
      draw = false;
      int k = 0;
      while ( k++ < 100) {
        double reward;
        if ( tid == turn ) {
           if (draw)
                break;
           if (youLost[tid]) {
             reward = -1;
             for (int i = 0; i < n; i++) {
               if ( state[i] >= 0 )
                updateQtable(state[i], action[i], reward, nextState);
             }
             break;
           }
           state[n] = getState();
           action[n] = chooseAction(state[n], true, &local_state);
           if (action[n] < 0)break;
           d_board[action[n]] = currentPlayer;

           nextState = getState();
           bool win = isWinning(currentPlayer);
           draw = isDraw();
           n++;
           int t = (tid == 1)? 2: 1;    // indexing the other thread
           if (win || draw) {
             if( win ){
               reward = 1;
               youLost[t] = true;
             } else {
               reward = 0;
             }
             for (int i = 0; i < n; i++) {
                if ( state[i] >= 0 )
                 updateQtable(state[i], action[i], reward, nextState);
             }
             turn = t;
             break;
           } //if win || draw
        turn = t;
        printf("");     //time delay
      }  //while k
     } //if turn == tid
    }   //if tid > 0
    else   //tid == 0
        episode++;
    __syncthreads();
  } //while episode < episodes 
}
/*
 * Host function to train the AI
 */
void train(int episodes, bool initialize)
{
  if (initialize) {
    // Initialize Q-table and random states on device
    int blocks = ceil((float) Nstates / BLOCK_SIZE);
    unsigned int seed = time(0);
    unsigned int seed1 = time(0) + 31415926;
    initQAndRandomStates<<< blocks, BLOCK_SIZE>>>(seed, seed1);
    cudaDeviceSynchronize();
  }
  // Run training episodes
  trainKernel<<< 1, 3>>>(episodes);
  cudaDeviceSynchronize();
  cout << "Training completed!" << endl;
}

/*
 * Print the board (host version)
 */
void printBoard()
{
    char symbols[3] = {' ', 'X', 'O'};
    for (int i = 0; i < 3; ++i) {
        for (int j = 0; j < 3; ++j) {
            Player p = h_board[3*i+j];
            cout << symbols[p];
            if (j < 2) cout << " | ";
        }
        cout << endl;
        if (i < 2) cout << "---------" << endl;
    }
}
/*
 * Host function to play against the AI
 */
void play()
{
    // Copy Q-table to host for CPU-based play
    double h_Q[Nstates][9];
    cudaMemcpyFromSymbol(h_Q, d_Q, sizeof(double) * Nstates * 9);

    // Reset board
    for (int i = 0; i < 9; i++)
        h_board[i] = EMPTY;

    Player currentPlayer = X;
    cout << "You are 'O'. AI is 'X'!" << endl;

    while (true) {
        if (currentPlayer == O) {
            // Human's turn
            int move;
            cout << "Your move (0-8): ";
            cin >> move;

            if (move < 0 || move > 8 || h_board[move] != EMPTY) {
                cout << "Invalid move. Try again!" << endl;
                continue;
            }

            h_board[move] = O;
        } else {
            // AI's turn
            int state = 0, m = 1;
            for (int i = 0; i < 9; i++) {
                state += m * h_board[i];
                m = m * 3;
            }

            // Find best move (exploitation only)
            int bestMove = -1;
            double max_q = -1000.0;

            for (int i = 0; i < 9; i++) {
                if (h_board[i] == EMPTY) {
                    double q = h_Q[state][i];
                    if (q > max_q) {
                        max_q = q;
                        bestMove = i;
                    }
                }
            }

            if (bestMove == -1) {
                cout << "No valid moves left!" << endl;
                break;
            }

            cout << "AI moves at " << bestMove << endl;
            h_board[bestMove] = X;
        }

        printBoard();

        // Check for win/draw
        bool win = false;
        const int w[8][3] = {
            {0,1,2},{3,4,5},{6,7,8},
            {0,3,6},{1,4,7},{2,5,8},
            {0,4,8},{2,4,6}
        };

        for (int i = 0; i < 8; i++) {
            if (h_board[w[i][0]] == currentPlayer &&
                h_board[w[i][1]] == currentPlayer &&
                h_board[w[i][2]] == currentPlayer) {
                win = true;
                break;
            }
        }

        if (win) {
            cout << (currentPlayer == X ? "AI wins!" : "You win!") << endl;
            break;
        }

        bool draw = true;
        for (int i = 0; i < 9; i++) {
            if (h_board[i] == EMPTY) {
                draw = false;
                break;
            }
        }

        if (draw) {
            cout << "It's a draw!" << endl;
            break;
        }

        currentPlayer = (currentPlayer == X ? O : X);
    }
}
int main()
{
    srand(time(0));

    // Train the AI from beginning
    train(50000, true);

    // Play against the AI
    while (true) {
      play();
      char c;
      cout << "Play another one (y/n)? ";
      cin >> c;
      if (c == 'n') break;
      train(200, false); //further training without initialization
    }

    return 0;
}