Materials available at: https://forejune.co/cuda/
Reinforcement Learning
|
|
Actions may affect not only the immediate reward
but also the next situations and all subsequent rewards
|
|
|
Tic-Tac-Toe Game
|
|
|
Player -- X, O A cell (square): Empty (0), X (1), O (2) A state = configuration of the 3 x 3 board Total number of states = 39 = 19683 Label the cells from 0 to 8:Represent a state by an integer:
Value of the state = 0x30 + 1x31 + 0x32 + 2x33 + 2x34 + 1x35 + 1x36 + 0x37 + 0x38 = 1191
Digits: 8 7 6 5 4 3 2 1 0 0 0 1 1 2 2 0 1 0 base 3 number![]()
int getState( int board[] )
{
int s = 0, m =1;
for(int i = 0; i < 9; i++){
s += m * board[i];
m = m * 3;
}
return s;
}
|
#define Nstates 19683 double Q[Nstates][9]; //Q[state][action] |
|
void updateQtable(int state, int action, double reward, int nextState)
{
double maxQnext = 0.0; //maximum Q value of the next state
double q;
for (int j = 0; j < 9; j++){
q = Q[nextState][j];
if (q > maxQnext) maxQnext = q;
}
Q[state][action] += alpha * (reward + gamma * maxQnext - Q[state][action]);
}
|
//choose best location (action) to mark X or O
int chooseAction(int s, bool explore)
{
int i;
if ((double)rand() / RAND_MAX < epsilon && explore == true) {
// Exploration
int n = 0;
int moves[9];
for (i = 0; i < 9; i++)
if (board[i] == EMPTY) moves[n++] = i;
if (n == 0) return -1; //no more space
//choose randomly one of the available space randomly
i = rand() % n;
return moves[i];
} else {
// Exploitation
// Choose the action with maximum quality
int bestMove = -1;
double max_q = -1000;
for (i = 0; i < 9; i++) {
if (board[i] == EMPTY) {
double q = Q[s][i];
if (q > max_q) {
max_q = q;
bestMove = i;
}
}
}
return bestMove;
}
}
|
CUDA Implementation
cuRAND -- a GPU-accelerated random number generation (RNG) library from Nvidia curandState_t -- we create an array of curandState_t objects to save random numbers curand_init() -- initializes each state using a specified seed, sequence number, and offset curand() -- generates a sequence of psuedo random numbers curand_uniform() -- generates uniformly distributed floating-point numbers #include <curand.h> #include <curand_kernel.h>
cudaError_t cudaMemcpyFromSymbol (
void *dst,
const char * symbol,
size_t count,
size_t offset = 0,
enum cudaMemcpyKind kind = cudaMemcpyDeviceToHost
)
Parameters:
dst - Destination memory address
symbol - Symbol source from device
count - Size in bytes to copy
offset - Offset from start of symbol in bytes
kind - Type of transfer
Returns:
cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidSymbol, cudaErrorInvalidDevicePointer,
cudaErrorInvalidMemcpyDirection
cudaDeviceSynchronize()
Returns
cudaSuccess
Thread 0 -- miscellaneous work Thread 1 -- Player X Thread 2 -- Player O
// tictactoe.cu // CUDA implementation of Q-Learning for Tic-Tac-Toe #include <iostream> #include <ctime> #include <stdio.h> #include <curand.h> #include <curand_kernel.h> using namespace std; // Constants const double alpha = 0.1; const double Gamma = 0.9; const double epsilon = 0.1; #define Nstates 19683 #define BLOCK_SIZE 256 enum Player {EMPTY=0, X, O}; // Device globals __device__ Player d_board[9]; __device__ double d_Q[Nstates][9]; // Q-table __device__ curandState_t d_rand_states[2][Nstates]; ; // Random states for each thread // Host board for display purposes Player h_board[9]; /* * CUDA Kernel to initialize Q-table and random states */ __global__ void initQAndRandomStates(unsigned int seed, unsigned int seed1) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < Nstates) { // Initialize Q-table for (int i = 0; i < 9; i++) d_Q[idx][i] = 0.0; // Initialize random states for each player curand_init(seed, idx, 0, &d_rand_states[0][idx]); curand_init(seed1, idx, 0, &d_rand_states[1][idx]); } } /* * Reset the board */ __device__ void resetBoard() { for (int i = 0; i < 9; i++) d_board[i] = EMPTY; } /* * Get state of the board (device version) */ __device__ int getState() { int s = 0, m = 1; for (int i = 0; i < 9; i++) { s += m * d_board[i]; m = m * 3; } return s; } /* * Check if a player has won (device version) */ __device__ bool isWinning(Player player) { const int w[8][3] = { {0,1,2},{3,4,5},{6,7,8}, {0,3,6},{1,4,7},{2,5,8}, {0,4,8},{2,4,6} }; for (int i = 0; i < 8; i++) { if (d_board[w[i][0]] == player && d_board[w[i][1]] == player && d_board[w[i][2]] == player) { return true; } } return false; } /* * Check if the game is a draw (device version) */ __device__ bool isDraw() { for (int i = 0; i < 9; i++) if (d_board[i] == EMPTY) return false; return true; } /* * Choose an action (device version) */ __device__ int chooseAction(int s, bool explore, curandState_t* state) { //curand_uniform returns a float between 0 and 1 if (explore && curand_uniform(state) < epsilon) { // Exploration int n = 0; int moves[9]; for (int i = 0; i < 9; i++) if (d_board[i] == EMPTY) moves[n++] = i; if (n == 0) return -1; return moves[(int)(curand_uniform(state) * n)]; } else { // Exploitation int bestMove = -1; double max_q = -1000.0; for (int i = 0; i < 9; i++) { if (d_board[i] == EMPTY) { double q = d_Q[s][i]; if (q > max_q) { max_q = q; bestMove = i; } } } return bestMove; } } /* * Update Q-table (device version) */ __device__ void updateQtable(int state, int action, double reward, int nextState) { double maxQnext = 0.0; for (int j = 0; j < 9; j++) { double q = d_Q[nextState][j]; if (q > maxQnext) maxQnext = q; } d_Q[state][action] += alpha * (reward + Gamma * maxQnext - d_Q[state][action]); } //CUDA synchronization functions like atomicCAS(), atomicAdd() do not //work well in my machine. So, use some other methods __global__ void trainKernel(int episodes) // int *turn) { __shared__ int episode; __shared__ int turn; __shared__ bool youLost[3]; __shared__ int nextState; __shared__ bool draw; int tid = threadIdx.x; //tid = 1: X, 2: O if ( tid == 0 ) episode = 0; __syncthreads(); while (episode < episodes){ Player currentPlayer; if ( tid == 0 ) { resetBoard(); turn = 1; //start with X } __syncthreads(); if ( tid > 0 ) { curandState_t local_state = d_rand_states[tid-1][episode % Nstates]; currentPlayer = (Player) tid; int action[9], state[9], n = 0; youLost[tid] = false; draw = false; int k = 0; while ( k++ < 100) { double reward; if ( tid == turn ) { if (draw) break; if (youLost[tid]) { reward = -1; for (int i = 0; i < n; i++) { if ( state[i] >= 0 ) updateQtable(state[i], action[i], reward, nextState); } break; } state[n] = getState(); action[n] = chooseAction(state[n], true, &local_state); if (action[n] < 0)break; d_board[action[n]] = currentPlayer; nextState = getState(); bool win = isWinning(currentPlayer); draw = isDraw(); n++; int t = (tid == 1)? 2: 1; // indexing the other thread if (win || draw) { if( win ){ reward = 1; youLost[t] = true; } else { reward = 0; } for (int i = 0; i < n; i++) { if ( state[i] >= 0 ) updateQtable(state[i], action[i], reward, nextState); } turn = t; break; } //if win || draw turn = t; printf(""); //time delay } //while k } //if turn == tid } //if tid > 0 else //tid == 0 episode++; __syncthreads(); } //while episode < episodes } /* * Host function to train the AI */ void train(int episodes, bool initialize) { if (initialize) { // Initialize Q-table and random states on device int blocks = ceil((float) Nstates / BLOCK_SIZE); unsigned int seed = time(0); unsigned int seed1 = time(0) + 31415926; initQAndRandomStates<<< blocks, BLOCK_SIZE>>>(seed, seed1); cudaDeviceSynchronize(); } // Run training episodes trainKernel<<< 1, 3>>>(episodes); cudaDeviceSynchronize(); cout << "Training completed!" << endl; } /* * Print the board (host version) */ void printBoard() { char symbols[3] = {' ', 'X', 'O'}; for (int i = 0; i < 3; ++i) { for (int j = 0; j < 3; ++j) { Player p = h_board[3*i+j]; cout << symbols[p]; if (j < 2) cout << " | "; } cout << endl; if (i < 2) cout << "---------" << endl; } } /* * Host function to play against the AI */ void play() { // Copy Q-table to host for CPU-based play double h_Q[Nstates][9]; cudaMemcpyFromSymbol(h_Q, d_Q, sizeof(double) * Nstates * 9); // Reset board for (int i = 0; i < 9; i++) h_board[i] = EMPTY; Player currentPlayer = X; cout << "You are 'O'. AI is 'X'!" << endl; while (true) { if (currentPlayer == O) { // Human's turn int move; cout << "Your move (0-8): "; cin >> move; if (move < 0 || move > 8 || h_board[move] != EMPTY) { cout << "Invalid move. Try again!" << endl; continue; } h_board[move] = O; } else { // AI's turn int state = 0, m = 1; for (int i = 0; i < 9; i++) { state += m * h_board[i]; m = m * 3; } // Find best move (exploitation only) int bestMove = -1; double max_q = -1000.0; for (int i = 0; i < 9; i++) { if (h_board[i] == EMPTY) { double q = h_Q[state][i]; if (q > max_q) { max_q = q; bestMove = i; } } } if (bestMove == -1) { cout << "No valid moves left!" << endl; break; } cout << "AI moves at " << bestMove << endl; h_board[bestMove] = X; } printBoard(); // Check for win/draw bool win = false; const int w[8][3] = { {0,1,2},{3,4,5},{6,7,8}, {0,3,6},{1,4,7},{2,5,8}, {0,4,8},{2,4,6} }; for (int i = 0; i < 8; i++) { if (h_board[w[i][0]] == currentPlayer && h_board[w[i][1]] == currentPlayer && h_board[w[i][2]] == currentPlayer) { win = true; break; } } if (win) { cout << (currentPlayer == X ? "AI wins!" : "You win!") << endl; break; } bool draw = true; for (int i = 0; i < 9; i++) { if (h_board[i] == EMPTY) { draw = false; break; } } if (draw) { cout << "It's a draw!" << endl; break; } currentPlayer = (currentPlayer == X ? O : X); } } int main() { srand(time(0)); // Train the AI from beginning train(50000, true); // Play against the AI while (true) { play(); char c; cout << "Play another one (y/n)? "; cin >> c; if (c == 'n') break; train(200, false); //further training without initialization } return 0; }![]()
![]()
![]()
![]()