Arrays Addition in CUDA
C/C++
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
const int N = 10000000;
#define MAX_ERR 1e-6
void add(float *sum, float *a, float *b, int n)
{
for(int i = 0; i < n; i++)
sum[i] = a[i] + b[i];
}
int main()
{
float *a, *b, *sum;
int nbytes = sizeof(float) * N;
// Allocate memory
a = (float*)malloc(nbytes);
b = (float*)malloc(nbytes);
sum = (float*)malloc(nbytes);
// Initialize array
for(int i = 0; i < N; i++){
a[i] = 1.0;
b[i] = 2.0;
}
add(sum, a, b, N);
// Verification
for(int i = 0; i < N; i++){
assert(fabs(sum[i] - a[i] - b[i]) < MAX_ERR);
}
printf("sum[0] = %f\n", sum[0]);
printf("PASSED\n");
free(a);
free(b);
free(sum);
return 0;
}
|
|
CUDA
__global__
void add(float *sum, float *a, float *b, int n)
add<<<1,1>>>(sum, a, b, N);
|
Need to allocate memory in GPU (device) memory
cudaMalloc(void **devPtr, size_t count);
cudaFree(void *devPtr);
e.g.
//Allocate GPU memory
int nbytes = sizeof(float) * N;
float *da;
cudaMalloc((void **) &da, nbytes);
//Transfer data from main memory (CPU) to GPU memory
cudaMemcpy(da, a, nbytes, cudaMemcpyHostToDevice);
add<<<1,1>>>(dsum, da, db, N);
// Transfer data back to host memory
cudaMemcpy(sum, dsum, nbytes, cudaMemcpyDeviceToHost);
Profiling Performance
$nvprof ./addArrays-cu
Parallel Operations
threads are grouped into thread blocks
thread blokcs are grouped into grids
Using one thread block, consisting of 256 threads
add<<<1,256>>>(dsum, da, db, N);
CUDA provides built-in variables for accessing thread information.
threadIdx.x : index of the thread within the block
blockDim.x : number of threads in the thread block
add<<<1,256>>>(dsum, da, db, N);
int tid = threadIdx.x;
int stride = blockDim.x;
for(int i = tid; i < n; i += stride)
sum[i] = a[i] + b[i];
gridDim.x : number of blocks in the grid
Using multiple thread blocks, if each block contains 256 threads
#of thread blocks = ceil( N / 256 )
int blockSize = 256;
int nBlocks = (N + blockSize - 1) / blockSize;
add<<< nBlocks, blockSize >>>(dsum, da, db, N);
__global__ void add(float *sum, float *a, float *b, int n)
{
//thread id
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
sum[i] = a[i] + b[i];
}