A Simple 8x8 Discrete Cosine Transform (DCT) in CUDA

Discrete Cosine Transform (DCT)

A Fourier-like transform proposed by Ahmed et al. in 1974.
Use only cosine functions as basis functions.
Present the output in the frequency domain.
Widely used in many engineer and science applications,
including the MPEG video compression

Macroblocks

Macroblock ~ a region of 8 x 8 samples
i.e. M = N = 8, Equation (3) becomes

The separability of i and j is important.
We can arrange the f_ij values in an 8x8 matrix.

For a given pair of (u, v), the cosine values could also be arranged in an 8x8 matrix.

f₀₀	f₀₁	f₀₂	f₀₃	f₀₄	f₀₅	f₀₆	f₀₇
f₁₀	f₁₁	f₁₂	f₁₃	f₁₄	f₁₅	f₁₆	f₁₇
f₂₀	f₂₁	f₂₂	f₂₃	f₂₄	f₂₅	f₂₆	f₂₇
f₃₀	f₃₁	f₃₂	f₃₃	f₃₄	f₃₅	f₃₆	f₃₇
f₄₀	f₄₁	f₄₂	f₄₃	f₄₄	f₄₅	f₄₆	f₄₇
f₅₀	f₅₁	f₅₂	f₅₃	f₅₄	f₅₅	f₅₆	f₅₇
f₆₀	f₆₁	f₆₂	f₆₃	f₆₄	f₆₅	f₆₆	f₆₇
f₇₀	f₇₁	f₇₂	f₇₃	f₇₄	f₇₅	f₇₆	f₇₇

C₀₀	C₀₁	C₀₂	C₀₃	C₀₄	C₀₅	C₀₆	C₀₇
C₁₀	C₁₁	C₁₂	C₁₃	C₁₄	C₁₅	C₁₆	C₁₇
C₂₀	C₂₁	C₂₂	C₂₃	C₂₄	C₂₅	C₂₆	C₂₇
C₃₀	C₃₁	C₃₂	C₃₃	C₃₄	C₃₅	C₃₆	C₃₇
C₄₀	C₄₁	C₄₂	C₄₃	C₄₄	C₄₅	C₄₆	C₄₇
C₅₀	C₅₁	C₅₂	C₅₃	C₅₄	C₅₅	C₅₆	C₅₇
C₆₀	C₆₁	C₆₂	C₆₃	C₆₄	C₆₅	C₆₆	C₆₇
C₇₀	C₇₁	C₇₂	C₇₃	C₇₄	C₇₅	C₇₆	C₇₇

[] term, sum over j: dot product of v-th row vector of f and v-th row vector of C
left term, sum over i: dot product of u-th column vector of f and u-th row vector of C
So, in matrix form, we can express (10a) as
It turns out that C^-1 = C^T or C^T C = I
or
Sometimes DCT is referred to as Forward DCT in order to distingusih it from Inverse DCT (IDCT)

C/C++ Implementation

Cosine values are often calculated offline

//genCosine.cpp #include <iostream> #include <iomanip> #include <math.h> using namespace std; const double PI = 3.141592653589; int main() { const int N = 8; double CT[N][N]; double a[N]; a[0] = sqrt ( 1.0 / N ); for (int i = 1; i < N; ++i ) a[i] = sqrt ( 2.0 / N ); for ( int v = 0; v < N; ++v ) for ( int j = 0; j < N; ++j ) CT[j][v] =a[v] * cos((2*j+1)*v*PI/(2*N)); cout << fixed << setprecision(5); for (int i = 0; i < N; i++){ cout << "\n"; for (int j = 0; j < N; j++) cout << CT[i][j] << ", "; } cout << endl; return 0; }

8x8 DCT and IDCT in C/C++

/* dct.cpp A straight forward implementation of 8x8 DCT and IDCT. */ #include <stdio.h> #include <stdlib.h> #include <math.h> using namespace std; #define PI 3.141592653589 const int N = 8; //cosine basis const double C[N][N] = { 0.35355, 0.49039, 0.46194, 0.41573, 0.35355, 0.27779, 0.19134, 0.09755, 0.35355, 0.41573, 0.19134, -0.09755, -0.35355, -0.49039, -0.46194, -0.27779, 0.35355, 0.27779, -0.19134, -0.49039, -0.35355, 0.09755, 0.46194, 0.41573, 0.35355, 0.09755, -0.46194, -0.27779, 0.35355, 0.41573, -0.19134, -0.49039, 0.35355, -0.09755, -0.46194, 0.27779, 0.35355, -0.41573, -0.19134, 0.49039, 0.35355, -0.27779, -0.19134, 0.49039, -0.35355, -0.09755, 0.46194, -0.41573, 0.35355, -0.41573, 0.19134, 0.09755, -0.35355, 0.49039, -0.46194, 0.27779, 0.35355, -0.49039, 0.46194, -0.41573, 0.35355, -0.27779, 0.19134, -0.09755 }; //input: f, output: F void dct(double f[][N], double F[][N] ) { double sum; for ( int u = 0; u < N; ++u ) { for ( int v = 0; v < N; ++v ) { sum = 0.0; for (int i = 0; i < N; i++) for (int j = 0; j < N; j++) sum += f[i][j] * C[i][u] * C[j][v]; F[u][v] = sum; } //for v } //for u } //input F; output f void idct(double F[][N], double f[][N]) { double sum; for (int i = 0; i < N; ++i) { for (int j = 0; j < N; ++j ) { sum = 0.0; for (int u = 0; u < N; ++u ) for (int v = 0; v < N; ++v ) sum += F[u][v] * C[i][u] * C[j][v]; //for v f[i][j] = sum; } //for j } //for j } void print_elements (short f[][N] ) { for (int i = 0; i < N; ++i ){ printf("\n"); for (int j = 0; j < N; ++j ) printf ("%4d, ", f[i][j]); } } void short2double(short f[N][N], double f_double[N][N]) { for (int i = 0; i < N; ++i) for(int j = 0; j < N; j++) f_double[i][j] = (double) f[i][j]; } void double2short (double F_double[N][N], short F[N][N]) { for (int i = 0; i < N; i++) for(int j = 0; j < N; j++) F[i][j] = (short) (floor (F_double[i][j]+0.5)); //rounding } int main() { short f[N][N], F[N][N]; //try some values for testing for (int i = 0; i < N; ++i ) for (int j = 0; j < N; ++j) f[i][j] = i + j; printf("\nOriginal sample values"); print_elements ( f ); printf("\n--------------------\n"); double f_double[N][N], F_double[N][N]; short2double(f, f_double); dct(f_double, F_double); //performing DCT double2short(F_double, F); printf("\nCoefficients of DCT:"); print_elements ( F ); printf("\n--------------------\n"); idct (F_double, f_double); //performing IDCT double2short(f_double, f); printf("\nValues recovered by IDCT:"); print_elements ( f ); printf("\n"); }

CUDA Implementation

Using One Thread

//input: f, output: F __global__ void dct(double *f, double *F, const double *C ) { double sum; for ( int u = 0; u < N; ++u ) { for ( int v = 0; v < N; ++v ) { sum = 0.0; for (int i = 0; i < N; i++) for (int j = 0; j < N; j++) sum += f[i*N+j] * C[i*N+u] * C[j*N+v]; F[u*N+v] = sum; } //for v } //for u } //input F; output f __global__ void idct(double *F, double *f, const double *C) { double sum; for (int i = 0; i < N; ++i) { for (int j = 0; j < N; ++j ) { sum = 0.0; for (int u = 0; u < N; ++u ) for (int v = 0; v < N; ++v ) sum += F[u*N+v] * C[i*N+u] * C[j*N+v]; //for v f[i*N+j] = sum; } //for j } //for j } int main() { short f[N][N], F[N][N]; //try some values for testing for (int i = 0; i < N; ++i ) for (int j = 0; j < N; ++j) f[i][j] = i + j; printf("\nOriginal sample values"); print_elements ( f ); printf("\n--------------------\n"); double f_double[N][N], F_double[N][N]; double *df, *dF, *dC; int aSize = N * N * sizeof(double); cudaMalloc(&df, aSize); cudaMalloc(&dF, aSize); cudaMalloc(&dC, aSize); //Note: may use cudaMallocPitch() to create 2D array, but less convenient short2double(f, f_double); cudaMemcpy(df, f_double, aSize, cudaMemcpyHostToDevice); cudaMemcpy(dC, C, aSize, cudaMemcpyHostToDevice); dim3 blockDim(1, 1); dim3 gridDim(1, 1); dct<<< gridDim, blockDim>>>(df, dF, dC); //performing DCT cudaDeviceSynchronize(); cudaMemcpy(F_double, dF, aSize, cudaMemcpyDeviceToHost); double2short(F_double, F); printf("\nCoefficients of DCT:"); print_elements ( F ); printf("\n--------------------\n"); idct<<< gridDim, blockDim>>> (dF, df, dC); //performing IDCT cudaDeviceSynchronize(); cudaMemcpy(f_double, df, aSize, cudaMemcpyDeviceToHost); double2short(f_double, f); printf("\nValues recovered by IDCT:"); print_elements ( f ); printf("\n"); cudaFree(df); cudaFree(dF); cudaFree(dC); }

Using Multi-Threads

   A grid contains one or more blocks, and
	a block contains one or more threads,

One thread to calcuate one sample.
Use one block, 64 threads to calcuate 64 samples in parallel.


__global__ void 
dct(double *f, double *F, const double *C )
{
  double sum;

  int u = threadIdx.x;  //Row index of output
  int v = threadIdx.y;  //Column index of output
  .....
}

_global__ void 
idct(double *F, double *f, const double *C)
{
  double sum;
  int i = threadIdx.x;
  int j = threadIdx.y;
  ....
}

nt main()
{
   ....
   dim3 blockDim(N, N);
   dim3 gridDim(1, 1);
   ....
}

Fast DCT

   Break down the summation into stages.
   Number of calucations ~ N log N rather than N²