CUDA Examples

CUBLAS vs. Optimized BLAS

Note that double-precision linear algebra is a less than ideal application for the GPUs. Still, it is a functional example of using one of the available CUDA runtime libraries.

cuda_bm.c

{{#fileAnchor: cuda_bm.c}} Download raw source of the [{{#fileLink: cuda_bm.c}} cuda_bm.c]

#include <stdio.h>
#include <stdlib.h>
#include <mkl.h>
#include <math.h>
#include <time.h>
#include <cublas.h>

void resuse(char *str);

double timeDiff( struct timespec *t1, struct timespec *t2)
{
  double T1, T2;
  T2 = (double)t2->tv_sec + (double)t2->tv_nsec / 1.0e9;
  T1 = (double)t1->tv_sec - (double)t1->tv_nsec / 1.0e9;
  return(T2 - T1);
}

main()
{
  int       dim = 9100;
  int       i,j,k;
  int       status;

  double    *psa, *psb, *psc;
  double    *sap, *sbp, *scp;
  double    *pda, *pdb, *pdc;
  double    *dap, *dbp, *dcp;

  double alpha   = 1.0;
  double beta    = 0.0;
  double gflops  = 0.0;
  float  deltaT  = 0.0;
  double gflopCnt = 2.0 * dim * dim * dim / 1.0e9;
  struct timespec t1;
  struct timespec t2;

  int       ptime();

  pda = NULL;
  pdb = NULL;
  pdc = NULL;
  psa = (double *) malloc(dim * dim * sizeof(*psa) );
  psb = (double *) malloc(dim * dim * sizeof(*psb) );
  psc = (double *) malloc(dim * dim * sizeof(*psc) );

  printf("Initializing Matrices...");
  clock_gettime(CLOCK_MONOTONIC, &t1); 
  sap = psa;
  sbp = psb;
  scp = psc;
  for (i = 0; i < dim; i++)
    for (j = 0; j < dim; j++) {
      *sap++ = 1.0;
      *sbp++ = 1.0;
      *scp++ = 0.0;
    }
  clock_gettime(CLOCK_MONOTONIC, &t2); 
  deltaT = timeDiff(&t1, &t2);
  printf("Done.    Elapsed Time = %6.4f secs\n", deltaT);
  fflush(stdout);

  printf("Starting parallel DGEMM...");
  fflush(stdout);
  clock_gettime(CLOCK_MONOTONIC, &t1); 
  cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, dim, dim, dim, alpha, psa, dim, psb, dim, beta, psc, dim);
  clock_gettime(CLOCK_MONOTONIC, &t2); 
  deltaT = timeDiff(&t1, &t2);
  printf("Done.  Elapsed Time = %6.4f secs\n", deltaT);
  printf("                                 ");
  printf("GFlOP Rate   = %8.4f\n", gflopCnt/deltaT);

  if ( (float) dim - psc[0] > 1.0e-5  ||
       (float) dim - psc[dim*dim-1] > 1.0e-5 ) {
      printf("Error: Incorrect Results!\n");
      printf("C[%2d,%2d] = %10.4f\n", 1,1,psc[0]); 
      printf("C[%2d,%2d] = %10.4f\n", dim,dim,psc[dim*dim-1]); 
  }

  /* Initialize CUDA */
  printf("Initializing CUDA...");
  status = cublasInit();
  if (status != CUBLAS_STATUS_SUCCESS) {
      fprintf (stderr, "!!!! CUBLAS initialization error\n");
      return EXIT_FAILURE;
  }
  printf("Done.\n");

  /* Re-initialize the matrices */
  printf("Re-initializing Matrices...");
  clock_gettime(CLOCK_MONOTONIC, &t1); 
  sap = psa;
  sbp = psb;
  scp = psc;
  for (i = 0; i < dim; i++) {
    for (j = 0; j < dim; j++) {
      *sap++ = 1.0;
      *sbp++ = 1.0;
      *scp++ = 0.0;
    }
  }
  clock_gettime(CLOCK_MONOTONIC, &t2); 
  deltaT = timeDiff(&t1, &t2);
  printf("Done.  Elapsed Time = %6.4f secs\n", deltaT);
  fflush(stdout);

  /* Allocate device memory for the matrices */
  printf("Starting CUDA DGEMM...");
  clock_gettime(CLOCK_MONOTONIC, &t1); 

  status = cublasAlloc(dim*dim, sizeof(*pda), (void**) &pda);
  if (status != CUBLAS_STATUS_SUCCESS) {
      fprintf (stderr, "!!!! device memory allocation error (A)\n");
      return EXIT_FAILURE;
  }

  status = cublasAlloc(dim*dim, sizeof(*pdb), (void**) &pdb);
  if (status != CUBLAS_STATUS_SUCCESS) {
      fprintf (stderr, "!!!! device memory allocation error (B)\n");
      return EXIT_FAILURE;
  }

  status = cublasAlloc(dim*dim, sizeof(*pdc), (void**) &pdc);
  if (status != CUBLAS_STATUS_SUCCESS) {
      fprintf (stderr, "!!!! device memory allocation error (C)\n");
      return EXIT_FAILURE;
  }

  /* Initialize the device matrices with the host matrices */
  status = cublasSetVector(dim*dim, sizeof(*psa), psa, 1, pda, 1);
  if (status != CUBLAS_STATUS_SUCCESS) {
      fprintf (stderr, "!!!! device access error (write A)\n");
      return EXIT_FAILURE;
  }

  status = cublasSetVector(dim*dim, sizeof(*pdb), psb, 1, pdb, 1);
  if (status != CUBLAS_STATUS_SUCCESS) {
      fprintf (stderr, "!!!! device access error (write B)\n");
      return EXIT_FAILURE;
  }

  status = cublasSetVector(dim*dim, sizeof(*psc), psc, 1, pdc, 1);
  if (status != CUBLAS_STATUS_SUCCESS) {
      fprintf (stderr, "!!!! device access error (write C)\n");
      return EXIT_FAILURE;
  }

  /* Clear last error */
  cublasGetError();

  /* Performs operation using cublas */
  cublasDgemm('n', 'n', dim, dim, dim, alpha, pda, dim, pdb, dim, beta, pdc, dim);
  status = cublasGetError();
  if (status != CUBLAS_STATUS_SUCCESS) {
      fprintf (stderr, "!!!! kernel execution error.\n");
      return EXIT_FAILURE;
  }

  /* Read the result back */
  status = cublasGetVector(dim*dim, sizeof(*psc), pdc, 1, psc, 1);
  if (status != CUBLAS_STATUS_SUCCESS) {
      fprintf (stderr, "!!!! device access error (read C)\n");
      return EXIT_FAILURE;
  }
  clock_gettime(CLOCK_MONOTONIC, &t2); 
  deltaT = timeDiff(&t1, &t2);
  printf("Done.       Elapsed Time = %6.4f secs\n", deltaT);
  printf("                                  ");
  printf("GFlOP Rate   = %8.4f\n", gflopCnt/deltaT);

  if ( (float) dim - psc[0] > 1.0e-5  ||
       (float) dim - psc[dim*dim-1] > 1.0e-5 ) {
      printf("Error: Incorrect Results!\n");
      printf("C[%2d,%2d] = %10.4f\n", 1,1,psc[0]); 
      printf("C[%2d,%2d] = %10.4f\n", dim,dim,psc[dim*dim-1]); 
  }
}

resuse.c

{{#fileAnchor: resuse.c}} Download raw source of the [{{#fileLink: resuse.c}} resuse.c]

/*---------------------------------------------------------------------------*
 *                      resuse.c                                             *
 *                                                                           *
 * resuse_(desc)                                                              *
 * char *desc;                                                               *
 *---------------------------------------------------------------------------*
 * This routine makes use of the getrusage system call to collect and        *
 * display resource utilization from one point in a program to another.      *
 * Thus, it works somewhat like a stopwatch in the since that the first call *
 * initializes, or starts, the resource collection over the interval.  The   *
 * second call stops the collection and displays the results accumulated     *
 * since the previous call.  This means, of course, that two calls are       *
 * required for each section of code to be monitored.                        *
 *
 * The underscore at the end of the routine name is there so that the routine*
 * may be called as an integer valued FORTRAN function name RESUSE(), under  *
 * both the SunOS and Ultrix f77 compilers.  AIX inter-language calling      *
 * conventions are different so the routine must be referenced as RESUSE_()  *
 * under AIX (RISC/6000) FORTRAN (xlf).
 *
 * From a FORTRAN routine:
 *
 * INT = RESUSE("Some String")
 * 
 *    fortran code to be timed
 *
 * INT = RESUSE("Some String")
 *
 * Just ignore the returned value.
 *---------------------------------------------------------------------------*/

#include <sys/time.h>
#include <sys/resource.h>

static int first_call = 1;

static struct rusage initial;

void resuse(str)
char *str;
{
  int     pgminor, pgmajor, nswap, nvcsw, nivcsw;
  int     inblock, oublock;
  struct  rusage final;
  float   usr, sys, secs;

  getrusage(RUSAGE_SELF, &final);

  if ( ! first_call )
  {
    secs = final.ru_utime.tv_sec + final.ru_utime.tv_usec / 1000000.0;
    usr  = secs;
    secs = initial.ru_utime.tv_sec + initial.ru_utime.tv_usec / 1000000.0;
    usr  = usr - secs;

    secs = final.ru_stime.tv_sec + final.ru_stime.tv_usec / 1000000.0;
    sys  = secs;
    secs = initial.ru_stime.tv_sec + initial.ru_stime.tv_usec / 1000000.0;
    sys  = sys - secs;

    pgminor = final.ru_minflt  - initial.ru_minflt;
    pgmajor = final.ru_majflt  - initial.ru_majflt;
    nswap   = final.ru_nswap   - initial.ru_nswap;
    inblock = final.ru_inblock - initial.ru_inblock;
    oublock = final.ru_oublock - initial.ru_oublock;
    nvcsw   = final.ru_nvcsw   - initial.ru_nvcsw ;
    nivcsw  = final.ru_nivcsw  - initial.ru_nivcsw ;

    printf("=============================================================\n");
    printf("%s: Resource Usage Data...\n", str);
    printf("-------------------------------------------------------------\n");
    printf("User Time   (secs)          : %10.3f\n", usr);
    printf("System Time (secs)          : %10.3f\n", sys);
    printf("Total Time  (secs)          : %10.3f\n", usr + sys);
    printf("Minor Page Faults           : %10d\n", pgminor);
    printf("Major Page Faults           : %10d\n", pgmajor);
    printf("Swap Count                  : %10d\n", nswap);
    printf("Voluntary Context Switches  : %10d\n", nvcsw);
    printf("Involuntary Context Switches: %10d\n", nivcsw);
    printf("Block Input Operations      : %10d\n", inblock);
    printf("Block Output Operations     : %10d\n", oublock);
    printf("=============================================================\n");

  }
  else
  {
    printf("=============================================================\n");
    printf("%s: Collecting Resource Usage Data\n", str);
    printf("=============================================================\n");
  }

  first_call = !first_call;
  initial = final;

  return;
}

Makefile for Building cuda_bm

{{#fileAnchor: Makefile}} Download raw source of the [{{#fileLink: Makefile}} Makefile]

#
# Makefile
#
CC          = icc
CFLAGS      = -O3 -openmp 

MKL_INCS    = -I$(HPC_MKL_INC)
MKL_LIBS    = -L$(HPC_MKL_LIB) -lmkl_intel_lp64 -lmkl_sequential -lmkl_core
MKL_MP_LIBS = -L$(HPC_MKL_LIB) -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core

CUDA_INCS   = -I$(HPC_CUDA_INC) 
CUDA_LIBS   = -L$(HPC_CUDA_LIB) -lcublas 

LIBS        = $(MKL_MP_LIBS) $(CUDA_LIBS) -lpthread -lrt
INCS        = $(MKL_INCS) $(CUDA_INCS)

bm: bm.o ptime.o resuse.o
	$(CC) $(CFLAGS)  -o bm bm.o ptime.o resuse.o $(LIBS)

cuda_bm: cuda_bm.c resuse.o
	$(CC) $(CFLAGS) $(INCS) -o cuda_bm cuda_bm.c resuse.o $(LIBS)

cuda_sp_bm: cuda_sp_bm.c resuse.o
	$(CC) $(CFLAGS) $(INCS) -o cuda_sp_bm cuda_sp_bm.c resuse.o $(LIBS)

clean:
	rm -f *.o core.* bm cuda_bm

Compiling and Linking

Download the above three files into a single directory. Then type:

[taylor@c11a-s15 Bench]$ module load intel/2013
[taylor@c11a-s15 Bench]$ module load cuda/5.5
[taylor@c11a-s15 Bench]$ make clean
rm -f *.o core.* bm cuda_bm

[taylor@c11a-s15 Bench]$ make cuda_bm
icc -O3 -openmp -I/opt/intel/composer_xe_2013_sp1.0.080/mkl/include   -c -o resuse.o resuse.c
icc -O3 -openmp -I/opt/intel/composer_xe_2013_sp1.0.080/mkl/include -I/opt/cuda/5.5/include -I/opt/intel/composer_xe_2013_sp1.0.080/mkl/include -o cuda_bm cuda_bm.c resuse.o -L/opt/cuda/5.5/lib64 -lcublas -L/opt/intel/composer_xe_2013_sp1.0.080/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lpthread -lrt

Output

# Host: Single, Hex-Core Intel 5675 @ 3.0 GHz
# GPU: Kepler K20c

$ export OMP_NUM_THREADS=4
$ ./cuda_bm
Initializing Matrices...Done.    Elapsed Time = 2.3768 secs
Starting parallel DGEMM...Done.  Elapsed Time = 33.2302 secs
                                 GFlOP Rate   =  45.3546
Initializing CUDA...Done.
Re-initializing Matrices...Done.  Elapsed Time = 2.1733 secs
Starting CUDA DGEMM...Done.       Elapsed Time = 2.3522 secs
                                  GFlOP Rate   = 640.7458