Difference between revisions of "CUDA Examples"
Jump to navigation
Jump to search
(Created page with "==Example: CUBLAS vs. Optimized BLAS== Note that double-precision linear algebra is a less than ideal application for the GPUs. Still, it is a functional example of using o...") |
|||
Line 1: | Line 1: | ||
− | == | + | ==CUBLAS vs. Optimized BLAS== |
Note that double-precision linear algebra is a less than ideal application for the GPUs. Still, it is a functional example of using one of the available CUDA runtime libraries. | Note that double-precision linear algebra is a less than ideal application for the GPUs. Still, it is a functional example of using one of the available CUDA runtime libraries. |
Revision as of 12:42, 11 October 2013
CUBLAS vs. Optimized BLAS
Note that double-precision linear algebra is a less than ideal application for the GPUs. Still, it is a functional example of using one of the available CUDA runtime libraries.
cuda_bm.c
{{#fileAnchor: cuda_bm.c}} Download raw source of the [{{#fileLink: cuda_bm.c}} cuda_bm.c]
#include <stdio.h>
#include <stdlib.h>
#include <mkl.h>
#include <math.h>
#include <time.h>
#include <cublas.h>
void resuse(char *str);
double timeDiff( struct timespec *t1, struct timespec *t2)
{
double T1, T2;
T2 = (double)t2->tv_sec + (double)t2->tv_nsec / 1.0e9;
T1 = (double)t1->tv_sec - (double)t1->tv_nsec / 1.0e9;
return(T2 - T1);
}
main()
{
int dim = 9100;
int i,j,k;
int status;
double *psa, *psb, *psc;
double *sap, *sbp, *scp;
double *pda, *pdb, *pdc;
double *dap, *dbp, *dcp;
double alpha = 1.0;
double beta = 0.0;
double gflops = 0.0;
float deltaT = 0.0;
double gflopCnt = 2.0 * dim * dim * dim / 1.0e9;
struct timespec t1;
struct timespec t2;
int ptime();
pda = NULL;
pdb = NULL;
pdc = NULL;
psa = (double *) malloc(dim * dim * sizeof(*psa) );
psb = (double *) malloc(dim * dim * sizeof(*psb) );
psc = (double *) malloc(dim * dim * sizeof(*psc) );
printf("Initializing Matrices...");
clock_gettime(CLOCK_MONOTONIC, &t1);
sap = psa;
sbp = psb;
scp = psc;
for (i = 0; i < dim; i++)
for (j = 0; j < dim; j++) {
*sap++ = 1.0;
*sbp++ = 1.0;
*scp++ = 0.0;
}
clock_gettime(CLOCK_MONOTONIC, &t2);
deltaT = timeDiff(&t1, &t2);
printf("Done. Elapsed Time = %6.4f secs\n", deltaT);
fflush(stdout);
printf("Starting parallel DGEMM...");
fflush(stdout);
clock_gettime(CLOCK_MONOTONIC, &t1);
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, dim, dim, dim, alpha, psa, dim, psb, dim, beta, psc, dim);
clock_gettime(CLOCK_MONOTONIC, &t2);
deltaT = timeDiff(&t1, &t2);
printf("Done. Elapsed Time = %6.4f secs\n", deltaT);
printf(" ");
printf("GFlOP Rate = %8.4f\n", gflopCnt/deltaT);
if ( (float) dim - psc[0] > 1.0e-5 ||
(float) dim - psc[dim*dim-1] > 1.0e-5 ) {
printf("Error: Incorrect Results!\n");
printf("C[%2d,%2d] = %10.4f\n", 1,1,psc[0]);
printf("C[%2d,%2d] = %10.4f\n", dim,dim,psc[dim*dim-1]);
}
/* Initialize CUDA */
printf("Initializing CUDA...");
status = cublasInit();
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! CUBLAS initialization error\n");
return EXIT_FAILURE;
}
printf("Done.\n");
/* Re-initialize the matrices */
printf("Re-initializing Matrices...");
clock_gettime(CLOCK_MONOTONIC, &t1);
sap = psa;
sbp = psb;
scp = psc;
for (i = 0; i < dim; i++) {
for (j = 0; j < dim; j++) {
*sap++ = 1.0;
*sbp++ = 1.0;
*scp++ = 0.0;
}
}
clock_gettime(CLOCK_MONOTONIC, &t2);
deltaT = timeDiff(&t1, &t2);
printf("Done. Elapsed Time = %6.4f secs\n", deltaT);
fflush(stdout);
/* Allocate device memory for the matrices */
printf("Starting CUDA DGEMM...");
clock_gettime(CLOCK_MONOTONIC, &t1);
status = cublasAlloc(dim*dim, sizeof(*pda), (void**) &pda);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device memory allocation error (A)\n");
return EXIT_FAILURE;
}
status = cublasAlloc(dim*dim, sizeof(*pdb), (void**) &pdb);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device memory allocation error (B)\n");
return EXIT_FAILURE;
}
status = cublasAlloc(dim*dim, sizeof(*pdc), (void**) &pdc);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device memory allocation error (C)\n");
return EXIT_FAILURE;
}
/* Initialize the device matrices with the host matrices */
status = cublasSetVector(dim*dim, sizeof(*psa), psa, 1, pda, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device access error (write A)\n");
return EXIT_FAILURE;
}
status = cublasSetVector(dim*dim, sizeof(*pdb), psb, 1, pdb, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device access error (write B)\n");
return EXIT_FAILURE;
}
status = cublasSetVector(dim*dim, sizeof(*psc), psc, 1, pdc, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device access error (write C)\n");
return EXIT_FAILURE;
}
/* Clear last error */
cublasGetError();
/* Performs operation using cublas */
cublasDgemm('n', 'n', dim, dim, dim, alpha, pda, dim, pdb, dim, beta, pdc, dim);
status = cublasGetError();
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! kernel execution error.\n");
return EXIT_FAILURE;
}
/* Read the result back */
status = cublasGetVector(dim*dim, sizeof(*psc), pdc, 1, psc, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device access error (read C)\n");
return EXIT_FAILURE;
}
clock_gettime(CLOCK_MONOTONIC, &t2);
deltaT = timeDiff(&t1, &t2);
printf("Done. Elapsed Time = %6.4f secs\n", deltaT);
printf(" ");
printf("GFlOP Rate = %8.4f\n", gflopCnt/deltaT);
if ( (float) dim - psc[0] > 1.0e-5 ||
(float) dim - psc[dim*dim-1] > 1.0e-5 ) {
printf("Error: Incorrect Results!\n");
printf("C[%2d,%2d] = %10.4f\n", 1,1,psc[0]);
printf("C[%2d,%2d] = %10.4f\n", dim,dim,psc[dim*dim-1]);
}
}
resuse.c
{{#fileAnchor: resuse.c}} Download raw source of the [{{#fileLink: resuse.c}} resuse.c]
/*---------------------------------------------------------------------------*
* resuse.c *
* *
* resuse_(desc) *
* char *desc; *
*---------------------------------------------------------------------------*
* This routine makes use of the getrusage system call to collect and *
* display resource utilization from one point in a program to another. *
* Thus, it works somewhat like a stopwatch in the since that the first call *
* initializes, or starts, the resource collection over the interval. The *
* second call stops the collection and displays the results accumulated *
* since the previous call. This means, of course, that two calls are *
* required for each section of code to be monitored. *
*
* The underscore at the end of the routine name is there so that the routine*
* may be called as an integer valued FORTRAN function name RESUSE(), under *
* both the SunOS and Ultrix f77 compilers. AIX inter-language calling *
* conventions are different so the routine must be referenced as RESUSE_() *
* under AIX (RISC/6000) FORTRAN (xlf).
*
* From a FORTRAN routine:
*
* INT = RESUSE("Some String")
*
* fortran code to be timed
*
* INT = RESUSE("Some String")
*
* Just ignore the returned value.
*---------------------------------------------------------------------------*/
#include <sys/time.h>
#include <sys/resource.h>
static int first_call = 1;
static struct rusage initial;
void resuse(str)
char *str;
{
int pgminor, pgmajor, nswap, nvcsw, nivcsw;
int inblock, oublock;
struct rusage final;
float usr, sys, secs;
getrusage(RUSAGE_SELF, &final);
if ( ! first_call )
{
secs = final.ru_utime.tv_sec + final.ru_utime.tv_usec / 1000000.0;
usr = secs;
secs = initial.ru_utime.tv_sec + initial.ru_utime.tv_usec / 1000000.0;
usr = usr - secs;
secs = final.ru_stime.tv_sec + final.ru_stime.tv_usec / 1000000.0;
sys = secs;
secs = initial.ru_stime.tv_sec + initial.ru_stime.tv_usec / 1000000.0;
sys = sys - secs;
pgminor = final.ru_minflt - initial.ru_minflt;
pgmajor = final.ru_majflt - initial.ru_majflt;
nswap = final.ru_nswap - initial.ru_nswap;
inblock = final.ru_inblock - initial.ru_inblock;
oublock = final.ru_oublock - initial.ru_oublock;
nvcsw = final.ru_nvcsw - initial.ru_nvcsw ;
nivcsw = final.ru_nivcsw - initial.ru_nivcsw ;
printf("=============================================================\n");
printf("%s: Resource Usage Data...\n", str);
printf("-------------------------------------------------------------\n");
printf("User Time (secs) : %10.3f\n", usr);
printf("System Time (secs) : %10.3f\n", sys);
printf("Total Time (secs) : %10.3f\n", usr + sys);
printf("Minor Page Faults : %10d\n", pgminor);
printf("Major Page Faults : %10d\n", pgmajor);
printf("Swap Count : %10d\n", nswap);
printf("Voluntary Context Switches : %10d\n", nvcsw);
printf("Involuntary Context Switches: %10d\n", nivcsw);
printf("Block Input Operations : %10d\n", inblock);
printf("Block Output Operations : %10d\n", oublock);
printf("=============================================================\n");
}
else
{
printf("=============================================================\n");
printf("%s: Collecting Resource Usage Data\n", str);
printf("=============================================================\n");
}
first_call = !first_call;
initial = final;
return;
}
Makefile for Building cuda_bm
{{#fileAnchor: Makefile}} Download raw source of the [{{#fileLink: Makefile}} Makefile]
#
# Makefile
#
CC = icc
CFLAGS = -O3 -openmp
MKL_INCS = -I$(HPC_MKL_INC)
MKL_LIBS = -L$(HPC_MKL_LIB) -lmkl_intel_lp64 -lmkl_sequential -lmkl_core
MKL_MP_LIBS = -L$(HPC_MKL_LIB) -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core
CUDA_INCS = -I$(HPC_CUDA_INC)
CUDA_LIBS = -L$(HPC_CUDA_LIB) -lcublas
LIBS = $(MKL_MP_LIBS) $(CUDA_LIBS) -lpthread -lrt
INCS = $(MKL_INCS) $(CUDA_INCS)
bm: bm.o ptime.o resuse.o
$(CC) $(CFLAGS) -o bm bm.o ptime.o resuse.o $(LIBS)
cuda_bm: cuda_bm.c resuse.o
$(CC) $(CFLAGS) $(INCS) -o cuda_bm cuda_bm.c resuse.o $(LIBS)
cuda_sp_bm: cuda_sp_bm.c resuse.o
$(CC) $(CFLAGS) $(INCS) -o cuda_sp_bm cuda_sp_bm.c resuse.o $(LIBS)
clean:
rm -f *.o core.* bm cuda_bm
Compiling and Linking
Download the above three files into a single directory. Then type:
[taylor@c11a-s15 Bench]$ module load intel/2013 [taylor@c11a-s15 Bench]$ module load cuda/5.5 [taylor@c11a-s15 Bench]$ make clean rm -f *.o core.* bm cuda_bm [taylor@c11a-s15 Bench]$ make cuda_bm icc -O3 -openmp -I/opt/intel/composer_xe_2013_sp1.0.080/mkl/include -c -o resuse.o resuse.c icc -O3 -openmp -I/opt/intel/composer_xe_2013_sp1.0.080/mkl/include -I/opt/cuda/5.5/include -I/opt/intel/composer_xe_2013_sp1.0.080/mkl/include -o cuda_bm cuda_bm.c resuse.o -L/opt/cuda/5.5/lib64 -lcublas -L/opt/intel/composer_xe_2013_sp1.0.080/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lpthread -lrt
Output
# Host: Single, Hex-Core Intel 5675 @ 3.0 GHz # GPU: Kepler K20c
$ export OMP_NUM_THREADS=4 $ ./cuda_bm Initializing Matrices...Done. Elapsed Time = 2.3768 secs Starting parallel DGEMM...Done. Elapsed Time = 33.2302 secs GFlOP Rate = 45.3546 Initializing CUDA...Done. Re-initializing Matrices...Done. Elapsed Time = 2.1733 secs Starting CUDA DGEMM...Done. Elapsed Time = 2.3522 secs GFlOP Rate = 640.7458