|
|
Line 130: |
Line 130: |
| |}} | | |}} |
|
| |
|
| ==Example: CUBLAS vs. Optimized BLAS== | | ==CUDA Examples== |
| | See the [[{{PAGENAME}}_Examples]] page for CUDA development examples. |
|
| |
|
| Note that double-precision linear algebra is a less than ideal application for the GPUs. Still, it is a functional example of using one of the available CUDA runtime libraries.
| |
|
| |
| ===cuda_bm.c===
| |
| {{#fileAnchor: cuda_bm.c}}
| |
| Download raw source of the [{{#fileLink: cuda_bm.c}} cuda_bm.c]
| |
| <source lang=c>
| |
| #include <stdio.h>
| |
| #include <stdlib.h>
| |
| #include <mkl.h>
| |
| #include <math.h>
| |
| #include <time.h>
| |
| #include <cublas.h>
| |
|
| |
| void resuse(char *str);
| |
|
| |
| double timeDiff( struct timespec *t1, struct timespec *t2)
| |
| {
| |
| double T1, T2;
| |
| T2 = (double)t2->tv_sec + (double)t2->tv_nsec / 1.0e9;
| |
| T1 = (double)t1->tv_sec - (double)t1->tv_nsec / 1.0e9;
| |
| return(T2 - T1);
| |
| }
| |
|
| |
| main()
| |
| {
| |
| int dim = 9100;
| |
| int i,j,k;
| |
| int status;
| |
|
| |
| double *psa, *psb, *psc;
| |
| double *sap, *sbp, *scp;
| |
| double *pda, *pdb, *pdc;
| |
| double *dap, *dbp, *dcp;
| |
|
| |
| double alpha = 1.0;
| |
| double beta = 0.0;
| |
| double gflops = 0.0;
| |
| float deltaT = 0.0;
| |
| double gflopCnt = 2.0 * dim * dim * dim / 1.0e9;
| |
| struct timespec t1;
| |
| struct timespec t2;
| |
|
| |
| int ptime();
| |
|
| |
| pda = NULL;
| |
| pdb = NULL;
| |
| pdc = NULL;
| |
| psa = (double *) malloc(dim * dim * sizeof(*psa) );
| |
| psb = (double *) malloc(dim * dim * sizeof(*psb) );
| |
| psc = (double *) malloc(dim * dim * sizeof(*psc) );
| |
|
| |
| printf("Initializing Matrices...");
| |
| clock_gettime(CLOCK_MONOTONIC, &t1);
| |
| sap = psa;
| |
| sbp = psb;
| |
| scp = psc;
| |
| for (i = 0; i < dim; i++)
| |
| for (j = 0; j < dim; j++) {
| |
| *sap++ = 1.0;
| |
| *sbp++ = 1.0;
| |
| *scp++ = 0.0;
| |
| }
| |
| clock_gettime(CLOCK_MONOTONIC, &t2);
| |
| deltaT = timeDiff(&t1, &t2);
| |
| printf("Done. Elapsed Time = %6.4f secs\n", deltaT);
| |
| fflush(stdout);
| |
|
| |
| printf("Starting parallel DGEMM...");
| |
| fflush(stdout);
| |
| clock_gettime(CLOCK_MONOTONIC, &t1);
| |
| cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, dim, dim, dim, alpha, psa, dim, psb, dim, beta, psc, dim);
| |
| clock_gettime(CLOCK_MONOTONIC, &t2);
| |
| deltaT = timeDiff(&t1, &t2);
| |
| printf("Done. Elapsed Time = %6.4f secs\n", deltaT);
| |
| printf(" ");
| |
| printf("GFlOP Rate = %8.4f\n", gflopCnt/deltaT);
| |
|
| |
| if ( (float) dim - psc[0] > 1.0e-5 ||
| |
| (float) dim - psc[dim*dim-1] > 1.0e-5 ) {
| |
| printf("Error: Incorrect Results!\n");
| |
| printf("C[%2d,%2d] = %10.4f\n", 1,1,psc[0]);
| |
| printf("C[%2d,%2d] = %10.4f\n", dim,dim,psc[dim*dim-1]);
| |
| }
| |
|
| |
| /* Initialize CUDA */
| |
| printf("Initializing CUDA...");
| |
| status = cublasInit();
| |
| if (status != CUBLAS_STATUS_SUCCESS) {
| |
| fprintf (stderr, "!!!! CUBLAS initialization error\n");
| |
| return EXIT_FAILURE;
| |
| }
| |
| printf("Done.\n");
| |
|
| |
| /* Re-initialize the matrices */
| |
| printf("Re-initializing Matrices...");
| |
| clock_gettime(CLOCK_MONOTONIC, &t1);
| |
| sap = psa;
| |
| sbp = psb;
| |
| scp = psc;
| |
| for (i = 0; i < dim; i++) {
| |
| for (j = 0; j < dim; j++) {
| |
| *sap++ = 1.0;
| |
| *sbp++ = 1.0;
| |
| *scp++ = 0.0;
| |
| }
| |
| }
| |
| clock_gettime(CLOCK_MONOTONIC, &t2);
| |
| deltaT = timeDiff(&t1, &t2);
| |
| printf("Done. Elapsed Time = %6.4f secs\n", deltaT);
| |
| fflush(stdout);
| |
|
| |
| /* Allocate device memory for the matrices */
| |
| printf("Starting CUDA DGEMM...");
| |
| clock_gettime(CLOCK_MONOTONIC, &t1);
| |
|
| |
| status = cublasAlloc(dim*dim, sizeof(*pda), (void**) &pda);
| |
| if (status != CUBLAS_STATUS_SUCCESS) {
| |
| fprintf (stderr, "!!!! device memory allocation error (A)\n");
| |
| return EXIT_FAILURE;
| |
| }
| |
|
| |
| status = cublasAlloc(dim*dim, sizeof(*pdb), (void**) &pdb);
| |
| if (status != CUBLAS_STATUS_SUCCESS) {
| |
| fprintf (stderr, "!!!! device memory allocation error (B)\n");
| |
| return EXIT_FAILURE;
| |
| }
| |
|
| |
| status = cublasAlloc(dim*dim, sizeof(*pdc), (void**) &pdc);
| |
| if (status != CUBLAS_STATUS_SUCCESS) {
| |
| fprintf (stderr, "!!!! device memory allocation error (C)\n");
| |
| return EXIT_FAILURE;
| |
| }
| |
|
| |
| /* Initialize the device matrices with the host matrices */
| |
| status = cublasSetVector(dim*dim, sizeof(*psa), psa, 1, pda, 1);
| |
| if (status != CUBLAS_STATUS_SUCCESS) {
| |
| fprintf (stderr, "!!!! device access error (write A)\n");
| |
| return EXIT_FAILURE;
| |
| }
| |
|
| |
| status = cublasSetVector(dim*dim, sizeof(*pdb), psb, 1, pdb, 1);
| |
| if (status != CUBLAS_STATUS_SUCCESS) {
| |
| fprintf (stderr, "!!!! device access error (write B)\n");
| |
| return EXIT_FAILURE;
| |
| }
| |
|
| |
| status = cublasSetVector(dim*dim, sizeof(*psc), psc, 1, pdc, 1);
| |
| if (status != CUBLAS_STATUS_SUCCESS) {
| |
| fprintf (stderr, "!!!! device access error (write C)\n");
| |
| return EXIT_FAILURE;
| |
| }
| |
|
| |
| /* Clear last error */
| |
| cublasGetError();
| |
|
| |
| /* Performs operation using cublas */
| |
| cublasDgemm('n', 'n', dim, dim, dim, alpha, pda, dim, pdb, dim, beta, pdc, dim);
| |
| status = cublasGetError();
| |
| if (status != CUBLAS_STATUS_SUCCESS) {
| |
| fprintf (stderr, "!!!! kernel execution error.\n");
| |
| return EXIT_FAILURE;
| |
| }
| |
|
| |
| /* Read the result back */
| |
| status = cublasGetVector(dim*dim, sizeof(*psc), pdc, 1, psc, 1);
| |
| if (status != CUBLAS_STATUS_SUCCESS) {
| |
| fprintf (stderr, "!!!! device access error (read C)\n");
| |
| return EXIT_FAILURE;
| |
| }
| |
| clock_gettime(CLOCK_MONOTONIC, &t2);
| |
| deltaT = timeDiff(&t1, &t2);
| |
| printf("Done. Elapsed Time = %6.4f secs\n", deltaT);
| |
| printf(" ");
| |
| printf("GFlOP Rate = %8.4f\n", gflopCnt/deltaT);
| |
|
| |
| if ( (float) dim - psc[0] > 1.0e-5 ||
| |
| (float) dim - psc[dim*dim-1] > 1.0e-5 ) {
| |
| printf("Error: Incorrect Results!\n");
| |
| printf("C[%2d,%2d] = %10.4f\n", 1,1,psc[0]);
| |
| printf("C[%2d,%2d] = %10.4f\n", dim,dim,psc[dim*dim-1]);
| |
| }
| |
| }
| |
|
| |
| </source>
| |
|
| |
| ===resuse.c===
| |
|
| |
| {{#fileAnchor: resuse.c}}
| |
| Download raw source of the [{{#fileLink: resuse.c}} resuse.c]
| |
| <source lang=c>
| |
| /*---------------------------------------------------------------------------*
| |
| * resuse.c *
| |
| * *
| |
| * resuse_(desc) *
| |
| * char *desc; *
| |
| *---------------------------------------------------------------------------*
| |
| * This routine makes use of the getrusage system call to collect and *
| |
| * display resource utilization from one point in a program to another. *
| |
| * Thus, it works somewhat like a stopwatch in the since that the first call *
| |
| * initializes, or starts, the resource collection over the interval. The *
| |
| * second call stops the collection and displays the results accumulated *
| |
| * since the previous call. This means, of course, that two calls are *
| |
| * required for each section of code to be monitored. *
| |
| *
| |
| * The underscore at the end of the routine name is there so that the routine*
| |
| * may be called as an integer valued FORTRAN function name RESUSE(), under *
| |
| * both the SunOS and Ultrix f77 compilers. AIX inter-language calling *
| |
| * conventions are different so the routine must be referenced as RESUSE_() *
| |
| * under AIX (RISC/6000) FORTRAN (xlf).
| |
| *
| |
| * From a FORTRAN routine:
| |
| *
| |
| * INT = RESUSE("Some String")
| |
| *
| |
| * fortran code to be timed
| |
| *
| |
| * INT = RESUSE("Some String")
| |
| *
| |
| * Just ignore the returned value.
| |
| *---------------------------------------------------------------------------*/
| |
|
| |
| #include <sys/time.h>
| |
| #include <sys/resource.h>
| |
|
| |
| static int first_call = 1;
| |
|
| |
| static struct rusage initial;
| |
|
| |
| void resuse(str)
| |
| char *str;
| |
| {
| |
| int pgminor, pgmajor, nswap, nvcsw, nivcsw;
| |
| int inblock, oublock;
| |
| struct rusage final;
| |
| float usr, sys, secs;
| |
|
| |
| getrusage(RUSAGE_SELF, &final);
| |
|
| |
| if ( ! first_call )
| |
| {
| |
| secs = final.ru_utime.tv_sec + final.ru_utime.tv_usec / 1000000.0;
| |
| usr = secs;
| |
| secs = initial.ru_utime.tv_sec + initial.ru_utime.tv_usec / 1000000.0;
| |
| usr = usr - secs;
| |
|
| |
| secs = final.ru_stime.tv_sec + final.ru_stime.tv_usec / 1000000.0;
| |
| sys = secs;
| |
| secs = initial.ru_stime.tv_sec + initial.ru_stime.tv_usec / 1000000.0;
| |
| sys = sys - secs;
| |
|
| |
| pgminor = final.ru_minflt - initial.ru_minflt;
| |
| pgmajor = final.ru_majflt - initial.ru_majflt;
| |
| nswap = final.ru_nswap - initial.ru_nswap;
| |
| inblock = final.ru_inblock - initial.ru_inblock;
| |
| oublock = final.ru_oublock - initial.ru_oublock;
| |
| nvcsw = final.ru_nvcsw - initial.ru_nvcsw ;
| |
| nivcsw = final.ru_nivcsw - initial.ru_nivcsw ;
| |
|
| |
| printf("=============================================================\n");
| |
| printf("%s: Resource Usage Data...\n", str);
| |
| printf("-------------------------------------------------------------\n");
| |
| printf("User Time (secs) : %10.3f\n", usr);
| |
| printf("System Time (secs) : %10.3f\n", sys);
| |
| printf("Total Time (secs) : %10.3f\n", usr + sys);
| |
| printf("Minor Page Faults : %10d\n", pgminor);
| |
| printf("Major Page Faults : %10d\n", pgmajor);
| |
| printf("Swap Count : %10d\n", nswap);
| |
| printf("Voluntary Context Switches : %10d\n", nvcsw);
| |
| printf("Involuntary Context Switches: %10d\n", nivcsw);
| |
| printf("Block Input Operations : %10d\n", inblock);
| |
| printf("Block Output Operations : %10d\n", oublock);
| |
| printf("=============================================================\n");
| |
|
| |
| }
| |
| else
| |
| {
| |
| printf("=============================================================\n");
| |
| printf("%s: Collecting Resource Usage Data\n", str);
| |
| printf("=============================================================\n");
| |
| }
| |
|
| |
| first_call = !first_call;
| |
| initial = final;
| |
|
| |
| return;
| |
| }
| |
| </source>
| |
|
| |
| ===Makefile for Building cuda_bm===
| |
|
| |
| {{#fileAnchor: Makefile}}
| |
| Download raw source of the [{{#fileLink: Makefile}} Makefile]
| |
|
| |
| <source lang=make>
| |
| #
| |
| # Makefile
| |
| #
| |
| CC = icc
| |
| CFLAGS = -O3 -openmp
| |
|
| |
| MKL_INCS = -I$(HPC_MKL_INC)
| |
| MKL_LIBS = -L$(HPC_MKL_LIB) -lmkl_intel_lp64 -lmkl_sequential -lmkl_core
| |
| MKL_MP_LIBS = -L$(HPC_MKL_LIB) -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core
| |
|
| |
| CUDA_INCS = -I$(HPC_CUDA_INC)
| |
| CUDA_LIBS = -L$(HPC_CUDA_LIB) -lcublas
| |
|
| |
| LIBS = $(MKL_MP_LIBS) $(CUDA_LIBS) -lpthread -lrt
| |
| INCS = $(MKL_INCS) $(CUDA_INCS)
| |
|
| |
| bm: bm.o ptime.o resuse.o
| |
| $(CC) $(CFLAGS) -o bm bm.o ptime.o resuse.o $(LIBS)
| |
|
| |
| cuda_bm: cuda_bm.c resuse.o
| |
| $(CC) $(CFLAGS) $(INCS) -o cuda_bm cuda_bm.c resuse.o $(LIBS)
| |
|
| |
| cuda_sp_bm: cuda_sp_bm.c resuse.o
| |
| $(CC) $(CFLAGS) $(INCS) -o cuda_sp_bm cuda_sp_bm.c resuse.o $(LIBS)
| |
|
| |
| clean:
| |
| rm -f *.o core.* bm cuda_bm
| |
| </source>
| |
|
| |
| === Compiling and Linking ===
| |
|
| |
| Download the above three files into a single directory. Then type:
| |
|
| |
| <pre>
| |
| [taylor@c11a-s15 Bench]$ module load intel/2013
| |
| [taylor@c11a-s15 Bench]$ module load cuda/5.5
| |
| [taylor@c11a-s15 Bench]$ make clean
| |
| rm -f *.o core.* bm cuda_bm
| |
|
| |
| [taylor@c11a-s15 Bench]$ make cuda_bm
| |
| icc -O3 -openmp -I/opt/intel/composer_xe_2013_sp1.0.080/mkl/include -c -o resuse.o resuse.c
| |
| icc -O3 -openmp -I/opt/intel/composer_xe_2013_sp1.0.080/mkl/include -I/opt/cuda/5.5/include -I/opt/intel/composer_xe_2013_sp1.0.080/mkl/include -o cuda_bm cuda_bm.c resuse.o -L/opt/cuda/5.5/lib64 -lcublas -L/opt/intel/composer_xe_2013_sp1.0.080/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lpthread -lrt
| |
| </pre>
| |
|
| |
| === Output ===
| |
| # Host: Single, Hex-Core Intel 5675 @ 3.0 GHz
| |
| # GPU: Kepler K20c
| |
|
| |
| <pre>
| |
| $ export OMP_NUM_THREADS=4
| |
| $ ./cuda_bm
| |
| Initializing Matrices...Done. Elapsed Time = 2.3768 secs
| |
| Starting parallel DGEMM...Done. Elapsed Time = 33.2302 secs
| |
| GFlOP Rate = 45.3546
| |
| Initializing CUDA...Done.
| |
| Re-initializing Matrices...Done. Elapsed Time = 2.1733 secs
| |
| Starting CUDA DGEMM...Done. Elapsed Time = 2.3522 secs
| |
| GFlOP Rate = 640.7458
| |
|
| |
| </pre>
| |
| |}} | | |}} |
|
| |
|