#include <stdio.h>
#include <stdlib.h>
#include <mkl.h>
#include <math.h>
#include <time.h>
#include <cublas.h>
void resuse(char *str);
double timeDiff( struct timespec *t1, struct timespec *t2)
{
double T1, T2;
T2 = (double)t2->tv_sec + (double)t2->tv_nsec / 1.0e9;
T1 = (double)t1->tv_sec - (double)t1->tv_nsec / 1.0e9;
return(T2 - T1);
}
main()
{
int dim = 9100;
int i,j,k;
int status;
double *psa, *psb, *psc;
double *sap, *sbp, *scp;
double *pda, *pdb, *pdc;
double *dap, *dbp, *dcp;
double alpha = 1.0;
double beta = 0.0;
double gflops = 0.0;
float deltaT = 0.0;
double gflopCnt = 2.0 * dim * dim * dim / 1.0e9;
struct timespec t1;
struct timespec t2;
int ptime();
pda = NULL;
pdb = NULL;
pdc = NULL;
psa = (double *) malloc(dim * dim * sizeof(*psa) );
psb = (double *) malloc(dim * dim * sizeof(*psb) );
psc = (double *) malloc(dim * dim * sizeof(*psc) );
printf("Initializing Matrices...");
clock_gettime(CLOCK_MONOTONIC, &t1);
sap = psa;
sbp = psb;
scp = psc;
for (i = 0; i < dim; i++)
for (j = 0; j < dim; j++) {
*sap++ = 1.0;
*sbp++ = 1.0;
*scp++ = 0.0;
}
clock_gettime(CLOCK_MONOTONIC, &t2);
deltaT = timeDiff(&t1, &t2);
printf("Done. Elapsed Time = %6.4f secs\n", deltaT);
fflush(stdout);
printf("Starting parallel DGEMM...");
fflush(stdout);
clock_gettime(CLOCK_MONOTONIC, &t1);
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, dim, dim, dim, alpha, psa, dim, psb, dim, beta, psc, dim);
clock_gettime(CLOCK_MONOTONIC, &t2);
deltaT = timeDiff(&t1, &t2);
printf("Done. Elapsed Time = %6.4f secs\n", deltaT);
printf(" ");
printf("GFlOP Rate = %8.4f\n", gflopCnt/deltaT);
if ( (float) dim - psc[0] > 1.0e-5 ||
(float) dim - psc[dim*dim-1] > 1.0e-5 ) {
printf("Error: Incorrect Results!\n");
printf("C[%2d,%2d] = %10.4f\n", 1,1,psc[0]);
printf("C[%2d,%2d] = %10.4f\n", dim,dim,psc[dim*dim-1]);
}
/* Initialize CUDA */
printf("Initializing CUDA...");
status = cublasInit();
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! CUBLAS initialization error\n");
return EXIT_FAILURE;
}
printf("Done.\n");
/* Re-initialize the matrices */
printf("Re-initializing Matrices...");
clock_gettime(CLOCK_MONOTONIC, &t1);
sap = psa;
sbp = psb;
scp = psc;
for (i = 0; i < dim; i++) {
for (j = 0; j < dim; j++) {
*sap++ = 1.0;
*sbp++ = 1.0;
*scp++ = 0.0;
}
}
clock_gettime(CLOCK_MONOTONIC, &t2);
deltaT = timeDiff(&t1, &t2);
printf("Done. Elapsed Time = %6.4f secs\n", deltaT);
fflush(stdout);
/* Allocate device memory for the matrices */
printf("Starting CUDA DGEMM...");
clock_gettime(CLOCK_MONOTONIC, &t1);
status = cublasAlloc(dim*dim, sizeof(*pda), (void**) &pda);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device memory allocation error (A)\n");
return EXIT_FAILURE;
}
status = cublasAlloc(dim*dim, sizeof(*pdb), (void**) &pdb);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device memory allocation error (B)\n");
return EXIT_FAILURE;
}
status = cublasAlloc(dim*dim, sizeof(*pdc), (void**) &pdc);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device memory allocation error (C)\n");
return EXIT_FAILURE;
}
/* Initialize the device matrices with the host matrices */
status = cublasSetVector(dim*dim, sizeof(*psa), psa, 1, pda, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device access error (write A)\n");
return EXIT_FAILURE;
}
status = cublasSetVector(dim*dim, sizeof(*pdb), psb, 1, pdb, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device access error (write B)\n");
return EXIT_FAILURE;
}
status = cublasSetVector(dim*dim, sizeof(*psc), psc, 1, pdc, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device access error (write C)\n");
return EXIT_FAILURE;
}
/* Clear last error */
cublasGetError();
/* Performs operation using cublas */
cublasDgemm('n', 'n', dim, dim, dim, alpha, pda, dim, pdb, dim, beta, pdc, dim);
status = cublasGetError();
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! kernel execution error.\n");
return EXIT_FAILURE;
}
/* Read the result back */
status = cublasGetVector(dim*dim, sizeof(*psc), pdc, 1, psc, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device access error (read C)\n");
return EXIT_FAILURE;
}
clock_gettime(CLOCK_MONOTONIC, &t2);
deltaT = timeDiff(&t1, &t2);
printf("Done. Elapsed Time = %6.4f secs\n", deltaT);
printf(" ");
printf("GFlOP Rate = %8.4f\n", gflopCnt/deltaT);
if ( (float) dim - psc[0] > 1.0e-5 ||
(float) dim - psc[dim*dim-1] > 1.0e-5 ) {
printf("Error: Incorrect Results!\n");
printf("C[%2d,%2d] = %10.4f\n", 1,1,psc[0]);
printf("C[%2d,%2d] = %10.4f\n", dim,dim,psc[dim*dim-1]);
}
}