#include "../common/common.h"
#include <cuda_runtime.h>
#include <stdio.h>/** This example demonstrates a simple vector sum on the GPU and on the host.* sumArraysOnGPU splits the work of the vector sum across CUDA threads on the* GPU. Only a single thread block is used in this small case, for simplicity.* sumArraysOnHost sequentially iterates through vector elements on the host.* This version of sumArrays adds host timers to measure GPU and CPU* performance.*/void checkResult(float *hostRef, float *gpuRef, const int N)
{double epsilon = 1.0E-8;bool match = 1;for (int i = 0; i < N; i++){if (abs(hostRef[i] - gpuRef[i]) > epsilon){match = 0;printf("Arrays do not match!\n");printf("host %5.2f gpu %5.2f at current %d\n", hostRef[i],gpuRef[i], i);break;}}if (match) printf("Arrays match.\n\n");return;
}void initialData(float *ip, int size)
{// generate different seed for random numbertime_t t;srand((unsigned) time(&t));for (int i = 0; i < size; i++){ip[i] = (float)( rand() & 0xFF ) / 10.0f;}return;
}void sumArraysOnHost(float *A, float *B, float *C, const int N)
{for (int idx = 0; idx < N; idx++){C[idx] = A[idx] + B[idx];}
__global__ void sumArraysOnGPU(float *A, float *B, float *C, const int N)
{int i = blockIdx.x * blockDim.x + threadIdx.x;if (i < N) C[i] = A[i] + B[i];
}int main(int argc, char **argv)
{printf("%s Starting...\n", argv[0]);// set up deviceint dev = 0;cudaDeviceProp deviceProp;CHECK(cudaGetDeviceProperties(&deviceProp, dev));printf("Using Device %d: %s\n", dev, deviceProp.name);CHECK(cudaSetDevice(dev));// set up data size of vectorsint nElem = 1 << 24;printf("Vector size %d\n", nElem);// malloc host memorysize_t nBytes = nElem * sizeof(float);float *h_A, *h_B, *hostRef, *gpuRef;h_A     = (float *)malloc(nBytes);h_B     = (float *)malloc(nBytes);hostRef = (float *)malloc(nBytes);gpuRef  = (float *)malloc(nBytes);double iStart, iElaps;// initialize data at host sideiStart = seconds();initialData(h_A, nElem);initialData(h_B, nElem);iElaps = seconds() - iStart;printf("initialData Time elapsed %f sec\n", iElaps);memset(hostRef, 0, nBytes);memset(gpuRef,  0, nBytes);// add vector at host side for result checksiStart = seconds();sumArraysOnHost(h_A, h_B, hostRef, nElem);iElaps = seconds() - iStart;printf("sumArraysOnHost Time elapsed %f sec\n", iElaps);// malloc device global memoryfloat *d_A, *d_B, *d_C;CHECK(cudaMalloc((float**)&d_A, nBytes));CHECK(cudaMalloc((float**)&d_B, nBytes));CHECK(cudaMalloc((float**)&d_C, nBytes));// transfer data from host to deviceCHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice));CHECK(cudaMemcpy(d_C, gpuRef, nBytes, cudaMemcpyHostToDevice));// invoke kernel at host sideint iLen = 512;dim3 block (iLen);dim3 grid  ((nElem + block.x - 1) / block.x);iStart = seconds();sumArraysOnGPU<<<grid, block>>>(d_A, d_B, d_C, nElem);CHECK(cudaDeviceSynchronize());iElaps = seconds() - iStart;printf("sumArraysOnGPU <<<  %d, %d  >>>  Time elapsed %f sec\n", grid.x,block.x, iElaps);// check kernel errorCHECK(cudaGetLastError()) ;// copy kernel result back to host sideCHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));// check device resultscheckResult(hostRef, gpuRef, nElem);// free device global memoryCHECK(cudaFree(d_A));CHECK(cudaFree(d_B));CHECK(cudaFree(d_C));// free host memoryfree(h_A);free(h_B);free(hostRef);free(gpuRef);return(0);


#include <sys/time.h>#ifndef _COMMON_H
#define _COMMON_H#define CHECK(call)                                                            \
{                                                                              \const cudaError_t error = call;                                            \if (error != cudaSuccess)                                                  \{                                                                          \fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \fprintf(stderr, "code: %d, reason: %s\n", error,                       \cudaGetErrorString(error));                                    \exit(1);                                                               \}                                                                          \
}#define CHECK_CUBLAS(call)                                                     \
{                                                                              \cublasStatus_t err;                                                        \if ((err = (call)) != CUBLAS_STATUS_SUCCESS)                               \{                                                                          \fprintf(stderr, "Got CUBLAS error %d at %s:%d\n", err, __FILE__,       \__LINE__);                                                     \exit(1);                                                               \}                                                                          \
}#define CHECK_CURAND(call)                                                     \
{                                                                              \curandStatus_t err;                                                        \if ((err = (call)) != CURAND_STATUS_SUCCESS)                               \{                                                                          \fprintf(stderr, "Got CURAND error %d at %s:%d\n", err, __FILE__,       \__LINE__);                                                     \exit(1);                                                               \}                                                                          \
}#define CHECK_CUFFT(call)                                                      \
{                                                                              \cufftResult err;                                                           \if ( (err = (call)) != CUFFT_SUCCESS)                                      \{                                                                          \fprintf(stderr, "Got CUFFT error %d at %s:%d\n", err, __FILE__,        \__LINE__);                                                     \exit(1);                                                               \}                                                                          \
}#define CHECK_CUSPARSE(call)                                                   \
{                                                                              \cusparseStatus_t err;                                                      \if ((err = (call)) != CUSPARSE_STATUS_SUCCESS)                             \{                                                                          \fprintf(stderr, "Got error %d at %s:%d\n", err, __FILE__, __LINE__);   \cudaError_t cuda_err = cudaGetLastError();                             \if (cuda_err != cudaSuccess)                                           \{                                                                      \fprintf(stderr, "  CUDA error \"%s\" also detected\n",             \cudaGetErrorString(cuda_err));                             \}                                                                      \exit(1);                                                               \}                                                                          \
}inline double seconds()
{struct timeval tp;struct timezone tzp;int i = gettimeofday(&tp, &tzp);return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}#endif // _COMMON_H


-bash-4.1$ ./sumArraysOnGPU-timer
./sumArraysOnGPU-timer Starting...
Using Device 0: Tesla K40c
Vector size 16777216
initialData Time elapsed 0.521010 sec
sumArraysOnHost Time elapsed 0.025647 sec
sumArraysOnGPU <<<  32768, 512  >>>  Time elapsed 0.001233 sec
Arrays match.-bash-4.1$


