该源程序来自《CUDA C语言编程中文译文版》,如有侵权,联系删除。此处只为学习交流。

cuda程序如下:

#include "../common/common.h"
#include <cuda_runtime.h>
#include <stdio.h>/** This example demonstrates a simple vector sum on the GPU and on the host.* sumArraysOnGPU splits the work of the vector sum across CUDA threads on the* GPU. Only a single thread block is used in this small case, for simplicity.* sumArraysOnHost sequentially iterates through vector elements on the host.* This version of sumArrays adds host timers to measure GPU and CPU* performance.*/void checkResult(float *hostRef, float *gpuRef, const int N)
{double epsilon = 1.0E-8;bool match = 1;for (int i = 0; i < N; i++){if (abs(hostRef[i] - gpuRef[i]) > epsilon){match = 0;printf("Arrays do not match!\n");printf("host %5.2f gpu %5.2f at current %d\n", hostRef[i],gpuRef[i], i);break;}}if (match) printf("Arrays match.\n\n");return;
}void initialData(float *ip, int size)
{// generate different seed for random numbertime_t t;srand((unsigned) time(&t));for (int i = 0; i < size; i++){ip[i] = (float)( rand() & 0xFF ) / 10.0f;}return;
}void sumArraysOnHost(float *A, float *B, float *C, const int N)
{for (int idx = 0; idx < N; idx++){C[idx] = A[idx] + B[idx];}
}
__global__ void sumArraysOnGPU(float *A, float *B, float *C, const int N)
{int i = blockIdx.x * blockDim.x + threadIdx.x;if (i < N) C[i] = A[i] + B[i];
}int main(int argc, char **argv)
{printf("%s Starting...\n", argv[0]);// set up deviceint dev = 0;cudaDeviceProp deviceProp;CHECK(cudaGetDeviceProperties(&deviceProp, dev));printf("Using Device %d: %s\n", dev, deviceProp.name);CHECK(cudaSetDevice(dev));// set up data size of vectorsint nElem = 1 << 24;printf("Vector size %d\n", nElem);// malloc host memorysize_t nBytes = nElem * sizeof(float);float *h_A, *h_B, *hostRef, *gpuRef;h_A     = (float *)malloc(nBytes);h_B     = (float *)malloc(nBytes);hostRef = (float *)malloc(nBytes);gpuRef  = (float *)malloc(nBytes);double iStart, iElaps;// initialize data at host sideiStart = seconds();initialData(h_A, nElem);initialData(h_B, nElem);iElaps = seconds() - iStart;printf("initialData Time elapsed %f sec\n", iElaps);memset(hostRef, 0, nBytes);memset(gpuRef,  0, nBytes);// add vector at host side for result checksiStart = seconds();sumArraysOnHost(h_A, h_B, hostRef, nElem);iElaps = seconds() - iStart;printf("sumArraysOnHost Time elapsed %f sec\n", iElaps);// malloc device global memoryfloat *d_A, *d_B, *d_C;CHECK(cudaMalloc((float**)&d_A, nBytes));CHECK(cudaMalloc((float**)&d_B, nBytes));CHECK(cudaMalloc((float**)&d_C, nBytes));// transfer data from host to deviceCHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice));CHECK(cudaMemcpy(d_C, gpuRef, nBytes, cudaMemcpyHostToDevice));// invoke kernel at host sideint iLen = 512;dim3 block (iLen);dim3 grid  ((nElem + block.x - 1) / block.x);iStart = seconds();sumArraysOnGPU<<<grid, block>>>(d_A, d_B, d_C, nElem);CHECK(cudaDeviceSynchronize());iElaps = seconds() - iStart;printf("sumArraysOnGPU <<<  %d, %d  >>>  Time elapsed %f sec\n", grid.x,block.x, iElaps);// check kernel errorCHECK(cudaGetLastError()) ;// copy kernel result back to host sideCHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));// check device resultscheckResult(hostRef, gpuRef, nElem);// free device global memoryCHECK(cudaFree(d_A));CHECK(cudaFree(d_B));CHECK(cudaFree(d_C));// free host memoryfree(h_A);free(h_B);free(hostRef);free(gpuRef);return(0);
}

common.h如下:

#include <sys/time.h>#ifndef _COMMON_H
#define _COMMON_H#define CHECK(call)                                                            \
{                                                                              \const cudaError_t error = call;                                            \if (error != cudaSuccess)                                                  \{                                                                          \fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \fprintf(stderr, "code: %d, reason: %s\n", error,                       \cudaGetErrorString(error));                                    \exit(1);                                                               \}                                                                          \
}#define CHECK_CUBLAS(call)                                                     \
{                                                                              \cublasStatus_t err;                                                        \if ((err = (call)) != CUBLAS_STATUS_SUCCESS)                               \{                                                                          \fprintf(stderr, "Got CUBLAS error %d at %s:%d\n", err, __FILE__,       \__LINE__);                                                     \exit(1);                                                               \}                                                                          \
}#define CHECK_CURAND(call)                                                     \
{                                                                              \curandStatus_t err;                                                        \if ((err = (call)) != CURAND_STATUS_SUCCESS)                               \{                                                                          \fprintf(stderr, "Got CURAND error %d at %s:%d\n", err, __FILE__,       \__LINE__);                                                     \exit(1);                                                               \}                                                                          \
}#define CHECK_CUFFT(call)                                                      \
{                                                                              \cufftResult err;                                                           \if ( (err = (call)) != CUFFT_SUCCESS)                                      \{                                                                          \fprintf(stderr, "Got CUFFT error %d at %s:%d\n", err, __FILE__,        \__LINE__);                                                     \exit(1);                                                               \}                                                                          \
}#define CHECK_CUSPARSE(call)                                                   \
{                                                                              \cusparseStatus_t err;                                                      \if ((err = (call)) != CUSPARSE_STATUS_SUCCESS)                             \{                                                                          \fprintf(stderr, "Got error %d at %s:%d\n", err, __FILE__, __LINE__);   \cudaError_t cuda_err = cudaGetLastError();                             \if (cuda_err != cudaSuccess)                                           \{                                                                      \fprintf(stderr, "  CUDA error \"%s\" also detected\n",             \cudaGetErrorString(cuda_err));                             \}                                                                      \exit(1);                                                               \}                                                                          \
}inline double seconds()
{struct timeval tp;struct timezone tzp;int i = gettimeofday(&tp, &tzp);return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}#endif // _COMMON_H

编译与运行:

-bash-4.1$ ./sumArraysOnGPU-timer
./sumArraysOnGPU-timer Starting...
Using Device 0: Tesla K40c
Vector size 16777216
initialData Time elapsed 0.521010 sec
sumArraysOnHost Time elapsed 0.025647 sec
sumArraysOnGPU <<<  32768, 512  >>>  Time elapsed 0.001233 sec
Arrays match.-bash-4.1$

用CPU计时器统计CUDA核函数的运行时间相关推荐

  1. 关于top指令及cpu占用统计

    原文链接: https://blog.csdn.net/cfy_phonex/article/details/18733731 通常,有如下方式可以得到 cpu 利用率情况: 1. top 命令 e. ...

  2. linux统计数据查看工具,CPU性能统计数据及Linux监控工具使用

    一.CPU性能统计数据 1.Run Queue Statistics-- 运行队列统计 In Linux, a process can be either runnableor blocked wai ...

  3. c语言 cuda核函数,CUDA核函数与线程配置

    CUDA核函数 在GPU上调用的函数成为CUDA核函数(Kernel function),核函数会被GPU上的多个线程执行.每个线程都会执行核函数里的代码,当然由于线程编号的不同,执行的代码路径可能会 ...

  4. 【记录一个问题】cuda核函数可能存在栈溢出,导致main()函数退出后程序卡死30秒CUDA...

    调试一个CUDA核函数过程中发现一个奇怪的问题: 调用某个核函数,程序耗时33秒,并且主要时间是main()函数结束后的33秒: 而注释掉此核函数,程序执行不到1秒. 由此可见,可能是某种栈溢出,导致 ...

  5. freeRtos学习笔记 (9) 移植和CPU利用率统计

    freeRtos学习笔记 (9) 移植和CPU利用率统计 使用官方固件移植 首先准备一个能跑的裸机工程 注意,freertos需要使用systick定时器,而stm32HAL库默认使用systick作 ...

  6. 详解CUDA核函数及运行时参数

    详解CUDA核函数及运行时参数 核函数是GPU每个thread上运行的程序.必须通过__gloabl__函数类型限定符定义.形式如下: __global__ void kernel(param lis ...

  7. Linux性能优化2.1 CPU性能统计信息

    摘要 性能工具:系统CPU 本章概述了系统级的Linux性能工具.这些工具是你追踪性能问题时的第一道防线.它们能展示整个系统的性能情况和哪些部分表现不好.本章将讨论这些工具可以测量的统计信息,以及如何 ...

  8. 【记录一个问题】linux+opencv+cuvid解码1080P视频,当使用CUDA核函数的时候,必然崩溃...

    崩溃的信息如下: 1 OpenCV(4.1.0-dev) Error: Gpu API call (invalid configuration argument) in videoDecPostPro ...

  9. 非参数与半参数统计之核函数

    非参数与半参数统计之核函数 核函数的本质是什么 核函数的产生过程 常见的核函数 核函数的本质是什么 核函数的产生过程 众所周知,分布函数的导数是密度函数,即: f ( x ) = F ′ ( x ) ...

最新文章

  1. HTML 資訊汲取(中篇) - Default namespace 問題
  2. 异步的两种写法: async 与 BeginInvoke
  3. js 解析php serialize,php如何解析jquery serialize 提交后的数据
  4. 托管非托管_如何利用Kubernetes的力量来优化您的托管成本
  5. myeclipse搭建SSH框架
  6. AndroidStudio_安卓原生开发_判断蓝牙_定位是否开启---Android原生开发工作笔记162
  7. “21天好习惯”第一期-17
  8. c语言i=5.6a=(int)i,(PSIM仿真)从零开始设计BOOST数字控制器
  9. Winform SplitContainer控件可调整大小
  10. Hive实现同比环比计算
  11. 苹果id怎么注册?老果粉教你创建新的Apple ID
  12. springboot配置文件密文解密
  13. .NET中XML 注释 SandCastle 帮助文件.hhp 使用HTML Help Workshop生成CHM文件
  14. Java项目:毕业设计管理系统(java+SSM+jsp+mysql+maven)
  15. 上传artifacts到maven仓库
  16. SAS EM之SAS Credit Scoring不能使用
  17. [LeetCode 中等 树]545. 二叉树的边界
  18. 初中数学分几个模块_初中数学有几部分
  19. Toronto Research Chemicals 双(乙酰丙酮)铂(II)
  20. 离散数学 (II) 习题 4

热门文章

  1. Pandas一些小技巧
  2. 爬虫—Requests高级用法
  3. 2010版EXCEL下方sheet工作表隐藏了
  4. mtr和nmap命令
  5. jeecg中的树形控件demo
  6. vue再次入手(数据传递①)
  7. 通过Filter统一全站编码
  8. 《教孩子编程(Python语言版)》课程介绍
  9. 2-自定义 Mybatis 框架
  10. 服务器pcie性能最大,首款PCIe 4.0服务器处理器为数据中心加速