本文是Nvidia 90美金的课程笔记

无论是从出色的性能,还是从易用性来看,CUDA计算平台都是加速计算的制胜法宝。CUDA 提供了一种可扩展 C、C++、Python 和 Fortran 等语言的编码范式,该范式能够在世界上性能超强劲的并行处理器 NVIDIA GPU 上运行经加速的大规模并行代码。CUDA 可以毫不费力地大幅加速应用程序,具有适用于DNN、BLAS、图形分析和FFT等更多运算的高度优化库生态系统,并且还附带功能强大的命令行和可视化性能分析器。

CUDA 支持以下领域

概念

https://www.nvidia.com/en-us/gpu-accelerated-applications/

gridDim.x 网格中的块数,图中为2
blockIdx.x网格中块的索引,图中为0,1
blockDim.x块中线程数 图中为4
threadIdx.x块中线程的索引,图中为,0,1,2,3

流多处理器(Streaming Multiprocessors)
统一内存(UM)
nsight-sys
命令示例
nvcc -o vector-add-no-prefetch 01-vector-add/01-vector-add.cu -run
nsys profile --stats=true -o vector-add-no-prefetch-report ./vector-add-no-prefetch

示例一

包含
使用跨网格循环来处理比网格更大的数组
CUDA错误处理功能

#include <stdio.h>
#include <assert.h>inline cudaError_t checkCuda(cudaError_t result)
{if (result != cudaSuccess) {fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));assert(result == cudaSuccess);}return result;
}void initWith(float num, float *a, int N)
{for(int i = 0; i < N; ++i){a[i] = num;}
}__global__
void addVectorsInto(float *result, float *a, float *b, int N)
{int index = threadIdx.x + blockIdx.x * blockDim.x;int stride = blockDim.x * gridDim.x;for(int i = index; i < N; i += stride){result[i] = a[i] + b[i];}
}void checkElementsAre(float target, float *array, int N)
{for(int i = 0; i < N; i++){if(array[i] != target){printf("FAIL: array[%d] - %0.0f does not equal %0.0f\n", i, array[i], target);exit(1);}}printf("SUCCESS! All values added correctly.\n");
}int main()
{const int N = 2<<20;size_t size = N * sizeof(float);float *a;float *b;float *c;checkCuda( cudaMallocManaged(&a, size) );checkCuda( cudaMallocManaged(&b, size) );checkCuda( cudaMallocManaged(&c, size) );initWith(3, a, N);initWith(4, b, N);initWith(0, c, N);size_t threadsPerBlock;size_t numberOfBlocks;threadsPerBlock = 256;numberOfBlocks = (N + threadsPerBlock - 1) / threadsPerBlock;addVectorsInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);checkCuda( cudaGetLastError() );checkCuda( cudaDeviceSynchronize() );checkElementsAre(7, c, N);checkCuda( cudaFree(a) );checkCuda( cudaFree(b) );checkCuda( cudaFree(c) );
}

示例二

包含
查询设备信息
异步内存预取
cudaMemPrefetchAsync(pointerToSomeUMData, size, deviceId);
cudaMemPrefetchAsync(pointerToSomeUMData, size, cudaCpuDeviceId);
将内存预取回CPU

#include <stdio.h>void initWith(float num, float *a, int N)
{for(int i = 0; i < N; ++i){a[i] = num;}
}__global__
void addVectorsInto(float *result, float *a, float *b, int N)
{int index = threadIdx.x + blockIdx.x * blockDim.x;int stride = blockDim.x * gridDim.x;for(int i = index; i < N; i += stride){result[i] = a[i] + b[i];}
}void checkElementsAre(float target, float *vector, int N)
{for(int i = 0; i < N; i++){if(vector[i] != target){printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target);exit(1);}}printf("Success! All values calculated correctly.\n");
}int main()
{int deviceId;int numberOfSMs;cudaGetDevice(&deviceId);cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);printf("Device ID: %d\tNumber of SMs: %d\n", deviceId, numberOfSMs);const int N = 2<<24;size_t size = N * sizeof(float);float *a;float *b;float *c;cudaMallocManaged(&a, size);cudaMallocManaged(&b, size);cudaMallocManaged(&c, size);/** Prefetching can also be used to prevent CPU page faults.*/cudaMemPrefetchAsync(a, size, cudaCpuDeviceId);cudaMemPrefetchAsync(b, size, cudaCpuDeviceId);cudaMemPrefetchAsync(c, size, cudaCpuDeviceId);initWith(3, a, N);initWith(4, b, N);initWith(0, c, N);cudaMemPrefetchAsync(a, size, deviceId);cudaMemPrefetchAsync(b, size, deviceId);cudaMemPrefetchAsync(c, size, deviceId);size_t threadsPerBlock;size_t numberOfBlocks;threadsPerBlock = 256;numberOfBlocks = 32 * numberOfSMs;cudaError_t addVectorsErr;cudaError_t asyncErr;addVectorsInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);addVectorsErr = cudaGetLastError();if(addVectorsErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(addVectorsErr));asyncErr = cudaDeviceSynchronize();if(asyncErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(asyncErr));/** Prefetching can also be used to prevent CPU page faults.*/cudaMemPrefetchAsync(c, size, cudaCpuDeviceId);checkElementsAre(7, c, N);cudaFree(a);cudaFree(b);cudaFree(c);
}

示例三

包含CUDA并发流
cudaStream_t stream; // CUDA流的类型为 cudaStream_t
cudaStreamCreate(&stream); // 注意,必须将一个指针传递给 cudaCreateStream

someKernel<<<number_of_blocks, threads_per_block, 0, stream>>>(); // stream 作为第4个EC参数传递

cudaStreamDestroy(stream); // 注意,将值(而不是指针)传递给 cudaDestroyStream
流用于并行进行数据初始化的核函数

#include <stdio.h>__global__
void initWith(float num, float *a, int N)
{int index = threadIdx.x + blockIdx.x * blockDim.x;int stride = blockDim.x * gridDim.x;for(int i = index; i < N; i += stride){a[i] = num;}
}__global__
void addVectorsInto(float *result, float *a, float *b, int N)
{int index = threadIdx.x + blockIdx.x * blockDim.x;int stride = blockDim.x * gridDim.x;for(int i = index; i < N; i += stride){result[i] = a[i] + b[i];}
}void checkElementsAre(float target, float *vector, int N)
{for(int i = 0; i < N; i++){if(vector[i] != target){printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target);exit(1);}}printf("Success! All values calculated correctly.\n");
}int main()
{int deviceId;int numberOfSMs;cudaGetDevice(&deviceId);cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);const int N = 2<<24;size_t size = N * sizeof(float);float *a;float *b;float *c;cudaMallocManaged(&a, size);cudaMallocManaged(&b, size);cudaMallocManaged(&c, size);cudaMemPrefetchAsync(a, size, deviceId);cudaMemPrefetchAsync(b, size, deviceId);cudaMemPrefetchAsync(c, size, deviceId);size_t threadsPerBlock;size_t numberOfBlocks;threadsPerBlock = 256;numberOfBlocks = 32 * numberOfSMs;cudaError_t addVectorsErr;cudaError_t asyncErr;/** Create 3 streams to run initialize the 3 data vectors in parallel.*/cudaStream_t stream1, stream2, stream3;cudaStreamCreate(&stream1);cudaStreamCreate(&stream2);cudaStreamCreate(&stream3);/** Give each `initWith` launch its own non-standard stream.*/initWith<<<numberOfBlocks, threadsPerBlock, 0, stream1>>>(3, a, N);initWith<<<numberOfBlocks, threadsPerBlock, 0, stream2>>>(4, b, N);initWith<<<numberOfBlocks, threadsPerBlock, 0, stream3>>>(0, c, N);addVectorsInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);addVectorsErr = cudaGetLastError();if(addVectorsErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(addVectorsErr));asyncErr = cudaDeviceSynchronize();if(asyncErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(asyncErr));cudaMemPrefetchAsync(c, size, cudaCpuDeviceId);checkElementsAre(7, c, N);/** Destroy streams when they are no longer needed.*/cudaStreamDestroy(stream1);cudaStreamDestroy(stream2);cudaStreamDestroy(stream3);cudaFree(a);cudaFree(b);cudaFree(c);
}

示例四

手动内存管理CUDA API 调用的代码。
手动分配主机和设备内存
使用流实现数据传输和代码的重叠执行
核函数和内存复制回主机重叠执行

int *host_a, *device_a; // Define host-specific and device-specific arrays.
cudaMalloc(&device_a, size); // device_a is immediately available on the GPU.
cudaMallocHost(&host_a, size); // host_a is immediately available on CPU, and is page-locked, or pinned.

initializeOnHost(host_a, N); // No CPU page faulting since memory is already allocated on the host.

// cudaMemcpy takes the destination, source, size, and a CUDA-provided variable for the direction of the copy.
cudaMemcpy(device_a, host_a, size, cudaMemcpyHostToDevice);

kernel<<<blocks, threads, 0, someStream>>>(device_a, N);

// cudaMemcpy can also copy data from device to host.
cudaMemcpy(host_a, device_a, size, cudaMemcpyDeviceToHost);

verifyOnHost(host_a, N);

cudaFree(device_a);
cudaFreeHost(host_a); // Free pinned memory like this.

#include <stdio.h>__global__
void initWith(float num, float *a, int N)
{int index = threadIdx.x + blockIdx.x * blockDim.x;int stride = blockDim.x * gridDim.x;for(int i = index; i < N; i += stride){a[i] = num;}
}__global__
void addVectorsInto(float *result, float *a, float *b, int N)
{int index = threadIdx.x + blockIdx.x * blockDim.x;int stride = blockDim.x * gridDim.x;for(int i = index; i < N; i += stride){result[i] = a[i] + b[i];}
}void checkElementsAre(float target, float *vector, int N)
{for(int i = 0; i < N; i++){if(vector[i] != target){printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target);exit(1);}}printf("Success! All values calculated correctly.\n");
}int main()
{int deviceId;int numberOfSMs;cudaGetDevice(&deviceId);cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);const int N = 2<<24;size_t size = N * sizeof(float);float *a;float *b;float *c;float *h_c;cudaMalloc(&a, size);cudaMalloc(&b, size);cudaMalloc(&c, size);cudaMallocHost(&h_c, size);size_t threadsPerBlock;size_t numberOfBlocks;threadsPerBlock = 256;numberOfBlocks = 32 * numberOfSMs;cudaError_t addVectorsErr;cudaError_t asyncErr;/** Create 3 streams to run initialize the 3 data vectors in parallel.*/cudaStream_t stream1, stream2, stream3;cudaStreamCreate(&stream1);cudaStreamCreate(&stream2);cudaStreamCreate(&stream3);/** Give each `initWith` launch its own non-standard stream.*/initWith<<<numberOfBlocks, threadsPerBlock, 0, stream1>>>(3, a, N);initWith<<<numberOfBlocks, threadsPerBlock, 0, stream2>>>(4, b, N);initWith<<<numberOfBlocks, threadsPerBlock, 0, stream3>>>(0, c, N);for (int i = 0; i < 4; ++i){cudaStream_t stream;cudaStreamCreate(&stream);addVectorsInto<<<numberOfBlocks/4, threadsPerBlock, 0, stream>>>(&c[i*N/4], &a[i*N/4], &b[i*N/4], N/4);cudaMemcpyAsync(&h_c[i*N/4], &c[i*N/4], size/4, cudaMemcpyDeviceToHost, stream);cudaStreamDestroy(stream);}addVectorsErr = cudaGetLastError();if(addVectorsErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(addVectorsErr));asyncErr = cudaDeviceSynchronize();if(asyncErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(asyncErr));checkElementsAre(7, h_c, N);/** Destroy streams when they are no longer needed.*/cudaStreamDestroy(stream1);cudaStreamDestroy(stream2);cudaStreamDestroy(stream3);cudaFree(a);cudaFree(b);cudaFree(c);cudaFreeHost(h_c);
}

练习作业
https://yangwc.com/2019/06/20/NbodySimulation/

CUDA加速计算的基础C/C++相关推荐

  1. CUDA加速计算矩阵乘法进阶玩法(共享内存)

    CUDA加速计算矩阵乘法&进阶玩法~共享内存 一.基础版矩阵乘法 二.为什么可以利用共享内存加速矩阵乘法 1.CUDA内存读写速度比较 2.申请共享内存 三.改进版矩阵乘法(利用共享内存) 一 ...

  2. NMS算法的GPU实现(使用CUDA加速计算)

    版权声明:本文为博主原创文章,遵循 CC 4.0 by-sa 版权协议,转载请附上原文出处链接和本声明. 本文链接: https://blog.csdn.net/qq_21368481/article ...

  3. 一个基于pycuda的矩阵乘法实现,它使用CUDA核心来加速计算。

    一个基于pycuda的矩阵乘法实现,它使用CUDA核心来加速计算.该实现使用了共享内存和线程块,以最大化GPU的性能. ```python import pycuda.driver as cuda i ...

  4. mtensor一个tensor计算库,支持cuda延迟计算

    1 mtensor mtensor是一个tensor计算库, 支持cuda的延迟计算, 项目地址为https://github.com/matazure/mtensor. 2 背景 延迟计算具有避免额 ...

  5. linux运行并行计算cuda,并行化计算与CUDA编程

    原标题:并行化计算与CUDA编程 近年来,显卡的更新换代也比较快,NVIDIA今年的发布会也宣布了RTX3080TI即将到来.显卡的运算能力也越来越强. 很多人对显卡的了解可能源于游戏,但是你的显卡不 ...

  6. MATLAB上的GPU加速计算

    概述 怎样在MATLAB上做GPU计算呢?主要分为三个步骤:数据的初始化.对GPU数据进行操作.把GPU上的数据回传给CPU 一.数据的初始化 首先要进行数据的初始化.有两种方法可以进行初始化:一是先 ...

  7. gpu处理信号_GPU显卡不仅用来打游戏那么简单,它还可以用于通用加速计算

    如今,显卡不仅在工作站.个人PC中变得非常重要,而且在数据中心也处于举足轻重的地位.CPU负责通用计算.GPU负责加速计算已经成为绝大数数据中心一种常态.用于加速计算的GPU专用处理器,它将计算密集型 ...

  8. MATLAB上的GPU加速计算——学习笔记

    MATLAB目前只支持Nvidia的显卡.如果你的显卡是AMD的或者是Intel的,就得考虑另寻它路了. MATLAB可谓工程计算中的神器,一方面它自带丰富的函数库,另一方面它所有的数据都是内建的矩阵 ...

  9. 写一个python函数 用cuda加速

    好的,下面是一个用 CUDA 加速的 Python 函数的示例: import numpy as np import cupy as cpdef accelerate_with_cuda(x, y): ...

最新文章

  1. PHPCMS V9 杂记一
  2. 经典游戏大合集,你的青春也许就在里面!
  3. 利用python进行数据分析 英文-如何学习和评价《利用python进行数据分析》这本书?...
  4. 使用Navicat for Oracle新建用户无法登陆(用户名大小写问题)
  5. C/C++内存泄露,如何解决内存泄露?
  6. tensorflow教程 学习笔记 之 Eager execution 急切执行
  7. 自动开料机器人_工业4.0智能家具无人化生产线 自动化无人化大势所趋
  8. Win10 OpenCV3.3.0+VS2013配置大坑,OpenCV解决方案编译报错“找不到python36_d.lib”错误...
  9. matplotlib之legend图例和标注(笔记三)
  10. 云存储技术与云存储服务
  11. for linux pdf转mobi_linux PDF转换为SWF
  12. 视频教程-华为HCIA网络基础-网络技术
  13. 短视频源码开启“短视频+”
  14. 时间管理 android app推荐,干货星球 篇十三:【强烈安利】分享10个时间管理APP,每一个都堪称精品!...
  15. js pug 代码_PUG 系列 | 第二课 JS 代码混合、包含引入
  16. mysql e 变量_MySQL变量分类及用法简析
  17. Java类加载的加载,验证,准备,解析,初始化小结
  18. [ACM]辽宁省赛2010 (HZNU 1081-1089)
  19. 【微信小程序/云开发bug解决方案合集】持续更新中(最新22-11-21)
  20. linux cli运行脚本,AWS学习笔记(四)--CLI创建EC2时执行脚本

热门文章

  1. C语言二维数求矩阵每行的最大值与最小值
  2. 弃用手机号码未被解绑小米账户:用户信息遭泄露
  3. vue3 倒计时功能
  4. 从键盘输入的年份,判断该年是否是闰年
  5. spark中将数据输出到json文件的两种方式
  6. python参考书目_Python 阅读书目推荐
  7. OLAP列式存储之引擎特性
  8. 骨传导耳机怎么选?一文告诉你骨传导耳机推荐哪个牌子
  9. 递归算法时间复杂度的数学证明过程(主定理)
  10. 根据csv文件按照标签划分文件夹数据集