CUDA加速计算的基础C/C++
本文是Nvidia 90美金的课程笔记
无论是从出色的性能,还是从易用性来看,CUDA计算平台都是加速计算的制胜法宝。CUDA 提供了一种可扩展 C、C++、Python 和 Fortran 等语言的编码范式,该范式能够在世界上性能超强劲的并行处理器 NVIDIA GPU 上运行经加速的大规模并行代码。CUDA 可以毫不费力地大幅加速应用程序,具有适用于DNN、BLAS、图形分析和FFT等更多运算的高度优化库生态系统,并且还附带功能强大的命令行和可视化性能分析器。
CUDA 支持以下领域
概念
https://www.nvidia.com/en-us/gpu-accelerated-applications/
gridDim.x 网格中的块数,图中为2
blockIdx.x网格中块的索引,图中为0,1
blockDim.x块中线程数 图中为4
threadIdx.x块中线程的索引,图中为,0,1,2,3
流多处理器(Streaming Multiprocessors)
统一内存(UM)
nsight-sys
命令示例
nvcc -o vector-add-no-prefetch 01-vector-add/01-vector-add.cu -run
nsys profile --stats=true -o vector-add-no-prefetch-report ./vector-add-no-prefetch
示例一
包含
使用跨网格循环来处理比网格更大的数组
CUDA错误处理功能
#include <stdio.h>
#include <assert.h>inline cudaError_t checkCuda(cudaError_t result)
{if (result != cudaSuccess) {fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));assert(result == cudaSuccess);}return result;
}void initWith(float num, float *a, int N)
{for(int i = 0; i < N; ++i){a[i] = num;}
}__global__
void addVectorsInto(float *result, float *a, float *b, int N)
{int index = threadIdx.x + blockIdx.x * blockDim.x;int stride = blockDim.x * gridDim.x;for(int i = index; i < N; i += stride){result[i] = a[i] + b[i];}
}void checkElementsAre(float target, float *array, int N)
{for(int i = 0; i < N; i++){if(array[i] != target){printf("FAIL: array[%d] - %0.0f does not equal %0.0f\n", i, array[i], target);exit(1);}}printf("SUCCESS! All values added correctly.\n");
}int main()
{const int N = 2<<20;size_t size = N * sizeof(float);float *a;float *b;float *c;checkCuda( cudaMallocManaged(&a, size) );checkCuda( cudaMallocManaged(&b, size) );checkCuda( cudaMallocManaged(&c, size) );initWith(3, a, N);initWith(4, b, N);initWith(0, c, N);size_t threadsPerBlock;size_t numberOfBlocks;threadsPerBlock = 256;numberOfBlocks = (N + threadsPerBlock - 1) / threadsPerBlock;addVectorsInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);checkCuda( cudaGetLastError() );checkCuda( cudaDeviceSynchronize() );checkElementsAre(7, c, N);checkCuda( cudaFree(a) );checkCuda( cudaFree(b) );checkCuda( cudaFree(c) );
}
示例二
包含
查询设备信息
异步内存预取
cudaMemPrefetchAsync(pointerToSomeUMData, size, deviceId);
cudaMemPrefetchAsync(pointerToSomeUMData, size, cudaCpuDeviceId);
将内存预取回CPU
#include <stdio.h>void initWith(float num, float *a, int N)
{for(int i = 0; i < N; ++i){a[i] = num;}
}__global__
void addVectorsInto(float *result, float *a, float *b, int N)
{int index = threadIdx.x + blockIdx.x * blockDim.x;int stride = blockDim.x * gridDim.x;for(int i = index; i < N; i += stride){result[i] = a[i] + b[i];}
}void checkElementsAre(float target, float *vector, int N)
{for(int i = 0; i < N; i++){if(vector[i] != target){printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target);exit(1);}}printf("Success! All values calculated correctly.\n");
}int main()
{int deviceId;int numberOfSMs;cudaGetDevice(&deviceId);cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);printf("Device ID: %d\tNumber of SMs: %d\n", deviceId, numberOfSMs);const int N = 2<<24;size_t size = N * sizeof(float);float *a;float *b;float *c;cudaMallocManaged(&a, size);cudaMallocManaged(&b, size);cudaMallocManaged(&c, size);/** Prefetching can also be used to prevent CPU page faults.*/cudaMemPrefetchAsync(a, size, cudaCpuDeviceId);cudaMemPrefetchAsync(b, size, cudaCpuDeviceId);cudaMemPrefetchAsync(c, size, cudaCpuDeviceId);initWith(3, a, N);initWith(4, b, N);initWith(0, c, N);cudaMemPrefetchAsync(a, size, deviceId);cudaMemPrefetchAsync(b, size, deviceId);cudaMemPrefetchAsync(c, size, deviceId);size_t threadsPerBlock;size_t numberOfBlocks;threadsPerBlock = 256;numberOfBlocks = 32 * numberOfSMs;cudaError_t addVectorsErr;cudaError_t asyncErr;addVectorsInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);addVectorsErr = cudaGetLastError();if(addVectorsErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(addVectorsErr));asyncErr = cudaDeviceSynchronize();if(asyncErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(asyncErr));/** Prefetching can also be used to prevent CPU page faults.*/cudaMemPrefetchAsync(c, size, cudaCpuDeviceId);checkElementsAre(7, c, N);cudaFree(a);cudaFree(b);cudaFree(c);
}
示例三
包含CUDA并发流
cudaStream_t stream; // CUDA流的类型为 cudaStream_t
cudaStreamCreate(&stream); // 注意,必须将一个指针传递给 cudaCreateStream
someKernel<<<number_of_blocks, threads_per_block, 0, stream>>>(); // stream
作为第4个EC参数传递
cudaStreamDestroy(stream); // 注意,将值(而不是指针)传递给 cudaDestroyStream
流用于并行进行数据初始化的核函数
#include <stdio.h>__global__
void initWith(float num, float *a, int N)
{int index = threadIdx.x + blockIdx.x * blockDim.x;int stride = blockDim.x * gridDim.x;for(int i = index; i < N; i += stride){a[i] = num;}
}__global__
void addVectorsInto(float *result, float *a, float *b, int N)
{int index = threadIdx.x + blockIdx.x * blockDim.x;int stride = blockDim.x * gridDim.x;for(int i = index; i < N; i += stride){result[i] = a[i] + b[i];}
}void checkElementsAre(float target, float *vector, int N)
{for(int i = 0; i < N; i++){if(vector[i] != target){printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target);exit(1);}}printf("Success! All values calculated correctly.\n");
}int main()
{int deviceId;int numberOfSMs;cudaGetDevice(&deviceId);cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);const int N = 2<<24;size_t size = N * sizeof(float);float *a;float *b;float *c;cudaMallocManaged(&a, size);cudaMallocManaged(&b, size);cudaMallocManaged(&c, size);cudaMemPrefetchAsync(a, size, deviceId);cudaMemPrefetchAsync(b, size, deviceId);cudaMemPrefetchAsync(c, size, deviceId);size_t threadsPerBlock;size_t numberOfBlocks;threadsPerBlock = 256;numberOfBlocks = 32 * numberOfSMs;cudaError_t addVectorsErr;cudaError_t asyncErr;/** Create 3 streams to run initialize the 3 data vectors in parallel.*/cudaStream_t stream1, stream2, stream3;cudaStreamCreate(&stream1);cudaStreamCreate(&stream2);cudaStreamCreate(&stream3);/** Give each `initWith` launch its own non-standard stream.*/initWith<<<numberOfBlocks, threadsPerBlock, 0, stream1>>>(3, a, N);initWith<<<numberOfBlocks, threadsPerBlock, 0, stream2>>>(4, b, N);initWith<<<numberOfBlocks, threadsPerBlock, 0, stream3>>>(0, c, N);addVectorsInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);addVectorsErr = cudaGetLastError();if(addVectorsErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(addVectorsErr));asyncErr = cudaDeviceSynchronize();if(asyncErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(asyncErr));cudaMemPrefetchAsync(c, size, cudaCpuDeviceId);checkElementsAre(7, c, N);/** Destroy streams when they are no longer needed.*/cudaStreamDestroy(stream1);cudaStreamDestroy(stream2);cudaStreamDestroy(stream3);cudaFree(a);cudaFree(b);cudaFree(c);
}
示例四
手动内存管理CUDA API 调用的代码。
手动分配主机和设备内存
使用流实现数据传输和代码的重叠执行
核函数和内存复制回主机重叠执行
int *host_a, *device_a; // Define host-specific and device-specific arrays.
cudaMalloc(&device_a, size); // device_a
is immediately available on the GPU.
cudaMallocHost(&host_a, size); // host_a
is immediately available on CPU, and is page-locked, or pinned.
initializeOnHost(host_a, N); // No CPU page faulting since memory is already allocated on the host.
// cudaMemcpy
takes the destination, source, size, and a CUDA-provided variable for the direction of the copy.
cudaMemcpy(device_a, host_a, size, cudaMemcpyHostToDevice);
kernel<<<blocks, threads, 0, someStream>>>(device_a, N);
// cudaMemcpy
can also copy data from device to host.
cudaMemcpy(host_a, device_a, size, cudaMemcpyDeviceToHost);
verifyOnHost(host_a, N);
cudaFree(device_a);
cudaFreeHost(host_a); // Free pinned memory like this.
#include <stdio.h>__global__
void initWith(float num, float *a, int N)
{int index = threadIdx.x + blockIdx.x * blockDim.x;int stride = blockDim.x * gridDim.x;for(int i = index; i < N; i += stride){a[i] = num;}
}__global__
void addVectorsInto(float *result, float *a, float *b, int N)
{int index = threadIdx.x + blockIdx.x * blockDim.x;int stride = blockDim.x * gridDim.x;for(int i = index; i < N; i += stride){result[i] = a[i] + b[i];}
}void checkElementsAre(float target, float *vector, int N)
{for(int i = 0; i < N; i++){if(vector[i] != target){printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target);exit(1);}}printf("Success! All values calculated correctly.\n");
}int main()
{int deviceId;int numberOfSMs;cudaGetDevice(&deviceId);cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);const int N = 2<<24;size_t size = N * sizeof(float);float *a;float *b;float *c;float *h_c;cudaMalloc(&a, size);cudaMalloc(&b, size);cudaMalloc(&c, size);cudaMallocHost(&h_c, size);size_t threadsPerBlock;size_t numberOfBlocks;threadsPerBlock = 256;numberOfBlocks = 32 * numberOfSMs;cudaError_t addVectorsErr;cudaError_t asyncErr;/** Create 3 streams to run initialize the 3 data vectors in parallel.*/cudaStream_t stream1, stream2, stream3;cudaStreamCreate(&stream1);cudaStreamCreate(&stream2);cudaStreamCreate(&stream3);/** Give each `initWith` launch its own non-standard stream.*/initWith<<<numberOfBlocks, threadsPerBlock, 0, stream1>>>(3, a, N);initWith<<<numberOfBlocks, threadsPerBlock, 0, stream2>>>(4, b, N);initWith<<<numberOfBlocks, threadsPerBlock, 0, stream3>>>(0, c, N);for (int i = 0; i < 4; ++i){cudaStream_t stream;cudaStreamCreate(&stream);addVectorsInto<<<numberOfBlocks/4, threadsPerBlock, 0, stream>>>(&c[i*N/4], &a[i*N/4], &b[i*N/4], N/4);cudaMemcpyAsync(&h_c[i*N/4], &c[i*N/4], size/4, cudaMemcpyDeviceToHost, stream);cudaStreamDestroy(stream);}addVectorsErr = cudaGetLastError();if(addVectorsErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(addVectorsErr));asyncErr = cudaDeviceSynchronize();if(asyncErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(asyncErr));checkElementsAre(7, h_c, N);/** Destroy streams when they are no longer needed.*/cudaStreamDestroy(stream1);cudaStreamDestroy(stream2);cudaStreamDestroy(stream3);cudaFree(a);cudaFree(b);cudaFree(c);cudaFreeHost(h_c);
}
练习作业
https://yangwc.com/2019/06/20/NbodySimulation/
CUDA加速计算的基础C/C++相关推荐
- CUDA加速计算矩阵乘法进阶玩法(共享内存)
CUDA加速计算矩阵乘法&进阶玩法~共享内存 一.基础版矩阵乘法 二.为什么可以利用共享内存加速矩阵乘法 1.CUDA内存读写速度比较 2.申请共享内存 三.改进版矩阵乘法(利用共享内存) 一 ...
- NMS算法的GPU实现(使用CUDA加速计算)
版权声明:本文为博主原创文章,遵循 CC 4.0 by-sa 版权协议,转载请附上原文出处链接和本声明. 本文链接: https://blog.csdn.net/qq_21368481/article ...
- 一个基于pycuda的矩阵乘法实现,它使用CUDA核心来加速计算。
一个基于pycuda的矩阵乘法实现,它使用CUDA核心来加速计算.该实现使用了共享内存和线程块,以最大化GPU的性能. ```python import pycuda.driver as cuda i ...
- mtensor一个tensor计算库,支持cuda延迟计算
1 mtensor mtensor是一个tensor计算库, 支持cuda的延迟计算, 项目地址为https://github.com/matazure/mtensor. 2 背景 延迟计算具有避免额 ...
- linux运行并行计算cuda,并行化计算与CUDA编程
原标题:并行化计算与CUDA编程 近年来,显卡的更新换代也比较快,NVIDIA今年的发布会也宣布了RTX3080TI即将到来.显卡的运算能力也越来越强. 很多人对显卡的了解可能源于游戏,但是你的显卡不 ...
- MATLAB上的GPU加速计算
概述 怎样在MATLAB上做GPU计算呢?主要分为三个步骤:数据的初始化.对GPU数据进行操作.把GPU上的数据回传给CPU 一.数据的初始化 首先要进行数据的初始化.有两种方法可以进行初始化:一是先 ...
- gpu处理信号_GPU显卡不仅用来打游戏那么简单,它还可以用于通用加速计算
如今,显卡不仅在工作站.个人PC中变得非常重要,而且在数据中心也处于举足轻重的地位.CPU负责通用计算.GPU负责加速计算已经成为绝大数数据中心一种常态.用于加速计算的GPU专用处理器,它将计算密集型 ...
- MATLAB上的GPU加速计算——学习笔记
MATLAB目前只支持Nvidia的显卡.如果你的显卡是AMD的或者是Intel的,就得考虑另寻它路了. MATLAB可谓工程计算中的神器,一方面它自带丰富的函数库,另一方面它所有的数据都是内建的矩阵 ...
- 写一个python函数 用cuda加速
好的,下面是一个用 CUDA 加速的 Python 函数的示例: import numpy as np import cupy as cpdef accelerate_with_cuda(x, y): ...
最新文章
- PHPCMS V9 杂记一
- 经典游戏大合集,你的青春也许就在里面!
- 利用python进行数据分析 英文-如何学习和评价《利用python进行数据分析》这本书?...
- 使用Navicat for Oracle新建用户无法登陆(用户名大小写问题)
- C/C++内存泄露,如何解决内存泄露?
- tensorflow教程 学习笔记 之 Eager execution 急切执行
- 自动开料机器人_工业4.0智能家具无人化生产线 自动化无人化大势所趋
- Win10 OpenCV3.3.0+VS2013配置大坑,OpenCV解决方案编译报错“找不到python36_d.lib”错误...
- matplotlib之legend图例和标注(笔记三)
- 云存储技术与云存储服务
- for linux pdf转mobi_linux PDF转换为SWF
- 视频教程-华为HCIA网络基础-网络技术
- 短视频源码开启“短视频+”
- 时间管理 android app推荐,干货星球 篇十三:【强烈安利】分享10个时间管理APP,每一个都堪称精品!...
- js pug 代码_PUG 系列 | 第二课 JS 代码混合、包含引入
- mysql e 变量_MySQL变量分类及用法简析
- Java类加载的加载,验证,准备,解析,初始化小结
- [ACM]辽宁省赛2010 (HZNU 1081-1089)
- 【微信小程序/云开发bug解决方案合集】持续更新中(最新22-11-21)
- linux cli运行脚本,AWS学习笔记(四)--CLI创建EC2时执行脚本