3.1. Compilation with NVCC

nvcc是简化的C++编译器(解释了为啥在编译C动态库时会丢失函数指针)

3.2. CUDA Runtime

3.2.1. Initialization

没有显式的初始化GPU设备的函数
cudaDeviceReset()会将设备的上下文销毁

3.2.2. Device Memory

device端申请linear memory 或者 CUDA arrays
cudaMalloc()申请Linear memory,cudaFree()释放,cudaMemcpy()用来host和device端内存数据传输

// Device code
__global__ void VecAdd(float* A, float* B, float* C, int N)
{int i = blockDim.x * blockIdx.x + threadIdx.x;if (i < N)C[i] = A[i] + B[i];
}// Host code
int main()
{int N = ...;size_t size = N * sizeof(float);// Allocate input vectors h_A and h_B in host memoryfloat* h_A = (float*)malloc(size);float* h_B = (float*)malloc(size);// Initialize input vectors...// Allocate vectors in device memoryfloat* d_A;cudaMalloc(&d_A, size);float* d_B;cudaMalloc(&d_B, size);float* d_C;cudaMalloc(&d_C, size);// Copy vectors from host memory to device memorycudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);// Invoke kernelint threadsPerBlock = 256;int blocksPerGrid =(N + threadsPerBlock - 1) / threadsPerBlock;VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);// Copy result from device memory to host memory// h_C contains the result in host memorycudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);// Free device memorycudaFree(d_A);cudaFree(d_B);cudaFree(d_C);// Free host memory...
}

使用cudaMallocPitch() cudaMemcpy2D()来申请二维数组
使用cudaMalloc3D()申请三维数组

下面的代码使用 width * height来申请2D array

// Host code
int width = 64, height = 64;
float* devPtr;
size_t pitch; //间距,既分配存储器的宽度,字节为段位
cudaMallocPitch(&devPtr, &pitch,width * sizeof(float), height);
MyKernel<<<100, 512>>>(devPtr, pitch, width, height);// Device code
__global__ void MyKernel(float* devPtr,size_t pitch, int width, int height)
{for (int r = 0; r < height; ++r) {float* row = (float*)((char*)devPtr + r * pitch);for (int c = 0; c < width; ++c) {float element = row[c];}}
}

下面的代码使用widthheightdepth申请3D数组

// Host code
int width = 64, height = 64, depth = 64;
cudaExtent extent = make_cudaExtent(width * sizeof(float),height, depth);
cudaPitchedPtr devPitchedPtr;
cudaMalloc3D(&devPitchedPtr, extent);
MyKernel<<<100, 512>>>(devPitchedPtr, width, height, depth);// Device code
__global__ void MyKernel(cudaPitchedPtr devPitchedPtr,int width, int height, int depth)
{char* devPtr = devPitchedPtr.ptr;size_t pitch = devPitchedPtr.pitch;size_t slicePitch = pitch * height;for (int z = 0; z < depth; ++z) {char* slice = devPtr + z * slicePitch;for (int y = 0; y < height; ++y) {float* row = (float*)(slice + y * pitch);for (int x = 0; x < width; ++x) {float element = row[x];}}}
}

下面的代码,是其他获得global变量的api

__constant__ float constData[256];
float data[256];
cudaMemcpyToSymbol(constData, data, sizeof(data));
cudaMemcpyFromSymbol(data, constData, sizeof(data));__device__ float devData;
float value = 3.14f;
cudaMemcpyToSymbol(devData, &value, sizeof(float));__device__ float* devPointer;
float* ptr;
cudaMalloc(&ptr, 256 * sizeof(float));
cudaMemcpyToSymbol(devPointer, &ptr, sizeof(ptr));

cudaGetSymbolAddress()用来获得global 分配的内存空间

3.2.3. Shared Memory

__shared__ memory 比global memory更快,在一个block共享
下面的代码没有使用shared memory来计算矩阵乘法,每一个thread读取A的每一行,和B的每一列
所以A被读取了B的width次,而B被读取了A的height次

// Matrices are stored in row-major order:
// M(row, col) = *(M.elements + row * M.width + col)
typedef struct {int width;int height;float* elements;
} Matrix;// Thread block size
#define BLOCK_SIZE 16// Forward declaration of the matrix multiplication kernel
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);// Matrix multiplication - Host code
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE
void MatMul(const Matrix A, const Matrix B, Matrix C)
{// Load A and B to device memoryMatrix d_A;d_A.width = A.width; d_A.height = A.height;size_t size = A.width * A.height * sizeof(float);cudaMalloc(&d_A.elements, size);cudaMemcpy(d_A.elements, A.elements, size,cudaMemcpyHostToDevice);Matrix d_B;d_B.width = B.width; d_B.height = B.height;size = B.width * B.height * sizeof(float);cudaMalloc(&d_B.elements, size);cudaMemcpy(d_B.elements, B.elements, size,cudaMemcpyHostToDevice);// Allocate C in device memoryMatrix d_C;d_C.width = C.width; d_C.height = C.height;size = C.width * C.height * sizeof(float);cudaMalloc(&d_C.elements, size);// Invoke kerneldim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);// Read C from device memorycudaMemcpy(C.elements, Cd.elements, size,cudaMemcpyDeviceToHost);// Free device memorycudaFree(d_A.elements);cudaFree(d_B.elements);cudaFree(d_C.elements);
}// Matrix multiplication kernel called by MatMul()
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{// Each thread computes one element of C// by accumulating results into Cvaluefloat Cvalue = 0;int row = blockIdx.y * blockDim.y + threadIdx.y;int col = blockIdx.x * blockDim.x + threadIdx.x;for (int e = 0; e < A.width; ++e)Cvalue += A.elements[row * A.width + e]* B.elements[e * B.width + col];C.elements[row * C.width + col] = Cvalue;
}

图片9. 不使用shared的矩阵乘法

下面的代码使用了shared memory
A只被读取了B的width/block_size次,B只被读取了A的height/block_size次
__device__函数用来建立 sub-matrix from a matrix.

// Matrices are stored in row-major order:
// M(row, col) = *(M.elements + row * M.stride + col)
typedef struct {int width;int height;int stride; float* elements;
} Matrix;// Get a matrix element
__device__ float GetElement(const Matrix A, int row, int col)
{return A.elements[row * A.stride + col];
}// Set a matrix element
__device__ void SetElement(Matrix A, int row, int col,float value)
{A.elements[row * A.stride + col] = value;
}// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is
// located col sub-matrices to the right and row sub-matrices down
// from the upper-left corner of A__device__ Matrix GetSubMatrix(Matrix A, int row, int col)
{Matrix Asub;Asub.width    = BLOCK_SIZE;Asub.height   = BLOCK_SIZE;Asub.stride   = A.stride;Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row+ BLOCK_SIZE * col];return Asub;
}// Thread block size
#define BLOCK_SIZE 16// Forward declaration of the matrix multiplication kernel
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);// Matrix multiplication - Host code
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE
void MatMul(const Matrix A, const Matrix B, Matrix C)
{// Load A and B to device memoryMatrix d_A;d_A.width = d_A.stride = A.width; d_A.height = A.height;size_t size = A.width * A.height * sizeof(float);cudaMalloc(&d_A.elements, size);cudaMemcpy(d_A.elements, A.elements, size,cudaMemcpyHostToDevice);Matrix d_B;d_B.width = d_B.stride = B.width; d_B.height = B.height;size = B.width * B.height * sizeof(float);cudaMalloc(&d_B.elements, size);cudaMemcpy(d_B.elements, B.elements, size,cudaMemcpyHostToDevice);// Allocate C in device memoryMatrix d_C;d_C.width = d_C.stride = C.width; d_C.height = C.height;size = C.width * C.height * sizeof(float);cudaMalloc(&d_C.elements, size);// Invoke kerneldim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);// Read C from device memorycudaMemcpy(C.elements, d_C.elements, size,cudaMemcpyDeviceToHost);// Free device memorycudaFree(d_A.elements);cudaFree(d_B.elements);cudaFree(d_C.elements);
}// Matrix multiplication kernel called by MatMul()__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{// Block row and columnint blockRow = blockIdx.y;int blockCol = blockIdx.x;// Each thread block computes one sub-matrix Csub of CMatrix Csub = GetSubMatrix(C, blockRow, blockCol);// Each thread computes one element of Csub// by accumulating results into Cvaluefloat Cvalue = 0;// Thread row and column within Csubint row = threadIdx.y;int col = threadIdx.x;// Loop over all the sub-matrices of A and B that are// required to compute Csub// Multiply each pair of sub-matrices together// and accumulate the resultsfor (int m = 0; m < (A.width / BLOCK_SIZE); ++m) {// Get sub-matrix Asub of AMatrix Asub = GetSubMatrix(A, blockRow, m);// Get sub-matrix Bsub of BMatrix Bsub = GetSubMatrix(B, m, blockCol);// Shared memory used to store Asub and Bsub respectively__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];// Load Asub and Bsub from device memory to shared memory// Each thread loads one element of each sub-matrixAs[row][col] = GetElement(Asub, row, col);Bs[row][col] = GetElement(Bsub, row, col);// Synchronize to make sure the sub-matrices are loaded// before starting the computation__syncthreads();// Multiply Asub and Bsub togetherfor (int e = 0; e < BLOCK_SIZE; ++e)Cvalue += As[row][e] * Bs[e][col];// Synchronize to make sure that the preceding// computation is done before loading two new// sub-matrices of A and B in the next iteration__syncthreads();}// Write Csub to device memory// Each thread writes one elementSetElement(Csub, row, col, Cvalue);
}

图10,使用sharedMemory计算矩阵乘法

3.2.4. Page-Locked Host Memory

page-locked (also known as pinned) host memory(相对于regular pageable host memory allocated by malloc()))
cudaHostAlloc() andcudaFreeHost()来申请page-locked host memory;cudaHostRegister()来锁定malloc()申请的内存
page-locked memory的优势
1)page-locked host memory and device memory可以
2)host memory可以映射到device memory
3)在带宽方面表现的更好
page-locked memory是稀有资源,可以理解为内存的绝对位置,可能会影响到其他的程序的侠侣

3.2.4.1. Portable Memory

3.2.4.2. Write-Combining Memory

3.2.4.3. Mapped Memory

向cudaHostAlloc()传入flag cudaHostAllocMapped

有点:
1)不需要再device和host之间显式的拷贝数据,他们会自动拷贝
2)

由于page-locked memory在host和device之间共享,所以要注意同步问题,避免read-after-write, write-after-read, or write-after-write

3.2.5. Asynchronous Concurrent Execution

下面的情况是异步(concurrently)
Computation on the host;
Computation on the device;
Memory transfers from the host to the device;
Memory transfers from the device to the host;
Memory transfers within the memory of a given device;
Memory transfers among devices.

3.2.5.1. Concurrent Execution between Host and Device

Concurrent host execution is facilitated through asynchronous library functions,在device计算完成之前将控制权返回给host
通过设定CUDA_LAUNCH_BLOCKING为1,来讲异步改为同步

3.2.5.2. Concurrent Kernel Execution

3.2.5.3. Overlap of Data Transfer and Kernel Execution

3.2.5.4. Concurrent Data Transfers

3.2.5.5. Streams

应用通过streams来管理concurrent operations
一个stream是一系列的命令(可能是来自不同hosts threads)按照顺序执行

3.2.5.5.1. Creation and Destruction

下面面的代码创建了两个stream,并且分配了page-locked内存

cudaStream_t stream[2];
for (int i = 0; i < 2; ++i)cudaStreamCreate(&stream[i]);
float* hostPtr;
cudaMallocHost(&hostPtr, 2 * size);

下面的代码使用两个stream来同步,先将输出从host传入dev,计算,从dev传回host,再进入下一个步骤

for (int i = 0; i < 2; ++i) {cudaMemcpyAsync(inputDevPtr + i * size, hostPtr + i * size,size, cudaMemcpyHostToDevice, stream[i]);MyKernel <<<100, 512, 0, stream[i]>>>(outputDevPtr + i * size, inputDevPtr + i * size, size);cudaMemcpyAsync(hostPtr + i * size, outputDevPtr + i * size,size, cudaMemcpyDeviceToHost, stream[i]);
}

使用cudaStreamDestroy()来销毁stream

for (int i = 0; i < 2; ++i)cudaStreamDestroy(stream[i]);

如果stream内容还没有完成,则释放所有资源

3.2.5.5.2. Default Stream

3.2.5.5.3. Explicit Synchronization

cudaDeviceSynchronize()等待直到所有host threads的所有stream完成
cudaStreamSynchronize()将 stream作为参数,等待直到stream中所有命令执行完
cudaStreamWaitEvent()将 stream和event作为参数,等待直到event完成
cudaStreamQuery()查看stream中的命令是否执行完

3.2.5.5.4. Implicit Synchronization

不同streams中的命令不能并发,除非:
a page-locked host memory allocation,
a device memory allocation,
a device memory set,
a memory copy between two addresses to the same device memory,
any CUDA command to the NULL stream,
a switch between the L1/shared memory configurations described in Compute Capability 3.x and Compute Capability 7.x.

3.2.5.5.5. Overlapping Behavior

for (int i = 0; i < 2; ++i)cudaMemcpyAsync(inputDevPtr + i * size, hostPtr + i * size,size, cudaMemcpyHostToDevice, stream[i]);
for (int i = 0; i < 2; ++i)MyKernel<<<100, 512, 0, stream[i]>>>(outputDevPtr + i * size, inputDevPtr + i * size, size);for (int i = 0; i < 2; ++i)cudaMemcpyAsync(hostPtr + i * size, outputDevPtr + i * size,size, cudaMemcpyDeviceToHost, stream[i]);

3.2.5.5.6. Host Functions (Callbacks)

cudaLaunchHostFunc()在stream中所有命令完成后执行host上的函数

void CUDART_CB MyCallback(cudaStream_t stream, cudaError_t status, void *data){printf("Inside callback %d\n", (size_t)data);
}
...
for (size_t i = 0; i < 2; ++i) {cudaMemcpyAsync(devPtrIn[i], hostPtr[i], size, cudaMemcpyHostToDevice, stream[i]);MyKernel<<<100, 512, 0, stream[i]>>>(devPtrOut[i], devPtrIn[i], size);cudaMemcpyAsync(hostPtr[i], devPtrOut[i], size, cudaMemcpyDeviceToHost, stream[i]);cudaLaunchHostFunc(stream[i], MyCallback, (void*)i);
}

3.2.5.5.7. Stream Priorities

cudaStreamCreateWithPriority()
cudaDeviceGetStreamPriorityRange()

3.2.5.6. Graphs

3.2.5.7. Events

3.2.5.7.1. Creation and Destruction

cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);

3.2.5.7.2. Elapsed Time

计时

cudaEventRecord(start, 0);
for (int i = 0; i < 2; ++i) {cudaMemcpyAsync(inputDev + i * size, inputHost + i * size,size, cudaMemcpyHostToDevice, stream[i]);MyKernel<<<100, 512, 0, stream[i]>>>(outputDev + i * size, inputDev + i * size, size);cudaMemcpyAsync(outputHost + i * size, outputDev + i * size,size, cudaMemcpyDeviceToHost, stream[i]);
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);

3.2.6. Multi-Device System

3.2.6.1. Device Enumeration

一台机器可以有多个显卡,可以选择指定的显卡

int deviceCount;
cudaGetDeviceCount(&deviceCount);
int device;
for (device = 0; device < deviceCount; ++device) {cudaDeviceProp deviceProp;cudaGetDeviceProperties(&deviceProp, device);printf("Device %d has compute capability %d.%d.\n",device, deviceProp.major, deviceProp.minor);
}

3.2.6.2. Device Selection

host任何时间都可以cudaSetDevice().默认device 0

size_t size = 1024 * sizeof(float);
cudaSetDevice(0);            // Set device 0 as current
float* p0;
cudaMalloc(&p0, size);       // Allocate memory on device 0
MyKernel<<<1000, 128>>>(p0); // Launch kernel on device 0
cudaSetDevice(1);            // Set device 1 as current
float* p1;
cudaMalloc(&p1, size);       // Allocate memory on device 1
MyKernel<<<1000, 128>>>(p1); // Launch kernel on device 1

3.2.6.3. Stream and Event Behavior

在不同设备使用不同stream,会报错

cudaSetDevice(0);               // Set device 0 as current
cudaStream_t s0;
cudaStreamCreate(&s0);          // Create stream s0 on device 0
MyKernel<<<100, 64, 0, s0>>>(); // Launch kernel on device 0 in s0
cudaSetDevice(1);               // Set device 1 as current
cudaStream_t s1;
cudaStreamCreate(&s1);          // Create stream s1 on device 1
MyKernel<<<100, 64, 0, s1>>>(); // Launch kernel on device 1 in s1// This kernel launch will fail:
MyKernel<<<100, 64, 0, s0>>>(); // Launch kernel on device 1 in s0

cudaEventRecord()event和stream在不同的device会失效
cudaEventElapsedTime()不同的device会失效
cudaEventSynchronize()和cudaEventQuery()不同的device可以
cudaStreamWaitEvent()不同的device可以

3.2.6.4. Peer-to-Peer Memory Access

如果cudaDeviceCanAccessPeer()返回true,则可以使用 peer-to-peer memory (一个GPU直接访问另一个GPU的内存)
cudaDeviceEnablePeerAccess()

cudaSetDevice(0);                   // Set device 0 as current
float* p0;
size_t size = 1024 * sizeof(float);
cudaMalloc(&p0, size);              // Allocate memory on device 0
MyKernel<<<1000, 128>>>(p0);        // Launch kernel on device 0
cudaSetDevice(1);                   // Set device 1 as current
cudaDeviceEnablePeerAccess(0, 0);   // Enable peer-to-peer access// with device 0// Launch kernel on device 1
// This kernel launch can access memory on device 0 at address p0
MyKernel<<<1000, 128>>>(p0);

3.2.6.5. Peer-to-Peer Memory Copy

使用cudaMemcpyPeer(), cudaMemcpyPeerAsync(),cudaMemcpy3DPeer(), or cudaMemcpy3DPeerAsync()来进行P2P拷贝

cudaSetDevice(0);                   // Set device 0 as current
float* p0;
size_t size = 1024 * sizeof(float);
cudaMalloc(&p0, size);              // Allocate memory on device 0
cudaSetDevice(1);                   // Set device 1 as current
float* p1;
cudaMalloc(&p1, size);              // Allocate memory on device 1
cudaSetDevice(0);                   // Set device 0 as current
MyKernel<<<1000, 128>>>(p0);        // Launch kernel on device 0
cudaSetDevice(1);                   // Set device 1 as current
cudaMemcpyPeer(p1, 1, p0, 0, size); // Copy p0 to p1
MyKernel<<<1000, 128>>>(p1);        // Launch kernel on device 1

3.2.7. Unified Virtual Address Space

cudaPointerGetAttributes()
cudaMemcpy*()函数的参数cudaMemcpyKind,可以设置为cudaMemcpyDefault
cudaHostAlloc()

3.2.8. Interprocess Communication

3.2.9. Error Checking

3.2.10. Call Stack

3.2.11. Texture and Surface Memory

特定环境使用特定的内存可以提升程序的性能,
TextureMemory(纹理内存)缓存在芯片上，只读内存专门为那些在内存访问模式中存在大量空间局部性（Spatial Locality）的图形应用程序而设计的。

参考:
https://blog.csdn.net/qq_24990189/article/details/89606221
https://blog.csdn.net/venom_snake/article/details/83857537

3.2.12. Graphics Interoperability

3.3. External Resource Interoperability

3.4. Versioning and Compatibility

3.5. Compute Modes

3.6. Mode Switches

3.7. Tesla Compute Cluster Mode for Windows

CUDA TOOlkit Programming Guide 3. Programming Interface相关推荐

CUDA TOOlkit Programming Guide 2. Programming Model
本章的样例来源:vectorAdd CUDA sample 2.1 Kernels __global__来声明kernel函数 // Kernel definition __global__ void ...
CUDA C++ Programming Guide（ v11.2.0）部分翻译+笔记
笔记对有些较为简单的部分做了省略,有些原文中表达比较拗口的地方,重新组织了话叙,由于部分内容并不是完全翻译,所以不建议作为主要学习资料,建议作为学习对比参考使用,如有不明白的地方或觉得有问题的地方,欢 ...
Structured Streaming编程 Programming Guide
Structured Streaming编程 Programming Guide • Overview • Quick Example • Programming Model o Basic Conc ...
View Controller Programming Guide for iOS---(七)---Resizing the View Controller’s Views
Resizing the View Controller's Views A view controller owns its own view and manages the view's cont ...
View Programming Guide for iOS官方文档翻译一
版权声明:本文为博主原创文章,转载请声明出处,谢谢! 本文档的官方英文原版地址关于窗口和视图在iOS中,您可以使用窗口(windows)和视图(views)在屏幕上显示APP的内容. 窗口(Win ...
Serial Programming Guide for POSIX Operating Systems
Serial Programming Guide for POSIX Operating Systems POSIX操作系统串行编程指南 5th Edition, 3rd Revision Co ...
Serial Programming Guide for POSIX Operating Systems(转)
Serial Programming Guide for POSIX Operating Systems POSIX操作系统串行编程指南 5th Edition, 3rd Revision Copyr ...
[iOS]Advanced Memory Management Programming Guide 高级内存管理编程指南(官方文档翻译)
Advanced Memory Management Programming Guide - 高级内存管理编程指南(官方文档翻译) 版权声明:本文为博主原创翻译,如需转载请注明出处. 新博客文章地址: ...
The Linux Kernel Module Programming Guide 2.4 中文版
The Linux Kernel Module Programming Guide 2.4 中文版分类: Linux/os 2007-09-29 11:14 820人阅读评论(0) 收藏举报 T ...

CUDA TOOlkit Programming Guide 3. Programming Interface