CUDA C编程权威指南 第二章 CUDA编程模型
CUDA6.0开始 有"统一寻址"(Unified Memory)编程模型,可以用单个指针访问CPU和GPU内存,无须手动拷贝
主机启动内核后,管理权立刻返回给主机(类似启动线程后,不join)
C函数 | CUDA C函数 |
---|---|
malloc | cudaMalloc |
memcpy | cudaMemcpy |
memset | cudaMemset |
free | cudaFree |
cudaError_t cudaMalloc(void** devPtr, size_t size)
分配线性内存,devPtr是内存指针
cudaError_t cudaMemcpy(void* dst, const void* src, size_t count,cudaMemcpyKind kind)
主机和设备之间的数据传输,从src向dst复制字节,复制方向由kind指定
kind:
cudaMemcpyHostToHost
cudaMemcpyHostToDevice
cudaMemcpyDeviceToHost
cudaMemcpyDeviceToDevice同步方式,在cudaMemcpy函数返回及传输操作完成之前主机是阻塞的
返回错误枚举类型cudaError_t,成功返回cudaSuccess,失败返回cudaErrorMemoryAllocation
char* cudaGetErrorString(cudaError_t error)
获得报错信息,和C语言中的strerror函数类似
内存层次结构
#include <stdlib.h>
#include <time.h>void sumArraysOnHost(float *A, float *B, float *C, const int N)//数组相加
{for (int idx = 0; idx < N; idx++){C[idx] = A[idx] + B[idx];}
}void initialData(float *ip, int size)//初始化数组,生成随机数
{time_t t;srand((unsigned) time(&t));//随机数发生器for (int i = 0; i < size; i++){//rand()随机数函数返回一个int型的数,将rand()返回值的高16位变成0,低16位不变,用来控制最大值ip[i] = (float)(rand() & 0xFF) / 10.0f;//}return;
}int main(int argc, char **argv)
{int nElem = 1024;size_t nBytes = nElem * sizeof(float);float *h_A, *h_B, *h_C;h_A = (float *)malloc(nBytes);h_B = (float *)malloc(nBytes);h_C = (float *)malloc(nBytes);initialData(h_A, nElem);initialData(h_B, nElem);sumArraysOnHost(h_A, h_B, h_C, nElem);free(h_A);free(h_B);free(h_C);return(0);
}
nvcc -Xcompiler -std=c99 sumArraysOnHost.c -o sum
-Xcompiler用于指定命令行选项是指向C编译器还是预处理器
-std=c99传递给编译器,指定编译标准
参考: https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
线程管理
- 同一grid所有线程共享相同的全局内存空间
- 一个grid由多个block构成,一个线程block包含一组thread
- 同一block内线程:1. 同步,2.共享内存;不同block不能协作
- 索引
- 线程依靠blockIdx(block在grid里的索引)和threadIdx(block内索引),基于坐标将不同的数据分配给不同的线程
- 坐标变量基于unint3定义的CUDA内置向量类型(blockIdx.x,blockIdx.y,blockIdx.z;threadIdx.x,threadIdx.y,threadIdx.z)
- grid和block都可以组织为3维的,一个grid,默认是二维,一个block默认三维
- gridDim线程格维度(每个grid中有多少个block) blockDim线程块的维度(每个block中有多少个线程)是dim3类型的变量
- dim3类型的变量,所有未指定的元素都被初始化为1,可以通过x,y,z获得各个维度的长度(blockDim.x,blockDim.y,blockDim.z)
- host端使用dim3定义grid和block,在device端会议unit3类型显示;既host端的dim3 对应了device端的unit(block对应blockDim,grid对应gridDim)
host和device对应
#include <cuda_runtime.h>
#include <stdio.h>
__global__ void checkIndex(void)
{printf("threadIdx:(%d, %d, %d)\n", threadIdx.x, threadIdx.y, threadIdx.z);//因为是1维的,所以y,z维度坐标都是0printf("blockIdx:(%d, %d, %d)\n", blockIdx.x, blockIdx.y, blockIdx.z);printf("blockDim:(%d, %d, %d)\n", blockDim.x, blockDim.y, blockDim.z);//因为是1维的,所以y,z维度都初始化为1printf("gridDim:(%d, %d, %d)\n", gridDim.x, gridDim.y, gridDim.z);//因为是1维的,所以y,z维度都初始化为1
}int main(int argc, char **argv)
{int nElem = 6;//假设有6个数据dim3 block(3);//每个block有3个thread//C语言中除号两边都是整型时,做的求模运算,即整除,得到的是一个整数,为什么不直接Elem/block.x?dim3 grid((nElem + block.x - 1) / block.x);//每个grid有2个block,// host端检查grid和block的配置printf("grid.x %d grid.y %d grid.z %d\n", grid.x, grid.y, grid.z);printf("block.x %d block.y %d block.z %d\n", block.x, block.y, block.z);// device端检查grid和block的配置checkIndex<<<grid, block>>>();cudaDeviceReset();return(0);
}
重置grid和block
#include <cuda_runtime.h>
#include <stdio.h>int main(int argc, char **argv)
{ int nElem = 1024;// 总共设置1024个数据// block,1维,共1024个thread;grid1维,共1个blockdim3 block (1024);dim3 grid ((nElem + block.x - 1) / block.x);printf("grid.x %d block.x %d \n", grid.x, block.x);// block,1维,共512个thread;grid1维,共2个blockblock.x = 512;grid.x = (nElem + block.x - 1) / block.x;printf("grid.x %d block.x %d \n", grid.x, block.x);// block,1维,共256个thread;grid1维,共4个blockblock.x = 256;grid.x = (nElem + block.x - 1) / block.x;printf("grid.x %d block.x %d \n", grid.x, block.x);// block,1维,共128个thread;grid1维,共8个blockblock.x = 128;grid.x = (nElem + block.x - 1) / block.x;printf("grid.x %d block.x %d \n", grid.x, block.x);cudaDeviceReset();return(0);
}
启动核函数
kernel_name <<< grid,block>>>(argument list);
相比c普通函数function_name(argument list)
多了<<< grid,block>>>
- 假设有8个元素,启动2个1维块,每个块有4个thread
kernel_name<<< 2, 4>>>(argument list)
3. kernel函数调用和host是异步的,调用后,控制权立刻返回给host,可以使用cudaError_t cuda DevicesSynchronize(void)
来同步
4. 有些kernel函数是同步的,如cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind)
,拷贝完成后,控制权才返回主机
编写核函数
限定符 | 调用位置 | 执行位置 | 备注 |
---|---|---|---|
global | host | device | 必须有void返回类型 |
device | device | device | |
host | host | host | 可以省略 |
__device__
和__host__
可以一起使用,这样函数同时在主机和设备端进行编译- 核函数限制
1)只能访问设备内存
2)必须有void返回类型
3)不支持可变数量的参数
4)不支持静态变量
5)基本都是异步 - cpu:
void sumArraysOnHost(float *A, float *B, float *C, const int N){for (int i = 0; i < N; i++)C[i] = A[i] + B[i];
}
gpu:
__global__ void sumArraysOnGPU(float *A, float *B, float *C)//数组相加
{int i = threadIdx.x;//省略循环,通过threadIdx来C[i] = A[i] + B[i];
}
验证核函数
验证结果
void checkResult(float *h_C, float *gpuRef, const int N){//检验cpu和gpu的计算结果double epsilon = 1.0E-8;bool match = 1;for (int i = 0; i < N; i++){if (abs(h_C[i] - gpuRef[i]) > epsilon){//host计算数组对应位置元素值-device计算数组对应元素值match = 0;//更改全正确标志为0,既不会打印全部正确printf("Arrays do not match!\n");printf("host %5.2f gpu %5.2f at current %d\n", h_C[i],gpuRef[i], i);break;}}if (match) printf("Arrays match.\n\n");return;
}
处理错误
综合
#include <cuda_runtime.h>
#include <stdio.h>
#include <sys/time.h>void checkResult(float *h_C, float *gpuRef, const int N){//检验cpu和gpu的计算结果double epsilon = 1.0E-8;bool match = 1;for (int i = 0; i < N; i++){if (abs(h_C[i] - gpuRef[i]) > epsilon){//host计算数组对应位置元素值-device计算数组对应元素值match = 0;//更改全正确标志为0,既不会打印全部正确printf("Arrays do not match!\n");printf("host %5.2f gpu %5.2f at current %d\n", h_C[i],gpuRef[i], i);break;}}if (match) printf("Arrays match.\n\n");return;
}void initialData(float *ip, int size){//目的是生成随机的浮点数time_t t;srand((unsigned) time(&t));for (int i = 0; i < size; i++){ip[i] = (float)(rand() & 0xFF) / 10.0f;}return;
}void sumArraysOnHost(float *A, float *B, float *C, const int N){for (int idx = 0; idx < N; idx++)C[idx] = A[idx] + B[idx];
}__global__ void sumArraysOnGPU(float *A, float *B, float *C, const int N){int i = threadIdx.x;if (i < N) C[i] = A[i] + B[i];//i<N 只让小于i的核来工作
}int main(int argc, char **argv){int nElem = 32;// malloc host memorysize_t nBytes = nElem * sizeof(float);float *h_A, *h_B, *h_C, *gpuRef;h_A = (float *)malloc(nBytes);//host 数组Ah_B = (float *)malloc(nBytes);//host 数组Bh_C = (float *)malloc(nBytes);//host 数组C,存储h_A + h_B的结果gpuRef = (float *)malloc(nBytes);//device 结果数组,从gpu中结果复制到CPUinitialData(h_A, nElem); //初始化 host数组A,生成随机的浮点数initialData(h_B, nElem); //初始化 host数组B,生成随机的浮点数memset(h_C, 0, nBytes); //初始化 host 结果数组,全部置为0memset(gpuRef, 0, nBytes); //初始化 device 结果数组,全部置为0float *d_A, *d_B, *d_C;cudaMalloc((float**)&d_A, nBytes); //初始化 device数组A 第一个参数意思是传入指针的地址,d_A本来在host内存中cudaMalloc((float**)&d_B, nBytes); //初始化 device数组B 经过cudaMalloc作用后指针将指向device的内存cudaMalloc((float**)&d_C, nBytes); //初始化 device数组CcudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);//将host数组内容拷贝到device中cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);//将host数组内容拷贝到device中cudaMemcpy(d_C, gpuRef, nBytes, cudaMemcpyHostToDevice);//将host数组内容拷贝到device中//host配置grid和blockdim3 block (nElem);dim3 grid (1);sumArraysOnGPU<<<grid, block>>>(d_A, d_B, d_C, nElem);//device计算数组相加cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);//将device结果拷贝到hostsumArraysOnHost(h_A, h_B, h_C, nElem);//host数组相加checkResult(h_C, gpuRef, nElem);//检测结果// free device global memorycudaFree(d_A);cudaFree(d_B);cudaFree(d_C);// free host memoryfree(h_A);free(h_B);free(h_C);free(gpuRef);cudaDeviceReset();return(0);
}
计时
#include <cuda_runtime.h>
#include <stdio.h>
#include <sys/time.h>
void checkResult(float *h_C, float *gpuRef, const int N){double epsilon = 1.0E-8;bool match = 1;for (int i = 0; i < N; i++){if (abs(h_C[i] - gpuRef[i]) > epsilon){match = 0;printf("Arrays do not match!\n");printf("host %5.2f gpu %5.2f at current %d\n", h_C[i],gpuRef[i], i);break;}}if (match) printf("Arrays match.\n\n");return;
}double seconds(){struct timeval tp;struct timezone tzp;int i = gettimeofday(&tp, &tzp);return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}void initialData(float *ip, int size){time_t t;srand((unsigned) time(&t));for (int i = 0; i < size; i++){ip[i] = (float)( rand() & 0xFF ) / 10.0f;}return;
}void sumArraysOnHost(float *A, float *B, float *C, const int N){for (int idx = 0; idx < N; idx++){C[idx] = A[idx] + B[idx];}
}
__global__ void sumArraysOnGPU(float *A, float *B, float *C, const int N){int i = blockIdx.x * blockDim.x + threadIdx.x;if (i < N) C[i] = A[i] + B[i];//device的blockIdx可能会超出数组的界限
}int main(int argc, char **argv)
{printf("%s Starting...\n", argv[0]);int nElem = 32;size_t nBytes = nElem * sizeof(float);float *h_A, *h_B, *h_C, *gpuRef;h_A = (float *)malloc(nBytes);h_B = (float *)malloc(nBytes);h_C = (float *)malloc(nBytes);gpuRef = (float *)malloc(nBytes);double iStart, iElaps;// 初始化hostiStart = seconds();initialData(h_A, nElem);initialData(h_B, nElem);iElaps = seconds() - iStart;printf("initialData Time elapsed %f sec\n", iElaps);memset(h_C, 0, nBytes);memset(gpuRef, 0, nBytes);// host计算iStart = seconds();sumArraysOnHost(h_A, h_B, h_C, nElem);iElaps = seconds() - iStart;printf("sumArraysOnHost Time elapsed %f sec\n", iElaps);// 初始化devicefloat *d_A, *d_B, *d_C;cudaMalloc((float**)&d_A, nBytes);//(float**)可以省略,使用是为了明确传入的是指针的地址(二重指针)cudaMalloc((float**)&d_B, nBytes);cudaMalloc((float**)&d_C, nBytes);cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);cudaMemcpy(d_C, gpuRef, nBytes, cudaMemcpyHostToDevice);int iLen = 512;dim3 block (iLen);dim3 grid ((nElem + block.x - 1) / block.x);iStart = seconds();sumArraysOnGPU<<<grid, block>>>(d_A, d_B, d_C, nElem);cudaDeviceSynchronize();iElaps = seconds() - iStart;printf("sumArraysOnGPU <<< %d, %d >>> Time elapsed %f sec\n", grid.x,block.x, iElaps);// 从device取数据cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);// 检查结果checkResult(h_C, gpuRef, nElem);cudaFree(d_A);cudaFree(d_B);cudaFree(d_C);free(h_A);free(h_B);free(h_C);free(gpuRef);return(0);
}
nvprof
CUDA5.0以来,提供给了程序执行的信息
nvprof [nvprof_args] <application> [application_args]
nvprof --help
`nvprof ./sumArraysOnGPU-timer
~/code/cuda$ nvprof ./sum
./sum Starting...
Vector size 32
initialData Time elapsed 0.000004 sec
sumArraysOnHost Time elapsed 0.000000 sec
==5121== NVPROF is profiling process 5121, command: ./sum
sumArraysOnGPU <<< 1, 512 >>> Time elapsed 0.000025 sec
Arrays match.==5121== Profiling application: ./sum
==5121== Profiling result:Type Time(%) Time Calls Avg Min Max NameGPU activities: 43.48% 1.9200us 3 640ns 576ns 768ns [CUDA memcpy HtoD]42.03% 1.8560us 1 1.8560us 1.8560us 1.8560us sumArraysOnGPU(float*, float*, float*, int)14.49% 640ns 1 640ns 640ns 640ns [CUDA memcpy DtoH]API calls: 98.84% 143.30ms 3 47.768ms 3.2070us 143.29ms cudaMalloc0.72% 1.0374ms 96 10.806us 519ns 507.17us cuDeviceGetAttribute0.25% 364.98us 1 364.98us 364.98us 364.98us cuDeviceTotalMem0.08% 113.68us 1 113.68us 113.68us 113.68us cuDeviceGetName0.06% 85.675us 3 28.558us 4.0100us 75.346us cudaFree0.03% 41.697us 4 10.424us 4.4080us 24.272us cudaMemcpy0.01% 20.049us 1 20.049us 20.049us 20.049us cudaLaunchKernel0.01% 7.6710us 1 7.6710us 7.6710us 7.6710us cuDeviceGetPCIBusId0.00% 4.1670us 3 1.3890us 516ns 2.7560us cuDeviceGetCount0.00% 3.7380us 1 3.7380us 3.7380us 3.7380us cudaDeviceSynchronize0.00% 3.5770us 2 1.7880us 533ns 3.0440us cuDeviceGet0.00% 1.0310us 1 1.0310us 1.0310us 1.0310us cuDeviceGetUuid
组织并行线程
使用block和thread建立矩阵索引
使用线性存储二维矩阵
#include <cuda_runtime.h>
#include <stdio.h>void printMatrix(int *C, const int nx, const int ny)
{int *ic = C;for (int iy = 0; iy < ny; iy++){for (int ix = 0; ix < nx; ix++){printf("%3d", ic[ix]);}ic += nx;//指针移动了一行的元素距离printf("\n");}printf("\n");return;
}__global__ void printThreadIndex(int *A, const int nx, const int ny){//二维block//dim3 block(4, 2);x:[0,1,2,3],y:[0,1]int ix = threadIdx.x + blockIdx.x * blockDim.x;int iy = threadIdx.y + blockIdx.y * blockDim.y;unsigned int idx = iy * nx + ix;printf("thread_id (%d,%d) block_id (%d,%d) coordinate (%d,%d) global index"" %2d ival %2d\n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y,ix, iy, idx, A[idx]);
}int main(int argc, char **argv)
{printf("%s Starting...\n", argv[0]);int nx = 8; //每行8个元素int ny = 6; //6列int nxy = nx * ny; //总元素个数int nBytes = nxy * sizeof(float);//host端申请内存,线性内存存储矩阵int *h_A = (int *)malloc(nBytes);for (int i = 0; i < nxy; i++){//初始化矩阵h_A[i] = i;}printMatrix(h_A, nx, ny);int *d_MatA;//device申请内存cudaMalloc((void **)&d_MatA, nBytes);cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice); //将host端矩阵和device端同步dim3 block(4, 2);dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);//(8+4-1)/4=2,(6+2-1)/2=3printThreadIndex<<<grid, block>>>(d_MatA, nx, ny);//<<<6,8>>>cudaFree(d_MatA);free(h_A);cudaDeviceReset();return (0);
}
二维网格和二维块对矩阵求和
#include <cuda_runtime.h>
#include <stdio.h>void initialData(float *ip, const int size){//初始化矩阵for(int i = 0; i < size; i++){ip[i] = (float)(rand() & 0xFF) / 10.0f;}return;
}void sumMatrixOnHost(float *A, float *B, float *C, const int nx, const int ny){//host端矩阵加法float *ia = A;float *ib = B;float *ic = C;for (int iy = 0; iy < ny; iy++){for (int ix = 0; ix < nx; ix++){ic[ix] = ia[ix] + ib[ix];}ia += nx;//每计算一行,指针位置前进一行ib += nx;ic += nx;}return;
}void checkResult(float *h_C, float *gpuRef, const int N){double epsilon = 1.0E-8;bool match = 1;for (int i = 0; i < N; i++){if (abs(h_C[i] - gpuRef[i]) > epsilon){match = 0;printf("host %f gpu %f\n", h_C[i], gpuRef[i]);break;}}if (match)printf("Arrays match.\n\n");elseprintf("Arrays do not match.\n\n");
}__global__ void sumMatrixOnGPU2D(float *MatA, float *MatB, float *MatC, int nx, int ny){unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;//横坐标,列数unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;//纵坐标,行数unsigned int idx = iy * nx + ix;if (ix < nx && iy < ny)MatC[idx] = MatA[idx] + MatB[idx];
}int main(int argc, char **argv)
{printf("%s Starting...\n", argv[0]);//初始化hostint nx = 1 << 14;//矩阵列数int ny = 1 << 14;//矩阵行数int nxy = nx * ny;//总元素个数int nBytes = nxy * sizeof(float);float *h_A, *h_B, *h_C, *gpuRef;h_A = (float *)malloc(nBytes);h_B = (float *)malloc(nBytes);h_C = (float *)malloc(nBytes);gpuRef = (float *)malloc(nBytes);initialData(h_A, nxy);initialData(h_B, nxy);memset(h_C, 0, nBytes);memset(gpuRef, 0, nBytes);sumMatrixOnHost(h_A, h_B, h_C, nx, ny);//初始化devicefloat *d_MatA, *d_MatB, *d_MatC;cudaMalloc((void **)&d_MatA, nBytes);cudaMalloc((void **)&d_MatB, nBytes);cudaMalloc((void **)&d_MatC, nBytes);cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice);cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice);int dimx = 32;int dimy = 32;dim3 block(dimx, dimy);dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);sumMatrixOnGPU2D<<<grid, block>>>(d_MatA, d_MatB, d_MatC, nx, ny);cudaDeviceSynchronize();cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost);checkResult(h_C, gpuRef, nxy);//释放内存cudaFree(d_MatA);cudaFree(d_MatB);cudaFree(d_MatC);free(h_A);free(h_B);free(h_C);free(gpuRef);cudaDeviceReset();return (0);
}
使用一维网格一维块对矩阵求和
__global__ void sumMatrixOnGPU1D(float *MatA, float *MatB, float *MatC, int nx,int ny){unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;if (ix < nx )for (int iy = 0; iy < ny; iy++){int idx = iy * nx + ix;MatC[idx] = MatA[idx] + MatB[idx];}
}
dim3 block(32, 1);
dim3 grid((nx + block.x - 1) / block.x, 1);
使用二维网格和一维块对矩阵求和
__global__ void sumMatrixOnGPUMix(float *MatA, float *MatB, float *MatC, int nx,int ny)
{unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;unsigned int iy = blockIdx.y;unsigned int idx = iy * nx + ix;if (ix < nx && iy < ny)MatC[idx] = MatA[idx] + MatB[idx];
}
int dimx = 32;
dim3 block(dimx, 1);
dim3 grid((nx + block.x - 1) / block.x, ny);
设备管理
使用运行时API查询GPU信息
#include <cuda_runtime.h>
#include <stdio.h>int main(int argc, char **argv){printf("%s Starting...\n", argv[0]);int deviceCount = 0;cudaGetDeviceCount(&deviceCount);//查看GPU数量printf("Detected %d CUDA Capable device(s)\n", deviceCount);int dev = 0, driverVersion = 0, runtimeVersion = 0;cudaSetDevice(dev);cudaDeviceProp deviceProp;cudaGetDeviceProperties(&deviceProp, dev);printf("Device %d: \"%s\"\n", dev, deviceProp.name);cudaDriverGetVersion(&driverVersion);cudaRuntimeGetVersion(&runtimeVersion);printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n",driverVersion / 1000, (driverVersion % 100) / 10,runtimeVersion / 1000, (runtimeVersion % 100) / 10);printf(" CUDA Capability Major/Minor version number: %d.%d\n",deviceProp.major, deviceProp.minor);printf(" Total amount of global memory: %.2f GBytes (%llu ""bytes)\n", (float)deviceProp.totalGlobalMem / pow(1024.0, 3),(unsigned long long)deviceProp.totalGlobalMem);printf(" GPU Clock rate: %.0f MHz (%0.2f ""GHz)\n", deviceProp.clockRate * 1e-3f,deviceProp.clockRate * 1e-6f);printf(" Memory Clock rate: %.0f Mhz\n",deviceProp.memoryClockRate * 1e-3f);printf(" Memory Bus Width: %d-bit\n",deviceProp.memoryBusWidth);if (deviceProp.l2CacheSize){printf(" L2 Cache Size: %d bytes\n",deviceProp.l2CacheSize);}printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), ""2D=(%d,%d), 3D=(%d,%d,%d)\n", deviceProp.maxTexture1D,deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1],deviceProp.maxTexture3D[2]);printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, ""2D=(%d,%d) x %d\n", deviceProp.maxTexture1DLayered[0],deviceProp.maxTexture1DLayered[1], deviceProp.maxTexture2DLayered[0],deviceProp.maxTexture2DLayered[1],deviceProp.maxTexture2DLayered[2]);printf(" Total amount of constant memory: %lu bytes\n",deviceProp.totalConstMem);printf(" Total amount of shared memory per block: %lu bytes\n",deviceProp.sharedMemPerBlock);printf(" Total number of registers available per block: %d\n",deviceProp.regsPerBlock);printf(" Warp size: %d\n",deviceProp.warpSize);printf(" Maximum number of threads per multiprocessor: %d\n",deviceProp.maxThreadsPerMultiProcessor);printf(" Maximum number of threads per block: %d\n",deviceProp.maxThreadsPerBlock);printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n",deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]);printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n",deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]);printf(" Maximum memory pitch: %lu bytes\n",deviceProp.memPitch);exit(EXIT_SUCCESS);
}
确定最优gpu
int numDevices = 0;
cudaGetDeviceCount(&numDevices);
if(numDevices > 1){int maxMultiprocessors = 0, maxDevice = 0;for(int device=0; device<numDevices; device++){cudaDeviceProp props;cudaGetDeviceProperties(&props, device);if (maxMultiprocessors < props.multipProcessorCount){maxMultiprocessors = props.multiProcessorCount;maxDevice = device;}}
}
使用nvidia-smi查询GPU信息
nvida-smi -L
nvidia-smi -q -i -0
nvidia-smi -q -i 0 -d MEMORY | tail -n 5
nvidia-smi -q -i 0 -d UTILIZATION | tail -n 4
运行时设置设备
有N个GPU的系统,nvidia-smi从0到N-1来记录设备,使用环境变量CUDA_VISIBLE_DEVICES
来设置
CUDA C编程权威指南 第二章 CUDA编程模型相关推荐
- CUDA C编程权威指南 第一章
基础 CUDA(Compute Unified Device Architecture)是NVIDIA提出 CPU和GPU是的PCI-Express总线相连 cpu CPU则负责管理设备端的资源; C ...
- 02 Confluent_Kafka权威指南 第二章:安装kafka
文章目录 CHAPTER 2 Installing Kafka kafka的安装配置 First Things First Choosing an Operating System Installin ...
- Asterisk权威指南/第二章 Asterisk架构
Asterisk和其他传统的PBX完全不同,因为Asterisk的拨号计划以同样的方式处理所有的入局信道(incoming channels). 传统的PBX在逻辑上区分工作站信道(连接电话机)和电话 ...
- JavaScript权威指南 第二章 词法结构
JavaScript程序是用unicode字符集编写的. JavaScript是区分大小写的语言. HTML是不区分大小写的语言. JavaScript会忽略程序中标识(token)之间的空格. Ja ...
- 《CUDA C编程权威指南》——3.4 避免分支分化
本节书摘来自华章计算机<CUDA C编程权威指南>一书中的第3章,第3.4节,作者 [美] 马克斯·格罗斯曼(Max Grossman),译 颜成钢 殷建 李亮,更多章节内容可以访问云栖社 ...
- 《CUDA C编程权威指南》——1.5节总结
本节书摘来自华章社区<CUDA C编程权威指南>一书中的第1章,第1.5节总结,作者[美] 马克斯·格罗斯曼(Max Grossman) ,更多章节内容可以访问云栖社区"华章社区 ...
- c cuda 指定gpu_《CUDA C编程权威指南》——1.3 用GPU输出Hello World-阿里云开发者社区...
本节书摘来自华章计算机<CUDA C编程权威指南>一书中的第1章,第1.3节,作者 [美] 马克斯·格罗斯曼(Max Grossman),译 颜成钢 殷建 李亮,更多章节内容可以访问云栖社 ...
- 《CUDA C编程权威指南》——2.2 给核函数计时
本节书摘来自华章计算机<CUDA C编程权威指南>一书中的第2章,第2.2节,作者 [美] 马克斯·格罗斯曼(Max Grossman),译 颜成钢 殷建 李亮,更多章节内容可以访问云栖社 ...
- 《CUDA C编程权威指南》——2.4节设备管理
本节书摘来自华章社区<CUDA C编程权威指南>一书中的第2章,第2.4节设备管理,作者[美] 马克斯·格罗斯曼(Max Grossman) ,更多章节内容可以访问云栖社区"华章 ...
最新文章
- 用Javascript修正12个常见的浏览器问题
- 创建Django项目和模型(创建工程、子应用、设置pycharm环境、使用Django进行数据库开发的步骤)
- php后台图片压缩_php实现等比例压缩图片
- ParlAI:Facebook开源的一个AI对话框架
- java子类有参构造函数吗_为什么我需要在这个Java通用子类中有一个构造函数?...
- python乒乓球比赛规则介绍_乒乓球比赛规则及活动方案
- C#基础知识整理:基础知识(1) Main方法
- 从《跨过鸭绿江》中看程序员的时间管理实践
- Redis 为什么是单线程的?
- python创建函数、可以接受任意多个整数参数并求和_pythonxlwt:求和函数最大参数限制?...
- 在div中设置文字与内部div垂直居中
- Lambda项目:迈向多核及超越
- Linux下,C++编程论坛题目抽取
- Linux 终端下 dstat 监控工具
- linux 压缩命令
- 苹果推出Find My Friends好友追踪iOS应用
- 大数据之-入门_大数据发展前景---大数据之hadoop工作笔记0005
- ros中使用boost::thread多线程boost::bind绑定参数,多线程发送topic
- Linux内核五个安全模块简述
- 计算机专业毕业论文写作指导(案例超详解)
热门文章
- php算法两数之和 复杂度,每天一道leetcode算法题:两数之和-php版
- LeetCode力扣(27. 移除元素)----Java/JavaScript/C
- 董付国老师1900页系列Python教学PPT阅读地址汇总
- Python可以这样学(第七季:pillow扩展库图像编程)-董付国-专题视频课程
- java报错信息怎么看_AE-E3D插件无效或提示OPENGL E3D Debug等错误报错信息怎么办?...
- redis依赖包_092-Redis集群、JedisCluster
- logback-spring.xml 文件路径 相对路径_Web前端必会知识点:VUE路径问题解析-Web前端教程...
- python中提取几列_自己录制的公开课视频中提取字幕(python+opencv+Tesseract-OCR)
- 两个vlan如何互通_网络交换机VLAN的常识与划分方法,你知道吗?
- jdialog 数据量大加载出现白板_王者荣耀:队友真的有人机?白板熟练进排位,资料面都是假的...