CUDA6.0开始 有"统一寻址"(Unified Memory)编程模型,可以用单个指针访问CPU和GPU内存,无须手动拷贝

主机启动内核后,管理权立刻返回给主机(类似启动线程后,不join)

C函数 CUDA C函数
malloc cudaMalloc
memcpy cudaMemcpy
memset cudaMemset
free cudaFree
cudaError_t cudaMalloc(void** devPtr, size_t size)

分配线性内存,devPtr是内存指针

cudaError_t cudaMemcpy(void* dst, const void* src, size_t count,cudaMemcpyKind kind)
  1. 主机和设备之间的数据传输,从src向dst复制字节,复制方向由kind指定

  2. kind:
    cudaMemcpyHostToHost
    cudaMemcpyHostToDevice
    cudaMemcpyDeviceToHost
    cudaMemcpyDeviceToDevice

  3. 同步方式,在cudaMemcpy函数返回及传输操作完成之前主机是阻塞的

  4. 返回错误枚举类型cudaError_t,成功返回cudaSuccess,失败返回cudaErrorMemoryAllocation

char* cudaGetErrorString(cudaError_t error)

获得报错信息,和C语言中的strerror函数类似

内存层次结构

#include <stdlib.h>
#include <time.h>void sumArraysOnHost(float *A, float *B, float *C, const int N)//数组相加
{for (int idx = 0; idx < N; idx++){C[idx] = A[idx] + B[idx];}
}void initialData(float *ip, int size)//初始化数组,生成随机数
{time_t t;srand((unsigned) time(&t));//随机数发生器for (int i = 0; i < size; i++){//rand()随机数函数返回一个int型的数,将rand()返回值的高16位变成0,低16位不变,用来控制最大值ip[i] = (float)(rand() & 0xFF) / 10.0f;//}return;
}int main(int argc, char **argv)
{int nElem = 1024;size_t nBytes = nElem * sizeof(float);float *h_A, *h_B, *h_C;h_A = (float *)malloc(nBytes);h_B = (float *)malloc(nBytes);h_C = (float *)malloc(nBytes);initialData(h_A, nElem);initialData(h_B, nElem);sumArraysOnHost(h_A, h_B, h_C, nElem);free(h_A);free(h_B);free(h_C);return(0);
}

nvcc -Xcompiler -std=c99 sumArraysOnHost.c -o sum
-Xcompiler用于指定命令行选项是指向C编译器还是预处理器
-std=c99传递给编译器,指定编译标准
参考: https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html

线程管理

  1. 同一grid所有线程共享相同的全局内存空间
  2. 一个grid由多个block构成,一个线程block包含一组thread
  3. 同一block内线程:1. 同步,2.共享内存;不同block不能协作

  1. 索引

    1. 线程依靠blockIdx(block在grid里的索引)和threadIdx(block内索引),基于坐标将不同的数据分配给不同的线程
    2. 坐标变量基于unint3定义的CUDA内置向量类型(blockIdx.x,blockIdx.y,blockIdx.z;threadIdx.x,threadIdx.y,threadIdx.z)
    3. grid和block都可以组织为3维的,一个grid,默认是二维,一个block默认三维
    4. gridDim线程格维度(每个grid中有多少个block) blockDim线程块的维度(每个block中有多少个线程)是dim3类型的变量
    5. dim3类型的变量,所有未指定的元素都被初始化为1,可以通过x,y,z获得各个维度的长度(blockDim.x,blockDim.y,blockDim.z)
    6. host端使用dim3定义grid和block,在device端会议unit3类型显示;既host端的dim3 对应了device端的unit(block对应blockDim,grid对应gridDim)

host和device对应

#include <cuda_runtime.h>
#include <stdio.h>
__global__ void checkIndex(void)
{printf("threadIdx:(%d, %d, %d)\n", threadIdx.x, threadIdx.y, threadIdx.z);//因为是1维的,所以y,z维度坐标都是0printf("blockIdx:(%d, %d, %d)\n", blockIdx.x, blockIdx.y, blockIdx.z);printf("blockDim:(%d, %d, %d)\n", blockDim.x, blockDim.y, blockDim.z);//因为是1维的,所以y,z维度都初始化为1printf("gridDim:(%d, %d, %d)\n", gridDim.x, gridDim.y, gridDim.z);//因为是1维的,所以y,z维度都初始化为1
}int main(int argc, char **argv)
{int nElem = 6;//假设有6个数据dim3 block(3);//每个block有3个thread//C语言中除号两边都是整型时,做的求模运算,即整除,得到的是一个整数,为什么不直接Elem/block.x?dim3 grid((nElem + block.x - 1) / block.x);//每个grid有2个block,// host端检查grid和block的配置printf("grid.x %d grid.y %d grid.z %d\n", grid.x, grid.y, grid.z);printf("block.x %d block.y %d block.z %d\n", block.x, block.y, block.z);// device端检查grid和block的配置checkIndex<<<grid, block>>>();cudaDeviceReset();return(0);
}

重置grid和block

#include <cuda_runtime.h>
#include <stdio.h>int main(int argc, char **argv)
{ int nElem = 1024;// 总共设置1024个数据// block,1维,共1024个thread;grid1维,共1个blockdim3 block (1024);dim3 grid  ((nElem + block.x - 1) / block.x);printf("grid.x %d block.x %d \n", grid.x, block.x);// block,1维,共512个thread;grid1维,共2个blockblock.x = 512;grid.x  = (nElem + block.x - 1) / block.x;printf("grid.x %d block.x %d \n", grid.x, block.x);// block,1维,共256个thread;grid1维,共4个blockblock.x = 256;grid.x  = (nElem + block.x - 1) / block.x;printf("grid.x %d block.x %d \n", grid.x, block.x);// block,1维,共128个thread;grid1维,共8个blockblock.x = 128;grid.x  = (nElem + block.x - 1) / block.x;printf("grid.x %d block.x %d \n", grid.x, block.x);cudaDeviceReset();return(0);
}

启动核函数

  1. kernel_name <<< grid,block>>>(argument list); 相比c普通函数function_name(argument list)多了<<< grid,block>>>
  2. 假设有8个元素,启动2个1维块,每个块有4个thread kernel_name<<< 2, 4>>>(argument list)


3. kernel函数调用和host是异步的,调用后,控制权立刻返回给host,可以使用cudaError_t cuda DevicesSynchronize(void)来同步
4. 有些kernel函数是同步的,如cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind),拷贝完成后,控制权才返回主机

编写核函数

限定符 调用位置 执行位置 备注
global host device 必须有void返回类型
device device device
host host host 可以省略
  1. __device____host__可以一起使用,这样函数同时在主机和设备端进行编译
  2. 核函数限制
    1)只能访问设备内存
    2)必须有void返回类型
    3)不支持可变数量的参数
    4)不支持静态变量
    5)基本都是异步
  3. cpu:
void sumArraysOnHost(float *A, float *B, float *C, const int N){for (int i = 0; i < N; i++)C[i] = A[i] + B[i];
}
gpu:
__global__ void sumArraysOnGPU(float *A, float *B, float *C)//数组相加
{int i = threadIdx.x;//省略循环,通过threadIdx来C[i] = A[i] + B[i];
}

验证核函数

验证结果

void checkResult(float *h_C, float *gpuRef, const int N){//检验cpu和gpu的计算结果double epsilon = 1.0E-8;bool match = 1;for (int i = 0; i < N; i++){if (abs(h_C[i] - gpuRef[i]) > epsilon){//host计算数组对应位置元素值-device计算数组对应元素值match = 0;//更改全正确标志为0,既不会打印全部正确printf("Arrays do not match!\n");printf("host %5.2f gpu %5.2f at current %d\n", h_C[i],gpuRef[i], i);break;}}if (match) printf("Arrays match.\n\n");return;
}

处理错误

综合

#include <cuda_runtime.h>
#include <stdio.h>
#include <sys/time.h>void checkResult(float *h_C, float *gpuRef, const int N){//检验cpu和gpu的计算结果double epsilon = 1.0E-8;bool match = 1;for (int i = 0; i < N; i++){if (abs(h_C[i] - gpuRef[i]) > epsilon){//host计算数组对应位置元素值-device计算数组对应元素值match = 0;//更改全正确标志为0,既不会打印全部正确printf("Arrays do not match!\n");printf("host %5.2f gpu %5.2f at current %d\n", h_C[i],gpuRef[i], i);break;}}if (match) printf("Arrays match.\n\n");return;
}void initialData(float *ip, int size){//目的是生成随机的浮点数time_t t;srand((unsigned) time(&t));for (int i = 0; i < size; i++){ip[i] = (float)(rand() & 0xFF) / 10.0f;}return;
}void sumArraysOnHost(float *A, float *B, float *C, const int N){for (int idx = 0; idx < N; idx++)C[idx] = A[idx] + B[idx];
}__global__ void sumArraysOnGPU(float *A, float *B, float *C, const int N){int i = threadIdx.x;if (i < N) C[i] = A[i] + B[i];//i<N 只让小于i的核来工作
}int main(int argc, char **argv){int nElem = 32;// malloc host memorysize_t nBytes = nElem * sizeof(float);float *h_A, *h_B, *h_C, *gpuRef;h_A     = (float *)malloc(nBytes);//host 数组Ah_B     = (float *)malloc(nBytes);//host 数组Bh_C = (float *)malloc(nBytes);//host 数组C,存储h_A + h_B的结果gpuRef  = (float *)malloc(nBytes);//device 结果数组,从gpu中结果复制到CPUinitialData(h_A, nElem); //初始化 host数组A,生成随机的浮点数initialData(h_B, nElem); //初始化 host数组B,生成随机的浮点数memset(h_C, 0, nBytes); //初始化 host 结果数组,全部置为0memset(gpuRef,  0, nBytes); //初始化 device 结果数组,全部置为0float *d_A, *d_B, *d_C;cudaMalloc((float**)&d_A, nBytes);  //初始化 device数组A 第一个参数意思是传入指针的地址,d_A本来在host内存中cudaMalloc((float**)&d_B, nBytes);  //初始化 device数组B 经过cudaMalloc作用后指针将指向device的内存cudaMalloc((float**)&d_C, nBytes);  //初始化 device数组CcudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);//将host数组内容拷贝到device中cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);//将host数组内容拷贝到device中cudaMemcpy(d_C, gpuRef, nBytes, cudaMemcpyHostToDevice);//将host数组内容拷贝到device中//host配置grid和blockdim3 block (nElem);dim3 grid  (1);sumArraysOnGPU<<<grid, block>>>(d_A, d_B, d_C, nElem);//device计算数组相加cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);//将device结果拷贝到hostsumArraysOnHost(h_A, h_B, h_C, nElem);//host数组相加checkResult(h_C, gpuRef, nElem);//检测结果// free device global memorycudaFree(d_A);cudaFree(d_B);cudaFree(d_C);// free host memoryfree(h_A);free(h_B);free(h_C);free(gpuRef);cudaDeviceReset();return(0);
}

计时

#include <cuda_runtime.h>
#include <stdio.h>
#include <sys/time.h>
void checkResult(float *h_C, float *gpuRef, const int N){double epsilon = 1.0E-8;bool match = 1;for (int i = 0; i < N; i++){if (abs(h_C[i] - gpuRef[i]) > epsilon){match = 0;printf("Arrays do not match!\n");printf("host %5.2f gpu %5.2f at current %d\n", h_C[i],gpuRef[i], i);break;}}if (match) printf("Arrays match.\n\n");return;
}double seconds(){struct timeval tp;struct timezone tzp;int i = gettimeofday(&tp, &tzp);return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}void initialData(float *ip, int size){time_t t;srand((unsigned) time(&t));for (int i = 0; i < size; i++){ip[i] = (float)( rand() & 0xFF ) / 10.0f;}return;
}void sumArraysOnHost(float *A, float *B, float *C, const int N){for (int idx = 0; idx < N; idx++){C[idx] = A[idx] + B[idx];}
}
__global__ void sumArraysOnGPU(float *A, float *B, float *C, const int N){int i = blockIdx.x * blockDim.x + threadIdx.x;if (i < N) C[i] = A[i] + B[i];//device的blockIdx可能会超出数组的界限
}int main(int argc, char **argv)
{printf("%s Starting...\n", argv[0]);int nElem = 32;size_t nBytes = nElem * sizeof(float);float *h_A, *h_B, *h_C, *gpuRef;h_A     = (float *)malloc(nBytes);h_B     = (float *)malloc(nBytes);h_C = (float *)malloc(nBytes);gpuRef  = (float *)malloc(nBytes);double iStart, iElaps;// 初始化hostiStart = seconds();initialData(h_A, nElem);initialData(h_B, nElem);iElaps = seconds() - iStart;printf("initialData Time elapsed %f sec\n", iElaps);memset(h_C, 0, nBytes);memset(gpuRef,  0, nBytes);// host计算iStart = seconds();sumArraysOnHost(h_A, h_B, h_C, nElem);iElaps = seconds() - iStart;printf("sumArraysOnHost Time elapsed %f sec\n", iElaps);// 初始化devicefloat *d_A, *d_B, *d_C;cudaMalloc((float**)&d_A, nBytes);//(float**)可以省略,使用是为了明确传入的是指针的地址(二重指针)cudaMalloc((float**)&d_B, nBytes);cudaMalloc((float**)&d_C, nBytes);cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);cudaMemcpy(d_C, gpuRef, nBytes, cudaMemcpyHostToDevice);int iLen = 512;dim3 block (iLen);dim3 grid  ((nElem + block.x - 1) / block.x);iStart = seconds();sumArraysOnGPU<<<grid, block>>>(d_A, d_B, d_C, nElem);cudaDeviceSynchronize();iElaps = seconds() - iStart;printf("sumArraysOnGPU <<<  %d, %d  >>>  Time elapsed %f sec\n", grid.x,block.x, iElaps);// 从device取数据cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);// 检查结果checkResult(h_C, gpuRef, nElem);cudaFree(d_A);cudaFree(d_B);cudaFree(d_C);free(h_A);free(h_B);free(h_C);free(gpuRef);return(0);
}

nvprof

CUDA5.0以来,提供给了程序执行的信息
nvprof [nvprof_args] <application> [application_args]
nvprof --help
`nvprof ./sumArraysOnGPU-timer

~/code/cuda$ nvprof ./sum
./sum Starting...
Vector size 32
initialData Time elapsed 0.000004 sec
sumArraysOnHost Time elapsed 0.000000 sec
==5121== NVPROF is profiling process 5121, command: ./sum
sumArraysOnGPU <<<  1, 512  >>>  Time elapsed 0.000025 sec
Arrays match.==5121== Profiling application: ./sum
==5121== Profiling result:Type  Time(%)      Time     Calls       Avg       Min       Max  NameGPU activities:   43.48%  1.9200us         3     640ns     576ns     768ns  [CUDA memcpy HtoD]42.03%  1.8560us         1  1.8560us  1.8560us  1.8560us  sumArraysOnGPU(float*, float*, float*, int)14.49%     640ns         1     640ns     640ns     640ns  [CUDA memcpy DtoH]API calls:   98.84%  143.30ms         3  47.768ms  3.2070us  143.29ms  cudaMalloc0.72%  1.0374ms        96  10.806us     519ns  507.17us  cuDeviceGetAttribute0.25%  364.98us         1  364.98us  364.98us  364.98us  cuDeviceTotalMem0.08%  113.68us         1  113.68us  113.68us  113.68us  cuDeviceGetName0.06%  85.675us         3  28.558us  4.0100us  75.346us  cudaFree0.03%  41.697us         4  10.424us  4.4080us  24.272us  cudaMemcpy0.01%  20.049us         1  20.049us  20.049us  20.049us  cudaLaunchKernel0.01%  7.6710us         1  7.6710us  7.6710us  7.6710us  cuDeviceGetPCIBusId0.00%  4.1670us         3  1.3890us     516ns  2.7560us  cuDeviceGetCount0.00%  3.7380us         1  3.7380us  3.7380us  3.7380us  cudaDeviceSynchronize0.00%  3.5770us         2  1.7880us     533ns  3.0440us  cuDeviceGet0.00%  1.0310us         1  1.0310us  1.0310us  1.0310us  cuDeviceGetUuid

组织并行线程

使用block和thread建立矩阵索引

使用线性存储二维矩阵

#include <cuda_runtime.h>
#include <stdio.h>void printMatrix(int *C, const int nx, const int ny)
{int *ic = C;for (int iy = 0; iy < ny; iy++){for (int ix = 0; ix < nx; ix++){printf("%3d", ic[ix]);}ic += nx;//指针移动了一行的元素距离printf("\n");}printf("\n");return;
}__global__ void printThreadIndex(int *A, const int nx, const int ny){//二维block//dim3 block(4, 2);x:[0,1,2,3],y:[0,1]int ix = threadIdx.x + blockIdx.x * blockDim.x;int iy = threadIdx.y + blockIdx.y * blockDim.y;unsigned int idx = iy * nx + ix;printf("thread_id (%d,%d) block_id (%d,%d) coordinate (%d,%d) global index"" %2d ival %2d\n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y,ix, iy, idx, A[idx]);
}int main(int argc, char **argv)
{printf("%s Starting...\n", argv[0]);int nx = 8; //每行8个元素int ny = 6; //6列int nxy = nx * ny; //总元素个数int nBytes = nxy * sizeof(float);//host端申请内存,线性内存存储矩阵int *h_A = (int *)malloc(nBytes);for (int i = 0; i < nxy; i++){//初始化矩阵h_A[i] = i;}printMatrix(h_A, nx, ny);int *d_MatA;//device申请内存cudaMalloc((void **)&d_MatA, nBytes);cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice); //将host端矩阵和device端同步dim3 block(4, 2);dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);//(8+4-1)/4=2,(6+2-1)/2=3printThreadIndex<<<grid, block>>>(d_MatA, nx, ny);//<<<6,8>>>cudaFree(d_MatA);free(h_A);cudaDeviceReset();return (0);
}

二维网格和二维块对矩阵求和

#include <cuda_runtime.h>
#include <stdio.h>void initialData(float *ip, const int size){//初始化矩阵for(int i = 0; i < size; i++){ip[i] = (float)(rand() & 0xFF) / 10.0f;}return;
}void sumMatrixOnHost(float *A, float *B, float *C, const int nx, const int ny){//host端矩阵加法float *ia = A;float *ib = B;float *ic = C;for (int iy = 0; iy < ny; iy++){for (int ix = 0; ix < nx; ix++){ic[ix] = ia[ix] + ib[ix];}ia += nx;//每计算一行,指针位置前进一行ib += nx;ic += nx;}return;
}void checkResult(float *h_C, float *gpuRef, const int N){double epsilon = 1.0E-8;bool match = 1;for (int i = 0; i < N; i++){if (abs(h_C[i] - gpuRef[i]) > epsilon){match = 0;printf("host %f gpu %f\n", h_C[i], gpuRef[i]);break;}}if (match)printf("Arrays match.\n\n");elseprintf("Arrays do not match.\n\n");
}__global__ void sumMatrixOnGPU2D(float *MatA, float *MatB, float *MatC, int nx, int ny){unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;//横坐标,列数unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;//纵坐标,行数unsigned int idx = iy * nx + ix;if (ix < nx && iy < ny)MatC[idx] = MatA[idx] + MatB[idx];
}int main(int argc, char **argv)
{printf("%s Starting...\n", argv[0]);//初始化hostint nx = 1 << 14;//矩阵列数int ny = 1 << 14;//矩阵行数int nxy = nx * ny;//总元素个数int nBytes = nxy * sizeof(float);float *h_A, *h_B, *h_C, *gpuRef;h_A = (float *)malloc(nBytes);h_B = (float *)malloc(nBytes);h_C = (float *)malloc(nBytes);gpuRef = (float *)malloc(nBytes);initialData(h_A, nxy);initialData(h_B, nxy);memset(h_C, 0, nBytes);memset(gpuRef, 0, nBytes);sumMatrixOnHost(h_A, h_B, h_C, nx, ny);//初始化devicefloat *d_MatA, *d_MatB, *d_MatC;cudaMalloc((void **)&d_MatA, nBytes);cudaMalloc((void **)&d_MatB, nBytes);cudaMalloc((void **)&d_MatC, nBytes);cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice);cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice);int dimx = 32;int dimy = 32;dim3 block(dimx, dimy);dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);sumMatrixOnGPU2D<<<grid, block>>>(d_MatA, d_MatB, d_MatC, nx, ny);cudaDeviceSynchronize();cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost);checkResult(h_C, gpuRef, nxy);//释放内存cudaFree(d_MatA);cudaFree(d_MatB);cudaFree(d_MatC);free(h_A);free(h_B);free(h_C);free(gpuRef);cudaDeviceReset();return (0);
}

使用一维网格一维块对矩阵求和

__global__ void sumMatrixOnGPU1D(float *MatA, float *MatB, float *MatC, int nx,int ny){unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;if (ix < nx )for (int iy = 0; iy < ny; iy++){int idx = iy * nx + ix;MatC[idx] = MatA[idx] + MatB[idx];}
}
dim3 block(32, 1);
dim3 grid((nx + block.x - 1) / block.x, 1);

使用二维网格和一维块对矩阵求和

__global__ void sumMatrixOnGPUMix(float *MatA, float *MatB, float *MatC, int nx,int ny)
{unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;unsigned int iy = blockIdx.y;unsigned int idx = iy * nx + ix;if (ix < nx && iy < ny)MatC[idx] = MatA[idx] + MatB[idx];
}
int dimx = 32;
dim3 block(dimx, 1);
dim3 grid((nx + block.x - 1) / block.x, ny);

设备管理

使用运行时API查询GPU信息

#include <cuda_runtime.h>
#include <stdio.h>int main(int argc, char **argv){printf("%s Starting...\n", argv[0]);int deviceCount = 0;cudaGetDeviceCount(&deviceCount);//查看GPU数量printf("Detected %d CUDA Capable device(s)\n", deviceCount);int dev = 0, driverVersion = 0, runtimeVersion = 0;cudaSetDevice(dev);cudaDeviceProp deviceProp;cudaGetDeviceProperties(&deviceProp, dev);printf("Device %d: \"%s\"\n", dev, deviceProp.name);cudaDriverGetVersion(&driverVersion);cudaRuntimeGetVersion(&runtimeVersion);printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n",driverVersion / 1000, (driverVersion % 100) / 10,runtimeVersion / 1000, (runtimeVersion % 100) / 10);printf("  CUDA Capability Major/Minor version number:    %d.%d\n",deviceProp.major, deviceProp.minor);printf("  Total amount of global memory:                 %.2f GBytes (%llu ""bytes)\n", (float)deviceProp.totalGlobalMem / pow(1024.0, 3),(unsigned long long)deviceProp.totalGlobalMem);printf("  GPU Clock rate:                                %.0f MHz (%0.2f ""GHz)\n", deviceProp.clockRate * 1e-3f,deviceProp.clockRate * 1e-6f);printf("  Memory Clock rate:                             %.0f Mhz\n",deviceProp.memoryClockRate * 1e-3f);printf("  Memory Bus Width:                              %d-bit\n",deviceProp.memoryBusWidth);if (deviceProp.l2CacheSize){printf("  L2 Cache Size:                                 %d bytes\n",deviceProp.l2CacheSize);}printf("  Max Texture Dimension Size (x,y,z)             1D=(%d), ""2D=(%d,%d), 3D=(%d,%d,%d)\n", deviceProp.maxTexture1D,deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1],deviceProp.maxTexture3D[2]);printf("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, ""2D=(%d,%d) x %d\n", deviceProp.maxTexture1DLayered[0],deviceProp.maxTexture1DLayered[1], deviceProp.maxTexture2DLayered[0],deviceProp.maxTexture2DLayered[1],deviceProp.maxTexture2DLayered[2]);printf("  Total amount of constant memory:               %lu bytes\n",deviceProp.totalConstMem);printf("  Total amount of shared memory per block:       %lu bytes\n",deviceProp.sharedMemPerBlock);printf("  Total number of registers available per block: %d\n",deviceProp.regsPerBlock);printf("  Warp size:                                     %d\n",deviceProp.warpSize);printf("  Maximum number of threads per multiprocessor:  %d\n",deviceProp.maxThreadsPerMultiProcessor);printf("  Maximum number of threads per block:           %d\n",deviceProp.maxThreadsPerBlock);printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n",deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]);printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n",deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]);printf("  Maximum memory pitch:                          %lu bytes\n",deviceProp.memPitch);exit(EXIT_SUCCESS);
}

确定最优gpu

int numDevices = 0;
cudaGetDeviceCount(&numDevices);
if(numDevices > 1){int maxMultiprocessors = 0, maxDevice = 0;for(int device=0; device<numDevices; device++){cudaDeviceProp props;cudaGetDeviceProperties(&props, device);if (maxMultiprocessors < props.multipProcessorCount){maxMultiprocessors = props.multiProcessorCount;maxDevice = device;}}
}

使用nvidia-smi查询GPU信息

nvida-smi -L
nvidia-smi -q -i -0
nvidia-smi -q -i 0 -d MEMORY | tail -n 5
nvidia-smi -q -i 0 -d UTILIZATION | tail -n 4

运行时设置设备

有N个GPU的系统,nvidia-smi从0到N-1来记录设备,使用环境变量CUDA_VISIBLE_DEVICES来设置

CUDA C编程权威指南 第二章 CUDA编程模型相关推荐

  1. CUDA C编程权威指南 第一章

    基础 CUDA(Compute Unified Device Architecture)是NVIDIA提出 CPU和GPU是的PCI-Express总线相连 cpu CPU则负责管理设备端的资源; C ...

  2. 02 Confluent_Kafka权威指南 第二章:安装kafka

    文章目录 CHAPTER 2 Installing Kafka kafka的安装配置 First Things First Choosing an Operating System Installin ...

  3. Asterisk权威指南/第二章 Asterisk架构

    Asterisk和其他传统的PBX完全不同,因为Asterisk的拨号计划以同样的方式处理所有的入局信道(incoming channels). 传统的PBX在逻辑上区分工作站信道(连接电话机)和电话 ...

  4. JavaScript权威指南 第二章 词法结构

    JavaScript程序是用unicode字符集编写的. JavaScript是区分大小写的语言. HTML是不区分大小写的语言. JavaScript会忽略程序中标识(token)之间的空格. Ja ...

  5. 《CUDA C编程权威指南》——3.4 避免分支分化

    本节书摘来自华章计算机<CUDA C编程权威指南>一书中的第3章,第3.4节,作者 [美] 马克斯·格罗斯曼(Max Grossman),译 颜成钢 殷建 李亮,更多章节内容可以访问云栖社 ...

  6. 《CUDA C编程权威指南》——1.5节总结

    本节书摘来自华章社区<CUDA C编程权威指南>一书中的第1章,第1.5节总结,作者[美] 马克斯·格罗斯曼(Max Grossman) ,更多章节内容可以访问云栖社区"华章社区 ...

  7. c cuda 指定gpu_《CUDA C编程权威指南》——1.3 用GPU输出Hello World-阿里云开发者社区...

    本节书摘来自华章计算机<CUDA C编程权威指南>一书中的第1章,第1.3节,作者 [美] 马克斯·格罗斯曼(Max Grossman),译 颜成钢 殷建 李亮,更多章节内容可以访问云栖社 ...

  8. 《CUDA C编程权威指南》——2.2 给核函数计时

    本节书摘来自华章计算机<CUDA C编程权威指南>一书中的第2章,第2.2节,作者 [美] 马克斯·格罗斯曼(Max Grossman),译 颜成钢 殷建 李亮,更多章节内容可以访问云栖社 ...

  9. 《CUDA C编程权威指南》——2.4节设备管理

    本节书摘来自华章社区<CUDA C编程权威指南>一书中的第2章,第2.4节设备管理,作者[美] 马克斯·格罗斯曼(Max Grossman) ,更多章节内容可以访问云栖社区"华章 ...

最新文章

  1. 用Javascript修正12个常见的浏览器问题
  2. 创建Django项目和模型(创建工程、子应用、设置pycharm环境、使用Django进行数据库开发的步骤)
  3. php后台图片压缩_php实现等比例压缩图片
  4. ParlAI:Facebook开源的一个AI对话框架
  5. java子类有参构造函数吗_为什么我需要在这个Java通用子类中有一个构造函数?...
  6. python乒乓球比赛规则介绍_乒乓球比赛规则及活动方案
  7. C#基础知识整理:基础知识(1) Main方法
  8. 从《跨过鸭绿江》中看程序员的时间管理实践
  9. Redis 为什么是单线程的?
  10. python创建函数、可以接受任意多个整数参数并求和_pythonxlwt:求和函数最大参数限制?...
  11. 在div中设置文字与内部div垂直居中
  12. Lambda项目:迈向多核及超越
  13. Linux下,C++编程论坛题目抽取
  14. Linux 终端下 dstat 监控工具
  15. linux 压缩命令
  16. 苹果推出Find My Friends好友追踪iOS应用
  17. 大数据之-入门_大数据发展前景---大数据之hadoop工作笔记0005
  18. ros中使用boost::thread多线程boost::bind绑定参数,多线程发送topic
  19. Linux内核五个安全模块简述
  20. 计算机专业毕业论文写作指导(案例超详解)

热门文章

  1. php算法两数之和 复杂度,每天一道leetcode算法题:两数之和-php版
  2. LeetCode力扣(27. 移除元素)----Java/JavaScript/C
  3. 董付国老师1900页系列Python教学PPT阅读地址汇总
  4. Python可以这样学(第七季:pillow扩展库图像编程)-董付国-专题视频课程
  5. java报错信息怎么看_AE-E3D插件无效或提示OPENGL E3D Debug等错误报错信息怎么办?...
  6. redis依赖包_092-Redis集群、JedisCluster
  7. logback-spring.xml 文件路径 相对路径_Web前端必会知识点:VUE路径问题解析-Web前端教程...
  8. python中提取几列_自己录制的公开课视频中提取字幕(python+opencv+Tesseract-OCR)
  9. 两个vlan如何互通_网络交换机VLAN的常识与划分方法,你知道吗?
  10. jdialog 数据量大加载出现白板_王者荣耀:队友真的有人机?白板熟练进排位,资料面都是假的...