1.使用线程实现小型矢量加法add


#include <stdio.h>#define N   10__global__ void add( int *a, int *b, int *c ) {int tid = threadIdx.x;if (tid < N)c[tid] = a[tid] + b[tid];
}int main( void ) {int a[N], b[N], c[N];int *dev_a, *dev_b, *dev_c;// allocate the memory on the GPUcudaMalloc( (void**)&dev_a, N * sizeof(int) ) ;cudaMalloc( (void**)&dev_b, N * sizeof(int) ) ;cudaMalloc( (void**)&dev_c, N * sizeof(int) ) ;// fill the arrays 'a' and 'b' on the CPUfor (int i=0; i<N; i++) {a[i] = i;b[i] = i * i;}// copy the arrays 'a' and 'b' to the GPUcudaMemcpy( dev_a, a, N * sizeof(int),cudaMemcpyHostToDevice ) ;cudaMemcpy( dev_b, b, N * sizeof(int),cudaMemcpyHostToDevice ) ;add<<<1,N>>>( dev_a, dev_b, dev_c );// copy the array 'c' back from the GPU to the CPUcudaMemcpy( c, dev_c, N * sizeof(int),cudaMemcpyDeviceToHost ) ;// display the resultsfor (int i=0; i<N; i++) {printf( "%d + %d = %d\n", a[i], b[i], c[i] );}// free the memory allocated on the GPUcudaFree( dev_a ) ;cudaFree( dev_b ) ;cudaFree( dev_c ) ;return 0;
}

输出：

2.使用线程实现大型矢量加法add

note:
1.线程块不能超过66635
2.单个线程块包含的线程数不能超过512

#include <stdio.h>#define N   (33 * 1024)__global__ void add( int *a, int *b, int *c ) {int tid = threadIdx.x + blockIdx.x * blockDim.x;while (tid < N) {c[tid] = a[tid] + b[tid];tid += blockDim.x * gridDim.x;}
}int main( void ) {int *a, *b, *c;int *dev_a, *dev_b, *dev_c;// allocate the memory on the CPUa = (int*)malloc( N * sizeof(int) );b = (int*)malloc( N * sizeof(int) );c = (int*)malloc( N * sizeof(int) );// allocate the memory on the GPUcudaMalloc( (void**)&dev_a, N * sizeof(int) ) ;cudaMalloc( (void**)&dev_b, N * sizeof(int) ) ;cudaMalloc( (void**)&dev_c, N * sizeof(int) ) ;// fill the arrays 'a' and 'b' on the CPUfor (int i=0; i<N; i++) {a[i] = i;b[i] = 2 * i;}// copy the arrays 'a' and 'b' to the GPUcudaMemcpy( dev_a, a, N * sizeof(int),cudaMemcpyHostToDevice );cudaMemcpy( dev_b, b, N * sizeof(int),cudaMemcpyHostToDevice );add<<<128,128>>>( dev_a, dev_b, dev_c );//or add<<<(N+127)/128,128>>>( dev_a, dev_b, dev_c );// copy the array 'c' back from the GPU to the CPUcudaMemcpy( c, dev_c, N * sizeof(int),cudaMemcpyDeviceToHost );// verify that the GPU did the work we requestedbool success = true;for (int i=0; i<N; i++) {if ((a[i] + b[i]) != c[i]) {printf( "Error:  %d + %d != %d\n", a[i], b[i], c[i] );success = false;}}if (success)    printf( "We did it!\n" );// free the memory we allocated on the GPUcudaFree( dev_a ) ;cudaFree( dev_b ) ;cudaFree( dev_c ) ;// free the memory we allocated on the CPUfree( a );free( b );free( c );return 0;
}

输出：
We did it!

Note：
当线程数量足够时，调用GPU时使用以下调用方式可减少资源浪费

add<<<(N+127)/128,128>>>( dev_a, dev_b, dev_c );

当线程数量不够时，可以让每个线程执行多次，如

__global__ void add( int *a, int *b, int *c ) {int tid = threadIdx.x + blockIdx.x * blockDim.x;while (tid < N) {c[tid] = a[tid] + b[tid];tid += blockDim.x * gridDim.x;//由于只有blockDim.x * gridDim.x个CPU，因此每次递增blockDim.x * gridDim.x}
}

类似于CPU中：

void add( int *a, int *b, int *c ) {int tid = 0;while (tid < N) {c[tid] = a[tid] + b[tid];tid ++;//由于只有一个CPU，因此每次递增1}
}

3.在GPU上使用线程实现纹波效果

#include <GL/glut.h>
struct CPUAnimBitmap {unsigned char    *pixels;int     width, height;void    *dataBlock;void (*fAnim)(void*,int);void (*animExit)(void*);void (*clickDrag)(void*,int,int,int,int);int     dragStartX, dragStartY;CPUAnimBitmap( int w, int h, void *d = NULL ) {width = w;height = h;pixels = new unsigned char[width * height * 4];dataBlock = d;clickDrag = NULL;}~CPUAnimBitmap() {delete [] pixels;}unsigned char* get_ptr( void ) const   { return pixels; }long image_size( void ) const { return width * height * 4; }void click_drag( void (*f)(void*,int,int,int,int)) {clickDrag = f;}void anim_and_exit( void (*f)(void*,int), void(*e)(void*) ) {CPUAnimBitmap**   bitmap = get_bitmap_ptr();*bitmap = this;fAnim = f;animExit = e;// a bug in the Windows GLUT implementation prevents us from// passing zero arguments to glutInit()int c=1;char* dummy = "";glutInit( &c, &dummy );glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );glutInitWindowSize( width, height );glutCreateWindow( "bitmap" );glutKeyboardFunc(Key);glutDisplayFunc(Draw);if (clickDrag != NULL)glutMouseFunc( mouse_func );glutIdleFunc( idle_func );glutMainLoop();}// static method used for glut callbacksstatic CPUAnimBitmap** get_bitmap_ptr( void ) {static CPUAnimBitmap*   gBitmap;return &gBitmap;}// static method used for glut callbacksstatic void mouse_func( int button, int state,int mx, int my ) {if (button == GLUT_LEFT_BUTTON) {CPUAnimBitmap*   bitmap = *(get_bitmap_ptr());if (state == GLUT_DOWN) {bitmap->dragStartX = mx;bitmap->dragStartY = my;} else if (state == GLUT_UP) {bitmap->clickDrag( bitmap->dataBlock,bitmap->dragStartX,bitmap->dragStartY,mx, my );}}}// static method used for glut callbacksstatic void idle_func( void ) {static int ticks = 1;CPUAnimBitmap*   bitmap = *(get_bitmap_ptr());bitmap->fAnim( bitmap->dataBlock, ticks++ );glutPostRedisplay();}// static method used for glut callbacksstatic void Key(unsigned char key, int x, int y) {switch (key) {case 27:CPUAnimBitmap*   bitmap = *(get_bitmap_ptr());bitmap->animExit( bitmap->dataBlock );//delete bitmap;exit(0);}}// static method used for glut callbacksstatic void Draw( void ) {CPUAnimBitmap*   bitmap = *(get_bitmap_ptr());glClearColor( 0.0, 0.0, 0.0, 1.0 );glClear( GL_COLOR_BUFFER_BIT );glDrawPixels( bitmap->width, bitmap->height, GL_RGBA, GL_UNSIGNED_BYTE, bitmap->pixels );glutSwapBuffers();}
};#define DIM 1024
#define PI 3.1415926535897932f__global__ void kernel( unsigned char *ptr, int ticks ) {// map from threadIdx/BlockIdx to pixel positionint x = threadIdx.x + blockIdx.x * blockDim.x;int y = threadIdx.y + blockIdx.y * blockDim.y;int offset = x + y * blockDim.x * gridDim.x;// now calculate the value at that positionfloat fx = x - DIM/2;float fy = y - DIM/2;float d = sqrtf( fx * fx + fy * fy );unsigned char grey = (unsigned char)(128.0f + 127.0f *cos(d/10.0f - ticks/7.0f) /(d/10.0f + 1.0f));    ptr[offset*4 + 0] = grey;ptr[offset*4 + 1] = grey;ptr[offset*4 + 2] = grey;ptr[offset*4 + 3] = 255;
}struct DataBlock {unsigned char   *dev_bitmap;CPUAnimBitmap  *bitmap;
};void generate_frame( DataBlock *d, int ticks ) {dim3    blocks(DIM/16,DIM/16);dim3    threads(16,16);kernel<<<blocks,threads>>>( d->dev_bitmap, ticks );cudaMemcpy( d->bitmap->get_ptr(),d->dev_bitmap,d->bitmap->image_size(),cudaMemcpyDeviceToHost ) ;
}// clean up memory allocated on the GPU
void cleanup( DataBlock *d ) {cudaFree( d->dev_bitmap );
}int main( void ) {DataBlock   data;CPUAnimBitmap  bitmap( DIM, DIM, &data );data.bitmap = &bitmap;cudaMalloc( (void**)&data.dev_bitmap,bitmap.image_size() ) ;bitmap.anim_and_exit( (void (*)(void*,int))generate_frame,(void (*)(void*))cleanup );
}

编译，输出：

4.共享内存和同步

关键字：__share__
用于单个线程块中多个线程之间的通信与协作，其他线程块上的线程无法读取和写入该线程块的共享内存中的变量。数据传输效率高。

4.1 点积运算

#include <stdio.h>#define imin(a,b) (a<b?a:b)const int N = 33 * 1024;
const int threadsPerBlock = 256;
const int blocksPerGrid =imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );__global__ void dot( float *a, float *b, float *c ) {__shared__ float cache[threadsPerBlock];int tid = threadIdx.x + blockIdx.x * blockDim.x;int cacheIndex = threadIdx.x;float   temp = 0;while (tid < N) {temp += a[tid] * b[tid];tid += blockDim.x * gridDim.x;}// set the cache valuescache[cacheIndex] = temp;// synchronize threads in this block__syncthreads();// for reductions, threadsPerBlock must be a power of 2// because of the following codeint i = blockDim.x/2;while (i != 0) {if (cacheIndex < i)cache[cacheIndex] += cache[cacheIndex + i];__syncthreads();i /= 2;}if (cacheIndex == 0)c[blockIdx.x] = cache[0];
}int main( void ) {float   *a, *b, c, *partial_c;float   *dev_a, *dev_b, *dev_partial_c;// allocate memory on the cpu sidea = (float*)malloc( N*sizeof(float) );b = (float*)malloc( N*sizeof(float) );partial_c = (float*)malloc( blocksPerGrid*sizeof(float) );// allocate the memory on the GPUcudaMalloc( (void**)&dev_a,N*sizeof(float) ) ;cudaMalloc( (void**)&dev_b,N*sizeof(float) ) ;cudaMalloc( (void**)&dev_partial_c,blocksPerGrid*sizeof(float) ) ;// fill in the host memory with datafor (int i=0; i<N; i++) {a[i] = i;b[i] = i*2;}// copy the arrays 'a' and 'b' to the GPUcudaMemcpy( dev_a, a, N*sizeof(float),cudaMemcpyHostToDevice ) ;cudaMemcpy( dev_b, b, N*sizeof(float),cudaMemcpyHostToDevice ) ; dot<<<blocksPerGrid,threadsPerBlock>>>( dev_a, dev_b,dev_partial_c );// copy the array 'c' back from the GPU to the CPUcudaMemcpy( partial_c, dev_partial_c,blocksPerGrid*sizeof(float),cudaMemcpyDeviceToHost ) ;// finish up on the CPU sidec = 0;for (int i=0; i<blocksPerGrid; i++) {c += partial_c[i];}#define sum_squares(x)  (x*(x+1)*(2*x+1)/6)printf( "Does GPU value %.6g = %.6g?\n", c,2 * sum_squares( (float)(N - 1) ) );// free memory on the gpu sidecudaFree( dev_a );cudaFree( dev_b );cudaFree( dev_partial_c );// free memory on the cpu sidefree( a );free( b );free( partial_c );
}

NOTE：__syncthreads()不能放在if语句下面，他必须被所有线程执行，否则会出事情。当if语句不被执行时，线程将会一直被等待。

输出：

Does GPU value 2.57236e+13 = 2.57236e+13?

4.2 位图

/** Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.** NVIDIA Corporation and its licensors retain all intellectual property and * proprietary rights in and to this software and related documentation. * Any use, reproduction, disclosure, or distribution of this software * and related documentation without an express license agreement from* NVIDIA Corporation is strictly prohibited.** Please refer to the applicable NVIDIA end user license agreement (EULA) * associated with this source code for terms and conditions that govern * your use of this NVIDIA software.* */#include <stdio.h>
#include <GL/glut.h>struct CPUBitmap {unsigned char    *pixels;int     x, y;void    *dataBlock;void (*bitmapExit)(void*);CPUBitmap( int width, int height, void *d = NULL ) {pixels = new unsigned char[width * height * 4];x = width;y = height;dataBlock = d;}~CPUBitmap() {delete [] pixels;}unsigned char* get_ptr( void ) const   { return pixels; }long image_size( void ) const { return x * y * 4; }void display_and_exit( void(*e)(void*) = NULL ) {CPUBitmap**   bitmap = get_bitmap_ptr();*bitmap = this;bitmapExit = e;// a bug in the Windows GLUT implementation prevents us from// passing zero arguments to glutInit()int c=1;char* dummy = "";glutInit( &c, &dummy );glutInitDisplayMode( GLUT_SINGLE | GLUT_RGBA );glutInitWindowSize( x, y );glutCreateWindow( "bitmap" );glutKeyboardFunc(Key);glutDisplayFunc(Draw);glutMainLoop();}// static method used for glut callbacksstatic CPUBitmap** get_bitmap_ptr( void ) {static CPUBitmap   *gBitmap;return &gBitmap;}// static method used for glut callbacksstatic void Key(unsigned char key, int x, int y) {switch (key) {case 27:CPUBitmap*   bitmap = *(get_bitmap_ptr());if (bitmap->dataBlock != NULL && bitmap->bitmapExit != NULL)bitmap->bitmapExit( bitmap->dataBlock );exit(0);}}// static method used for glut callbacksstatic void Draw( void ) {CPUBitmap*   bitmap = *(get_bitmap_ptr());glClearColor( 0.0, 0.0, 0.0, 1.0 );glClear( GL_COLOR_BUFFER_BIT );glDrawPixels( bitmap->x, bitmap->y, GL_RGBA, GL_UNSIGNED_BYTE, bitmap->pixels );glFlush();}
};#define DIM 1024
#define PI 3.1415926535897932f__global__ void kernel( unsigned char *ptr ) {// map from threadIdx/BlockIdx to pixel positionint x = threadIdx.x + blockIdx.x * blockDim.x;int y = threadIdx.y + blockIdx.y * blockDim.y;int offset = x + y * blockDim.x * gridDim.x;__shared__ float    shared[16][16];// now calculate the value at that positionconst float period = 128.0f;shared[threadIdx.x][threadIdx.y] =255 * (sinf(x*2.0f*PI/ period) + 1.0f) *(sinf(y*2.0f*PI/ period) + 1.0f) / 4.0f;// removing this syncthreads shows graphically what happens// when it doesn't exist.  this is an example of why we need it.__syncthreads();ptr[offset*4 + 0] = 0;ptr[offset*4 + 1] = shared[15-threadIdx.x][15-threadIdx.y];ptr[offset*4 + 2] = 0;ptr[offset*4 + 3] = 255;
}// globals needed by the update routine
struct DataBlock {unsigned char   *dev_bitmap;
};int main( void ) {DataBlock   data;CPUBitmap bitmap( DIM, DIM, &data );unsigned char    *dev_bitmap;cudaMalloc( (void**)&dev_bitmap,bitmap.image_size() );data.dev_bitmap = dev_bitmap;dim3    grids(DIM/16,DIM/16);dim3    threads(16,16);kernel<<<grids,threads>>>( dev_bitmap );cudaMemcpy( bitmap.get_ptr(), dev_bitmap,bitmap.image_size(),cudaMemcpyDeviceToHost ) ;cudaFree( dev_bitmap );bitmap.display_and_exit();
}

删除其中的__syncthreads，得到以下输出：

添加其中的__syncthreads，得到以下输出：

CUDA 学习——线程协助（CUDA实战第五章）相关推荐

CUDA学习笔记之 CUDA存储器模型
CUDA学习笔记之 CUDA存储器模型标签: cuda存储bindingcache编程api 2010-12-14 01:33 1223人阅读评论(0) 收藏举报分类: CUDA(26) GP ...
cuda学习笔记5——CUDA实现图像形态学腐蚀、膨胀
cuda学习笔记5--CUDA实现图像形态学腐蚀.膨胀代码 linux如何编译cuda和opencv代码耗时情况代码 #include "cuda_runtime.h" #i ...
陈宝林《最优化理论与算法》超详细学习笔记（七）————第五章运输问题
陈宝林<最优化理论与算法>超详细学习笔记 (七)----第五章运输问题第1节运输问题的数学模型第2节表上作业法 2.1 确定初始基可行解 2.2 最优解的判别 2.3 改进的方法 ...
《C指针》学习笔记（第四、五章）指针与字符串、指针与多维数组
第四章:指针与字符串字符串就是字符序列.它存储的是字节数组.一个特殊字符串结束符是用来标记字符串的结束.结束符用转义序列'\0'表示. 定义字符串变量的方法 char *strptr = " ...
《深度学习进阶自然语言处理》第五章：RNN通俗介绍
文章目录 5.1 概率和语言模型 5.1.1 概率视角下的word2vec 5.1.2 语言模型 5.1.3 将CBOW模型用作语言模型的效果怎么样? 5.2 RNN 5.2.1 循环神经网络 5.2 ...
cuda学习(5):使用cuda核函数加速warpaffine
1. warpaffine 介绍放射变换(warpaffine),主要解决图像的缩放和平移来处理目标检测中常见的预处理行为. 比如有一张猫的图片,对图片进行letterbox变换,将图片进行缩放,并 ...
Cuda 学习教程：Cuda 程序初始化
Cuda程序初始化目前,cuda里面没有对设备的初始化函数InitDevice(),只能每次调用的api函数的时候,加载设备的上下文,自动进行初始化,这将带来问题: First函数调用的时候,需要自 ...
OpenGL学习笔记一之实战篇五 2D游戏(Breakout)之渲染精灵
转载自 https://learnopengl-cn.github.io/06%20In%20Practice/2D-Game/03%20Rendering%20Sprites/ 本节暂未进行完全的重 ...
DirectX 12 3D 游戏开发与实战第五章内容
渲染流水线学习目标: 了解用于在2D图像中表现出场景立体感和空间深度感等真实效果的关键因素探索如何用Direct3D表示3D对象学习如何建立虚拟摄像机理解渲染流水线,根据给定的3D场景的几何描 ...

CUDA 学习——线程协助（CUDA实战第五章）

1.使用线程实现小型矢量加法add

2.使用线程实现大型矢量加法add

3.在GPU上使用线程实现纹波效果

4.共享内存和同步

4.1 点积运算

4.2 位图

CUDA 学习——线程协助（CUDA实战第五章）相关推荐

最新文章

热门文章

CUDA 学习——线程协助（CUDA实战 第五章）

1.使用线程实现小型矢量加法add

2.使用线程实现大型矢量加法add

3.在GPU上使用线程实现纹波效果

4.共享内存和同步

4.1 点积运算

4.2 位图

CUDA 学习——线程协助（CUDA实战 第五章）相关推荐

最新文章

热门文章

CUDA 学习——线程协助（CUDA实战第五章）

CUDA 学习——线程协助（CUDA实战第五章）相关推荐