1.使用线程实现小型矢量加法add


#include <stdio.h>#define N   10__global__ void add( int *a, int *b, int *c ) {int tid = threadIdx.x;if (tid < N)c[tid] = a[tid] + b[tid];
}int main( void ) {int a[N], b[N], c[N];int *dev_a, *dev_b, *dev_c;// allocate the memory on the GPUcudaMalloc( (void**)&dev_a, N * sizeof(int) ) ;cudaMalloc( (void**)&dev_b, N * sizeof(int) ) ;cudaMalloc( (void**)&dev_c, N * sizeof(int) ) ;// fill the arrays 'a' and 'b' on the CPUfor (int i=0; i<N; i++) {a[i] = i;b[i] = i * i;}// copy the arrays 'a' and 'b' to the GPUcudaMemcpy( dev_a, a, N * sizeof(int),cudaMemcpyHostToDevice ) ;cudaMemcpy( dev_b, b, N * sizeof(int),cudaMemcpyHostToDevice ) ;add<<<1,N>>>( dev_a, dev_b, dev_c );// copy the array 'c' back from the GPU to the CPUcudaMemcpy( c, dev_c, N * sizeof(int),cudaMemcpyDeviceToHost ) ;// display the resultsfor (int i=0; i<N; i++) {printf( "%d + %d = %d\n", a[i], b[i], c[i] );}// free the memory allocated on the GPUcudaFree( dev_a ) ;cudaFree( dev_b ) ;cudaFree( dev_c ) ;return 0;
}

输出:

2.使用线程实现大型矢量加法add

note:
1.线程块不能超过66635
2.单个线程块包含的线程数不能超过512

#include <stdio.h>#define N   (33 * 1024)__global__ void add( int *a, int *b, int *c ) {int tid = threadIdx.x + blockIdx.x * blockDim.x;while (tid < N) {c[tid] = a[tid] + b[tid];tid += blockDim.x * gridDim.x;}
}int main( void ) {int *a, *b, *c;int *dev_a, *dev_b, *dev_c;// allocate the memory on the CPUa = (int*)malloc( N * sizeof(int) );b = (int*)malloc( N * sizeof(int) );c = (int*)malloc( N * sizeof(int) );// allocate the memory on the GPUcudaMalloc( (void**)&dev_a, N * sizeof(int) ) ;cudaMalloc( (void**)&dev_b, N * sizeof(int) ) ;cudaMalloc( (void**)&dev_c, N * sizeof(int) ) ;// fill the arrays 'a' and 'b' on the CPUfor (int i=0; i<N; i++) {a[i] = i;b[i] = 2 * i;}// copy the arrays 'a' and 'b' to the GPUcudaMemcpy( dev_a, a, N * sizeof(int),cudaMemcpyHostToDevice );cudaMemcpy( dev_b, b, N * sizeof(int),cudaMemcpyHostToDevice );add<<<128,128>>>( dev_a, dev_b, dev_c );//or add<<<(N+127)/128,128>>>( dev_a, dev_b, dev_c );// copy the array 'c' back from the GPU to the CPUcudaMemcpy( c, dev_c, N * sizeof(int),cudaMemcpyDeviceToHost );// verify that the GPU did the work we requestedbool success = true;for (int i=0; i<N; i++) {if ((a[i] + b[i]) != c[i]) {printf( "Error:  %d + %d != %d\n", a[i], b[i], c[i] );success = false;}}if (success)    printf( "We did it!\n" );// free the memory we allocated on the GPUcudaFree( dev_a ) ;cudaFree( dev_b ) ;cudaFree( dev_c ) ;// free the memory we allocated on the CPUfree( a );free( b );free( c );return 0;
}

输出:
We did it!

Note:
当线程数量足够时,调用GPU时使用以下调用方式可减少资源浪费

add<<<(N+127)/128,128>>>( dev_a, dev_b, dev_c );

当线程数量不够时,可以让每个线程执行多次,如

__global__ void add( int *a, int *b, int *c ) {int tid = threadIdx.x + blockIdx.x * blockDim.x;while (tid < N) {c[tid] = a[tid] + b[tid];tid += blockDim.x * gridDim.x;//由于只有blockDim.x * gridDim.x个CPU,因此每次递增blockDim.x * gridDim.x}
}

类似于CPU中:

void add( int *a, int *b, int *c ) {int tid = 0;while (tid < N) {c[tid] = a[tid] + b[tid];tid ++;//由于只有一个CPU,因此每次递增1}
}

3.在GPU上使用线程实现纹波效果

#include <GL/glut.h>
struct CPUAnimBitmap {unsigned char    *pixels;int     width, height;void    *dataBlock;void (*fAnim)(void*,int);void (*animExit)(void*);void (*clickDrag)(void*,int,int,int,int);int     dragStartX, dragStartY;CPUAnimBitmap( int w, int h, void *d = NULL ) {width = w;height = h;pixels = new unsigned char[width * height * 4];dataBlock = d;clickDrag = NULL;}~CPUAnimBitmap() {delete [] pixels;}unsigned char* get_ptr( void ) const   { return pixels; }long image_size( void ) const { return width * height * 4; }void click_drag( void (*f)(void*,int,int,int,int)) {clickDrag = f;}void anim_and_exit( void (*f)(void*,int), void(*e)(void*) ) {CPUAnimBitmap**   bitmap = get_bitmap_ptr();*bitmap = this;fAnim = f;animExit = e;// a bug in the Windows GLUT implementation prevents us from// passing zero arguments to glutInit()int c=1;char* dummy = "";glutInit( &c, &dummy );glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );glutInitWindowSize( width, height );glutCreateWindow( "bitmap" );glutKeyboardFunc(Key);glutDisplayFunc(Draw);if (clickDrag != NULL)glutMouseFunc( mouse_func );glutIdleFunc( idle_func );glutMainLoop();}// static method used for glut callbacksstatic CPUAnimBitmap** get_bitmap_ptr( void ) {static CPUAnimBitmap*   gBitmap;return &gBitmap;}// static method used for glut callbacksstatic void mouse_func( int button, int state,int mx, int my ) {if (button == GLUT_LEFT_BUTTON) {CPUAnimBitmap*   bitmap = *(get_bitmap_ptr());if (state == GLUT_DOWN) {bitmap->dragStartX = mx;bitmap->dragStartY = my;} else if (state == GLUT_UP) {bitmap->clickDrag( bitmap->dataBlock,bitmap->dragStartX,bitmap->dragStartY,mx, my );}}}// static method used for glut callbacksstatic void idle_func( void ) {static int ticks = 1;CPUAnimBitmap*   bitmap = *(get_bitmap_ptr());bitmap->fAnim( bitmap->dataBlock, ticks++ );glutPostRedisplay();}// static method used for glut callbacksstatic void Key(unsigned char key, int x, int y) {switch (key) {case 27:CPUAnimBitmap*   bitmap = *(get_bitmap_ptr());bitmap->animExit( bitmap->dataBlock );//delete bitmap;exit(0);}}// static method used for glut callbacksstatic void Draw( void ) {CPUAnimBitmap*   bitmap = *(get_bitmap_ptr());glClearColor( 0.0, 0.0, 0.0, 1.0 );glClear( GL_COLOR_BUFFER_BIT );glDrawPixels( bitmap->width, bitmap->height, GL_RGBA, GL_UNSIGNED_BYTE, bitmap->pixels );glutSwapBuffers();}
};#define DIM 1024
#define PI 3.1415926535897932f__global__ void kernel( unsigned char *ptr, int ticks ) {// map from threadIdx/BlockIdx to pixel positionint x = threadIdx.x + blockIdx.x * blockDim.x;int y = threadIdx.y + blockIdx.y * blockDim.y;int offset = x + y * blockDim.x * gridDim.x;// now calculate the value at that positionfloat fx = x - DIM/2;float fy = y - DIM/2;float d = sqrtf( fx * fx + fy * fy );unsigned char grey = (unsigned char)(128.0f + 127.0f *cos(d/10.0f - ticks/7.0f) /(d/10.0f + 1.0f));    ptr[offset*4 + 0] = grey;ptr[offset*4 + 1] = grey;ptr[offset*4 + 2] = grey;ptr[offset*4 + 3] = 255;
}struct DataBlock {unsigned char   *dev_bitmap;CPUAnimBitmap  *bitmap;
};void generate_frame( DataBlock *d, int ticks ) {dim3    blocks(DIM/16,DIM/16);dim3    threads(16,16);kernel<<<blocks,threads>>>( d->dev_bitmap, ticks );cudaMemcpy( d->bitmap->get_ptr(),d->dev_bitmap,d->bitmap->image_size(),cudaMemcpyDeviceToHost ) ;
}// clean up memory allocated on the GPU
void cleanup( DataBlock *d ) {cudaFree( d->dev_bitmap );
}int main( void ) {DataBlock   data;CPUAnimBitmap  bitmap( DIM, DIM, &data );data.bitmap = &bitmap;cudaMalloc( (void**)&data.dev_bitmap,bitmap.image_size() ) ;bitmap.anim_and_exit( (void (*)(void*,int))generate_frame,(void (*)(void*))cleanup );
}

编译,输出:

4.共享内存和同步

关键字:__share__
用于单个线程块中多个线程之间的通信与协作,其他线程块上的线程无法读取和写入该线程块的共享内存中的变量。数据传输效率高。

4.1 点积运算

#include <stdio.h>#define imin(a,b) (a<b?a:b)const int N = 33 * 1024;
const int threadsPerBlock = 256;
const int blocksPerGrid =imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );__global__ void dot( float *a, float *b, float *c ) {__shared__ float cache[threadsPerBlock];int tid = threadIdx.x + blockIdx.x * blockDim.x;int cacheIndex = threadIdx.x;float   temp = 0;while (tid < N) {temp += a[tid] * b[tid];tid += blockDim.x * gridDim.x;}// set the cache valuescache[cacheIndex] = temp;// synchronize threads in this block__syncthreads();// for reductions, threadsPerBlock must be a power of 2// because of the following codeint i = blockDim.x/2;while (i != 0) {if (cacheIndex < i)cache[cacheIndex] += cache[cacheIndex + i];__syncthreads();i /= 2;}if (cacheIndex == 0)c[blockIdx.x] = cache[0];
}int main( void ) {float   *a, *b, c, *partial_c;float   *dev_a, *dev_b, *dev_partial_c;// allocate memory on the cpu sidea = (float*)malloc( N*sizeof(float) );b = (float*)malloc( N*sizeof(float) );partial_c = (float*)malloc( blocksPerGrid*sizeof(float) );// allocate the memory on the GPUcudaMalloc( (void**)&dev_a,N*sizeof(float) ) ;cudaMalloc( (void**)&dev_b,N*sizeof(float) ) ;cudaMalloc( (void**)&dev_partial_c,blocksPerGrid*sizeof(float) ) ;// fill in the host memory with datafor (int i=0; i<N; i++) {a[i] = i;b[i] = i*2;}// copy the arrays 'a' and 'b' to the GPUcudaMemcpy( dev_a, a, N*sizeof(float),cudaMemcpyHostToDevice ) ;cudaMemcpy( dev_b, b, N*sizeof(float),cudaMemcpyHostToDevice ) ; dot<<<blocksPerGrid,threadsPerBlock>>>( dev_a, dev_b,dev_partial_c );// copy the array 'c' back from the GPU to the CPUcudaMemcpy( partial_c, dev_partial_c,blocksPerGrid*sizeof(float),cudaMemcpyDeviceToHost ) ;// finish up on the CPU sidec = 0;for (int i=0; i<blocksPerGrid; i++) {c += partial_c[i];}#define sum_squares(x)  (x*(x+1)*(2*x+1)/6)printf( "Does GPU value %.6g = %.6g?\n", c,2 * sum_squares( (float)(N - 1) ) );// free memory on the gpu sidecudaFree( dev_a );cudaFree( dev_b );cudaFree( dev_partial_c );// free memory on the cpu sidefree( a );free( b );free( partial_c );
}

NOTE:__syncthreads()不能放在if语句下面,他必须被所有线程执行,否则会出事情。当if语句不被执行时,线程将会一直被等待。

输出:

Does GPU value 2.57236e+13 = 2.57236e+13?

4.2 位图

/** Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.** NVIDIA Corporation and its licensors retain all intellectual property and * proprietary rights in and to this software and related documentation. * Any use, reproduction, disclosure, or distribution of this software * and related documentation without an express license agreement from* NVIDIA Corporation is strictly prohibited.** Please refer to the applicable NVIDIA end user license agreement (EULA) * associated with this source code for terms and conditions that govern * your use of this NVIDIA software.* */#include <stdio.h>
#include <GL/glut.h>struct CPUBitmap {unsigned char    *pixels;int     x, y;void    *dataBlock;void (*bitmapExit)(void*);CPUBitmap( int width, int height, void *d = NULL ) {pixels = new unsigned char[width * height * 4];x = width;y = height;dataBlock = d;}~CPUBitmap() {delete [] pixels;}unsigned char* get_ptr( void ) const   { return pixels; }long image_size( void ) const { return x * y * 4; }void display_and_exit( void(*e)(void*) = NULL ) {CPUBitmap**   bitmap = get_bitmap_ptr();*bitmap = this;bitmapExit = e;// a bug in the Windows GLUT implementation prevents us from// passing zero arguments to glutInit()int c=1;char* dummy = "";glutInit( &c, &dummy );glutInitDisplayMode( GLUT_SINGLE | GLUT_RGBA );glutInitWindowSize( x, y );glutCreateWindow( "bitmap" );glutKeyboardFunc(Key);glutDisplayFunc(Draw);glutMainLoop();}// static method used for glut callbacksstatic CPUBitmap** get_bitmap_ptr( void ) {static CPUBitmap   *gBitmap;return &gBitmap;}// static method used for glut callbacksstatic void Key(unsigned char key, int x, int y) {switch (key) {case 27:CPUBitmap*   bitmap = *(get_bitmap_ptr());if (bitmap->dataBlock != NULL && bitmap->bitmapExit != NULL)bitmap->bitmapExit( bitmap->dataBlock );exit(0);}}// static method used for glut callbacksstatic void Draw( void ) {CPUBitmap*   bitmap = *(get_bitmap_ptr());glClearColor( 0.0, 0.0, 0.0, 1.0 );glClear( GL_COLOR_BUFFER_BIT );glDrawPixels( bitmap->x, bitmap->y, GL_RGBA, GL_UNSIGNED_BYTE, bitmap->pixels );glFlush();}
};#define DIM 1024
#define PI 3.1415926535897932f__global__ void kernel( unsigned char *ptr ) {// map from threadIdx/BlockIdx to pixel positionint x = threadIdx.x + blockIdx.x * blockDim.x;int y = threadIdx.y + blockIdx.y * blockDim.y;int offset = x + y * blockDim.x * gridDim.x;__shared__ float    shared[16][16];// now calculate the value at that positionconst float period = 128.0f;shared[threadIdx.x][threadIdx.y] =255 * (sinf(x*2.0f*PI/ period) + 1.0f) *(sinf(y*2.0f*PI/ period) + 1.0f) / 4.0f;// removing this syncthreads shows graphically what happens// when it doesn't exist.  this is an example of why we need it.__syncthreads();ptr[offset*4 + 0] = 0;ptr[offset*4 + 1] = shared[15-threadIdx.x][15-threadIdx.y];ptr[offset*4 + 2] = 0;ptr[offset*4 + 3] = 255;
}// globals needed by the update routine
struct DataBlock {unsigned char   *dev_bitmap;
};int main( void ) {DataBlock   data;CPUBitmap bitmap( DIM, DIM, &data );unsigned char    *dev_bitmap;cudaMalloc( (void**)&dev_bitmap,bitmap.image_size() );data.dev_bitmap = dev_bitmap;dim3    grids(DIM/16,DIM/16);dim3    threads(16,16);kernel<<<grids,threads>>>( dev_bitmap );cudaMemcpy( bitmap.get_ptr(), dev_bitmap,bitmap.image_size(),cudaMemcpyDeviceToHost ) ;cudaFree( dev_bitmap );bitmap.display_and_exit();
}

删除其中的__syncthreads,得到以下输出:

添加其中的__syncthreads,得到以下输出:

CUDA 学习——线程协助(CUDA实战 第五章)相关推荐

  1. CUDA学习笔记之 CUDA存储器模型

    CUDA学习笔记之 CUDA存储器模型 标签: cuda存储bindingcache编程api 2010-12-14 01:33 1223人阅读 评论(0) 收藏 举报 分类: CUDA(26) GP ...

  2. cuda学习笔记5——CUDA实现图像形态学腐蚀、膨胀

    cuda学习笔记5--CUDA实现图像形态学腐蚀.膨胀 代码 linux如何编译cuda和opencv代码 耗时情况 代码 #include "cuda_runtime.h" #i ...

  3. 陈宝林《最优化理论与算法》超详细学习笔记 (七)————第五章 运输问题

    陈宝林<最优化理论与算法>超详细学习笔记 (七)----第五章 运输问题 第1节 运输问题的数学模型 第2节 表上作业法 2.1 确定初始基可行解 2.2 最优解的判别 2.3 改进的方法 ...

  4. 《C指针》学习笔记( 第四、五章)指针与字符串、指针与多维数组

    第四章:指针与字符串 字符串就是字符序列.它存储的是字节数组.一个特殊字符串结束符是用来标记字符串的结束.结束符用转义序列'\0'表示. 定义字符串变量的方法 char *strptr = " ...

  5. 《深度学习进阶 自然语言处理》第五章:RNN通俗介绍

    文章目录 5.1 概率和语言模型 5.1.1 概率视角下的word2vec 5.1.2 语言模型 5.1.3 将CBOW模型用作语言模型的效果怎么样? 5.2 RNN 5.2.1 循环神经网络 5.2 ...

  6. cuda学习(5):使用cuda核函数加速warpaffine

    1. warpaffine 介绍 放射变换(warpaffine),主要解决图像的缩放和平移来处理目标检测中常见的预处理行为. 比如有一张猫的图片,对图片进行letterbox变换,将图片进行缩放,并 ...

  7. Cuda 学习教程:Cuda 程序初始化

    Cuda程序初始化 目前,cuda里面没有对设备的初始化函数InitDevice(),只能每次调用的api函数的时候,加载设备的上下文,自动进行初始化,这将带来问题: First函数调用的时候,需要自 ...

  8. OpenGL学习笔记一之实战篇五 2D游戏(Breakout)之渲染精灵

    转载自 https://learnopengl-cn.github.io/06%20In%20Practice/2D-Game/03%20Rendering%20Sprites/ 本节暂未进行完全的重 ...

  9. DirectX 12 3D 游戏开发与实战第五章内容

    渲染流水线 学习目标: 了解用于在2D图像中表现出场景立体感和空间深度感等真实效果的关键因素 探索如何用Direct3D表示3D对象 学习如何建立虚拟摄像机 理解渲染流水线,根据给定的3D场景的几何描 ...

最新文章

  1. HttpHand和HttpModule的详细解释,包括Asp.Net对Http请求的处理流程。
  2. 美国多个城市禁止、我国却蓬勃发展的人脸识别是什么样子?
  3. 用python写一个简单的推荐系统 1
  4. Qomo OpenProject beta1 发布!
  5. mysql中int型的zerofill参数
  6. ThinkPHP--栏目增删改查ADSF
  7. 如何召开一次无效的会议?
  8. 找CTO杜仲:再谈中年危机和应对策略
  9. Written English-书面-一般现在时
  10. 理解快速傅里叶变换(FFT)算法
  11. Xlim函数--Matplotlib
  12. matlab 读取同一文件中所有图像_matlab 批量读取文件夹内所有图片的几种方法
  13. 计算机基础a3知识点,《计算机应用基础》期末试卷A(A3版).doc
  14. 硬件信息查看PHP,如何查看电脑硬件信息|查看电脑硬件信息软件 查看电脑CPU/显卡/内存条/主板型号等配置...
  15. opencv convertTo函数
  16. Home键监听与电源键
  17. 基于RTMP协议的Flash流媒体网页直播播放器
  18. 大型桥梁结构健康监测系统之GNSS位移监测系统
  19. jws webservice 跳过https认证_【大连学为贵5周年庆典】多邻国考试不能认证是怎么回事?这些雷区不要踩!...
  20. matlab的NNF算法,NNF是什么意思

热门文章

  1. C++接口实现汉字拼音转换
  2. 全志A33移植openharmony3.1标准系统之添加产品编译
  3. 错误 3 error C2181: 没有匹配 if 的非法 else c:\users\zhong\desktop\我的程序\c++qt程序\test\sineware\sineware\sinewa
  4. 「macOS无法验证此APP不包含恶意软件」的处理方式
  5. 计算机桌面底下显示条,详细教您电脑屏幕出现条纹怎么办
  6. 查看服务器型号和cpu参数等信息
  7. 无法从共享目录中映射段 linux,dopen():未作为root用户运行时,“无法从共享对象映射段”...
  8. AcWing寒假每日一题
  9. “诸神之眼”——Nmap端口扫描工具使用小手册
  10. LeetCode 1218 最长定差子序列