CUDA 学习(CUDA实战 第四章)
1.矢量求和
1.1 CPU
#include <stdio.h>#define N 10void add( int *a, int *b, int *c ) {int tid = 0; // this is CPU zero, so we start at zerowhile (tid < N) {c[tid] = a[tid] + b[tid];tid += 1; // we have one CPU, so we increment by one}
}int main( void ) {int a[N], b[N], c[N];// fill the arrays 'a' and 'b' on the CPUfor (int i=0; i<N; i++) {a[i] = -i;b[i] = i * i;}add( a, b, c );// display the resultsfor (int i=0; i<N; i++) {printf( "%d + %d = %d\n", a[i], b[i], c[i] );}return 0;
}
输出:
1.2 GPU
#include <stdio.h>#define N 10__global__ void add( int *a, int *b, int *c ) {int tid = blockIdx.x; // this thread handles the data at its thread idif (tid < N)c[tid] = a[tid] + b[tid];
}int main( void ) {int a[N], b[N], c[N];int *dev_a, *dev_b, *dev_c;// allocate the memory on the GPUcudaMalloc( (void**)&dev_a, N * sizeof(int) ) ;cudaMalloc( (void**)&dev_b, N * sizeof(int) ) ;cudaMalloc( (void**)&dev_c, N * sizeof(int) ) ;// fill the arrays 'a' and 'b' on the CPUfor (int i=0; i<N; i++) {a[i] = -i;b[i] = i * i;}// copy the arrays 'a' and 'b' to the GPUcudaMemcpy( dev_a, a, N * sizeof(int),cudaMemcpyHostToDevice );cudaMemcpy( dev_b, b, N * sizeof(int),cudaMemcpyHostToDevice );add<<<N,1>>>( dev_a, dev_b, dev_c );// copy the array 'c' back from the GPU to the CPUcudaMemcpy( c, dev_c, N * sizeof(int),cudaMemcpyDeviceToHost );// display the resultsfor (int i=0; i<N; i++) {printf( "%d + %d = %d\n", a[i], b[i], c[i] );}// free the memory allocated on the GPUcudaFree( dev_a ) ;cudaFree( dev_b ) ;cudaFree( dev_c ) ;return 0;
}
1.3 GPU (数据大)
#include <stdio.h>#define N (32 * 1024)__global__ void add( int *a, int *b, int *c ) {int tid = blockIdx.x;while (tid < N) {c[tid] = a[tid] + b[tid];tid += gridDim.x;}
}int main( void ) {int *a, *b, *c;int *dev_a, *dev_b, *dev_c;// allocate the memory on the CPUa = (int*)malloc( N * sizeof(int) );b = (int*)malloc( N * sizeof(int) );c = (int*)malloc( N * sizeof(int) );// allocate the memory on the GPUcudaMalloc( (void**)&dev_a, N * sizeof(int) ) ;cudaMalloc( (void**)&dev_b, N * sizeof(int) ) ;cudaMalloc( (void**)&dev_c, N * sizeof(int) ) ;// fill the arrays 'a' and 'b' on the CPUfor (int i=0; i<N; i++) {a[i] = i;b[i] = 2 * i;}// copy the arrays 'a' and 'b' to the GPUcudaMemcpy( dev_a, a, N * sizeof(int),cudaMemcpyHostToDevice ) ;cudaMemcpy( dev_b, b, N * sizeof(int),cudaMemcpyHostToDevice ) ;add<<<128,1>>>( dev_a, dev_b, dev_c );// copy the array 'c' back from the GPU to the CPUcudaMemcpy( c, dev_c, N * sizeof(int),cudaMemcpyDeviceToHost ) ;// verify that the GPU did the work we requestedbool success = true;for (int i=0; i<N; i++) {if ((a[i] + b[i]) != c[i]) {printf( "Error: %d + %d != %d\n", a[i], b[i], c[i] );success = false;}}if (success) printf( "We did it!\n" );// free the memory we allocated on the GPUcudaFree( dev_a ) ;cudaFree( dev_b ) ;cudaFree( dev_c ) ;// free the memory we allocated on the CPUfree( a );free( b );free( c );return 0;
}
2. Julia集曲线
2.1 CPU
#include <GL/glut.h>#define DIM 1000
struct CPUBitmap {unsigned char *pixels;int x, y;void *dataBlock;void (*bitmapExit)(void*);CPUBitmap( int width, int height, void *d = NULL ) {pixels = new unsigned char[width * height * 4];x = width;y = height;dataBlock = d;}~CPUBitmap() {delete [] pixels;}unsigned char* get_ptr( void ) const { return pixels; }long image_size( void ) const { return x * y * 4; }void display_and_exit( void(*e)(void*) = NULL ) {CPUBitmap** bitmap = get_bitmap_ptr();*bitmap = this;bitmapExit = e;// a bug in the Windows GLUT implementation prevents us from// passing zero arguments to glutInit()int c=1;char* dummy = "";glutInit( &c, &dummy );glutInitDisplayMode( GLUT_SINGLE | GLUT_RGBA );glutInitWindowSize( x, y );glutCreateWindow( "bitmap" );glutKeyboardFunc(Key);glutDisplayFunc(Draw);glutMainLoop();}// static method used for glut callbacksstatic CPUBitmap** get_bitmap_ptr( void ) {static CPUBitmap *gBitmap;return &gBitmap;}// static method used for glut callbacksstatic void Key(unsigned char key, int x, int y) {switch (key) {case 27:CPUBitmap* bitmap = *(get_bitmap_ptr());if (bitmap->dataBlock != NULL && bitmap->bitmapExit != NULL)bitmap->bitmapExit( bitmap->dataBlock );exit(0);}}// static method used for glut callbacksstatic void Draw( void ) {CPUBitmap* bitmap = *(get_bitmap_ptr());glClearColor( 0.0, 0.0, 0.0, 1.0 );glClear( GL_COLOR_BUFFER_BIT );glDrawPixels( bitmap->x, bitmap->y, GL_RGBA, GL_UNSIGNED_BYTE, bitmap->pixels );glFlush();}
};struct cuComplex {float r;float i;cuComplex( float a, float b ) : r(a), i(b) {}float magnitude2( void ) { return r * r + i * i; }cuComplex operator*(const cuComplex& a) {return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);}cuComplex operator+(const cuComplex& a) {return cuComplex(r+a.r, i+a.i);}
};int julia( int x, int y ) { const float scale = 1.5;float jx = scale * (float)(DIM/2 - x)/(DIM/2);float jy = scale * (float)(DIM/2 - y)/(DIM/2);cuComplex c(-0.8, 0.156);cuComplex a(jx, jy);int i = 0;for (i=0; i<200; i++) {a = a * a + c;if (a.magnitude2() > 1000)return 0;}return 1;
}void kernel( unsigned char *ptr ){for (int y=0; y<DIM; y++) {for (int x=0; x<DIM; x++) {int offset = x + y * DIM;int juliaValue = julia( x, y );ptr[offset*4 + 0] = 255 * juliaValue;ptr[offset*4 + 1] = 0;ptr[offset*4 + 2] = 0;ptr[offset*4 + 3] = 255;}}}int main( void ) {CPUBitmap bitmap( DIM, DIM );unsigned char *ptr = bitmap.get_ptr();kernel( ptr );bitmap.display_and_exit();
}
编译,运行:
2.2 GPU
// nvcc julia_gpu.cu -o julia -lglut -lcuda#include <GL/glut.h>#ifndef DIM
#define DIM 1000
#endif
struct CPUBitmap {unsigned char *pixels;int x, y;void *dataBlock;void (*bitmapExit)(void*);CPUBitmap( int width, int height, void *d = NULL ) {pixels = new unsigned char[width * height * 4];x = width;y = height;dataBlock = d;}~CPUBitmap() {delete [] pixels;}unsigned char* get_ptr( void ) const { return pixels; }long image_size( void ) const { return x * y * 4; }void display_and_exit( void(*e)(void*) = NULL ) {CPUBitmap** bitmap = get_bitmap_ptr();*bitmap = this;bitmapExit = e;// a bug in the Windows GLUT implementation prevents us from// passing zero arguments to glutInit()int c=1;char* dummy = "";glutInit( &c, &dummy );glutInitDisplayMode( GLUT_SINGLE | GLUT_RGBA );glutInitWindowSize( x, y );glutCreateWindow( "bitmap" );glutKeyboardFunc(Key);glutDisplayFunc(Draw);glutMainLoop();}// static method used for glut callbacksstatic CPUBitmap** get_bitmap_ptr( void ) {static CPUBitmap *gBitmap;return &gBitmap;}// static method used for glut callbacksstatic void Key(unsigned char key, int x, int y) {switch (key) {case 27:CPUBitmap* bitmap = *(get_bitmap_ptr());if (bitmap->dataBlock != NULL && bitmap->bitmapExit != NULL)bitmap->bitmapExit( bitmap->dataBlock );exit(0);}}// static method used for glut callbacksstatic void Draw( void ) {CPUBitmap* bitmap = *(get_bitmap_ptr());glClearColor( 0.0, 0.0, 0.0, 1.0 );glClear( GL_COLOR_BUFFER_BIT );glDrawPixels( bitmap->x, bitmap->y, GL_RGBA, GL_UNSIGNED_BYTE, bitmap->pixels );glFlush();}
};struct cuComplex {float r;float i;__device__ cuComplex( float a, float b ) : r(a), i(b) {}__device__ float magnitude2( void ) {return r * r + i * i;}__device__ cuComplex operator*(const cuComplex& a) {return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);}__device__ cuComplex operator+(const cuComplex& a) {return cuComplex(r+a.r, i+a.i);}
};__device__ int julia( int x, int y ) {const float scale = 1.5;float jx = scale * (float)(DIM/2 - x)/(DIM/2);float jy = scale * (float)(DIM/2 - y)/(DIM/2);cuComplex c(-0.8, 0.156);cuComplex a(jx, jy);int i = 0;for (i=0; i<200; i++) {a = a * a + c;if (a.magnitude2() > 1000)return 0;}return 1;
}__global__ void kernel( unsigned char *ptr ) {// map from blockIdx to pixel positionint x = blockIdx.x;int y = blockIdx.y;int offset = x + y * gridDim.x;// now calculate the value at that positionint juliaValue = julia( x, y );ptr[offset*4 + 0] = 255 * juliaValue;ptr[offset*4 + 1] = 0;ptr[offset*4 + 2] = 0;ptr[offset*4 + 3] = 255;
}// globals needed by the update routine
struct DataBlock {unsigned char *dev_bitmap;
};int main( void ) {DataBlock data;CPUBitmap bitmap( DIM, DIM, &data );unsigned char *dev_bitmap;cudaMalloc( (void**)&dev_bitmap, bitmap.image_size() );data.dev_bitmap = dev_bitmap;dim3 grid(DIM,DIM);kernel<<<grid,1>>>(dev_bitmap);cudaMemcpy(bitmap.get_ptr(), dev_bitmap, bitmap.image_size(), cudaMemcpyDeviceToHost);cudaFree(dev_bitmap);bitmap.display_and_exit();
}
编译,运行:
CUDA 学习(CUDA实战 第四章)相关推荐
- C++ Primer 学习笔记(第四章:表达式)
2019独角兽企业重金招聘Python工程师标准>>> ##C++ Primer 学习笔记(第四章:表达式) [TOC] ###4.1 基础 左值和右值: 当一个对象被用作右值的时候 ...
- 机器学习理论《统计学习方法》学习笔记:第四章 朴素贝叶斯法
机器学习理论<统计学习方法>学习笔记:第四章 朴素贝叶斯法 4 朴素贝叶斯法 4.1 朴素贝叶斯法的学习与分类 4.1.1 基本方法 4.1.2 后验概率最大化的含义 4.2 朴素贝叶斯法 ...
- 尚学堂java实战第四章课后习题
尚学堂java实战第四章课后习题 文章中的题目答案仅供参考 选择题答案: 1.B 解析:一个java类必然存在构造器,即使没有定义构造器,也会存在一个默认的无参构造器. 2.D 3.AC 解析: A( ...
- 零基础学Python课后实战第四章
零基础学Python课后实战第四章 实战一:输出王者荣耀的游戏角色 实战二:模拟火车订票系统 实战三:电视剧的收视率排行榜 tips 实战一:输出王者荣耀的游戏角色 列表的创建.遍历列表 代码 pri ...
- javascript学习笔记(第四章图片库--初步了解)
javascript学习笔记(第四章图片库–初步了解) 通过前三章的学习我们已经对这个新的语言有了一个了解,js的语法基本和C语言一致,我们可以通过调用一些document对象中的函数来对实现一些很简 ...
- 学习笔记-第十四章 恶意代码分析实战
第十四章 恶意代码的网络特征 1.网络应对措施. 网络行为的基本属性包括IP地址,TCP端口,以及流量内容等,网络和安全 设备可以利用它们,来提供网络应对措施.根据IP地址和端口,防火墙和路由器可以限 ...
- android movie 资源释放,Android 资讯类App项目实战 第四章 电影模块
前言: 正在做一个资讯类app,打算一边做一边整理,供自己学习与巩固.用到的知识复杂度不高,仅适于新手.经验不多,如果写出来的代码有不好的地方欢迎讨论. 以往的内容 第四章 电影模块 本章内容最终效果 ...
- 机器学习实战——第四章(分类):朴素贝叶斯
前言 首先感谢博主:Jack-Cui 主页:http://blog.csdn.net/c406495762 朴素贝叶斯博文地址: https://blog.csdn.net/c406495762/ar ...
- Python入门:对Excel数据处理的学习笔记【第四章】字符串类型处理技术
注:该学习笔记是根据曾志贤老师编写的<从Excel到Python,用Python轻松处理Excel数据>所学习整理的笔记. 第四章 字符串类型处理技术 目录 第四章 字符串类型处理技术 一 ...
- 【OpenCV 学习笔记】第四章: 色彩空间类型转换
第四章: 色彩空间类型转换 为什么有色彩空间这个概念? 世界本是无颜色的,我们人类看到的各种有色光只是特定波长的电磁波能够刺激人眼的锥体细胞,进而在人脑中形成颜色信号而已,实际上电磁波的波长域是非常广 ...
最新文章
- linux7安装haproxy,Centos7 源码编译安装haproxy
- python函数内部变量能改变外部吗_python中,如何利用函数修改外部变量,注意我的要求...
- Windows下通过VNC远程访问Linux服务器,并实现可视化
- octave进行积分运算
- 2020 职场,哪类程序员会是王者?!
- HAProxy介绍及配置文件详解
- 神奇的linux发行版 tiny core linux
- pyecharts实现多节点、长路径的sankey桑基图
- 第01期:salesforce开发环境的搭建
- Linux命令(五)之service服务查找、启动/停止等相关操作
- ZZULIOJ 1030~1039(oj入门题)
- 189邮箱smpt服务器,189邮箱登录(常用邮箱客户端设置指南)
- hashmap的底层
- 什么是索引回表,如何避免(索引覆盖)
- 《SLA by Short brain》—学好英语口语的终极法宝!
- python实现汇率转换
- jdk9 jdk10 jdk11启动rocketMQ的问题
- Eclipse里看到project 存在向上或者向下的箭头
- 《激荡三十年》——来了解我们的时代
- 提交form表单方法
热门文章
- 计算机中cpu是不是内存,电脑卡是cpu还是内存
- Android根据图片路径获取图片名字
- 【Verilog】FPGA控制RGB灯WS2812B
- 亲测绝对有用,,电脑不显示可连接WiFi列表,无法连接WiFi的解决办法。
- MyBatis_Learning_3.5.7
- Java最基础的算法逻辑题,新手一定要看看呀。
- 百度地图地图及定位实现
- 项目经理如何做好项目管理PMP - 持续更新
- python搭建qt开发环境_QT开发环境搭建(Windows)
- 小米手环NFC 模拟门禁卡 不买东西最简解决方案(100字)