由于刚刚开始学习Cuda,还没有整理出一个完整的Cuda类,只是在Nvidia提供的kenerl架构上做修改。

  但用于初体验GPU给我们带来的好处也绰绰有余了。

  直接贴代码:

/*矩阵乘法,CPU版本和GPU版本的对比*/#include "cuda_runtime.h"
#include "device_launch_parameters.h"#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <Windows.h>
#include <string>
#include <malloc.h>//用于指示不同的GPU 优化版本
enum Type
{Mode1 = 1,   //Mode 1 :将每一个C[i][j]都分别分配一个线程Mode2 = 2     //Mode 2 :不让一个线程完整计算一个C[i][j],通过C(i,j) = sum { A(i,k)*B(k,j) }发现,我们还可以再细度划分://           sub(i,j) = sum{A(i,ksub+offsetA)*B(ksub+offsetB,j)}  0<=ksub < blockSize//            C(i, j) = sum{ Csub(i, j) }//            就是把矩阵分成n*n个大的子块,然后每一个block负责计算子块i 和 子块j的子乘积,计算完毕后加起来则可。这里主要使用了共享显存作优化。
};cudaError_t addWithCuda(float *c, const float *a, const float *b, unsigned int WA, unsigned int HA, unsigned int WB, unsigned int HB, Type mode);__global__ void MatrixMulGPU_1(float *c, const float *a, const float *b, unsigned int WA, unsigned int WB)
{float sum = 0;//找出该线程所在的行和列int row = blockIdx.y * blockDim.y + threadIdx.y;int col = blockIdx.x * blockDim.x + threadIdx.x;//线程Thread(row, col)负责计算C(row, col)for (int i = 0; i < WB; ++i){sum += a[row * WA + i] * b[i * WB + col];}c[row * WB + col] = sum;
}template<int BLOCK_SIZE> __global__ void MatrixMulGPU_2(float *c, const float *a, const float *b, unsigned int WA, unsigned int WB)
{// Block indexint bx = blockIdx.x;int by = blockIdx.y;// Thread indexint tx = threadIdx.x;int ty = threadIdx.y;// Index of the first sub-matrix of A processed by the blockint aBegin = WA * BLOCK_SIZE * by;// Index of the last sub-matrix of A processed by the blockint aEnd = aBegin + WA - 1;// Step size used to iterate through the sub-matrices of Aint aStep = BLOCK_SIZE;// Index of the first sub-matrix of B processed by the blockint bBegin = BLOCK_SIZE * bx;// Step size used to iterate through the sub-matrices of Bint bStep = BLOCK_SIZE * WB;// Csub is used to store the element of the block sub-matrix// that is computed by the threadfloat Csub = 0;// Loop over all the sub-matrices of A and B// required to compute the block sub-matrixfor (int i = aBegin, j = bBegin;i <= aEnd;i += aStep, j += bStep){// Declaration of the shared memory array As used to// store the sub-matrix of A__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];// Declaration of the shared memory array Bs used to// store the sub-matrix of B__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];// Load the matrices from device memory// to shared memory; each thread loads// one element of each matrixAs[ty][tx] = a[i + WA * ty + tx];Bs[ty][tx] = b[j + WB * ty + tx];// Synchronize to make sure the matrices are loaded__syncthreads();// Multiply the two matrices together;// each thread computes one element// of the block sub-matrix
#pragma unrollfor (int k = 0; k < BLOCK_SIZE; ++k){Csub += As[ty][k] * Bs[k][tx];}// Synchronize to make sure that the preceding// computation is done before loading two new// sub-matrices of A and B in the next iteration__syncthreads();}// Write the block sub-matrix to device memory;// each thread writes one elementint k = WB * BLOCK_SIZE * by + BLOCK_SIZE * bx;c[k + WB * ty + tx] = Csub;
}//GPU version
void MatrixMulCPU(float *_C, const float* _A, const float* _B, int WA, int HA, int WB, int HB)
{if (WA != HB){printf("the matrix A and B cannot be multipled!");exit(0);}for (int i = 0; i < HA; ++i){for (int j = 0; j < WB; ++j){for (int k = 0; k < WA; ++k){_C[i * WA + j] += _A[i * WA + k] * _B[k * WB + j];}}}
}//给初始的矩阵一个随机值
void randomInit(float* _data, int _size)
{for (int i = 0; i < _size; ++i){_data[i] = rand() / (float)RAND_MAX * 100;}
}//print the matrix
void printMatrix(float* m_Matrix, int W, int H)
{for (int i = 0; i < W * H; ++i){printf("%2.1f ", m_Matrix[i]);if (i % W == 0 && i != 0) printf("\n");}printf("\n");
}bool CheckAnswer(const float* _C, const float* _D, unsigned int size)
{bool isRight = true;for (int i = 0; i < size && isRight == true; ++i){if (_C[i] != _D[i])isRight = false;}return isRight;
}int main()
{const int width_A = 1024;const int height_A = 1024;const int width_B = 1024;const int height_B = 1024;float *B = (float *)malloc(sizeof(float) * height_B * width_B);float *A = (float *)malloc(sizeof(float) * height_A * width_A);float *C = (float *)malloc(sizeof(float) * height_A * width_B);float *D = (float *)malloc(sizeof(float) * height_A * width_B);float *E = (float *)malloc(sizeof(float) * height_A * width_B);memset(A, 0.0, sizeof(float) * height_A * width_A);memset(B, 0.0, sizeof(float) * height_B * width_B);memset(C, 0.0, sizeof(float) * height_A * width_B);memset(D, 0.0, sizeof(float) * height_A * width_B);memset(E, 0.0, sizeof(float) * height_A * width_B);//产生随机数生成器srand((unsigned)time(0));randomInit(B, height_B * width_B);randomInit(A, height_A * width_A);//printMatrix(B, width_B, height_B);//printMatrix(A, width_A, height_A);//CPU 计算unsigned int tick1 = GetTickCount();MatrixMulCPU(C, A, B, width_A, height_A, width_B, height_B);printf("CPU use time : %dms\n", GetTickCount() - tick1);//GPU Type m_Mode = Mode1;unsigned int tick2 = GetTickCount();cudaError_t cudaStatus = addWithCuda(D, A, B, width_A, height_A, width_B, height_B, m_Mode);if (cudaStatus != cudaSuccess){fprintf(stderr, "addWithCuda failed!\n");return 1;}printf("GPU mode1 use time : %dms\n", GetTickCount() - tick2);m_Mode = Mode2;unsigned int tick3 = GetTickCount();cudaStatus = addWithCuda(E, A, B, width_A, height_A, width_B, height_B, m_Mode);if (cudaStatus != cudaSuccess){fprintf(stderr, "addWithCuda failed!\n");return 1;}printf("GPU mode2 use time : %dms\n", GetTickCount() - tick3);//检查GPU, CPU 计算的结果是否相同if (!CheckAnswer(C, D, height_A * width_B) && !CheckAnswer(C, E, height_A * width_B))printf("The answer is wrong!");else printf("The answer is right!");// cudaDeviceReset must be called before exiting in order for profiling and// tracing tools such as Nsight and Visual Profiler to show complete traces.cudaStatus = cudaDeviceReset();if (cudaStatus != cudaSuccess){fprintf(stderr, "cudaDeviceReset failed!");return 1;}return 0;
}// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(float *c, const float *a, const float *b, unsigned int WA, unsigned int HA, unsigned int WB, unsigned int HB, Type mode)
{float *dev_a = 0;float *dev_b = 0;float *dev_c = 0;cudaError_t cudaStatus;// Choose which GPU to run on, change this on a multi-GPU system.cudaStatus = cudaSetDevice(0);if (cudaStatus != cudaSuccess) {fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");goto Error;}// Allocate GPU buffers for three vectors (two input, one output)    .cudaStatus = cudaMalloc((void**)&dev_c, HA * WB * sizeof(float));if (cudaStatus != cudaSuccess) {fprintf(stderr, "cudaMalloc failed!");goto Error;}cudaStatus = cudaMalloc((void**)&dev_a, HA * WA * sizeof(float));if (cudaStatus != cudaSuccess) {fprintf(stderr, "cudaMalloc failed!");goto Error;}cudaStatus = cudaMalloc((void**)&dev_b, HB * WB * sizeof(float));if (cudaStatus != cudaSuccess) {fprintf(stderr, "cudaMalloc failed!");goto Error;}// Copy input vectors from host memory to GPU buffers.cudaStatus = cudaMemcpy(dev_a, a, HA * WA * sizeof(float), cudaMemcpyHostToDevice);if (cudaStatus != cudaSuccess) {fprintf(stderr, "cudaMemcpy failed!");goto Error;}cudaStatus = cudaMemcpy(dev_b, b, HB * WB * sizeof(float), cudaMemcpyHostToDevice);if (cudaStatus != cudaSuccess) {fprintf(stderr, "cudaMemcpy failed!");goto Error;}//为每一个C[i][j]设置一个线程进行计算int block_size = 16;dim3 Threads(block_size, block_size);dim3 Blocks(WB / block_size, HA / block_size);// Launch a kernel on the GPU with one thread for each element.if (mode == Mode1){MatrixMulGPU_1 << <Blocks, Threads >>>(dev_c, dev_a, dev_b, WA, WB);}if (mode == Mode2){MatrixMulGPU_2<16> << <Blocks, Threads >> >(dev_c, dev_a, dev_b, WA, WB);}// Check for any errors launching the kernelcudaStatus = cudaGetLastError();if (cudaStatus != cudaSuccess) {fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));goto Error;}// cudaDeviceSynchronize waits for the kernel to finish, and returns// any errors encountered during the launch.cudaStatus = cudaDeviceSynchronize();if (cudaStatus != cudaSuccess) {fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);goto Error;}// Copy output vector from GPU buffer to host memory.cudaStatus = cudaMemcpy(c, dev_c, HA * WB * sizeof(float), cudaMemcpyDeviceToHost);if (cudaStatus != cudaSuccess) {fprintf(stderr, "cudaMemcpy failed!");goto Error;}Error:cudaFree(dev_c);cudaFree(dev_a);cudaFree(dev_b);return cudaStatus;
}

代码中,总过使用了CPU的计算和两种GPU的运算,最终的运行结果如下:

  

可以明显的看出,GPU的运行速度比CPU快很多,并且将任务越细分,运行的速度也更快。

后续我还想通过更多的方式(比如texture binding)来继续进行优化。

原文链接:http://www.cnblogs.com/stormhan/p/5467187.html

GPU与CPU版本的矩阵乘法对比相关推荐

  1. 深度学习环境配置(GPU、CPU版本)

    一.Anaconda安装 1.介绍 Anaconda是一个开源的Python发行版本,用来管理Python相关的包,安装Anaconda可以很方便的切换不同的环境,使用不同的深度学习框架开发项目,本文 ...

  2. C++、python、CUDA性能分析--矩阵乘法

    网上看到一个分析python.Numpy.C++.cuda.cuBLAS做矩阵运算性能的帖子,我觉得非常好.所以,就自己动手实测了一下.这才有了这篇文章.就算是给需要的朋友做个参考吧. ******* ...

  3. 循环取矩阵的某行_1.2 震惊! 某大二本科生写的矩阵乘法吊打Mathematica-线性代数库BLAS-矩阵 (上)...

    本文是 1. 线性代数库BLAS​zhuanlan.zhihu.com 系列的第二篇, 将讲述矩阵类的结构和矩阵基础运算的AVX2加速算法. 1. 矩阵类的结构 在讲述矩阵各种算法之前很有必要详解一下 ...

  4. 稀疏矩阵加法运算_1.2 震惊! 某大二本科生写的矩阵乘法吊打Mathematica-线性代数库BLAS-矩阵 (上)...

    本文是 1. 线性代数库BLAS​zhuanlan.zhihu.com 系列的第二篇, 将讲述矩阵类的结构和矩阵基础运算的AVX2加速算法. 1. 矩阵类的结构 在讲述矩阵各种算法之前很有必要详解一下 ...

  5. 【CS231n 课程笔记】第八讲-----常见的深度学习框架,以及GPU和CPU介绍。

    目录 0.写在前面 1.CPU VS GPU 1.1GPU 与CPU的比较 1.2性能测试 1.3实践中的问题 2.深度学习框架 2.1 计算图思想和深度学习框架 2.2 tensorflow 2.2 ...

  6. 深度学习中:使用GPU和CPU的区别

    深度学习框架:Tensorflow 显卡型号:GTX 1060 CPU型号:i5-8300H 深度学习:使用GPU和CPU的区别 前言 代码 结果 总结 前言   这几天在做深度学习,想看看为什么 G ...

  7. c语言数天数类型,GPU和CPU到底有什么区别

    描述 显卡的发展可以说是非常的快,人们对于视觉化上的要求也越来越高,随着用户对于图像处理上面的要求不断超出处理器的计算能力.另一方面CPU处理能力也不断强大,但在进入3D时代后,人们发现庞大的3D图像 ...

  8. CPU的自动调度矩阵乘法

    CPU的自动调度矩阵乘法 这是一个有关如何对CPU使用自动调度程序的文档. 与依靠手动模板定义搜索空间的基于模板的autotvm不同,自动调度程序不需要任何模板.用户只需要编写计算声明,而无需任何调度 ...

  9. pytorch cpu版本安装_pytorch深度学习框架--gpu和cpu的选择

    最近实现了一个简单的手写数字识别的程序,我安装的pytorch是gpu版(你也可以安装cpu版本的,根据个人需要),这里我介绍pytorch的gpu版本和cpu版本的安装以及训练手写数字识别时gpu和 ...

最新文章

  1. JDK版本Java SE、Java EE、Java ME的区别
  2. C++——重载运算符和重载函数
  3. 【js】JSON.stringify 语法实例讲解
  4. 一条SQL语句查询数据库中的所有表、存储过程、触发器
  5. iUAP云运维平台v3.0全面支持基于K8s的微服务架构
  6. atomic原子类实现机制_深入了解Java atomic原子类的使用方法和原理
  7. CenterOs 防火墙设置
  8. 马尔可夫链 (Markov Chain)是什么鬼
  9. 在python中可以使用if作为变量名_变量,注释,缩进,细数Python优雅风 | Python基础连载(二)...
  10. pythonfor循环break_Python break语句 跳出循环
  11. 也谈谈Javascript中的几个怪异特性(上)
  12. oracle数据库常用操作语句
  13. 用商业模式改变世界(上)
  14. html5妇女节游戏,适合妇女节玩的小游戏
  15. 信息学奥赛一本通2011:【20CSPS提高组】贪吃蛇
  16. java 支付宝支付 demo_java开发支付宝支付详细流程_demo的运行
  17. Linux中GCC编译工具集中个软件的用途、gcc的简单编译以及ELF文件格式
  18. 交换机断网的原因分析
  19. 设置vim 显示行号
  20. Axure中动态面板中的“推动/拉动原件”

热门文章

  1. (FFOS Gecko Gaia) OTA - 转移至System App
  2. win7 IIS7.5 HTTP 错误 404.17 - Not Found 请求的内容似乎是脚本,因而将无法由静态...
  3. [导入] 堆和栈的区别
  4. Metasploit 提权篇
  5. 【数据结构与算法】之深入解析“我的日程安排表II”的求解思路与算法示例
  6. RxSwift之UI控件UIPickerView扩展的使用
  7. 【Tiny4412】烧写Android系统remote data too large问题分析和解决
  8. 【Linux系统编程】信号 (上)
  9. 【Linux】一步一步学Linux——nice命令(127)
  10. 【Linux系统编程】线程与进程的比较