GPU与CPU版本的矩阵乘法对比

由于刚刚开始学习Cuda，还没有整理出一个完整的Cuda类，只是在Nvidia提供的kenerl架构上做修改。

　　但用于初体验GPU给我们带来的好处也绰绰有余了。

　　直接贴代码：

/*矩阵乘法，CPU版本和GPU版本的对比*/#include "cuda_runtime.h"
#include "device_launch_parameters.h"#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <Windows.h>
#include <string>
#include <malloc.h>//用于指示不同的GPU 优化版本
enum Type
{Mode1 = 1,   //Mode 1 :将每一个C[i][j]都分别分配一个线程Mode2 = 2     //Mode 2 :不让一个线程完整计算一个C[i][j]，通过C(i,j) = sum { A(i,k)*B(k,j) }发现，我们还可以再细度划分：//           sub(i,j) = sum{A(i,ksub+offsetA)*B(ksub+offsetB,j)}  0<=ksub < blockSize//            C(i, j) = sum{ Csub(i, j) }//            就是把矩阵分成n*n个大的子块，然后每一个block负责计算子块i 和 子块j的子乘积，计算完毕后加起来则可。这里主要使用了共享显存作优化。
};cudaError_t addWithCuda(float *c, const float *a, const float *b, unsigned int WA, unsigned int HA, unsigned int WB, unsigned int HB, Type mode);__global__ void MatrixMulGPU_1(float *c, const float *a, const float *b, unsigned int WA, unsigned int WB)
{float sum = 0;//找出该线程所在的行和列int row = blockIdx.y * blockDim.y + threadIdx.y;int col = blockIdx.x * blockDim.x + threadIdx.x;//线程Thread(row, col)负责计算C(row, col)for (int i = 0; i < WB; ++i){sum += a[row * WA + i] * b[i * WB + col];}c[row * WB + col] = sum;
}template<int BLOCK_SIZE> __global__ void MatrixMulGPU_2(float *c, const float *a, const float *b, unsigned int WA, unsigned int WB)
{// Block indexint bx = blockIdx.x;int by = blockIdx.y;// Thread indexint tx = threadIdx.x;int ty = threadIdx.y;// Index of the first sub-matrix of A processed by the blockint aBegin = WA * BLOCK_SIZE * by;// Index of the last sub-matrix of A processed by the blockint aEnd = aBegin + WA - 1;// Step size used to iterate through the sub-matrices of Aint aStep = BLOCK_SIZE;// Index of the first sub-matrix of B processed by the blockint bBegin = BLOCK_SIZE * bx;// Step size used to iterate through the sub-matrices of Bint bStep = BLOCK_SIZE * WB;// Csub is used to store the element of the block sub-matrix// that is computed by the threadfloat Csub = 0;// Loop over all the sub-matrices of A and B// required to compute the block sub-matrixfor (int i = aBegin, j = bBegin;i <= aEnd;i += aStep, j += bStep){// Declaration of the shared memory array As used to// store the sub-matrix of A__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];// Declaration of the shared memory array Bs used to// store the sub-matrix of B__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];// Load the matrices from device memory// to shared memory; each thread loads// one element of each matrixAs[ty][tx] = a[i + WA * ty + tx];Bs[ty][tx] = b[j + WB * ty + tx];// Synchronize to make sure the matrices are loaded__syncthreads();// Multiply the two matrices together;// each thread computes one element// of the block sub-matrix
#pragma unrollfor (int k = 0; k < BLOCK_SIZE; ++k){Csub += As[ty][k] * Bs[k][tx];}// Synchronize to make sure that the preceding// computation is done before loading two new// sub-matrices of A and B in the next iteration__syncthreads();}// Write the block sub-matrix to device memory;// each thread writes one elementint k = WB * BLOCK_SIZE * by + BLOCK_SIZE * bx;c[k + WB * ty + tx] = Csub;
}//GPU version
void MatrixMulCPU(float *_C, const float* _A, const float* _B, int WA, int HA, int WB, int HB)
{if (WA != HB){printf("the matrix A and B cannot be multipled!");exit(0);}for (int i = 0; i < HA; ++i){for (int j = 0; j < WB; ++j){for (int k = 0; k < WA; ++k){_C[i * WA + j] += _A[i * WA + k] * _B[k * WB + j];}}}
}//给初始的矩阵一个随机值
void randomInit(float* _data, int _size)
{for (int i = 0; i < _size; ++i){_data[i] = rand() / (float)RAND_MAX * 100;}
}//print the matrix
void printMatrix(float* m_Matrix, int W, int H)
{for (int i = 0; i < W * H; ++i){printf("%2.1f ", m_Matrix[i]);if (i % W == 0 && i != 0) printf("\n");}printf("\n");
}bool CheckAnswer(const float* _C, const float* _D, unsigned int size)
{bool isRight = true;for (int i = 0; i < size && isRight == true; ++i){if (_C[i] != _D[i])isRight = false;}return isRight;
}int main()
{const int width_A = 1024;const int height_A = 1024;const int width_B = 1024;const int height_B = 1024;float *B = (float *)malloc(sizeof(float) * height_B * width_B);float *A = (float *)malloc(sizeof(float) * height_A * width_A);float *C = (float *)malloc(sizeof(float) * height_A * width_B);float *D = (float *)malloc(sizeof(float) * height_A * width_B);float *E = (float *)malloc(sizeof(float) * height_A * width_B);memset(A, 0.0, sizeof(float) * height_A * width_A);memset(B, 0.0, sizeof(float) * height_B * width_B);memset(C, 0.0, sizeof(float) * height_A * width_B);memset(D, 0.0, sizeof(float) * height_A * width_B);memset(E, 0.0, sizeof(float) * height_A * width_B);//产生随机数生成器srand((unsigned)time(0));randomInit(B, height_B * width_B);randomInit(A, height_A * width_A);//printMatrix(B, width_B, height_B);//printMatrix(A, width_A, height_A);//CPU 计算unsigned int tick1 = GetTickCount();MatrixMulCPU(C, A, B, width_A, height_A, width_B, height_B);printf("CPU use time : %dms\n", GetTickCount() - tick1);//GPU Type m_Mode = Mode1;unsigned int tick2 = GetTickCount();cudaError_t cudaStatus = addWithCuda(D, A, B, width_A, height_A, width_B, height_B, m_Mode);if (cudaStatus != cudaSuccess){fprintf(stderr, "addWithCuda failed!\n");return 1;}printf("GPU mode1 use time : %dms\n", GetTickCount() - tick2);m_Mode = Mode2;unsigned int tick3 = GetTickCount();cudaStatus = addWithCuda(E, A, B, width_A, height_A, width_B, height_B, m_Mode);if (cudaStatus != cudaSuccess){fprintf(stderr, "addWithCuda failed!\n");return 1;}printf("GPU mode2 use time : %dms\n", GetTickCount() - tick3);//检查GPU, CPU 计算的结果是否相同if (!CheckAnswer(C, D, height_A * width_B) && !CheckAnswer(C, E, height_A * width_B))printf("The answer is wrong!");else printf("The answer is right!");// cudaDeviceReset must be called before exiting in order for profiling and// tracing tools such as Nsight and Visual Profiler to show complete traces.cudaStatus = cudaDeviceReset();if (cudaStatus != cudaSuccess){fprintf(stderr, "cudaDeviceReset failed!");return 1;}return 0;
}// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(float *c, const float *a, const float *b, unsigned int WA, unsigned int HA, unsigned int WB, unsigned int HB, Type mode)
{float *dev_a = 0;float *dev_b = 0;float *dev_c = 0;cudaError_t cudaStatus;// Choose which GPU to run on, change this on a multi-GPU system.cudaStatus = cudaSetDevice(0);if (cudaStatus != cudaSuccess) {fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");goto Error;}// Allocate GPU buffers for three vectors (two input, one output)    .cudaStatus = cudaMalloc((void**)&dev_c, HA * WB * sizeof(float));if (cudaStatus != cudaSuccess) {fprintf(stderr, "cudaMalloc failed!");goto Error;}cudaStatus = cudaMalloc((void**)&dev_a, HA * WA * sizeof(float));if (cudaStatus != cudaSuccess) {fprintf(stderr, "cudaMalloc failed!");goto Error;}cudaStatus = cudaMalloc((void**)&dev_b, HB * WB * sizeof(float));if (cudaStatus != cudaSuccess) {fprintf(stderr, "cudaMalloc failed!");goto Error;}// Copy input vectors from host memory to GPU buffers.cudaStatus = cudaMemcpy(dev_a, a, HA * WA * sizeof(float), cudaMemcpyHostToDevice);if (cudaStatus != cudaSuccess) {fprintf(stderr, "cudaMemcpy failed!");goto Error;}cudaStatus = cudaMemcpy(dev_b, b, HB * WB * sizeof(float), cudaMemcpyHostToDevice);if (cudaStatus != cudaSuccess) {fprintf(stderr, "cudaMemcpy failed!");goto Error;}//为每一个C[i][j]设置一个线程进行计算int block_size = 16;dim3 Threads(block_size, block_size);dim3 Blocks(WB / block_size, HA / block_size);// Launch a kernel on the GPU with one thread for each element.if (mode == Mode1){MatrixMulGPU_1 << <Blocks, Threads >>>(dev_c, dev_a, dev_b, WA, WB);}if (mode == Mode2){MatrixMulGPU_2<16> << <Blocks, Threads >> >(dev_c, dev_a, dev_b, WA, WB);}// Check for any errors launching the kernelcudaStatus = cudaGetLastError();if (cudaStatus != cudaSuccess) {fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));goto Error;}// cudaDeviceSynchronize waits for the kernel to finish, and returns// any errors encountered during the launch.cudaStatus = cudaDeviceSynchronize();if (cudaStatus != cudaSuccess) {fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);goto Error;}// Copy output vector from GPU buffer to host memory.cudaStatus = cudaMemcpy(c, dev_c, HA * WB * sizeof(float), cudaMemcpyDeviceToHost);if (cudaStatus != cudaSuccess) {fprintf(stderr, "cudaMemcpy failed!");goto Error;}Error:cudaFree(dev_c);cudaFree(dev_a);cudaFree(dev_b);return cudaStatus;
}

代码中，总过使用了CPU的计算和两种GPU的运算，最终的运行结果如下：

可以明显的看出，GPU的运行速度比CPU快很多，并且将任务越细分，运行的速度也更快。

后续我还想通过更多的方式（比如texture binding）来继续进行优化。

原文链接：http://www.cnblogs.com/stormhan/p/5467187.html

GPU与CPU版本的矩阵乘法对比相关推荐

深度学习环境配置（GPU、CPU版本）
一.Anaconda安装 1.介绍 Anaconda是一个开源的Python发行版本,用来管理Python相关的包,安装Anaconda可以很方便的切换不同的环境,使用不同的深度学习框架开发项目,本文 ...
C++、python、CUDA性能分析--矩阵乘法
网上看到一个分析python.Numpy.C++.cuda.cuBLAS做矩阵运算性能的帖子,我觉得非常好.所以,就自己动手实测了一下.这才有了这篇文章.就算是给需要的朋友做个参考吧. ******* ...
循环取矩阵的某行_1.2 震惊! 某大二本科生写的矩阵乘法吊打Mathematica-线性代数库BLAS-矩阵 (上)...
本文是 1. 线性代数库BLASzhuanlan.zhihu.com 系列的第二篇, 将讲述矩阵类的结构和矩阵基础运算的AVX2加速算法. 1. 矩阵类的结构在讲述矩阵各种算法之前很有必要详解一下 ...
稀疏矩阵加法运算_1.2 震惊! 某大二本科生写的矩阵乘法吊打Mathematica-线性代数库BLAS-矩阵 (上)...
本文是 1. 线性代数库BLASzhuanlan.zhihu.com 系列的第二篇, 将讲述矩阵类的结构和矩阵基础运算的AVX2加速算法. 1. 矩阵类的结构在讲述矩阵各种算法之前很有必要详解一下 ...
【CS231n 课程笔记】第八讲-----常见的深度学习框架，以及GPU和CPU介绍。
目录 0.写在前面 1.CPU VS GPU 1.1GPU 与CPU的比较 1.2性能测试 1.3实践中的问题 2.深度学习框架 2.1 计算图思想和深度学习框架 2.2 tensorflow 2.2 ...
深度学习中：使用GPU和CPU的区别
深度学习框架:Tensorflow 显卡型号:GTX 1060 CPU型号:i5-8300H 深度学习:使用GPU和CPU的区别前言代码结果总结前言这几天在做深度学习,想看看为什么 G ...
c语言数天数类型,GPU和CPU到底有什么区别
描述显卡的发展可以说是非常的快,人们对于视觉化上的要求也越来越高,随着用户对于图像处理上面的要求不断超出处理器的计算能力.另一方面CPU处理能力也不断强大,但在进入3D时代后,人们发现庞大的3D图像 ...
CPU的自动调度矩阵乘法
CPU的自动调度矩阵乘法这是一个有关如何对CPU使用自动调度程序的文档. 与依靠手动模板定义搜索空间的基于模板的autotvm不同,自动调度程序不需要任何模板.用户只需要编写计算声明,而无需任何调度 ...
pytorch cpu版本安装_pytorch深度学习框架--gpu和cpu的选择
最近实现了一个简单的手写数字识别的程序,我安装的pytorch是gpu版(你也可以安装cpu版本的,根据个人需要),这里我介绍pytorch的gpu版本和cpu版本的安装以及训练手写数字识别时gpu和 ...

GPU与CPU版本的矩阵乘法对比

GPU与CPU版本的矩阵乘法对比相关推荐

最新文章

热门文章