c++实现maxpooling+利用OpenMP、SIMD优化代码

1.概述

最近接触了一个问题，如何用c++去实现图像中的maxpooling操作，在网上找了一些资料，发现只有一些简单的实现算法，这些算法实际运行复杂度较高。因此，本文，探究了如何利用c++的多线程（OpenMP）、“一条指令操作多个数据”（SIMD）等方向出发，优化maxpooling操作。

2. maxpooling

参考博客对Max Pooling的理解_117瓶果粒橙-CSDN博客_maxpooling中的理解，整个图片被不重叠的分割成若干个同样大小的小块（pooling size）。每个小块内只取最大的数字，再舍弃其他节点后，保持原有的平面结构得出 output。

在这篇文章中设定最大池化操作的参数kernel_size=3,stride=2,padding=1（在图像周围填充一格0）,那么在输入矩阵src1为：

1	1	2	4
5	6	7	8
3	2	1	0
1	2	3	4

最大池化的结果应该为

6	8
6	8

3.pytorch实现和c++实现

pytorch

c++(版本1)

//求maxpooling操作vector<vector<vector<vector<int>>>> maxpooling(){int batch = src1.size(); int channel = src1[0].size();int height=src1[0][0].size(); int width=src1[0][0][0].size();int out_h = (height-3+2)/2, out_w = (width-3+2)/2;int mod_h = (height-3+2)%2, mod_w =(width-3+2)%2;if (mod_h != 0) out_h++;if (mod_w != 0) out_w++;//先填充vector<vector<vector<vector<int>>>> pad_map;for(int b=0;b<batch;b++){vector<vector<vector<int>>> c_tmp;for(int c=0;c<channel;c++){vector<vector<int>> h_tmp(height+2,vector<int>(width+2,0));for(int h=0;h<height+2;h++){for(int w=0;w<width+2;w++){if(h>=1&&h<height+2-1&&w>=1&&w<width+2-1){h_tmp[h][w]=src1[b][c][h-1][w-1];}}}c_tmp.push_back(h_tmp);}pad_map.push_back(c_tmp);c_tmp.clear();}// cout<<pad_map.size()<<"###"; //求maxpoling后的结果vector<vector<vector<vector<int>>>> pool_out;vector<vector<vector<vector<vector<int>>>>> max_index;for(int b=0;b<batch;b++){vector<vector<vector<int>>> c_tmp;vector<vector<vector<vector<int>>>> c_index;for(int c=0;c<channel;c++){vector<vector<int>> res(out_h, vector<int>(out_w, 0));vector<vector<vector<int>>> res_index;for(int i=0;i<out_h;i++){vector<vector<int>> tmp;for (int j = 0; j < out_w; j++){int start_x = i*2;int start_y = j*2;// vector<int> temp;int max_ele=pad_map[b][c][start_x][start_y];vector<int> m_idx(2,0);for(int ii=0;ii<3;ii++)for (int jj = 0; jj <3; jj++){// temp.push_back(pad_map[b][c][start_y + jj][start_x + ii]);if(pad_map[b][c][start_x + ii][start_y + jj]>max_ele){max_ele=pad_map[b][c][start_x + ii][start_y + jj];m_idx[0]=(start_x + ii);m_idx[1]=(start_y+jj);}}// sort(temp.begin(), temp.end());tmp.push_back(m_idx);res[i][j] = max_ele;}res_index.push_back(tmp);tmp.clear();}c_tmp.push_back(res);c_index.push_back(res_index);res_index.clear();res.clear();}pool_out.push_back(c_tmp);max_index.push_back(c_index);c_index.clear();c_tmp.clear();}argmax_index=max_index;return pool_out;}

从代码中可以看到maxpooling的操作，无非就是就是先填充，然后取滑动窗口中数值的最大值，可以看出来计算maxpooling的操作需要六层循环，batch*channel*out_h*out_w*3*3,可见复杂度挺高的。

4. 代码优化

（1）版本1的代码中存在着大量vector的插入、删除操作（哈哈，还不是很熟悉c++），这些操作消耗了大量时间；

（2）引入OpenMP，利用多线程操作，完成最大值池化，可以参考OpenMP共享内存并行编程详解 - liangliangh - 博客园 (cnblogs.com)；

（3）gcc编译器本身存在代码优化，利用-O3操作，对代码进行自动优化；

最终优化的代码C++

//求maxpooling操作vector<vector<vector<vector<vector<int>>>>> maxpooling(vector<vector<vector<vector<int>>>> &res){cout<<clock()<<endl;int batch = src1.size(); int channel = src1[0].size();int height=src1[0][0].size(); int width=src1[0][0][0].size();int out_h = (height-3+2)/2, out_w = (width-3+2)/2;int mod_h = (height-3+2)%2, mod_w =(width-3+2)%2;if (mod_h != 0) out_h++;if (mod_w != 0) out_w++;// cout<<"out size";// cout<<out_h<<" ";// cout<<out_w<<endl;// cout<<clock()<<endl;//先填充// vector<vector<vector<vector<int>>>> pad_map(batch,vector<vector<vector<int>>>(channel,vector<vector<int>>(height+2,vector<int>(width+2,0))));//  #pragma omp parallel num_threads(4)// {//     #pragma omp for//         for(int b=0;b<batch;b++){//             for(int c=0;c<channel;c++){//                 for(int h=0;h<height+2;h++){//                     for(int w=0;w<width+2;w++){//                         if(h>=1&&h<height+2-1&&w>=1&&w<width+2-1){//                             pad_map[b][c][h][w]=src1[b][c][h-1][w-1];//                         }//                     }//                 }//             }//         }// }cout<<clock()<<endl;// cout<<"pad_map";// cout<<pad_map.size()<<" ";// cout<<pad_map[0].size()<<" ";// cout<<pad_map[0][0].size()<<" ";// cout<<pad_map[0][0][0].size()<<endl;//求maxpoling后的结果// vector<vector<vector<vector<int>>>> pool_out;// pool_out.resize(channel);vector<vector<vector<vector<int>>>> pool_out;vector<vector<vector<vector<vector<int>>>>> max_index;max_index.resize(batch);pool_out.resize(batch);#pragma omp parallel forfor(int b=0;b<batch;b++){pool_out[b].resize(channel);max_index[b].resize(channel);for(int c=0;c<channel;c++){pool_out[b][c].resize(out_h);max_index[b][c].resize(out_h);for(int i=0;i<out_h;i++){pool_out[b][c][i].resize(out_w);max_index[b][c][i].resize(out_w);for(int j=0;j<out_w;j++){max_index[b][c][i][j].resize(2);}}}}//vector<vector<vector<vector<int>>>> pool_out(batch,vector<vector<vector<int>>>(channel,vector<vector<int>>(out_h,vector<int>(out_w,0))));//vector<vector<vector<vector<vector<int>>>>> max_index(batch,vector<vector<vector<vector<int>>>>(channel,vector<vector<vector<int>>>(out_h,vector<vector<int>>(out_w,vector<int>(2,0)))));// cout<<"2222"<<endl;    cout<<clock()<<endl; // #pragma omp parallel num_threads(4)// {#pragma omp  parallel forfor(int b=0;b<batch;b++){for(int c=0;c<channel;c++){for(int i=0;i<out_h;i++){for (int j = 0; j < out_w; j++){int max_ele=INT_LEAST32_MIN;max_index[b][c][i][j][0]=(i*2);max_index[b][c][i][j][1]=( j*2);for(int ii=0;ii<3;ii++)for (int jj = 0; jj <3; jj++){int tmp_x=i*2+ii;int tmp_y=j*2+jj;if(tmp_x==0 || tmp_x==height+1 || tmp_y==0 || tmp_y==width+1){continue;}if(src1[b][c][tmp_x-1][tmp_y-1]>max_ele){max_ele=src1[b][c][tmp_x-1][tmp_y-1];max_index[b][c][i][j][0]=(tmp_x);max_index[b][c][i][j][1]=(tmp_y);}}pool_out[b][c][i][j]=max_ele;}}    }}// }cout<<clock()<<endl;// cout<<"#####"<<endl;// argmax_index=max_index;cout<<clock()<<endl;res=pool_out;return max_index;}

5.实际测试的时间消耗

在设置输入矩阵的维度为[32,64,112,112]

代码	时间消耗（s）
版本1	10.69
+（优化vector的使用）	4.67
+openMp	3.62
+gcc本身指令优化（O3）	0.41

6.SMID

在maxpooling上，暂时没想到比较好的单条指令多数据并行的方式，因此在元素求和上进行了一些尝试。

c++代码

 vector<vector<vector<vector<int>>>> add(vector<vector<vector<vector<int>>>> tensor1,vector<vector<vector<vector<int>>>> tensor2){int batch_1 = tensor1.size(); int channel_1 = tensor1[0].size();int height_1=tensor1[0][0].size(); int width_1=tensor1[0][0][0].size();int batch_2 = tensor2.size(); int channel_2 = tensor2[0].size();int height_2=tensor2[0][0].size(); int width_2=tensor2[0][0][0].size();int batch_3=max(batch_1,batch_2);int channel_3=max(channel_1,channel_2);int height_3=max(height_1,height_2);int width_3=max(width_1,width_2);vector<vector<vector<vector<int>>>> add_res (batch_3,vector<vector<vector<int>>>(channel_3,vector<vector<int>>(height_3,vector<int>(width_3,0))));if(batch_1!=batch_2&& batch_1!=1&&batch_2!=1){cout<<"batch不一致"<<endl;return add_res;}if(channel_1!=channel_2&& channel_1!=1&&channel_2!=1){cout<<"channel不一致"<<endl;return add_res;}if(height_1!=height_2&&height_1!=1&&height_2!=1){cout<<"height不一致"<<endl;return add_res;}if(width_1!=width_2&&width_1!=1&&width_2!=1){cout<<"width不一致"<<endl;return add_res;}// #pragma omp parallel num_threads(4){#pragma omp forfor(int b=0;b<batch_3;b++){int b1=min(b,batch_1-1);int b2=min(b,batch_2-1);for(int c=0;c<channel_3;c++){int c1=min(c,channel_1-1);int c2=min(c,channel_2-1);for(int h=0;h<height_3;h++){int h1=min(h,height_1-1);int h2=min(h,height_2-1);if(width_1==1&& width_2==1){add_res[b][c][h][0]=tensor1[b1][c1][h1][0]+tensor2[b2][c2][h2][0];}else{int w_int=width_3/4;int w_res=width_3%4;for(int w=0;w<w_int;w++){__m128i add;if(width_1==1){__m128i xx1 = _mm_setr_epi32(tensor1[b1][c1][h1][0],tensor1[b1][c1][h1][0],tensor1[b1][c1][h1][0],tensor1[b1][c1][h1][0]);__m128i xx2 = _mm_setr_epi32(tensor2[b2][c2][h2][0+4*w],tensor2[b2][c2][h2][1+4*w],tensor2[b2][c2][h2][2+4*w],tensor2[b2][c2][h2][3+4*w]);add = _mm_add_epi32(xx1, xx2);_mm_storeu_si128((__m128i*)(&add_res[b][c][h][w*4]), add);}else if(width_2==1){__m128i xx1 = _mm_setr_epi32(tensor1[b1][c1][h1][0+4*w],tensor1[b1][c1][h1][1+4*w],tensor1[b1][c1][h1][2+4*w],tensor1[b1][c1][h1][3+4*w]);__m128i xx2 = _mm_setr_epi32(tensor2[b2][c2][h2][0],tensor2[b2][c2][h2][0],tensor2[b2][c2][h2][0],tensor2[b2][c2][h2][0]);add = _mm_add_epi32(xx1, xx2);_mm_storeu_si128((__m128i*)(&add_res[b][c][h][w*4]), add);}else{__m128i xx1 = _mm_setr_epi32(tensor1[b1][c1][h1][0+4*w],tensor1[b1][c1][h1][1+4*w],tensor1[b1][c1][h1][2+4*w],tensor1[b1][c1][h1][3+4*w]);__m128i xx2 = _mm_setr_epi32(tensor2[b2][c2][h2][0+4*w],tensor2[b2][c2][h2][1+4*w],tensor2[b2][c2][h2][2+4*w],tensor2[b2][c2][h2][3+4*w]);add = _mm_add_epi32(xx1, xx2);_mm_storeu_si128((__m128i*)(&add_res[b][c][h][w*4]), add);}}if(w_res>0){for(int w_r=0;w_r<w_res;w_r++){int w1=w_int*4+w_r;int w2=w_int*4+w_r;if(width_1==1){add_res[b][c][h][w_int*4+w_r]=tensor1[b1][c1][h1][0]+tensor2[b2][c2][h2][w2];}else if(width_2==1){add_res[b][c][h][w_int*4+w_r]=tensor1[b1][c1][h1][w1]+tensor2[b2][c2][h2][0];}else{add_res[b][c][h][w_int*4+w_r]=tensor1[b1][c1][h1][w1]+tensor2[b2][c2][h2][w2];}}}}// for(int w=0;w<width_3;w++){//     add_res[b][c][h][w]=tensor1[min(b,batch_1-1)][min(c,channel_1-1)][min(h,height_1-1)][min(w,width_1-1)]+tensor2[min(b,batch_2-1)][min(c,channel_2-1)][min(h,height_2-1)][min(w,width_2-1)];// }}}}}return add_res;}

时间消耗的对比（矩阵维度为（32，1，64，64））

代码	时间消耗（s）
版本1	0.81
+优化vector使用	0.47
+openMP	0.31
+SIMD	0.24
+gcc O3优化	0.11

c++实现maxpooling+利用OpenMP、SIMD优化代码相关推荐

MAT之PSO：利用PSO算法优化二元函数，寻找最优个体适应度
MAT之PSO:利用PSO算法优化二元函数,寻找最优个体适应度目录实现结果设计代码实现结果设计代码 figure [x,y] = meshgrid(-5:0.1:5,-5:0.1:5); z ...
利用对象池优化数据库操作
简介:这是利用对象池优化数据库操作的详细页面,介绍了和asp.net,.Net,创建,对象池,示例有关的知识,要查看更多相关信息,请点击此处说到对象池,大家都不陌生.很多人都实现过,网上的代码也满天 ...
使用pickle模块序列化数据，优化代码
使用pickle模块序列化数据,优化代码 pickle是Python标准库中的一个二进制序列化和反序列化库. 可以以二进制的形式将数据持久化保存到磁盘文件中.可以将数据和代码分离,提高代码可读性和优雅 ...
BUAA-2021春-数据结构-综合作业-文本摘要生成（Hash实现 + SIMD优化终测最速）
题目内容问题描述在自然语言文本处理中,有一种分析文本.自动抽取文本主题思想的方法(通常用于文本摘要生成),其方法如下: 1. 首先分析文本中非停用词(stop-word)的出现频度: ...
java优化代码常见套路
目录程序员的痛点(烂代码) 该如何优化代码前台后台两次md5加盐加密 JSR303和全局异常处理 Redis通用的key生成策略和通用的RedisService方法程序猿的必读书籍程序员的痛点 ...
利用OpenMP加速拉伸图像操作
前面的博客<OpenCV拉伸图像>介绍了如何利用OpenCV的现成函数实现图像的透视变换.本文受到了<http://blog.csdn.net/xiaowei_cqu/article ...
利用DelayLoad来优化应用程序的性能.拦截API.
翻译 <Under the hood -by Matt Pietrek > 源文件 http://www.microsoft.com/msj/0200/hood/hood0200.asp ...
不要写完代码就束之高阁，适当地优化代码结构，能够为以后的开发带来许多方便，这《重构:改善既有代码的设计》就向你介绍了这方面的技巧，说得非常详细。...
"不要写完代码就束之高阁,适当地优化代码结构,能够为以后的开发带来许多方便,这<重构:改善既有代码的设计>就向你介绍了这方面的技巧,说得非常详细." "程序几 ...
TVM yolov3优化代码修改（编译运行OK）
TVM yolov3优化代码修改(编译运行OK) yolov3_quantize_sample.py 附https://github.com/makihiro/tvm_yolov3_sample代码: ...

c++实现maxpooling+利用OpenMP、SIMD优化代码

c++实现maxpooling+利用OpenMP、SIMD优化代码相关推荐

最新文章

热门文章