1.BaseConvolutionLayer & ConvolutionLayer



/// @brief The spatial dimensions of a filter kernel.  // kernel的形状 = [kernel_h, kernel_w]  Blob<int> kernel_shape_;  /// @brief The spatial dimensions of the stride.  // 步长形状 = [stride_h, stride_w]  Blob<int> stride_;  /// @brief The spatial dimensions of the padding.  // pad的形状 = [pad_h, pad_w]  Blob<int> pad_;  /// @brief The spatial dimensions of the convolution input.  // 卷积的输入形状 = [输入图像通道数, 输入图像h,    输入图像w]  Blob<int> conv_input_shape_;  /// @brief The spatial dimensions of the col_buffer.  // col_buffer的形状 = [kernel_dim_, conv_out_spatial_dim_ ]  vector<int> col_buffer_shape_;  /// @brief The spatial dimensions of the output.  // 输出的形状  vector<int> output_shape_;  // 输入的形状  const vector<int>* bottom_shape_;  // 空间轴个数  int num_spatial_axes_;  // 输入度维度 = 输入图像通道数*输入图像的h*输入图像w  int bottom_dim_;  // 输出维度 = 输出通道数*输出h*输出w  int top_dim_;  // 输入图像的第几个轴是通道  int channel_axis_;  // batchsize  int num_;  // 输入图像的通道数  int channels_;  // 卷积组的大小  int group_;  // 输出空间维度 = 卷积之后的图像长*卷积之后图像的宽  int out_spatial_dim_;  // 使用卷积组用到的  int weight_offset_;  // 卷积后的图像的通道数  int num_output_;  // 是否启用偏置  bool bias_term_;  // 是不是1x1卷积  bool is_1x1_;  // 强制使用n维通用卷积  bool force_nd_im2col_;  // conv_in_channels_ * conv_out_spatial_dim_  int num_kernels_im2col_;  // num_kernels_col2im_ = reverse_dimensions() ? top_dim_ : bottom_dim_  int num_kernels_col2im_;  // 卷积的输出通道数 ,在参数配置文件中设置  int conv_out_channels_;  // 卷积的输入通道数 (即输入图像的通道数)  int conv_in_channels_;  // 卷积的输出的空间维度 = 卷积后图像h*卷积后图像w  int conv_out_spatial_dim_;  // 卷积核的维度 = 输入图像的维度*卷积核的h*卷积核的w  int kernel_dim_;  // 在使用gropu参数的时候使用的offset  int col_offset_;  int output_offset_;  // im2col的时候使用的存储空间  Blob<Dtype> col_buffer_;  // 将偏置扩展成矩阵的东东  Blob<Dtype> bias_multiplier_;



 public://构造函数explicit BaseConvolutionLayer(const LayerParameter& param): Layer<Dtype>(param) {}//初始化virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);//重设形状virtual void Reshape(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);


GEMM的全称是General Matrix Matrix Multiply。其基本形式如下:

void BaseConvolutionLayer<Dtype>::forward_cpu_gemm(const Dtype* input,const Dtype* weights, Dtype* output, bool skip_im2col) {const Dtype* col_buff = input;if (!is_1x1_) {if (!skip_im2col) {conv_im2col_cpu(input, col_buffer_.mutable_cpu_data());}col_buff = col_buffer_.cpu_data();}for (int g = 0; g < group_; ++g) {caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, conv_out_channels_ /group_, conv_out_spatial_dim_, kernel_dim_,(Dtype)1., weights + weight_offset_ * g, col_buff + col_offset_ * g,(Dtype)0., output + output_offset_ * g);}
}template <typename Dtype>
void BaseConvolutionLayer<Dtype>::forward_cpu_bias(Dtype* output,const Dtype* bias) {caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,out_spatial_dim_, 1, (Dtype)1., bias, bias_multiplier_.cpu_data(),(Dtype)1., output);


  inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) {if (!force_nd_im2col_ && num_spatial_axes_ == 2) {im2col_cpu(data, conv_in_channels_,conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],pad_.cpu_data()[0], pad_.cpu_data()[1],stride_.cpu_data()[0], stride_.cpu_data()[1],dilation_.cpu_data()[0], dilation_.cpu_data()[1], col_buff);} else {im2col_nd_cpu(data, num_spatial_axes_, conv_input_shape_.cpu_data(),col_buffer_shape_.data(), kernel_shape_.cpu_data(),pad_.cpu_data(), stride_.cpu_data(), dilation_.cpu_data(), col_buff);}}inline void conv_col2im_cpu(const Dtype* col_buff, Dtype* data) {if (!force_nd_im2col_ && num_spatial_axes_ == 2) {col2im_cpu(col_buff, conv_in_channels_,conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],pad_.cpu_data()[0], pad_.cpu_data()[1],stride_.cpu_data()[0], stride_.cpu_data()[1],dilation_.cpu_data()[0], dilation_.cpu_data()[1], data);} else {col2im_nd_cpu(col_buff, num_spatial_axes_, conv_input_shape_.cpu_data(),col_buffer_shape_.data(), kernel_shape_.cpu_data(),pad_.cpu_data(), stride_.cpu_data(), dilation_.cpu_data(), data);}}


template <typename Dtype>
void ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top) {const Dtype* weight = this->blobs_[0]->cpu_data();for (int i = 0; i < bottom.size(); ++i) {const Dtype* bottom_data = bottom[i]->cpu_data();Dtype* top_data = top[i]->mutable_cpu_data();for (int n = 0; n < this->num_; ++n) {this->forward_cpu_gemm(bottom_data + n * this->bottom_dim_, weight,top_data + n * this->top_dim_);if (this->bias_term_) {const Dtype* bias = this->blobs_[1]->cpu_data();this->forward_cpu_bias(top_data + n * this->top_dim_, bias);}}}


template <typename Dtype>
void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {const Dtype* weight = this->blobs_[0]->cpu_data();Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();for (int i = 0; i < top.size(); ++i) {const Dtype* top_diff = top[i]->cpu_diff();const Dtype* bottom_data = bottom[i]->cpu_data();Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();// Bias gradient, if necessary.if (this->bias_term_ && this->param_propagate_down_[1]) {Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff();for (int n = 0; n < this->num_; ++n) {this->backward_cpu_bias(bias_diff, top_diff + n * this->top_dim_);}}if (this->param_propagate_down_[0] || propagate_down[i]) {for (int n = 0; n < this->num_; ++n) {// gradient w.r.t. weight. Note that we will accumulate diffs.if (this->param_propagate_down_[0]) {this->weight_cpu_gemm(bottom_data + n * this->bottom_dim_,top_diff + n * this->top_dim_, weight_diff);}// gradient w.r.t. bottom data, if necessary.if (propagate_down[i]) {this->backward_cpu_gemm(top_diff + n * this->top_dim_, weight,bottom_diff + n * this->bottom_dim_);}}}}

bottom, weight, bias导数计算,向后传递。

template <typename Dtype>
void BaseConvolutionLayer<Dtype>::backward_cpu_gemm(const Dtype* output,const Dtype* weights, Dtype* input) {Dtype* col_buff = col_buffer_.mutable_cpu_data();if (is_1x1_) {col_buff = input;}for (int g = 0; g < group_; ++g) {caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, kernel_dim_,conv_out_spatial_dim_, conv_out_channels_ / group_,(Dtype)1., weights + weight_offset_ * g, output + output_offset_ * g,(Dtype)0., col_buff + col_offset_ * g);}if (!is_1x1_) {conv_col2im_cpu(col_buff, input);}
}template <typename Dtype>
void BaseConvolutionLayer<Dtype>::weight_cpu_gemm(const Dtype* input,const Dtype* output, Dtype* weights) {const Dtype* col_buff = input;if (!is_1x1_) {conv_im2col_cpu(input, col_buffer_.mutable_cpu_data());col_buff = col_buffer_.cpu_data();}for (int g = 0; g < group_; ++g) {caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, conv_out_channels_ / group_,kernel_dim_, conv_out_spatial_dim_,(Dtype)1., output + output_offset_ * g, col_buff + col_offset_ * g,(Dtype)1., weights + weight_offset_ * g);}
}template <typename Dtype>
void BaseConvolutionLayer<Dtype>::backward_cpu_bias(Dtype* bias,const Dtype* input) {caffe_cpu_gemv<Dtype>(CblasNoTrans, num_output_, out_spatial_dim_, 1.,input, bias_multiplier_.cpu_data(), 1., bias);


在caffe中,卷积运算就是先对数据进行im2col操作,再进行内积运算(inner product)。这样做,比原始的卷积操作速度更快。

图片来自High Performance Convolutional Neural Networks for Document Processing, 是图示了caffe中卷积计算的原理。




def conv(a, v, full=0):  # valid:0  full:1ah, aw = np.shape(a)vh, vw = np.shape(v)print ah,aw,vh,vwif full:temp = np.zeros((ah + 2 * vh - 2, aw + 2 * vw - 2))temp[vh - 1:vh - 1 + ah, vw - 1:vw - 1 + aw] = aa = tempah, aw = np.shape(a)k =[[ np.sum(np.multiply(a[i:i + vh, j:j + vw], v))for j in range(aw - vw + 1)] for i in range(ah - vh + 1)]return k






template <typename Dtype>
void im2col_cpu(const Dtype* data_im, const int channels,const int height, const int width, const int kernel_h, const int kernel_w,const int pad_h, const int pad_w,const int stride_h, const int stride_w,Dtype* data_col) {const int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;const int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;const int channels_col = channels * kernel_h * kernel_w;for (int c_col = 0; c_col < channels_col; ++c_col) {int w_offset = c_col % kernel_w;int h_offset = (c_col / kernel_w) % kernel_h;int c_im = c_col / kernel_h / kernel_w;for (int h_col = 0; h_col < height_col; ++h_col) {for (int w_col = 0; w_col < width_col; ++w_col) {int h_im = h_col * stride_h - pad_h + h_offset;int w_im = w_col * stride_w - pad_w + w_offset;data_col[(c_col * height_col + h_col) * width_col + w_col] =(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?data_im[(c_im * height + h_im) * width + w_im] : 0;}}}
将图片按照卷积的窗口大小切成子图,拉成一列。*/template <typename Dtype>
inline void im2col_nd_core_cpu(const Dtype* data_input, const bool im2col,const int num_spatial_axes, const int* im_shape, const int* col_shape,const int* kernel_shape, const int* pad, const int* stride,Dtype* data_output)
针对输入的spatial dimension 不是二维的情况
*/template <typename Dtype>
void col2im_cpu(const Dtype* data_col, const int channels,const int height, const int width, const int kernel_h, const int kernel_w,const int pad_h, const int pad_w,const int stride_h, const int stride_w,Dtype* data_im) {caffe_set(height * width * channels, Dtype(0), data_im);const int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;const int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;const int channels_col = channels * kernel_h * kernel_w;for (int c_col = 0; c_col < channels_col; ++c_col) {int w_offset = c_col % kernel_w;int h_offset = (c_col / kernel_w) % kernel_h;int c_im = c_col / kernel_h / kernel_w;for (int h_col = 0; h_col < height_col; ++h_col) {for (int w_col = 0; w_col < width_col; ++w_col) {int h_im = h_col * stride_h - pad_h + h_offset;int w_im = w_col * stride_w - pad_w + w_offset;if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width)data_im[(c_im * height + h_im) * width + w_im] +=data_col[(c_col * height_col + h_col) * width_col + w_col];}}}


