





YOLOv5-Lite的网络结构的Backbone主要使用的是含Shuffle channel的Shuffle block组成;

检测 Head 依旧用的是 YOLOv5 head,但用的是其简化版的 YOLOv5 head

Shuffle block示意图如下:

YOLOv5 backbone:在原先U版的 YOLOv5 Backbone中,作者在特征提取的上层结构中采用了4次slice操作组成了Focus层

YOLOv5 head:






Kh × Kw × Cin × Cout


Kh × Kw × Cin × Cout × H × W = 即(当前层filter × 输出的feature map)= params × H × W


在yolov5s的网络结构中,可以看到,Focus模块的卷积核是3 × 3,输出通道是32:


普通下采样:即将一张640×640×3的图片输入3×3的卷积中,步长为2,输出通道32,下采样后得到320 × 320 × 32的特征图,那么普通卷积下采样理论的计算量为:

FLOPs(conv)=3×3×3×32×320×320=88473600(不考虑bias情况下) params参数量(conv)=3×3×3×32+32+32=928(后面两个32分别为bias和BN层参数)


FLOPs(Focus)=3×3×12×32×320×320=353894400(不考虑bias情况下) params参数量(Focus)=3×3×12×32+32+32=3520(为了呼应上图输出的参数量,将后面两个32分别为bias和BN层的参数考虑进去,通常这两个占比比较小可以忽略)



对于Focus层,在一个正方形中每 4 个相邻像素,并生成一个具有 4 倍通道数的feature map,类似与对上级图层进行了4次下采样操作,再将结果concat到一起,最主要的功能还是在不降低模型特征提取能力的前提下,对模型进行降参和加速。


class Focus(nn.Module):# Focus wh information into c-spacedef __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):  # ch_in, ch_out, kernel, stride, padding, groupssuper(Focus, self).__init__()self.conv = Conv(c1 * 4, c2, k, s, p, g, act)      # 这里输入通道变成了4倍def forward(self, x):  # x(b,c,w,h) -> y(b,4c,w/2,h/2)return self.conv(torch.cat([x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]], 1))


cuda _CudaDeviceProperties(name='Tesla T4', major=7, minor=5, total_memory=15079MB, multi_processor_count=40)Params       FLOPS    forward (ms)   backward (ms)                   input                  output7040       23.07           62.89           87.79       (16, 3, 640, 640)      (16, 64, 320, 320)7040       23.07           15.52           48.69       (16, 3, 640, 640)      (16, 64, 320, 320)
cuda _CudaDeviceProperties(name='Tesla T4', major=7, minor=5, total_memory=15079MB, multi_processor_count=40)Params       FLOPS    forward (ms)   backward (ms)                   input                  output7040       23.07           11.61           79.72       (16, 3, 640, 640)      (16, 64, 320, 320)7040       23.07           12.54           42.94       (16, 3, 640, 640)      (16, 64, 320, 320)






  1. 同等通道大小可以最小化内存访问量

  2. 过量使用组卷积会增加MAC

  3. 网络过于碎片化(特别是多路)会降低并行度

  4. 不能忽略元素级操作(比如shortcut和Add)


  1. 摘除Focus层,避免多次采用slice操作

  2. 避免多次使用C3 Leyer以及高通道的C3 Layer(C3 Leyer是YOLOv5作者提出的CSPBottleneck改进版本,它更简单、更快、更轻,在近乎相似的损耗上能取得更好的结果。但C3 Layer采用多路分离卷积,测试证明,频繁使用C3 Layer以及通道数较高的C3 Layer,占用较多的缓存空间,减低运行速度)

class C3(nn.Module):# CSP Bottleneck with 3 convolutionsdef __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansionsuper(C3, self).__init__()c_ = int(c2 * e)  # hidden channelsself.cv1 = Conv(c1, c_, 1, 1)self.cv2 = Conv(c1, c_, 1, 1)self.cv3 = Conv(2 * c_, c2, 1) self.m = nn.Sequential(*[Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)])# self.m = nn.Sequential(*[CrossConv(c_, c_, 3, 1, g, 1.0, shortcut) for _ in range(n)])
  1. 对yolov5 head进行通道剪枝,剪枝细则参考G1

  2. 摘除shufflenetv2 backbone的1024 conv 和 5×5 pooling




1. init_tengine


2. create_graph


3. prerun_graph


struct options
{int num_thread;//核个数设置,int cluster;//大小核设置,可选TENGINE_CLUSTER_[ALL,BIG,MEDIUM,LITTLE]int precision;//精度设置,TENGINE_MODE_[FP32,FP16,HYBRID_INT8,UINT8,INT8]uint64_t affinity;//核亲和性掩码,绑定具体核,

4. run_graph


5. postrun_graph


6. destroy_graph




但如果填充的比较多,则存在信息冗余,影响推理速度。Yolov5在推理阶段,采用缩减黑边的方式,来提高推理的速度。在代码datasets.py的letterbox函数中进行了修改,对原始图像自适应的添加最少的黑边。eg:“比如我1000×800的图片不是直接缩放到608×608的大小,而是计算608/1000=0.608 然后缩放至608×486的大小,然后计算608-486=122 然后np.mod(122,32)取余数得到26,再平均成13填充到图片高度两端,最后是608×512。”

def letterbox(img, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):# Resize and pad image while meeting stride-multiple constraintsshape = img.shape[:2]  # current shape [height, width]if isinstance(new_shape, int):new_shape = (new_shape, new_shape)# Scale ratio (new / old)r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])if not scaleup:  # only scale down, do not scale up (for better test mAP)r = min(r, 1.0)# Compute paddingratio = r, r  # width, height ratiosnew_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh paddingif auto:  # minimum rectangledw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh paddingelif scaleFill:  # stretchdw, dh = 0.0, 0.0new_unpad = (new_shape[1], new_shape[0])ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratiosdw /= 2  # divide padding into 2 sidesdh /= 2if shape[::-1] != new_unpad:  # resizeimg = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))left, right = int(round(dw - 0.1)), int(round(dw + 0.1))img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add borderreturn img, ratio, (dw, dh)


void get_input_data_focus(const char* image_file, float* input_data, int img_h, int img_w, const float* mean, const float* scale)
{cv::Mat sample = cv::imread(image_file, 1);cv::Mat img;const int target_size = 640;int imge_w = img.cols;int imge_h = img.rows;int w = imge_w;int h = imge_h;float scale_im = 1.f;if (w > h){scale_im = (float)target_size / w;w = target_size;h = h * scale_im;}else{scale_im = (float)target_size / h;h = target_size;w = w * scale_im;}cv::cvtColor(sample, img, cv::COLOR_BGR2RGB);cv::resize(img, img, cv::Size(w, h));// pad to target_size rectangleint wpad = (w + 31) / 32 * 32 - w;int hpad = (h + 31) / 32 * 32 - h;cv::Mat in_pad;cv::copy_make_border(img, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, cv::BORDER_CONSTANT, 114.f);img.convertTo(img, CV_32FC3);float* img_data = (float*)img.data;/* nhwc to nchw */for (int h = 0; h < img_h; h++){for (int w = 0; w < img_w; w++){for (int c = 0; c < 3; c++){int in_index = h * img_w * 3 + w * 3 + c;int out_index = c * img_h * img_w + h * img_w + w;input_data[out_index] = (img_data[in_index] - mean[c]) * scale[c];}}}


/* set runtime options */struct options opt;opt.num_thread = num_thread;opt.cluster = TENGINE_CLUSTER_ALL;opt.precision = TENGINE_MODE_FP32;opt.affinity = 0;/* inital tengine */if (init_tengine() != 0){fprintf(stderr, "Initial tengine failed.\n");return -1;}fprintf(stderr, "tengine-lite library version: %s\n", get_tengine_version());/* create graph, load tengine model xxx.tmfile */graph_t graph = create_graph(nullptr, "tengine", model_file);if (graph == nullptr){fprintf(stderr, "Create graph failed.\n");return -1;}

3 获取推理结果

/* yolov5 postprocess */// 0: 1, 3, 20, 20, 85// 1: 1, 3, 40, 40, 85// 2: 1, 3, 80, 80, 85tensor_t p8_output = get_graph_output_tensor(graph, 0, 0);tensor_t p16_output = get_graph_output_tensor(graph, 1, 0);tensor_t p32_output = get_graph_output_tensor(graph, 2, 0);float* p8_data = (float*)get_tensor_buffer(p8_output);float* p16_data = (float*)get_tensor_buffer(p16_output);float* p32_data = (float*)get_tensor_buffer(p32_output);/* postprocess */const float prob_threshold = 0.55;const float nms_threshold = 0.5;std::vector<Object> proposals;std::vector<Object> objects8;std::vector<Object> objects16;std::vector<Object> objects32;std::vector<Object> objects;generate_proposals(32, p32_data, prob_threshold, objects32, letterbox_cols, letterbox_rows);proposals.insert(proposals.end(), objects32.begin(), objects32.end());generate_proposals(16, p16_data, prob_threshold, objects16, letterbox_cols, letterbox_rows);proposals.insert(proposals.end(), objects16.begin(), objects16.end());generate_proposals(8, p8_data, prob_threshold, objects8, letterbox_cols, letterbox_rows);proposals.insert(proposals.end(), objects8.begin(), objects8.end());qsort_descent_inplace(proposals);std::vector<int> picked;nms_sorted_bboxes(proposals, picked, nms_threshold);

4 后处理

static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold)
{picked.clear();const int n = faceobjects.size();std::vector<float> areas(n);for (int i = 0; i < n; i++){areas[i] = faceobjects[i].rect.area();}for (int i = 0; i < n; i++){const Object& a = faceobjects[i];int keep = 1;for (int j = 0; j < (int)picked.size(); j++){const Object& b = faceobjects[picked[j]];// intersection over unionfloat inter_area = intersection_area(a, b);float union_area = areas[i] + areas[picked[j]] - inter_area;// float IoU = inter_area / union_areaif (inter_area / union_area > nms_threshold)keep = 0;}if (keep)picked.push_back(i);}
}static void generate_proposals(int stride, const float* feat, float prob_threshold, std::vector<Object>& objects,int letterbox_cols, int letterbox_rows)
{//static float anchors[18] = {10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326};static float anchors[18] = {10, 13, 16, 30, 33, 23,30, 61, 62, 45, 59, 119,116, 90, 156, 198, 373, 326};int anchor_num = 3;int feat_w = letterbox_cols / stride;int feat_h = letterbox_rows / stride;int cls_num = 80;int anchor_group;if (stride == 8)anchor_group = 1;if (stride == 16)anchor_group = 2;if (stride == 32)anchor_group = 3;for (int h = 0; h <= feat_h - 1; h++){for (int w = 0; w <= feat_w - 1; w++){for (int a = 0; a <= anchor_num - 1; a++){//process cls scoreint class_index = 0;float class_score = -FLT_MAX;for (int s = 0; s <= cls_num - 1; s++){float score = feat[a * feat_w * feat_h * (cls_num + 5) + h * feat_w * (cls_num + 5) + w * (cls_num + 5) + s + 5];if (score > class_score){class_index = s;class_score = score;}}//process box scorefloat box_score = feat[a * feat_w * feat_h * (cls_num + 5) + (h * feat_w) * (cls_num + 5) + w * (cls_num + 5) + 4];float final_score = sigmoid(box_score) * sigmoid(class_score);if (final_score >= prob_threshold){int loc_idx = a * feat_h * feat_w * (cls_num + 5) + h * feat_w * (cls_num + 5) + w * (cls_num + 5);float dx = sigmoid(feat[loc_idx + 0]);float dy = sigmoid(feat[loc_idx + 1]);float dw = sigmoid(feat[loc_idx + 2]);float dh = sigmoid(feat[loc_idx + 3]);float pred_cx = (dx * 2.0f - 0.5f + w) * stride;float pred_cy = (dy * 2.0f - 0.5f + h) * stride;float anchor_w = anchors[(anchor_group - 1) * 6 + a * 2 + 0];float anchor_h = anchors[(anchor_group - 1) * 6 + a * 2 + 1];float pred_w = dw * dw * 4.0f * anchor_w;float pred_h = dh * dh * 4.0f * anchor_h;float x0 = pred_cx - pred_w * 0.5f;float y0 = pred_cy - pred_h * 0.5f;float x1 = pred_cx + pred_w * 0.5f;float y1 = pred_cy + pred_h * 0.5f;Object obj;obj.rect.x = x0;obj.rect.y = y0;obj.rect.width = x1 - x0;obj.rect.height = y1 - y0;obj.label = class_index;obj.prob = final_score;objects.push_back(obj);}}}}

5 绘图

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{static const char* class_names[] = {"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light","fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow","elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee","skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard","tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple","sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch","potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone","microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear","hair drier", "toothbrush"};cv::Mat image = bgr.clone();for (size_t i = 0; i < objects.size(); i++){const Object& obj = objects[i];fprintf(stderr, "%2d: %3.0f%%, [%4.0f, %4.0f, %4.0f, %4.0f], %s\n", obj.label, obj.prob * 100, obj.rect.x,obj.rect.y, obj.rect.x + obj.rect.width, obj.rect.y + obj.rect.height, class_names[obj.label]);cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));char text[256];sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);int baseLine = 0;cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 2, &baseLine);int x = obj.rect.x;int y = obj.rect.y - label_size.height - baseLine;if (y < 0)y = 0;if (x + label_size.width > image.cols)x = image.cols - label_size.width;cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),cv::Scalar(255, 255, 255), -1);cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5,cv::Scalar(0, 0, 0));}cv::imwrite("yolov5s_out.jpg", image);
/* yolov5 draw the result */float scale_letterbox;int resize_rows;int resize_cols;if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)){scale_letterbox = letterbox_rows * 1.0 / img.rows;}else{scale_letterbox = letterbox_cols * 1.0 / img.cols;}resize_cols = int(scale_letterbox * img.cols);resize_rows = int(scale_letterbox * img.rows);int tmp_h = (letterbox_rows - resize_rows) / 2;int tmp_w = (letterbox_cols - resize_cols) / 2;float ratio_x = (float)img.rows / resize_rows;float ratio_y = (float)img.cols / resize_cols;int count = picked.size();fprintf(stderr, "detection num: %d\n", count);objects.resize(count);for (int i = 0; i < count; i++){objects[i] = proposals[picked[i]];float x0 = (objects[i].rect.x);float y0 = (objects[i].rect.y);float x1 = (objects[i].rect.x + objects[i].rect.width);float y1 = (objects[i].rect.y + objects[i].rect.height);x0 = (x0 - tmp_w) * ratio_x;y0 = (y0 - tmp_h) * ratio_y;x1 = (x1 - tmp_w) * ratio_x;y1 = (y1 - tmp_h) * ratio_y;x0 = std::max(std::min(x0, (float)(img.cols - 1)), 0.f);y0 = std::max(std::min(y0, (float)(img.rows - 1)), 0.f);x1 = std::max(std::min(x1, (float)(img.cols - 1)), 0.f);y1 = std::max(std::min(y1, (float)(img.rows - 1)), 0.f);objects[i].rect.x = x0;objects[i].rect.y = y0;objects[i].rect.width = x1 - x0;objects[i].rect.height = y1 - y0;}draw_objects(img, objects);/* release tengine */postrun_graph(graph);destroy_graph(graph);release_tengine();

6 可视化推理结果






