YoloV7目标检测(Pytorch版)【详解】
文章目录
- 一、网络结构
- 1、总体网络结构
- 2、主干网络介绍(backbone)
- 2.1 多分支模块堆叠
- 2.2 下采样网络结构
- 2.3 整个backbone代码
- 3、FPN特征金字塔
- 二、预测结果的解码
- 1、 获得预测框、置信度、种类的数值
- 2、得分筛选与非极大抑制
- 三、训练过程
- 1、正样本匹配过程
- 1.1 匹配anchor与特征点
- 1.2 SimOTA自适应匹配
- 2、Loss的组成
一、网络结构
1、总体网络结构
主干网络示意图如下,其实采用的和YoloV3、YoloV4、YoloV5类似的网络结构
2、主干网络介绍(backbone)
2.1 多分支模块堆叠
代码如下,多分支模块堆叠的类名为:Multi_Concat_Block
import torch
import torch.nn as nndef autopad(k, p=None):if p is None:p = k // 2 if isinstance(k, int) else [x // 2 for x in k]return pclass SiLU(nn.Module):@staticmethoddef forward(x):return x * torch.sigmoid(x)class Conv(nn.Module):def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=SiLU()): # ch_in, ch_out, kernel, stride, padding, groupssuper(Conv, self).__init__()self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)self.bn = nn.BatchNorm2d(c2, eps=0.001, momentum=0.03)# 走SiLUself.act = nn.LeakyReLU(0.1, inplace=True) if act is True else (act if isinstance(act, nn.Module) else nn.Identity())def forward(self, x):return self.act(self.bn(self.conv(x)))def fuseforward(self, x):return self.act(self.conv(x))class Multi_Concat_Block(nn.Module):def __init__(self, c1, c2, c3, n=4, e=1, ids=[0]):super(Multi_Concat_Block, self).__init__()c_ = int(c2 * e)self.ids = idsself.cv1 = Conv(c1, c_, 1, 1)self.cv2 = Conv(c1, c_, 1, 1)self.cv3 = nn.ModuleList([Conv(c_ if i == 0 else c2, c2, 3, 1) for i in range(n)])self.cv4 = Conv(c_ * 2 + c2 * (len(ids) - 2), c3, 1, 1)def forward(self, x):x_1 = self.cv1(x)x_2 = self.cv2(x)x_all = [x_1, x_2]for i in range(len(self.cv3)):x_2 = self.cv3[i](x_2)x_all.append(x_2)out = self.cv4(torch.cat([x_all[id] for id in self.ids], 1)) # 1:在1维拼接, 0:在0维拼接return outif __name__ == '__main__':ids = {'l': [-1, -3, -5, -6],'x': [-1, -3, -5, -7, -8],}['l']x = torch.randn(2, 3, 5, 5)print(x.shape)out = Multi_Concat_Block(3, 3, 5, n=4, ids=ids)(x)print(out.shape)
输出:
torch.Size([2, 3, 5, 5])
torch.Size([2, 5, 5, 5])
2.2 下采样网络结构
结合了maxpooling和2 ×\times× 2步长的卷积
代码如下,下采样结构类名为Transition_Block,
import torch
import torch.nn as nndef autopad(k, p=None):if p is None:p = k // 2 if isinstance(k, int) else [x // 2 for x in k]return pclass SiLU(nn.Module):@staticmethoddef forward(x):return x * torch.sigmoid(x)class Conv(nn.Module):def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=SiLU()): # ch_in, ch_out, kernel, stride, padding, groupssuper(Conv, self).__init__()self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)self.bn = nn.BatchNorm2d(c2, eps=0.001, momentum=0.03)# 走SiLUself.act = nn.LeakyReLU(0.1, inplace=True) if act is True else (act if isinstance(act, nn.Module) else nn.Identity())def forward(self, x):return self.act(self.bn(self.conv(x)))def fuseforward(self, x):return self.act(self.conv(x))class MP(nn.Module):def __init__(self, k=3, t=2):super(MP, self).__init__()self.m = nn.MaxPool2d(kernel_size=k, stride=t, padding=1)def forward(self, x):return self.m(x)class Transition_Block(nn.Module):def __init__(self, c1, c2):super(Transition_Block, self).__init__()self.cv1 = Conv(c1, c2, 1, 1)self.cv2 = Conv(c1, c2, 1, 1)self.cv3 = Conv(c2, c2, 3, 2)self.mp = MP()def forward(self, x):x_1 = self.mp(x)x_1 = self.cv1(x_1)x_2 = self.cv2(x)x_2 = self.cv3(x_2)return torch.cat([x_2, x_1], 1)if __name__ == '__main__':x = torch.randn(2, 3, 9, 9)print(x.shape)out = Transition_Block(3, 5)(x)print(out.shape)
输出:
torch.Size([2, 3, 9, 9])
torch.Size([2, 10, 5, 5])
2.3 整个backbone代码
整个主干网络实现代码为:
import torch
import torch.nn as nndef autopad(k, p=None):if p is None:p = k // 2 if isinstance(k, int) else [x // 2 for x in k]return pclass SiLU(nn.Module):@staticmethoddef forward(x):return x * torch.sigmoid(x)class Conv(nn.Module):def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=SiLU()): # ch_in, ch_out, kernel, stride, padding, groupssuper(Conv, self).__init__()self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)self.bn = nn.BatchNorm2d(c2, eps=0.001, momentum=0.03)# 走SiLUself.act = nn.LeakyReLU(0.1, inplace=True) if act is True else (act if isinstance(act, nn.Module) else nn.Identity())def forward(self, x):return self.act(self.bn(self.conv(x)))def fuseforward(self, x):return self.act(self.conv(x))class Multi_Concat_Block(nn.Module):def __init__(self, c1, c2, c3, n=4, e=1, ids=[0]):super(Multi_Concat_Block, self).__init__()c_ = int(c2 * e)self.ids = idsself.cv1 = Conv(c1, c_, 1, 1)self.cv2 = Conv(c1, c_, 1, 1)self.cv3 = nn.ModuleList([Conv(c_ if i == 0 else c2, c2, 3, 1) for i in range(n)])self.cv4 = Conv(c_ * 2 + c2 * (len(ids) - 2), c3, 1, 1)def forward(self, x):x_1 = self.cv1(x)x_2 = self.cv2(x)x_all = [x_1, x_2]for i in range(len(self.cv3)):x_2 = self.cv3[i](x_2)x_all.append(x_2)out = self.cv4(torch.cat([x_all[id] for id in self.ids], 1)) # 1:在1维拼接, 0:在0维拼接return outclass MP(nn.Module):def __init__(self, k=2):super(MP, self).__init__()self.m = nn.MaxPool2d(kernel_size=k, stride=k)def forward(self, x):return self.m(x)class Transition_Block(nn.Module):def __init__(self, c1, c2):super(Transition_Block, self).__init__()self.cv1 = Conv(c1, c2, 1, 1)self.cv2 = Conv(c1, c2, 1, 1)self.cv3 = Conv(c2, c2, 3, 2)self.mp = MP()def forward(self, x):x_1 = self.mp(x)x_1 = self.cv1(x_1)x_2 = self.cv2(x)x_2 = self.cv3(x_2)return torch.cat([x_2, x_1], 1)class Backbone(nn.Module):def __init__(self, transition_channels, block_channels, n, phi, pretrained=False):super().__init__()# -----------------------------------------------## 输入图片是640, 640, 3# -----------------------------------------------#ids = {'l': [-1, -3, -5, -6],'x': [-1, -3, -5, -7, -8],}[phi]self.stem = nn.Sequential(Conv(3, transition_channels, 3, 1),Conv(transition_channels, transition_channels * 2, 3, 2),Conv(transition_channels * 2, transition_channels * 2, 3, 1),)self.dark2 = nn.Sequential(Conv(transition_channels * 2, transition_channels * 4, 3, 2),Multi_Concat_Block(transition_channels * 4, block_channels * 2, transition_channels * 8, n=n, ids=ids),)self.dark3 = nn.Sequential(Transition_Block(transition_channels * 8, transition_channels * 4),Multi_Concat_Block(transition_channels * 8, block_channels * 4, transition_channels * 16, n=n, ids=ids),)self.dark4 = nn.Sequential(Transition_Block(transition_channels * 16, transition_channels * 8),Multi_Concat_Block(transition_channels * 16, block_channels * 8, transition_channels * 32, n=n, ids=ids),)self.dark5 = nn.Sequential(Transition_Block(transition_channels * 32, transition_channels * 16),Multi_Concat_Block(transition_channels * 32, block_channels * 8, transition_channels * 32, n=n, ids=ids),)if pretrained:url = {"l": 'https://github.com/bubbliiiing/yolov7-pytorch/releases/download/v1.0/yolov7_backbone_weights.pth',"x": 'https://github.com/bubbliiiing/yolov7-pytorch/releases/download/v1.0/yolov7_x_backbone_weights.pth',}[phi]checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", model_dir="./model_data")self.load_state_dict(checkpoint, strict=False)print("Load weights from " + url.split('/')[-1])def forward(self, x):x = self.stem(x)x = self.dark2(x)# -----------------------------------------------## dark3的输出为80, 80, 512,是一个有效特征层# -----------------------------------------------#x = self.dark3(x)feat1 = x# -----------------------------------------------## dark4的输出为40, 40, 1024,是一个有效特征层# -----------------------------------------------#x = self.dark4(x)feat2 = x# -----------------------------------------------## dark5的输出为20, 20, 1024,是一个有效特征层# -----------------------------------------------#x = self.dark5(x)feat3 = xreturn feat1, feat2, feat3if __name__ == '__main__':x = torch.randn(16, 3, 640, 640)print("x.shape:", x.shape)out1, out2, out3 = Backbone(3, 5, n=4, phi='l')(x)print("out1.shape:", out1.shape, '\n', "out2.shape:", out2.shape, '\n', "out3.shape:", out3.shape)
输出:
x.shape: torch.Size([16, 3, 640, 640])
out1.shape: torch.Size([16, 48, 80, 80])
out2.shape: torch.Size([16, 96, 40, 40])
out3.shape: torch.Size([16, 96, 20, 20])
3、FPN特征金字塔
backbone与FPN以及head代码:
import os
import sysimport numpy as np
import torch
import torch.nn as nn
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from nets.backbone import Backbone, Multi_Concat_Block, Conv, SiLU, Transition_Block, autopadclass SPPCSPC(nn.Module):# CSP https://github.com/WongKinYiu/CrossStagePartialNetworksdef __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5, k=(5, 9, 13)):super(SPPCSPC, self).__init__()c_ = int(2 * c2 * e) # hidden channelsself.cv1 = Conv(c1, c_, 1, 1)self.cv2 = Conv(c1, c_, 1, 1)self.cv3 = Conv(c_, c_, 3, 1)self.cv4 = Conv(c_, c_, 1, 1)self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])self.cv5 = Conv(4 * c_, c_, 1, 1)self.cv6 = Conv(c_, c_, 3, 1)self.cv7 = Conv(2 * c_, c2, 1, 1)def forward(self, x):x1 = self.cv4(self.cv3(self.cv1(x)))y1 = self.cv6(self.cv5(torch.cat([x1] + [m(x1) for m in self.m], 1)))y2 = self.cv2(x)return self.cv7(torch.cat((y1, y2), dim=1))class RepConv(nn.Module):# Represented convolution# https://arxiv.org/abs/2101.03697def __init__(self, c1, c2, k=3, s=1, p=None, g=1, act=SiLU(), deploy=False):super(RepConv, self).__init__()self.deploy = deployself.groups = gself.in_channels = c1self.out_channels = c2assert k == 3assert autopad(k, p) == 1padding_11 = autopad(k, p) - k // 2self.act = nn.LeakyReLU(0.1, inplace=True) if act is True else (act if isinstance(act, nn.Module) else nn.Identity())if deploy:self.rbr_reparam = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=True)else:self.rbr_identity = (nn.BatchNorm2d(num_features=c1, eps=0.001, momentum=0.03) if c2 == c1 and s == 1 else None)self.rbr_dense = nn.Sequential(nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False),nn.BatchNorm2d(num_features=c2, eps=0.001, momentum=0.03),)self.rbr_1x1 = nn.Sequential(nn.Conv2d(c1, c2, 1, s, padding_11, groups=g, bias=False),nn.BatchNorm2d(num_features=c2, eps=0.001, momentum=0.03),)def forward(self, inputs):if hasattr(self, "rbr_reparam"):return self.act(self.rbr_reparam(inputs))if self.rbr_identity is None:id_out = 0else:id_out = self.rbr_identity(inputs)return self.act(self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out)def get_equivalent_kernel_bias(self):kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense)kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity)return (kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid,bias3x3 + bias1x1 + biasid,)def _pad_1x1_to_3x3_tensor(self, kernel1x1):if kernel1x1 is None:return 0else:return nn.functional.pad(kernel1x1, [1, 1, 1, 1])def _fuse_bn_tensor(self, branch):if branch is None:return 0, 0if isinstance(branch, nn.Sequential):kernel = branch[0].weightrunning_mean = branch[1].running_meanrunning_var = branch[1].running_vargamma = branch[1].weightbeta = branch[1].biaseps = branch[1].epselse:assert isinstance(branch, nn.BatchNorm2d)if not hasattr(self, "id_tensor"):input_dim = self.in_channels // self.groupskernel_value = np.zeros((self.in_channels, input_dim, 3, 3), dtype=np.float32)for i in range(self.in_channels):kernel_value[i, i % input_dim, 1, 1] = 1self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device)kernel = self.id_tensorrunning_mean = branch.running_meanrunning_var = branch.running_vargamma = branch.weightbeta = branch.biaseps = branch.epsstd = (running_var + eps).sqrt()t = (gamma / std).reshape(-1, 1, 1, 1)return kernel * t, beta - running_mean * gamma / stddef repvgg_convert(self):kernel, bias = self.get_equivalent_kernel_bias()return (kernel.detach().cpu().numpy(),bias.detach().cpu().numpy(),)def fuse_conv_bn(self, conv, bn):std = (bn.running_var + bn.eps).sqrt()bias = bn.bias - bn.running_mean * bn.weight / stdt = (bn.weight / std).reshape(-1, 1, 1, 1)weights = conv.weight * tbn = nn.Identity()conv = nn.Conv2d(in_channels=conv.in_channels,out_channels=conv.out_channels,kernel_size=conv.kernel_size,stride=conv.stride,padding=conv.padding,dilation=conv.dilation,groups=conv.groups,bias=True,padding_mode=conv.padding_mode)conv.weight = torch.nn.Parameter(weights)conv.bias = torch.nn.Parameter(bias)return convdef fuse_repvgg_block(self):if self.deploy:returnprint(f"RepConv.fuse_repvgg_block")self.rbr_dense = self.fuse_conv_bn(self.rbr_dense[0], self.rbr_dense[1])self.rbr_1x1 = self.fuse_conv_bn(self.rbr_1x1[0], self.rbr_1x1[1])rbr_1x1_bias = self.rbr_1x1.biasweight_1x1_expanded = torch.nn.functional.pad(self.rbr_1x1.weight, [1, 1, 1, 1])# Fuse self.rbr_identityif (isinstance(self.rbr_identity, nn.BatchNorm2d) or isinstance(self.rbr_identity,nn.modules.batchnorm.SyncBatchNorm)):identity_conv_1x1 = nn.Conv2d(in_channels=self.in_channels,out_channels=self.out_channels,kernel_size=1,stride=1,padding=0,groups=self.groups,bias=False)identity_conv_1x1.weight.data = identity_conv_1x1.weight.data.to(self.rbr_1x1.weight.data.device)identity_conv_1x1.weight.data = identity_conv_1x1.weight.data.squeeze().squeeze()identity_conv_1x1.weight.data.fill_(0.0)identity_conv_1x1.weight.data.fill_diagonal_(1.0)identity_conv_1x1.weight.data = identity_conv_1x1.weight.data.unsqueeze(2).unsqueeze(3)identity_conv_1x1 = self.fuse_conv_bn(identity_conv_1x1, self.rbr_identity)bias_identity_expanded = identity_conv_1x1.biasweight_identity_expanded = torch.nn.functional.pad(identity_conv_1x1.weight, [1, 1, 1, 1])else:bias_identity_expanded = torch.nn.Parameter(torch.zeros_like(rbr_1x1_bias))weight_identity_expanded = torch.nn.Parameter(torch.zeros_like(weight_1x1_expanded))self.rbr_dense.weight = torch.nn.Parameter(self.rbr_dense.weight + weight_1x1_expanded + weight_identity_expanded)self.rbr_dense.bias = torch.nn.Parameter(self.rbr_dense.bias + rbr_1x1_bias + bias_identity_expanded)self.rbr_reparam = self.rbr_denseself.deploy = Trueif self.rbr_identity is not None:del self.rbr_identityself.rbr_identity = Noneif self.rbr_1x1 is not None:del self.rbr_1x1self.rbr_1x1 = Noneif self.rbr_dense is not None:del self.rbr_denseself.rbr_dense = Nonedef fuse_conv_and_bn(conv, bn):fusedconv = nn.Conv2d(conv.in_channels,conv.out_channels,kernel_size=conv.kernel_size,stride=conv.stride,padding=conv.padding,groups=conv.groups,bias=True).requires_grad_(False).to(conv.weight.device)w_conv = conv.weight.clone().view(conv.out_channels, -1)w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var)))fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.shape))b_conv = torch.zeros(conv.weight.size(0), device=conv.weight.device) if conv.bias is None else conv.biasb_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps))fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn)return fusedconv# ---------------------------------------------------#
# yolo_body
# ---------------------------------------------------#
class YoloBody(nn.Module):def __init__(self, anchors_mask, num_classes, phi, pretrained=False):super(YoloBody, self).__init__()# -----------------------------------------------## 定义了不同yolov7版本的参数# -----------------------------------------------#transition_channels = {'l': 32, 'x': 40}[phi]block_channels = 32panet_channels = {'l': 32, 'x': 64}[phi]e = {'l': 2, 'x': 1}[phi]n = {'l': 4, 'x': 6}[phi]ids = {'l': [-1, -2, -3, -4, -5, -6], 'x': [-1, -3, -5, -7, -8]}[phi]conv = {'l': RepConv, 'x': Conv}[phi]# -----------------------------------------------## 输入图片是640, 640, 3# -----------------------------------------------## ---------------------------------------------------## 生成主干模型# 获得三个有效特征层,他们的shape分别是:# 80, 80, 512# 40, 40, 1024# 20, 20, 1024# ---------------------------------------------------#self.backbone = Backbone(transition_channels, block_channels, n, phi, pretrained=pretrained)self.upsample = nn.Upsample(scale_factor=2, mode="nearest")self.sppcspc = SPPCSPC(transition_channels * 32, transition_channels * 16)self.conv_for_P5 = Conv(transition_channels * 16, transition_channels * 8)self.conv_for_feat2 = Conv(transition_channels * 32, transition_channels * 8)self.conv3_for_upsample1 = Multi_Concat_Block(transition_channels * 16, panet_channels * 4,transition_channels * 8, e=e, n=n, ids=ids)self.conv_for_P4 = Conv(transition_channels * 8, transition_channels * 4)self.conv_for_feat1 = Conv(transition_channels * 16, transition_channels * 4)self.conv3_for_upsample2 = Multi_Concat_Block(transition_channels * 8, panet_channels * 2,transition_channels * 4, e=e, n=n, ids=ids)self.down_sample1 = Transition_Block(transition_channels * 4, transition_channels * 4)self.conv3_for_downsample1 = Multi_Concat_Block(transition_channels * 16, panet_channels * 4,transition_channels * 8, e=e, n=n, ids=ids)self.down_sample2 = Transition_Block(transition_channels * 8, transition_channels * 8)self.conv3_for_downsample2 = Multi_Concat_Block(transition_channels * 32, panet_channels * 8,transition_channels * 16, e=e, n=n, ids=ids)self.rep_conv_1 = conv(transition_channels * 4, transition_channels * 8, 3, 1)self.rep_conv_2 = conv(transition_channels * 8, transition_channels * 16, 3, 1)self.rep_conv_3 = conv(transition_channels * 16, transition_channels * 32, 3, 1)self.yolo_head_P3 = nn.Conv2d(transition_channels * 8, len(anchors_mask[2]) * (5 + num_classes), 1)self.yolo_head_P4 = nn.Conv2d(transition_channels * 16, len(anchors_mask[1]) * (5 + num_classes), 1)self.yolo_head_P5 = nn.Conv2d(transition_channels * 32, len(anchors_mask[0]) * (5 + num_classes), 1)def fuse(self):print('Fusing layers... ')for m in self.modules():if isinstance(m, RepConv):m.fuse_repvgg_block()elif type(m) is Conv and hasattr(m, 'bn'):m.conv = fuse_conv_and_bn(m.conv, m.bn)delattr(m, 'bn')m.forward = m.fuseforwardreturn selfdef forward(self, x):# backbonefeat1, feat2, feat3 = self.backbone.forward(x)P5 = self.sppcspc(feat3)P5_conv = self.conv_for_P5(P5)P5_upsample = self.upsample(P5_conv)P4 = torch.cat([self.conv_for_feat2(feat2), P5_upsample], 1)P4 = self.conv3_for_upsample1(P4)P4_conv = self.conv_for_P4(P4)P4_upsample = self.upsample(P4_conv)P3 = torch.cat([self.conv_for_feat1(feat1), P4_upsample], 1)P3 = self.conv3_for_upsample2(P3)P3_downsample = self.down_sample1(P3)P4 = torch.cat([P3_downsample, P4], 1)P4 = self.conv3_for_downsample1(P4)P4_downsample = self.down_sample2(P4)P5 = torch.cat([P4_downsample, P5], 1)P5 = self.conv3_for_downsample2(P5)P3 = self.rep_conv_1(P3)P4 = self.rep_conv_2(P4)P5 = self.rep_conv_3(P5)# ---------------------------------------------------## 第三个特征层# y3=(batch_size, 75, 80, 80)# ---------------------------------------------------#out2 = self.yolo_head_P3(P3)# ---------------------------------------------------## 第二个特征层# y2=(batch_size, 75, 40, 40)# ---------------------------------------------------#out1 = self.yolo_head_P4(P4)# ---------------------------------------------------## 第一个特征层# y1=(batch_size, 75, 20, 20)# ---------------------------------------------------#out0 = self.yolo_head_P5(P5)return [out0, out1, out2]if __name__ == '__main__':x = torch.randn(16, 3, 640, 640)print("x.shape:", x.shape)anchors_mask = [[[12, 16], [19, 36], [40, 28]], [[36, 75], [76, 55], [72, 146]], [[142, 110], [192, 243], [459, 401]]]out = YoloBody(anchors_mask, 20, 'l')(x)for item in out:print(item.shape)
输出:
x.shape: torch.Size([16, 3, 640, 640])
torch.Size([16, 75, 20, 20])
torch.Size([16, 75, 40, 40])
torch.Size([16, 75, 80, 80])
二、预测结果的解码
1、 获得预测框、置信度、种类的数值
代码:
def decode_box(self, inputs):outputs = []for i, input in enumerate(inputs):# -----------------------------------------------## 输入的input一共有三个,他们的shape分别是# batch_size, 255, 20, 20# batch_size, 255, 40, 40# batch_size, 255, 80, 80# -----------------------------------------------#batch_size = input.size(0)input_height = input.size(2)input_width = input.size(3)# -----------------------------------------------## 输入为640x640时# stride_h = stride_w = 32、16、8# -----------------------------------------------#stride_h = self.input_shape[0] / input_heightstride_w = self.input_shape[1] / input_width# -------------------------------------------------## 此时获得的scaled_anchors大小是相对于特征层的# -------------------------------------------------#scaled_anchors = [(anchor_width / stride_w, anchor_height / stride_h) for anchor_width, anchor_height inself.anchors[self.anchors_mask[i]]]# -----------------------------------------------## 输入的input一共有三个,他们的shape分别是# batch_size, 3, 20, 20, 85# batch_size, 3, 40, 40, 85# batch_size, 3, 80, 80, 85# -----------------------------------------------#prediction = input.view(batch_size, len(self.anchors_mask[i]),self.bbox_attrs, input_height, input_width).permute(0, 1, 3, 4, 2).contiguous()# -----------------------------------------------## 先验框的中心位置的调整参数# -----------------------------------------------#x = torch.sigmoid(prediction[..., 0])y = torch.sigmoid(prediction[..., 1])# -----------------------------------------------## 先验框的宽高调整参数# -----------------------------------------------#w = torch.sigmoid(prediction[..., 2])h = torch.sigmoid(prediction[..., 3])# -----------------------------------------------## 获得置信度,是否有物体# -----------------------------------------------#conf = torch.sigmoid(prediction[..., 4])# -----------------------------------------------## 种类置信度# -----------------------------------------------#pred_cls = torch.sigmoid(prediction[..., 5:])# 暂未看懂?FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensorLongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor# ----------------------------------------------------------## 生成网格,先验框中心,网格左上角# batch_size,3,20,20# ----------------------------------------------------------#grid_x = torch.linspace(0, input_width - 1, input_width).repeat(input_height, 1).repeat(batch_size * len(self.anchors_mask[i]), 1, 1).view(x.shape).type(FloatTensor)grid_y = torch.linspace(0, input_height - 1, input_height).repeat(input_width, 1).t().repeat(batch_size * len(self.anchors_mask[i]), 1, 1).view(y.shape).type(FloatTensor)# ----------------------------------------------------------## 按照网格格式生成先验框的宽高# batch_size,3,20,20# ----------------------------------------------------------#anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0]))anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1]))anchor_w = anchor_w.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(w.shape)anchor_h = anchor_h.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(h.shape)# ----------------------------------------------------------## 利用预测结果对先验框进行调整# 首先调整先验框的中心,从先验框中心向右下角偏移# 再调整先验框的宽高。# ----------------------------------------------------------#pred_boxes = FloatTensor(prediction[..., :4].shape)pred_boxes[..., 0] = x.data * 2. - 0.5 + grid_xpred_boxes[..., 1] = y.data * 2. - 0.5 + grid_ypred_boxes[..., 2] = (w.data * 2) ** 2 * anchor_wpred_boxes[..., 3] = (h.data * 2) ** 2 * anchor_h# ----------------------------------------------------------## 将输出结果归一化成小数的形式# ----------------------------------------------------------#_scale = torch.Tensor([input_width, input_height, input_width, input_height]).type(FloatTensor)output = torch.cat((pred_boxes.view(batch_size, -1, 4) / _scale,conf.view(batch_size, -1, 1), pred_cls.view(batch_size, -1, self.num_classes)), -1)outputs.append(output.data)return outputs
2、得分筛选与非极大抑制
代码:
def non_max_suppression(self, prediction, num_classes, input_shape, image_shape, letterbox_image, conf_thres=0.5,nms_thres=0.4):# ----------------------------------------------------------## 将预测结果的格式转换成左上角右下角的格式。# prediction [batch_size, num_anchors, 85]# ----------------------------------------------------------#box_corner = prediction.new(prediction.shape)box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2prediction[:, :, :4] = box_corner[:, :, :4]output = [None for _ in range(len(prediction))]for i, image_pred in enumerate(prediction):# ----------------------------------------------------------## 对种类预测部分取max。# class_conf [num_anchors, 1] 种类置信度# class_pred [num_anchors, 1] 种类# 0是每列的最大值,1是每行的最大值# ----------------------------------------------------------#class_conf, class_pred = torch.max(image_pred[:, 5:5 + num_classes], 1, keepdim=True)# ----------------------------------------------------------## 利用置信度进行第一轮筛选# ----------------------------------------------------------#conf_mask = (image_pred[:, 4] * class_conf[:, 0] >= conf_thres).squeeze()# ----------------------------------------------------------## 根据置信度进行预测结果的筛选# ----------------------------------------------------------#image_pred = image_pred[conf_mask]class_conf = class_conf[conf_mask]class_pred = class_pred[conf_mask]if not image_pred.size(0):continue# -------------------------------------------------------------------------## detections [num_anchors, 7]# 7的内容为:x1, y1, x2, y2, obj_conf, class_conf, class_pred# -------------------------------------------------------------------------#detections = torch.cat((image_pred[:, :5], class_conf.float(), class_pred.float()), 1)# ------------------------------------------## 获得预测结果中包含的所有种类# ------------------------------------------#unique_labels = detections[:, -1].cpu().unique()if prediction.is_cuda:unique_labels = unique_labels.cuda()detections = detections.cuda()for c in unique_labels:# ------------------------------------------## 获得某一类得分筛选后全部的预测结果# ------------------------------------------#detections_class = detections[detections[:, -1] == c]# ------------------------------------------## 使用官方自带的非极大抑制会速度更快一些!# ------------------------------------------#keep = nms(detections_class[:, :4],detections_class[:, 4] * detections_class[:, 5],nms_thres)max_detections = detections_class[keep]# # 按照存在物体的置信度排序# _, conf_sort_index = torch.sort(detections_class[:, 4]*detections_class[:, 5], descending=True)# detections_class = detections_class[conf_sort_index]# # 进行非极大抑制# max_detections = []# while detections_class.size(0):# # 取出这一类置信度最高的,一步一步往下判断,判断重合程度是否大于nms_thres,如果是则去除掉# max_detections.append(detections_class[0].unsqueeze(0))# if len(detections_class) == 1:# break# ious = bbox_iou(max_detections[-1], detections_class[1:])# detections_class = detections_class[1:][ious < nms_thres]# # 堆叠# max_detections = torch.cat(max_detections).data# Add max detections to outputsoutput[i] = max_detections if output[i] is None else torch.cat((output[i], max_detections))if output[i] is not None:output[i] = output[i].cpu().numpy()box_xy, box_wh = (output[i][:, 0:2] + output[i][:, 2:4]) / 2, output[i][:, 2:4] - output[i][:, 0:2]output[i][:, :4] = self.yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape, letterbox_image)return output
三、训练过程
1、正样本匹配过程
1.1 匹配anchor与特征点
代码:
def find_3_positive(self, predictions, targets):# ------------------------------------## 获得每个特征层先验框的数量# 与真实框的数量# ------------------------------------#num_anchor, num_gt = len(self.anchors_mask[0]), targets.shape[0]# ------------------------------------## 创建空列表存放indices和anchors# ------------------------------------#indices, anchors = [], []# ------------------------------------## 创建7个1# 序号0,1为1# 序号2:6为特征层的高宽# 序号6为1# ------------------------------------#gain = torch.ones(7, device=targets.device)# ------------------------------------## ai [num_anchor, num_gt]# targets [num_gt, 6] => [num_anchor, num_gt, 7]# ------------------------------------#ai = torch.arange(num_anchor, device=targets.device).float().view(num_anchor, 1).repeat(1, num_gt)targets = torch.cat((targets.repeat(num_anchor, 1, 1), ai[:, :, None]), 2) # append anchor indicesg = 0.5 # offsetsoff = torch.tensor([[0, 0],[1, 0], [0, 1], [-1, 0], [0, -1], # j,k,l,m# [1, 1], [1, -1], [-1, 1], [-1, -1], # jk,jm,lk,lm], device=targets.device).float() * gfor i in range(len(predictions)):# ----------------------------------------------------## 将先验框除以stride,获得相对于特征层的先验框。# anchors_i [num_anchor, 2]# ----------------------------------------------------#anchors_i = torch.from_numpy(self.anchors[i] / self.stride[i]).type_as(predictions[i])# -------------------------------------------## 计算获得对应特征层的高宽# -------------------------------------------#gain[2:6] = torch.tensor(predictions[i].shape)[[3, 2, 3, 2]]# -------------------------------------------## 将真实框乘上gain,# 其实就是将真实框映射到特征层上# -------------------------------------------#t = targets * gainif num_gt:# -------------------------------------------## 计算真实框与先验框高宽的比值# 然后根据比值大小进行判断,# 判断结果用于取出,获得所有先验框对应的真实框# r [num_anchor, num_gt, 2]# t [num_anchor, num_gt, 7] => [num_matched_anchor, 7]# -------------------------------------------#r = t[:, :, 4:6] / anchors_i[:, None]j = torch.max(r, 1. / r).max(2)[0] < self.thresholdt = t[j] # filter# -------------------------------------------## gxy 获得所有先验框对应的真实框的x轴y轴坐标# gxi 取相对于该特征层的右小角的坐标# -------------------------------------------#gxy = t[:, 2:4] # grid xygxi = gain[[2, 3]] - gxy # inversej, k = ((gxy % 1. < g) & (gxy > 1.)).Tl, m = ((gxi % 1. < g) & (gxi > 1.)).Tj = torch.stack((torch.ones_like(j), j, k, l, m))# -------------------------------------------## t 重复5次,使用满足条件的j进行框的提取# j 一共五行,代表当前特征点在五个# [0, 0], [1, 0], [0, 1], [-1, 0], [0, -1]# 方向是否存在# -------------------------------------------#t = t.repeat((5, 1, 1))[j]offsets = (torch.zeros_like(gxy)[None] + off[:, None])[j]else:t = targets[0]offsets = 0# -------------------------------------------## b 代表属于第几个图片# gxy 代表该真实框所处的x、y中心坐标# gwh 代表该真实框的wh坐标# gij 代表真实框所属的特征点坐标# -------------------------------------------#b, c = t[:, :2].long().T # image, classgxy = t[:, 2:4] # grid xygwh = t[:, 4:6] # grid whgij = (gxy - offsets).long()gi, gj = gij.T # grid xy indices# -------------------------------------------## gj、gi不能超出特征层范围# a代表属于该特征点的第几个先验框# -------------------------------------------#a = t[:, 6].long() # anchor indicesindices.append((b, a, gj.clamp_(0, gain[3] - 1), gi.clamp_(0, gain[2] - 1))) # image, anchor, grid indicesanchors.append(anchors_i[a]) # anchorsreturn indices, anchors
1.2 SimOTA自适应匹配
代码:
def build_targets(self, predictions, targets, imgs):#-------------------------------------------## 匹配正样本#-------------------------------------------#indices, anch = self.find_3_positive(predictions, targets)matching_bs = [[] for _ in predictions]matching_as = [[] for _ in predictions]matching_gjs = [[] for _ in predictions]matching_gis = [[] for _ in predictions]matching_targets = [[] for _ in predictions]matching_anchs = [[] for _ in predictions]#-------------------------------------------## 一共三层#-------------------------------------------#num_layer = len(predictions)#-------------------------------------------## 对batch_size进行循环,进行OTA匹配# 在batch_size循环中对layer进行循环#-------------------------------------------#for batch_idx in range(predictions[0].shape[0]):#-------------------------------------------## 先判断匹配上的真实框哪些属于该图片#-------------------------------------------#b_idx = targets[:, 0]==batch_idxthis_target = targets[b_idx]#-------------------------------------------## 如果没有真实框属于该图片则continue#-------------------------------------------#if this_target.shape[0] == 0:continue#-------------------------------------------## 真实框的坐标进行缩放#-------------------------------------------#txywh = this_target[:, 2:6] * imgs[batch_idx].shape[1]#-------------------------------------------## 从中心宽高到左上角右下角#-------------------------------------------#txyxy = self.xywh2xyxy(txywh)pxyxys = []p_cls = []p_obj = []from_which_layer = []all_b = []all_a = []all_gj = []all_gi = []all_anch = []#-------------------------------------------## 对三个layer进行循环#-------------------------------------------#for i, prediction in enumerate(predictions):#-------------------------------------------## b代表第几张图片 a代表第几个先验框# gj代表y轴,gi代表x轴#-------------------------------------------#b, a, gj, gi = indices[i]idx = (b == batch_idx)b, a, gj, gi = b[idx], a[idx], gj[idx], gi[idx] all_b.append(b)all_a.append(a)all_gj.append(gj)all_gi.append(gi)all_anch.append(anch[i][idx])from_which_layer.append(torch.ones(size=(len(b),)) * i)#-------------------------------------------## 取出这个真实框对应的预测结果#-------------------------------------------#fg_pred = prediction[b, a, gj, gi] p_obj.append(fg_pred[:, 4:5])p_cls.append(fg_pred[:, 5:])#-------------------------------------------## 获得网格后,进行解码#-------------------------------------------#grid = torch.stack([gi, gj], dim=1).type_as(fg_pred)pxy = (fg_pred[:, :2].sigmoid() * 2. - 0.5 + grid) * self.stride[i]pwh = (fg_pred[:, 2:4].sigmoid() * 2) ** 2 * anch[i][idx] * self.stride[i]pxywh = torch.cat([pxy, pwh], dim=-1)pxyxy = self.xywh2xyxy(pxywh)pxyxys.append(pxyxy)#-------------------------------------------## 判断是否存在对应的预测框,不存在则跳过#-------------------------------------------#pxyxys = torch.cat(pxyxys, dim=0)if pxyxys.shape[0] == 0:continue#-------------------------------------------## 进行堆叠#-------------------------------------------#p_obj = torch.cat(p_obj, dim=0)p_cls = torch.cat(p_cls, dim=0)from_which_layer = torch.cat(from_which_layer, dim=0)all_b = torch.cat(all_b, dim=0)all_a = torch.cat(all_a, dim=0)all_gj = torch.cat(all_gj, dim=0)all_gi = torch.cat(all_gi, dim=0)all_anch = torch.cat(all_anch, dim=0)#-------------------------------------------------------------## 计算当前图片中,真实框与预测框的重合程度# iou的范围为0-1,取-log后为0~inf# 重合程度越大,取-log后越小# 因此,真实框与预测框重合度越大,pair_wise_iou_loss越小#-------------------------------------------------------------#pair_wise_iou = self.box_iou(txyxy, pxyxys)pair_wise_iou_loss = -torch.log(pair_wise_iou + 1e-8)#-------------------------------------------## 最多二十个预测框与真实框的重合程度# 然后求和,找到每个真实框对应几个预测框#-------------------------------------------#top_k, _ = torch.topk(pair_wise_iou, min(20, pair_wise_iou.shape[1]), dim=1)dynamic_ks = torch.clamp(top_k.sum(1).int(), min=1)#-------------------------------------------## gt_cls_per_image 种类的真实信息#-------------------------------------------#gt_cls_per_image = F.one_hot(this_target[:, 1].to(torch.int64), self.num_classes).float().unsqueeze(1).repeat(1, pxyxys.shape[0], 1)#-------------------------------------------## cls_preds_ 种类置信度的预测信息# cls_preds_越接近于1,y越接近于1# y / (1 - y)越接近于无穷大# 也就是种类置信度预测的越准# pair_wise_cls_loss越小#-------------------------------------------#num_gt = this_target.shape[0]cls_preds_ = p_cls.float().unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_() * p_obj.unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_()y = cls_preds_.sqrt_()pair_wise_cls_loss = F.binary_cross_entropy_with_logits(torch.log(y / (1 - y)), gt_cls_per_image, reduction="none").sum(-1)del cls_preds_#-------------------------------------------## 求cost的总和#-------------------------------------------#cost = (pair_wise_cls_loss+ 3.0 * pair_wise_iou_loss)#-------------------------------------------## 求cost最小的k个预测框#-------------------------------------------#matching_matrix = torch.zeros_like(cost)for gt_idx in range(num_gt):_, pos_idx = torch.topk(cost[gt_idx], k=dynamic_ks[gt_idx].item(), largest=False)matching_matrix[gt_idx][pos_idx] = 1.0del top_k, dynamic_ks#-------------------------------------------## 如果一个预测框对应多个真实框# 只使用这个预测框最对应的真实框#-------------------------------------------#anchor_matching_gt = matching_matrix.sum(0)if (anchor_matching_gt > 1).sum() > 0:_, cost_argmin = torch.min(cost[:, anchor_matching_gt > 1], dim=0)matching_matrix[:, anchor_matching_gt > 1] *= 0.0matching_matrix[cost_argmin, anchor_matching_gt > 1] = 1.0fg_mask_inboxes = matching_matrix.sum(0) > 0.0matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0)#-------------------------------------------## 取出符合条件的框#-------------------------------------------#from_which_layer = from_which_layer[fg_mask_inboxes]all_b = all_b[fg_mask_inboxes]all_a = all_a[fg_mask_inboxes]all_gj = all_gj[fg_mask_inboxes]all_gi = all_gi[fg_mask_inboxes]all_anch = all_anch[fg_mask_inboxes]this_target = this_target[matched_gt_inds]for i in range(num_layer):layer_idx = from_which_layer == imatching_bs[i].append(all_b[layer_idx])matching_as[i].append(all_a[layer_idx])matching_gjs[i].append(all_gj[layer_idx])matching_gis[i].append(all_gi[layer_idx])matching_targets[i].append(this_target[layer_idx])matching_anchs[i].append(all_anch[layer_idx])for i in range(num_layer):matching_bs[i] = torch.cat(matching_bs[i], dim=0) if len(matching_bs[i]) != 0 else torch.Tensor(matching_bs[i])matching_as[i] = torch.cat(matching_as[i], dim=0) if len(matching_as[i]) != 0 else torch.Tensor(matching_as[i])matching_gjs[i] = torch.cat(matching_gjs[i], dim=0) if len(matching_gjs[i]) != 0 else torch.Tensor(matching_gjs[i])matching_gis[i] = torch.cat(matching_gis[i], dim=0) if len(matching_gis[i]) != 0 else torch.Tensor(matching_gis[i])matching_targets[i] = torch.cat(matching_targets[i], dim=0) if len(matching_targets[i]) != 0 else torch.Tensor(matching_targets[i])matching_anchs[i] = torch.cat(matching_anchs[i], dim=0) if len(matching_anchs[i]) != 0 else torch.Tensor(matching_anchs[i])return matching_bs, matching_as, matching_gjs, matching_gis, matching_targets, matching_anchs
2、Loss的组成
代码:
def smooth_BCE(eps=0.1): # https://github.com/ultralytics/yolov3/issues/238#issuecomment-598028441# return positive, negative label smoothing BCE targetsreturn 1.0 - 0.5 * eps, 0.5 * epsclass YOLOLoss(nn.Module):def __init__(self, anchors, num_classes, input_shape, anchors_mask = [[6,7,8], [3,4,5], [0,1,2]], label_smoothing = 0):super(YOLOLoss, self).__init__()#-----------------------------------------------------------## 13x13的特征层对应的anchor是[142, 110],[192, 243],[459, 401]# 26x26的特征层对应的anchor是[36, 75],[76, 55],[72, 146]# 52x52的特征层对应的anchor是[12, 16],[19, 36],[40, 28]#-----------------------------------------------------------#self.anchors = [anchors[mask] for mask in anchors_mask]self.num_classes = num_classesself.input_shape = input_shapeself.anchors_mask = anchors_maskself.balance = [0.4, 1.0, 4]self.stride = [32, 16, 8]self.box_ratio = 0.05self.obj_ratio = 1 * (input_shape[0] * input_shape[1]) / (640 ** 2)self.cls_ratio = 0.5 * (num_classes / 80)self.threshold = 4self.cp, self.cn = smooth_BCE(eps=label_smoothing) self.BCEcls, self.BCEobj, self.gr = nn.BCEWithLogitsLoss(), nn.BCEWithLogitsLoss(), 1def bbox_iou(self, box1, box2, x1y1x2y2=True, GIoU=False, DIoU=False, CIoU=False, eps=1e-7):box2 = box2.Tif x1y1x2y2:b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3]b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3]else:b1_x1, b1_x2 = box1[0] - box1[2] / 2, box1[0] + box1[2] / 2b1_y1, b1_y2 = box1[1] - box1[3] / 2, box1[1] + box1[3] / 2b2_x1, b2_x2 = box2[0] - box2[2] / 2, box2[0] + box2[2] / 2b2_y1, b2_y2 = box2[1] - box2[3] / 2, box2[1] + box2[3] / 2inter = (torch.min(b1_x2, b2_x2) - torch.max(b1_x1, b2_x1)).clamp(0) * \(torch.min(b1_y2, b2_y2) - torch.max(b1_y1, b2_y1)).clamp(0)w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + epsw2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + epsunion = w1 * h1 + w2 * h2 - inter + epsiou = inter / unionif GIoU or DIoU or CIoU:cw = torch.max(b1_x2, b2_x2) - torch.min(b1_x1, b2_x1) # convex (smallest enclosing box) widthch = torch.max(b1_y2, b2_y2) - torch.min(b1_y1, b2_y1) # convex heightif CIoU or DIoU: # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1c2 = cw ** 2 + ch ** 2 + eps # convex diagonal squaredrho2 = ((b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2 +(b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2) / 4 # center distance squaredif DIoU:return iou - rho2 / c2 # DIoUelif CIoU: # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47v = (4 / math.pi ** 2) * torch.pow(torch.atan(w2 / h2) - torch.atan(w1 / h1), 2)with torch.no_grad():alpha = v / (v - iou + (1 + eps))return iou - (rho2 / c2 + v * alpha) # CIoUelse: # GIoU https://arxiv.org/pdf/1902.09630.pdfc_area = cw * ch + eps # convex areareturn iou - (c_area - union) / c_area # GIoUelse:return iou # IoUdef __call__(self, predictions, targets, imgs): #-------------------------------------------## 对输入进来的预测结果进行reshape# bs, 255, 20, 20 => bs, 3, 20, 20, 85# bs, 255, 40, 40 => bs, 3, 40, 40, 85# bs, 255, 80, 80 => bs, 3, 80, 80, 85#-------------------------------------------#for i in range(len(predictions)):bs, _, h, w = predictions[i].size()predictions[i] = predictions[i].view(bs, len(self.anchors_mask[i]), -1, h, w).permute(0, 1, 3, 4, 2).contiguous()#-------------------------------------------## 获得工作的设备#-------------------------------------------#device = targets.device#-------------------------------------------## 初始化三个部分的损失#-------------------------------------------#cls_loss, box_loss, obj_loss = torch.zeros(1, device = device), torch.zeros(1, device = device), torch.zeros(1, device = device)#-------------------------------------------## 进行正样本的匹配#-------------------------------------------#bs, as_, gjs, gis, targets, anchors = self.build_targets(predictions, targets, imgs)#-------------------------------------------## 计算获得对应特征层的高宽#-------------------------------------------#feature_map_sizes = [torch.tensor(prediction.shape, device=device)[[3, 2, 3, 2]].type_as(prediction) for prediction in predictions] #-------------------------------------------## 计算损失,对三个特征层各自进行处理#-------------------------------------------#for i, prediction in enumerate(predictions): #-------------------------------------------## image, anchor, gridy, gridx#-------------------------------------------#b, a, gj, gi = bs[i], as_[i], gjs[i], gis[i]tobj = torch.zeros_like(prediction[..., 0], device=device) # target obj#-------------------------------------------## 获得目标数量,如果目标大于0# 则开始计算种类损失和回归损失#-------------------------------------------#n = b.shape[0]if n:prediction_pos = prediction[b, a, gj, gi] # prediction subset corresponding to targets#-------------------------------------------## 计算匹配上的正样本的回归损失#-------------------------------------------##-------------------------------------------## grid 获得正样本的x、y轴坐标#-------------------------------------------#grid = torch.stack([gi, gj], dim=1)#-------------------------------------------## 进行解码,获得预测结果#-------------------------------------------#xy = prediction_pos[:, :2].sigmoid() * 2. - 0.5wh = (prediction_pos[:, 2:4].sigmoid() * 2) ** 2 * anchors[i]box = torch.cat((xy, wh), 1)#-------------------------------------------## 对真实框进行处理,映射到特征层上#-------------------------------------------#selected_tbox = targets[i][:, 2:6] * feature_map_sizes[i]selected_tbox[:, :2] -= grid.type_as(prediction)#-------------------------------------------## 计算预测框和真实框的回归损失#-------------------------------------------#iou = self.bbox_iou(box.T, selected_tbox, x1y1x2y2=False, CIoU=True)box_loss += (1.0 - iou).mean()#-------------------------------------------## 根据预测结果的iou获得置信度损失的gt#-------------------------------------------#tobj[b, a, gj, gi] = (1.0 - self.gr) + self.gr * iou.detach().clamp(0).type(tobj.dtype) # iou ratio#-------------------------------------------## 计算匹配上的正样本的分类损失#-------------------------------------------#selected_tcls = targets[i][:, 1].long()t = torch.full_like(prediction_pos[:, 5:], self.cn, device=device) # targetst[range(n), selected_tcls] = self.cpcls_loss += self.BCEcls(prediction_pos[:, 5:], t) # BCE#-------------------------------------------## 计算目标是否存在的置信度损失# 并且乘上每个特征层的比例#-------------------------------------------#obj_loss += self.BCEobj(prediction[..., 4], tobj) * self.balance[i] # obj loss#-------------------------------------------## 将各个部分的损失乘上比例# 全加起来后,乘上batch_size#-------------------------------------------#box_loss *= self.box_ratioobj_loss *= self.obj_ratiocls_loss *= self.cls_ratiobs = tobj.shape[0]loss = box_loss + obj_loss + cls_lossreturn lossdef xywh2xyxy(self, x):# Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2]y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)y[:, 0] = x[:, 0] - x[:, 2] / 2 # top left xy[:, 1] = x[:, 1] - x[:, 3] / 2 # top left yy[:, 2] = x[:, 0] + x[:, 2] / 2 # bottom right xy[:, 3] = x[:, 1] + x[:, 3] / 2 # bottom right yreturn ydef box_iou(self, box1, box2):# https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py"""Return intersection-over-union (Jaccard index) of boxes.Both sets of boxes are expected to be in (x1, y1, x2, y2) format.Arguments:box1 (Tensor[N, 4])box2 (Tensor[M, 4])Returns:iou (Tensor[N, M]): the NxM matrix containing the pairwiseIoU values for every element in boxes1 and boxes2"""def box_area(box):# box = 4xnreturn (box[2] - box[0]) * (box[3] - box[1])area1 = box_area(box1.T)area2 = box_area(box2.T)# inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)inter = (torch.min(box1[:, None, 2:], box2[:, 2:]) - torch.max(box1[:, None, :2], box2[:, :2])).clamp(0).prod(2)return inter / (area1[:, None] + area2 - inter) # iou = inter / (area1 + area2 - inter)def build_targets(self, predictions, targets, imgs):#-------------------------------------------## 匹配正样本#-------------------------------------------#indices, anch = self.find_3_positive(predictions, targets)matching_bs = [[] for _ in predictions]matching_as = [[] for _ in predictions]matching_gjs = [[] for _ in predictions]matching_gis = [[] for _ in predictions]matching_targets = [[] for _ in predictions]matching_anchs = [[] for _ in predictions]#-------------------------------------------## 一共三层#-------------------------------------------#num_layer = len(predictions)#-------------------------------------------## 对batch_size进行循环,进行OTA匹配# 在batch_size循环中对layer进行循环#-------------------------------------------#for batch_idx in range(predictions[0].shape[0]):#-------------------------------------------## 先判断匹配上的真实框哪些属于该图片#-------------------------------------------#b_idx = targets[:, 0]==batch_idxthis_target = targets[b_idx]#-------------------------------------------## 如果没有真实框属于该图片则continue#-------------------------------------------#if this_target.shape[0] == 0:continue#-------------------------------------------## 真实框的坐标进行缩放#-------------------------------------------#txywh = this_target[:, 2:6] * imgs[batch_idx].shape[1]#-------------------------------------------## 从中心宽高到左上角右下角#-------------------------------------------#txyxy = self.xywh2xyxy(txywh)pxyxys = []p_cls = []p_obj = []from_which_layer = []all_b = []all_a = []all_gj = []all_gi = []all_anch = []#-------------------------------------------## 对三个layer进行循环#-------------------------------------------#for i, prediction in enumerate(predictions):#-------------------------------------------## b代表第几张图片 a代表第几个先验框# gj代表y轴,gi代表x轴#-------------------------------------------#b, a, gj, gi = indices[i]idx = (b == batch_idx)b, a, gj, gi = b[idx], a[idx], gj[idx], gi[idx] all_b.append(b)all_a.append(a)all_gj.append(gj)all_gi.append(gi)all_anch.append(anch[i][idx])from_which_layer.append(torch.ones(size=(len(b),)) * i)#-------------------------------------------## 取出这个真实框对应的预测结果#-------------------------------------------#fg_pred = prediction[b, a, gj, gi] p_obj.append(fg_pred[:, 4:5])p_cls.append(fg_pred[:, 5:])#-------------------------------------------## 获得网格后,进行解码#-------------------------------------------#grid = torch.stack([gi, gj], dim=1).type_as(fg_pred)pxy = (fg_pred[:, :2].sigmoid() * 2. - 0.5 + grid) * self.stride[i]pwh = (fg_pred[:, 2:4].sigmoid() * 2) ** 2 * anch[i][idx] * self.stride[i]pxywh = torch.cat([pxy, pwh], dim=-1)pxyxy = self.xywh2xyxy(pxywh)pxyxys.append(pxyxy)#-------------------------------------------## 判断是否存在对应的预测框,不存在则跳过#-------------------------------------------#pxyxys = torch.cat(pxyxys, dim=0)if pxyxys.shape[0] == 0:continue#-------------------------------------------## 进行堆叠#-------------------------------------------#p_obj = torch.cat(p_obj, dim=0)p_cls = torch.cat(p_cls, dim=0)from_which_layer = torch.cat(from_which_layer, dim=0)all_b = torch.cat(all_b, dim=0)all_a = torch.cat(all_a, dim=0)all_gj = torch.cat(all_gj, dim=0)all_gi = torch.cat(all_gi, dim=0)all_anch = torch.cat(all_anch, dim=0)#-------------------------------------------------------------## 计算当前图片中,真实框与预测框的重合程度# iou的范围为0-1,取-log后为0~inf# 重合程度越大,取-log后越小# 因此,真实框与预测框重合度越大,pair_wise_iou_loss越小#-------------------------------------------------------------#pair_wise_iou = self.box_iou(txyxy, pxyxys)pair_wise_iou_loss = -torch.log(pair_wise_iou + 1e-8)#-------------------------------------------## 最多二十个预测框与真实框的重合程度# 然后求和,找到每个真实框对应几个预测框#-------------------------------------------#top_k, _ = torch.topk(pair_wise_iou, min(20, pair_wise_iou.shape[1]), dim=1)dynamic_ks = torch.clamp(top_k.sum(1).int(), min=1)#-------------------------------------------## gt_cls_per_image 种类的真实信息#-------------------------------------------#gt_cls_per_image = F.one_hot(this_target[:, 1].to(torch.int64), self.num_classes).float().unsqueeze(1).repeat(1, pxyxys.shape[0], 1)#-------------------------------------------## cls_preds_ 种类置信度的预测信息# cls_preds_越接近于1,y越接近于1# y / (1 - y)越接近于无穷大# 也就是种类置信度预测的越准# pair_wise_cls_loss越小#-------------------------------------------#num_gt = this_target.shape[0]cls_preds_ = p_cls.float().unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_() * p_obj.unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_()y = cls_preds_.sqrt_()pair_wise_cls_loss = F.binary_cross_entropy_with_logits(torch.log(y / (1 - y)), gt_cls_per_image, reduction="none").sum(-1)del cls_preds_#-------------------------------------------## 求cost的总和#-------------------------------------------#cost = (pair_wise_cls_loss+ 3.0 * pair_wise_iou_loss)#-------------------------------------------## 求cost最小的k个预测框#-------------------------------------------#matching_matrix = torch.zeros_like(cost)for gt_idx in range(num_gt):_, pos_idx = torch.topk(cost[gt_idx], k=dynamic_ks[gt_idx].item(), largest=False)matching_matrix[gt_idx][pos_idx] = 1.0del top_k, dynamic_ks#-------------------------------------------## 如果一个预测框对应多个真实框# 只使用这个预测框最对应的真实框#-------------------------------------------#anchor_matching_gt = matching_matrix.sum(0)if (anchor_matching_gt > 1).sum() > 0:_, cost_argmin = torch.min(cost[:, anchor_matching_gt > 1], dim=0)matching_matrix[:, anchor_matching_gt > 1] *= 0.0matching_matrix[cost_argmin, anchor_matching_gt > 1] = 1.0fg_mask_inboxes = matching_matrix.sum(0) > 0.0matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0)#-------------------------------------------## 取出符合条件的框#-------------------------------------------#from_which_layer = from_which_layer[fg_mask_inboxes]all_b = all_b[fg_mask_inboxes]all_a = all_a[fg_mask_inboxes]all_gj = all_gj[fg_mask_inboxes]all_gi = all_gi[fg_mask_inboxes]all_anch = all_anch[fg_mask_inboxes]this_target = this_target[matched_gt_inds]for i in range(num_layer):layer_idx = from_which_layer == imatching_bs[i].append(all_b[layer_idx])matching_as[i].append(all_a[layer_idx])matching_gjs[i].append(all_gj[layer_idx])matching_gis[i].append(all_gi[layer_idx])matching_targets[i].append(this_target[layer_idx])matching_anchs[i].append(all_anch[layer_idx])for i in range(num_layer):matching_bs[i] = torch.cat(matching_bs[i], dim=0) if len(matching_bs[i]) != 0 else torch.Tensor(matching_bs[i])matching_as[i] = torch.cat(matching_as[i], dim=0) if len(matching_as[i]) != 0 else torch.Tensor(matching_as[i])matching_gjs[i] = torch.cat(matching_gjs[i], dim=0) if len(matching_gjs[i]) != 0 else torch.Tensor(matching_gjs[i])matching_gis[i] = torch.cat(matching_gis[i], dim=0) if len(matching_gis[i]) != 0 else torch.Tensor(matching_gis[i])matching_targets[i] = torch.cat(matching_targets[i], dim=0) if len(matching_targets[i]) != 0 else torch.Tensor(matching_targets[i])matching_anchs[i] = torch.cat(matching_anchs[i], dim=0) if len(matching_anchs[i]) != 0 else torch.Tensor(matching_anchs[i])return matching_bs, matching_as, matching_gjs, matching_gis, matching_targets, matching_anchsdef find_3_positive(self, predictions, targets):#------------------------------------## 获得每个特征层先验框的数量# 与真实框的数量#------------------------------------#num_anchor, num_gt = len(self.anchors_mask[0]), targets.shape[0] #------------------------------------## 创建空列表存放indices和anchors#------------------------------------#indices, anchors = [], []#------------------------------------## 创建7个1# 序号0,1为1# 序号2:6为特征层的高宽# 序号6为1#------------------------------------#gain = torch.ones(7, device=targets.device)#------------------------------------## ai [num_anchor, num_gt]# targets [num_gt, 6] => [num_anchor, num_gt, 7]#------------------------------------#ai = torch.arange(num_anchor, device=targets.device).float().view(num_anchor, 1).repeat(1, num_gt)targets = torch.cat((targets.repeat(num_anchor, 1, 1), ai[:, :, None]), 2) # append anchor indicesg = 0.5 # offsetsoff = torch.tensor([[0, 0],[1, 0], [0, 1], [-1, 0], [0, -1], # j,k,l,m# [1, 1], [1, -1], [-1, 1], [-1, -1], # jk,jm,lk,lm], device=targets.device).float() * g for i in range(len(predictions)):#----------------------------------------------------## 将先验框除以stride,获得相对于特征层的先验框。# anchors_i [num_anchor, 2]#----------------------------------------------------#anchors_i = torch.from_numpy(self.anchors[i] / self.stride[i]).type_as(predictions[i])#-------------------------------------------## 计算获得对应特征层的高宽#-------------------------------------------#gain[2:6] = torch.tensor(predictions[i].shape)[[3, 2, 3, 2]]#-------------------------------------------## 将真实框乘上gain,# 其实就是将真实框映射到特征层上#-------------------------------------------#t = targets * gainif num_gt:#-------------------------------------------## 计算真实框与先验框高宽的比值# 然后根据比值大小进行判断,# 判断结果用于取出,获得所有先验框对应的真实框# r [num_anchor, num_gt, 2]# t [num_anchor, num_gt, 7] => [num_matched_anchor, 7]#-------------------------------------------#r = t[:, :, 4:6] / anchors_i[:, None]j = torch.max(r, 1. / r).max(2)[0] < self.thresholdt = t[j] # filter#-------------------------------------------## gxy 获得所有先验框对应的真实框的x轴y轴坐标# gxi 取相对于该特征层的右小角的坐标#-------------------------------------------#gxy = t[:, 2:4] # grid xygxi = gain[[2, 3]] - gxy # inversej, k = ((gxy % 1. < g) & (gxy > 1.)).Tl, m = ((gxi % 1. < g) & (gxi > 1.)).Tj = torch.stack((torch.ones_like(j), j, k, l, m))#-------------------------------------------## t 重复5次,使用满足条件的j进行框的提取# j 一共五行,代表当前特征点在五个# [0, 0], [1, 0], [0, 1], [-1, 0], [0, -1]# 方向是否存在#-------------------------------------------#t = t.repeat((5, 1, 1))[j]offsets = (torch.zeros_like(gxy)[None] + off[:, None])[j]else:t = targets[0]offsets = 0#-------------------------------------------## b 代表属于第几个图片# gxy 代表该真实框所处的x、y中心坐标# gwh 代表该真实框的wh坐标# gij 代表真实框所属的特征点坐标#-------------------------------------------#b, c = t[:, :2].long().T # image, classgxy = t[:, 2:4] # grid xygwh = t[:, 4:6] # grid whgij = (gxy - offsets).long()gi, gj = gij.T # grid xy indices#-------------------------------------------## gj、gi不能超出特征层范围# a代表属于该特征点的第几个先验框#-------------------------------------------#a = t[:, 6].long() # anchor indicesindices.append((b, a, gj.clamp_(0, gain[3] - 1), gi.clamp_(0, gain[2] - 1))) # image, anchor, grid indicesanchors.append(anchors_i[a]) # anchorsreturn indices, anchors
YoloV7目标检测(Pytorch版)【详解】相关推荐
- 深度篇——目标检测史(七) 细说 YOLO-V3目标检测 之 代码详解
返回主目录 返回 目标检测史 目录 上一章:深度篇--目标检测史(六) 细说 YOLO-V3目标检测 下一章:深度篇--目标检测史(八) 细说 CornerNet-Lite 目标检测 论文地址:< ...
- window10下拯救者笔记本RTX3060laptop配置CUDA11.0 pytorch版详解
cuda版本11.0 torch=1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio==0.7.2 可以直接用命令: pip install torch== ...
- 目标检测 RCNN算法详解
原文:http://blog.csdn.net/shenxiaolu1984/article/details/51066975 [目标检测]RCNN算法详解 Girshick, Ross, et al ...
- 目标检测算法——SSD详解
目录 一. 背景(基本介绍) 二. 网络结构 三. 具体过程 1. default box 匹配 2. 损失函数 3. 数据增广 4. Atrous Algothrim 5. NMS(非极大值抑制) ...
- 目标检测算法YOLO-V2详解
❝ 上期我们一起学习了YOLO-V1算法的框架原来和损失函数等知识,如下: 目标检测算法YOLO-V1算法详解 目标检测模型YOLO-V1损失函数详解 [文末领福利] ❞ 今天,我们一起学习下YOLO ...
- 目标检测指标mAP详解
前言 相信刚刚接触目标检测的小伙伴也是有点疑惑吧,目标检测的知识点和模型属实有点多,想要工作找CV的话,目标检测是必须掌握的方向了.我记得在找实习的时候,面试官就问到了我目标检测的指标是什么,答:mA ...
- 【三维目标检测】Complex-Yolov4详解(二):模型结构
Complex-Yolo网络模型的核心思想是用鸟瞰图BEV替换Yolo网络输入的RGB图像.因此,在完成BEV处理之后,模型的训练和推理过程基本和Yolo完全一致.Yolov4中输入的RGB图片的尺寸 ...
- 【三维目标检测】Complex-Yolov4详解(一): 数据处理
前面分别介绍了基于点云的三维深度学习算法PointNet.PointNet++,和基于体素的三维深度学习算法VoxelNet.本节将开始介绍基于投影的三维深度学习算法Complex-Yolov4.三维 ...
- Face Paper: 目标检测RSSD论文详解
转载: http://blog.csdn.net/u014380165/article/details/77130922 论文:Enhancement of SSD by concatenating ...
- 强烈推荐 | 基于飞桨的五大目标检测模型实战详解
机器视觉领域的核心问题之一就是目标检测(object detection),它的任务是找出图像当中所有感兴趣的目标(物体),确定其位置和大小.对于人类来说,目标检测是一个非常简单的任务.然而,计算机能 ...
最新文章
- 记录:自制的小说推荐程序(一)
- springboot集成shiro无法加载样式status302
- winform TreeView 节点选择
- MobileNet论文阅读笔记
- linux内核优化脚本,linux内核高级优化脚本
- javacript中的mvc设计模式
- VSCode打开多个项目文件夹的解决方法
- Java面试的基础题20190301
- Java进阶03 IO基础
- 通俗易懂讲解Java线程安全
- Himall商城LinqHelper帮助类(3)
- 网易云接口获取音乐(转载练习)
- 数字后端设计流程小结
- 5S管理卫生考评办法
- 上海数据分析师面试经历
- Appium+python自动化(二十一)- 让猴子按你指令大闹手机,让我们都成为耍猴高手(超详解)...
- 【Java数据结构与算法】Java如何实现环形队列
- Android 项目集成有米 SDK 添加广告
- usrp b210 参数记录
- python基础_字典_列表_元组考试
热门文章
- 让两个 mysql 自动同步_实现两个Mysql数据库之间同步的方案
- JRE的安装及环境变量配置
- [DP] [贪心] [Vijos P1417] 魔法塔防 (mtower)
- 利用通达信科建站系统通用型漏洞,Get Shell多家高校全过程
- 学习Python的三种境界
- ZBrush:PolyGroup基础
- 【Python plotly】零基础也能轻松掌握的学习路线与参考资料
- VB网络编程(webbrowser应用及Inet抓包封包)
- rebase 用法小结
- 明码标价:谈待遇的要领和禁忌