pytorch简单实现yolo v1

首先本文的参考代码来自于作者一和作者二

yolo v1 原理简析

yolo v1的原理相较于其他的一些目标检测算法稍微没那么复杂，Yolo采用一个单独的CNN模型实现end-to-end的目标检测模型。

检测框架

整个框架把输入图片分成S×S个grid，然后每个格子预测两个bbox，每个bbox包含五个预测参数：x, y, w, h, c。前四个参数都是归一化之后的，其中c代表置信度，表示预测的这个框属于哪一个物体。另外每个格子都预测20个假定类别的概率，网络的最后输出为S×S×30(52+20)，论文中将整个图品分为77,所以S=7。

其中c（置信度）的计算公式为

每个bbox都有一个对应的confidence score，如果当前的grid cell中不包含物体的话，这个confidence就应该为0，如果有的话，这个confidence score就等于预测的box与ground truth的iou值。那么如何判断一个grid cell 中是否包含物体呢？？作者是这么做的：如果一个object的ground truth的中心点坐标落入一个grid cell中，那么这个grid cell就包含这个object，这个object的预测就由这个grid cell负责（也就是说每一个grid只能预测一个物体，那么对于分布密集的物体，就极有可能造成漏检，因为密集的物体可能会有多个物体的中心落在同一个grid之中。）

网络结构

在模型结构方面基本继承了GoogleNet的结构如下：

为了方便使用pytorch的预训练模型，采用了resnet50。

import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo
import torch.nn.functional as F
import torch
import torchvision.models as models
from torch.autograd import Variable__all__ = ['ResNet', 'resnet50']model_urls = {'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth'}def conv3x3(in_planes, out_planes, stride=1):"3x3 convolution with padding"return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,padding=1, bias=False)class BasicBlock(nn.Module):expansion = 1def __init__(self, inplanes, planes, stride=1, downsample=None):super(BasicBlock, self).__init__()self.conv1 = conv3x3(inplanes, planes, stride)self.bn1 = nn.BatchNorm2d(planes)self.relu = nn.ReLU(inplace=True)self.conv2 = conv3x3(planes, planes)self.bn2 = nn.BatchNorm2d(planes)self.downsample = downsampleself.stride = stridedef forward(self, x):residual = xout = self.conv1(x)out = self.bn1(out)out = self.relu(out)out = self.conv2(out)out = self.bn2(out)if self.downsample is not None:residual = self.downsample(x)out += residualout = self.relu(out)return outclass Bottleneck(nn.Module):expansion = 4def __init__(self, inplanes, planes, stride=1, downsample=None):super(Bottleneck, self).__init__()self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)self.bn1 = nn.BatchNorm2d(planes)self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,padding=1, bias=False)self.bn2 = nn.BatchNorm2d(planes)self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)self.bn3 = nn.BatchNorm2d(planes * 4)self.relu = nn.ReLU(inplace=True)self.downsample = downsampleself.stride = stridedef forward(self, x):residual = xout = self.conv1(x)out = self.bn1(out)out = self.relu(out)out = self.conv2(out)out = self.bn2(out)out = self.relu(out)out = self.conv3(out)out = self.bn3(out)if self.downsample is not None:residual = self.downsample(x)out += residualout = self.relu(out)return outclass detnet_bottleneck(nn.Module):# no expansion# dilation = 2# type B use 1x1 convexpansion = 1def __init__(self, in_planes, planes, stride=1, block_type='A'):super(detnet_bottleneck, self).__init__()self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)self.bn1 = nn.BatchNorm2d(planes)self.conv2 = nn.Conv2d(planes,planes,kernel_size=3,stride=stride,padding=2,bias=False,dilation=2)self.bn2 = nn.BatchNorm2d(planes)self.conv3 = nn.Conv2d(planes,self.expansion *planes,kernel_size=1,bias=False)self.bn3 = nn.BatchNorm2d(self.expansion * planes)self.downsample = nn.Sequential()if stride != 1 or in_planes != self.expansion * planes or block_type == 'B':self.downsample = nn.Sequential(nn.Conv2d(in_planes,self.expansion *planes,kernel_size=1,stride=stride,bias=False),nn.BatchNorm2d(self.expansion *planes))def forward(self, x):out = F.relu(self.bn1(self.conv1(x)))out = F.relu(self.bn2(self.conv2(out)))out = self.bn3(self.conv3(out))out += self.downsample(x)out = F.relu(out)return outclass ResNet(nn.Module):def __init__(self, block, layers, num_classes=1470):self.inplanes = 64super(ResNet, self).__init__()self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,bias=False)self.bn1 = nn.BatchNorm2d(64)self.relu = nn.ReLU(inplace=True)self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)self.layer1 = self._make_layer(block, 64, layers[0])self.layer2 = self._make_layer(block, 128, layers[1], stride=2)self.layer3 = self._make_layer(block, 256, layers[2], stride=2)self.layer4 = self._make_layer(block, 512, layers[3], stride=2)# self.layer5 = self._make_layer(block, 512, layers[3], stride=2)self.layer5 = self._make_detnet_layer(in_channels=2048)self.avgpool = nn.AvgPool2d(2)  # fit 448 input size# self.fc = nn.Linear(512 * block.expansion, num_classes)self.conv_end = nn.Conv2d(256,30,kernel_size=3,stride=1,padding=1,bias=False)self.bn_end = nn.BatchNorm2d(30)for m in self.modules():if isinstance(m, nn.Conv2d):n = m.kernel_size[0] * m.kernel_size[1] * m.out_channelsm.weight.data.normal_(0, math.sqrt(2. / n))elif isinstance(m, nn.BatchNorm2d):m.weight.data.fill_(1)m.bias.data.zero_()def _make_layer(self, block, planes, blocks, stride=1):downsample = Noneif stride != 1 or self.inplanes != planes * block.expansion:downsample = nn.Sequential(nn.Conv2d(self.inplanes, planes * block.expansion,kernel_size=1, stride=stride, bias=False),nn.BatchNorm2d(planes * block.expansion),)layers = []layers.append(block(self.inplanes, planes, stride, downsample))self.inplanes = planes * block.expansionfor i in range(1, blocks):layers.append(block(self.inplanes, planes))return nn.Sequential(*layers)def _make_detnet_layer(self, in_channels):layers = []layers.append(detnet_bottleneck(in_planes=in_channels,planes=256,block_type='B'))layers.append(detnet_bottleneck(in_planes=256,planes=256,block_type='A'))layers.append(detnet_bottleneck(in_planes=256,planes=256,block_type='A'))return nn.Sequential(*layers)def forward(self, x):x = self.conv1(x)x = self.bn1(x)x = self.relu(x)x = self.maxpool(x)x = self.layer1(x)x = self.layer2(x)x = self.layer3(x)x = self.layer4(x)x = self.layer5(x)x = self.avgpool(x)# x = x.view(x.size(0), -1)# x = self.fc(x)x = self.conv_end(x)x = self.bn_end(x)x = torch.sigmoid(x)# x = x.view(-1,7,7,30)x = x.permute(0, 2, 3, 1)  # (-1,7,7,30)return xdef resnet50(pretrained=False):"""Constructs a ResNet-50 model.Args:pretrained (bool): If True, returns a model pre-trained on ImageNet"""model = ResNet(Bottleneck, [3, 4, 6, 3])if pretrained:model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))return model

由图可以看出最后的输出每个grid有30维，与之前讨论的相对应，前八维是Box的坐标，2维是box的confidence,还有20个类别概率,网络坐标的x,y用对应于网格的offset归一化值0到1之间，w,h用图像的width和height归一化到0-1之间。

损失函数

在一个网络的实现中，最主要的就是怎么设计损失函数，为了让这三个方面得到很好的平衡，作者简单粗暴的全部采用了sum-squared error loss来做这件事。
但是这种做法存在以下几个问题：
第一，8维的localization error和20维的classification error同等重要显然是不合理的；
第二，如果一个网格中没有物体（一幅图中这种网格很多），那么就会将这些网格中的box的confidence push到0，相较于较少的的有物体的网格，这会导致网络不稳定甚至发散。
作者提出以下几种办法：

更重视8维的坐标预测，给这些损失前面赋予更大的Loss weight，记为l_coord，在pascaal VOC训练中取5
对于没有物体的box的confidense loss,赋予小的loss weight,记为l_noobj，在pascaal VOC训练中取0.5
有object的box的confidence loss和类别的loss的loss weight正常取1。
对不同大小的box预测中，相比于大box预测偏一点，小box预测偏一点肯定更不能被忍受的。而sum-square error loss中对同样的偏移loss是一样。
为了缓和这个问题，作者用了一个比较取巧的办法，就是将box的width和height取平方根代替原本的height和width。这个参考下面的图很容易理解，小box的横轴值较小，发生偏移时，反应到y轴上相比大box要大。
一个网格预测多个box，希望的是每个box predictor专门负责预测某个object。具体做法就是看当前预测的box与ground truth box中哪个IoU大，就负责哪个。这种做法称作box predictor的specialization。
最后整个的损失函数如下所示

import torch
import torch.nn as nn
import torch.nn.functional as Fclass yoloLoss(nn.Module):def __init__(self, S, B, l_coord, l_noobj):  # 为了更重视8维的坐标预测，给这些算是前面赋予更大的loss weight# 对于有物体的记为λcoord，在pascal VOC训练中取5，对于没有object的bbox的confidence loss，前面赋予更小的loss weight 记为 λnoobj# 在pascal VOC训练中取0.5# 有object的bbox的confidence loss (上图红色框) 和类别的loss （上图紫色框）的loss weight正常取1super(yoloLoss, self).__init__()self.S = Sself.B = Bself.l_coord = l_coordself.l_noobj = l_noobjdef compute_iou(self, box1, box2):  # iou的作用是，当一个物体有多个框时，选一个相比ground truth最大的执行度的为物体的预测，然后将剩下的框降序排列，如果后面的框中有与这个框的iou大于一定的阈值时则将这个框舍去（这样就可以抑制一个物体有多个框的出现了），目标检测算法中都会用到这种思想。'''Compute the intersection over union of two set of boxes, each box is [x1,y1,x2,y2].Args:box1: (tensor) bounding boxes, sized [N,4].box2: (tensor) bounding boxes, sized [M,4].Return:(tensor) iou, sized [N,M].'''N = box1.size(0)M = box2.size(0)lt = torch.max(# [N,2] -> [N,1,2] -> [N,M,2]box1[:, :2].unsqueeze(1).expand(N, M, 2),# [M,2] -> [1,M,2] -> [N,M,2]box2[:, :2].unsqueeze(0).expand(N, M, 2),)rb = torch.min(# [N,2] -> [N,1,2] -> [N,M,2]box1[:, 2:].unsqueeze(1).expand(N, M, 2),# [M,2] -> [1,M,2] -> [N,M,2]box2[:, 2:].unsqueeze(0).expand(N, M, 2),)wh = rb - lt  # [N,M,2]wh[wh < 0] = 0  # clip at 0inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]area1 = (box1[:, 2] - box1[:, 0]) * (box1[:, 3] - box1[:, 1])  # [N,]area2 = (box2[:, 2] - box2[:, 0]) * (box2[:, 3] - box2[:, 1])  # [M,]area1 = area1.unsqueeze(1).expand_as(inter)  # [N,] -> [N,1] -> [N,M]area2 = area2.unsqueeze(0).expand_as(inter)  # [M,] -> [1,M] -> [N,M]iou = inter / (area1 + area2 - inter)return ioudef forward(self, pred_tensor, target_tensor):'''pred_tensor: (tensor) size(batchsize,S,S,Bx5+20=30) [x,y,w,h,c]---预测对应的格式target_tensor: (tensor) size(batchsize,S,S,30) --- 标签的准确格式'''N = pred_tensor.size()[0]coo_mask = target_tensor[:, :, :, 4] > 0   # 具有目标标签的索引noo_mask = target_tensor[:, :, :, 4] == 0  # 不具有目标的标签索引# 得到含物体的坐标等信息coo_mask = coo_mask.unsqueeze(-1).expand_as(target_tensor)# 得到不含物体的坐标等信息noo_mask = noo_mask.unsqueeze(-1).expand_as(target_tensor)coo_pred = pred_tensor[coo_mask].view(-1, 30)box_pred = coo_pred[:, :10].contiguous().view(-1, 5)  # box[x1,y1,w1,h1,c1]class_pred = coo_pred[:, 10:]  # [x2,y2,w2,h2,c2]coo_target = target_tensor[coo_mask].view(-1, 30)box_target = coo_target[:, :10].contiguous().view(-1, 5)class_target = coo_target[:, 10:]# compute not contain obj lossnoo_pred = pred_tensor[noo_mask].view(-1, 30)noo_target = target_tensor[noo_mask].view(-1, 30)noo_pred_mask = torch.cuda.ByteTensor(noo_pred.size())noo_pred_mask.zero_()noo_pred_mask[:, 4] = 1noo_pred_mask[:, 9] = 1noo_pred_c = noo_pred[noo_pred_mask]  # noo pred只需要计算 c 的损失 size[-1,2]noo_target_c = noo_target[noo_pred_mask]nooobj_loss = F.mse_loss(noo_pred_c,noo_target_c,size_average=False)  # 对应的位置做均方误差# compute contain obj losscoo_response_mask = torch.cuda.ByteTensor(box_target.size())coo_response_mask.zero_()coo_not_response_mask = torch.cuda.ByteTensor(box_target.size())coo_not_response_mask.zero_()box_target_iou = torch.zeros(box_target.size()).cuda()# 预测值，有多个box的话那么就取一个最大的box，出来就可以了其他的不要啦for i in range(0,box_target.size()[0],2):  # choose the best iou box ， box1 是预测的 box2 是我们提供的box1 = box_pred[i:i + 2]box1_xyxy = Variable(torch.FloatTensor(box1.size()))box1_xyxy[:, :2] = box1[:, :2] / 14. - 0.5 * box1[:, 2:4]box1_xyxy[:, 2:4] = box1[:, :2] / 14. + 0.5 * box1[:, 2:4]box2 = box_target[i].view(-1, 5)box2_xyxy = Variable(torch.FloatTensor(box2.size()))box2_xyxy[:, :2] = box2[:, :2] / 14. - 0.5 * box2[:, 2:4]box2_xyxy[:, 2:4] = box2[:, :2] / 14. + 0.5 * box2[:, 2:4]iou = self.compute_iou(box1_xyxy[:, :4], box2_xyxy[:, :4])  # [2,1]max_iou, max_index = iou.max(0)max_index = max_index.data.cuda()coo_response_mask[i + max_index] = 1coo_not_response_mask[i + 1 - max_index] = 1###### we want the confidence score to equal the# intersection over union (IOU) between the predicted box# and the ground truth#####box_target_iou[i + max_index,torch.LongTensor([4]).cuda()] = max_iou.data.cuda()box_target_iou = Variable(box_target_iou).cuda()# 1.response loss，iou符合的box_pred_response = box_pred[coo_response_mask].view(-1, 5)box_target_response_iou = box_target_iou[coo_response_mask].view(-1, 5)box_target_response = box_target[coo_response_mask].view(-1, 5)contain_loss = F.mse_loss(box_pred_response[:, 4], box_target_response_iou[:, 4], size_average=False)loc_loss = F.mse_loss(box_pred_response[:, :2], box_target_response[:, :2], size_average=False) + F.mse_loss(torch.sqrt(box_pred_response[:, 2:4]), torch.sqrt(box_target_response[:, 2:4]), size_average=False)# 2.not response loss iou不符合的box_pred_not_response = box_pred[coo_not_response_mask].view(-1, 5)box_target_not_response = box_target[coo_not_response_mask].view(-1, 5)box_target_not_response[:, 4] = 0#not_contain_loss = F.mse_loss(box_pred_response[:,4],box_target_response[:,4],size_average=False)# I believe this bug is simply a typonot_contain_loss = F.mse_loss(box_pred_not_response[:, 4], box_target_not_response[:, 4], size_average=False)# 3.class lossclass_loss = F.mse_loss(class_pred, class_target, size_average=False)return (self.l_coord * loc_loss + 2 * contain_loss +not_contain_loss + self.l_noobj * nooobj_loss + class_loss) / N

数据集和数据加载

在object detection方面，公开数据集主要有VOC和COCO 数据集两种。因为这篇文章是复习和练习，所以可以采用VOC数据集。VOC是一个公开的Obj detection数据集，里面也有人体动作和segmentation的标注数据。 VOC 数据来源于曾经的2007-2012年的公开竞赛。虽然竞赛已经结束，但是数据还是可以下载到。
http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar

添加链接描述
里面文件夹结构非常清晰。对obj-detection比较有用的主要有两个文件夹。

VOCdevkit/VOC2012/Annotations
VOCdevkit/VOC2012/JPEGImages

其中Annotations存储了标注信息。而JPEGImages 存储的是图片文件。因为我们是练习，所以不必在意Imageset里面的txt文件，直接手动按照自己的需求划分数据集即可。Annotation已经包含了我们需要的所有标注信息。
在Annotation 文件夹下面一个图片对应一个xml file

<annotation><filename>2012_004331.jpg</filename><folder>VOC2012</folder><object><name>person</name><actions><jumping>1</jumping><other>0</other><phoning>0</phoning><playinginstrument>0</playinginstrument><reading>0</reading><ridingbike>0</ridingbike><ridinghorse>0</ridinghorse><running>0</running><takingphoto>0</takingphoto><usingcomputer>0</usingcomputer><walking>0</walking></actions><bndbox><xmax>208</xmax><xmin>102</xmin><ymax>230</ymax><ymin>25</ymin></bndbox><difficult>0</difficult><pose>Unspecified</pose><point><x>155</x><y>119</y></point></object><segmented>0</segmented><size><depth>3</depth><height>375</height><width>500</width></size><source><annotation>PASCAL VOC2012</annotation><database>The VOC2012 Database</database><image>flickr</image></source>
</annotation>

yolo 主要是识别物体的bounding box 还有物体的类别，所以只需要读取xml file 里面bndbox 和两个目标即可。代码如下：

import xml.etree.ElementTree as ET
import osVOC_CLASSES = (    # always index 0'aeroplane', 'bicycle', 'bird', 'boat','bottle', 'bus', 'car', 'cat', 'chair','cow', 'diningtable', 'dog', 'horse','motorbike', 'person', 'pottedplant','sheep', 'sofa', 'train', 'tvmonitor')def parse_rec(filename):""" Parse a PASCAL VOC xml file """tree = ET.parse(filename)objects = []for obj in tree.findall('object'):obj_struct = {}difficult = int(obj.find('difficult').text)if difficult == 1:# print(filename)continueobj_struct['name'] = obj.find('name').text#obj_struct['pose'] = obj.find('pose').text#obj_struct['truncated'] = int(obj.find('truncated').text)#obj_struct['difficult'] = int(obj.find('difficult').text)bbox = obj.find('bndbox')obj_struct['bbox'] = [int(float(bbox.find('xmin').text)),int(float(bbox.find('ymin').text)),int(float(bbox.find('xmax').text)),int(float(bbox.find('ymax').text))]objects.append(obj_struct)return objectstxt_file = open('voc2007test.txt', 'w')
test_file = open('voc07testimg.txt', 'r')
lines = test_file.readlines()
lines = [x[:-1] for x in lines]
print(lines)Annotations = 'F:/pytorch-YOLO-v1-master/data/VOCdevkit/VOC2007/Annotations/'
xml_files = os.listdir(Annotations)count = 0
for xml_file in xml_files:count += 1if xml_file.split('.')[0] not in lines:# print(xml_file.split('.')[0])continueimage_path = xml_file.split('.')[0] + '.jpg'results = parse_rec(Annotations + xml_file)if len(results) == 0:print(xml_file)continuetxt_file.write(image_path)# num_obj = len(results)# txt_file.write(str(num_obj)+' ')for result in results:class_name = result['name']bbox = result['bbox']class_name = VOC_CLASSES.index(class_name)txt_file.write(' ' +str(bbox[0]) +' ' +str(bbox[1]) +' ' +str(bbox[2]) +' ' +str(bbox[3]) +' ' +str(class_name))txt_file.write('\n')# if count == 10:#    break
txt_file.close()

在进行数据增强时，有一些比如旋转、随即裁剪等会改变图片中物体的bbox的坐标，因此不能直接应用torchvision里面的transform包来进行数据增强(https://github.com/DuanYiqun/pytorch_implementation_of_Yolov1)
这里面说“ 所以这里经过资料查阅发现github上一大部分实现都没有加这样的data augmentation的步骤而是借用已经pretrained的base network parameters来提供一个baseline accuracy。另一部分主要是用opencv的包来做的”，这里关于第一句话还不是特别理解。西面直接来看看代码，里面关于左边转换的代码需要仔细看看，如果看不懂以后会用也行。

import torch
import torch.utils.data as data
import torchvision.transforms as transformsfrom PIL import Image
import cv2import os
import sys
import os.pathimport random
import numpy as npclass yoloDataset(data.Dataset):image_size = 448def __init__(self, root, list_file, train, transform):print('loading annotations')self.root = rootself.train = trainself.transform = transformself.fnames = []self.boxes = []self.labels = []self.S = 7  # grid number 7*7 normallyself.B = 2  # bounding box number in each gridself.C = 20  # how many classesself.mean = (123, 117, 104)  # RGBif isinstance(list_file, list):# Cat multiple list files together.# This is especially useful for voc07/voc12 combination.# 将voc2007和voc2012两个数据集的标签整合为一tmp_file = '/tmp/listfile.txt'os.system('cat %s > %s' % (' '.join(list_file), tmp_file))list_file = tmp_filewith open(list_file) as f:lines = f.readlines()for line in lines:splited = line.strip().split()self.fnames.append(splited[0])         # 存储图片的名字num_boxes = (len(splited) - 1) // 5    # 每一幅图片里面有多少个bboxbox = []label = []for i in range(num_boxes):x = float(splited[1 + 5 * i])y = float(splited[2 + 5 * i])x2 = float(splited[3 + 5 * i])y2 = float(splited[4 + 5 * i])c = splited[5 + 5 * i]            # 代表物体的类别，即是20种物体里面的哪一种box.append([x, y, x2, y2])label.append(int(c))self.boxes.append(torch.Tensor(box))self.labels.append(torch.LongTensor(label))self.num_samples = len(self.boxes)def __getitem__(self, idx):fname = self.fnames[idx]img = cv2.imread(os.path.join(self.root + fname))boxes = self.boxes[idx].clone()labels = self.labels[idx].clone()if self.train:  # 数据增强里面的各种变换用torch自带的transform是做不到的，因为对图片进行旋转、随即裁剪等会造成bbox的坐标也会发生变化，所以需要自己来定义数据增强img, boxes = self.random_flip(img, boxes)img, boxes = self.randomScale(img, boxes)img = self.randomBlur(img)img = self.RandomBrightness(img)img = self.RandomHue(img)img = self.RandomSaturation(img)img, boxes, labels = self.randomShift(img, boxes, labels)img, boxes, labels = self.randomCrop(img, boxes, labels)h, w, _ = img.shapeboxes /= torch.Tensor([w, h, w, h]).expand_as(boxes)         # 坐标归一化处理，为了方便训练img = self.BGR2RGB(img)                                                                           # because pytorch pretrained model use RGBimg = self.subMean(img, self.mean)                          # 减去均值img = cv2.resize(img, (self.image_size, self.image_size))   # 将所有图片都resize到指定大小target = self.encoder(boxes, labels)                        # 将图片标签编码到7x7*30的向量for t in self.transform:img = t(img)return img, targetdef __len__(self):return self.num_samplesdef encoder(self, boxes, labels):'''boxes (tensor) [[x1,y1,x2,y2],[]]labels (tensor) [...]return 7x7x30'''grid_num = 7target = torch.zeros((grid_num, grid_num, 30))cell_size = 1. / grid_num                         # 每个格子的大小# 右下坐标        左上坐标# x2,y2           x1,y1wh = boxes[:, 2:] - boxes[:, :2]# 物体中心坐标集合cxcy = (boxes[:, 2:] + boxes[:, :2]) / 2for i in range(cxcy.size()[0]):# 物体中心坐标cxcy_sample = cxcy[i]# 指示落在那网格，如[0,0]ij = (cxcy_sample / cell_size).ceil() - 1  # 中心点对应格子的坐标#    0 1    2 3   4      5 6   7 8   9# [中心坐标,长宽,置信度,中心坐标,长宽,置信度, 20个类别] x 7x7   因为一个框预测两个物体# 第一个框的置信度target[int(ij[1]), int(ij[0]), 4] = 1# 第二个框的置信度target[int(ij[1]), int(ij[0]), 9] = 1target[int(ij[1]), int(ij[0]), int(labels[i]) + 9] = 1      # 类别# xy为归一化后网格的左上坐标---->相对整张图xy = ij * cell_size# 物体中心相对左上的坐标 ---> 坐标x,y代表了预测的bounding# box的中心与栅格边界的相对值delta_xy = (cxcy_sample - xy) / cell_size  # 其实就是offset# (1) 每个小格会对应B(2)个边界框，边界框的宽高范围为全图，表示以该小格为中心寻找物体的边界框位置。# (2) 每个边界框对应一个分值，代表该处是否有物体及定位准确度# (3) 每个小格会对应C个概率值，找出最大概率对应的类别P(Class|object)，并认为小格中包含该物体或者该物体的一部分。# 坐标w,h代表了预测的bounding box的width、height相对于整幅图像width,height的比例target[int(ij[1]), int(ij[0]), 2:4] = wh[i]target[int(ij[1]), int(ij[0]), :2] = delta_xy# 每一个网格有两个边框target[int(ij[1]), int(ij[0]), 7:9] = wh[i]           # 长宽# 中心坐标偏移# 由此可得其实返回的中心坐标其实是相对左上角顶点的偏移，因此在进行预测的时候还需要进行解码target[int(ij[1]), int(ij[0]), 5:7] = delta_xyreturn target"""这一部分其实也可以写成这1、先将bbox的坐标[x1, y1, x2, y2]的形式转换成[x_center, y_center, w, h]2、然后将labels(list)[0, 1, 4, 2, ...]和bboxes(list)[[x_center, y_center, width, height], ...]转换成[self.S, self.S, self.B*5+self.C]即[7, 7, 30]，代码如下：def change_box_to_center_axes(self, bboxes):rebboxes = []for bbox in bboxes:x_center, y_center = (bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2width, height = (bbox[0] - bbox[2]), (bbox[1] - bbox[3])rebboxes.append([x_center, y_center, width, height])return rebboxesdef make_target(self, labels, bboxes):bboxes = self.change_box_to_center_axes(bboxes)num_elements = self.B * 5 + self.Cnum_bboxes = len(bboxes)# for excetion: num of bboxes is zeroif num_bboxes == 0:return np.zeros((self.S, self.S, num_elements))labels = np.array(labels, dtype=np.int)bboxes = np.array(bboxes, dtype=np.float)np_target = np.zeros((self.S, self.S, num_elements))np_class = np.zeros((num_bboxes, self.C))for i in range(num_bboxes):np_class[i, labels[i]] = 1        x_center = bboxes[:, 0].reshape(-1, 1)y_center = bboxes[:, 1].reshape(-1, 1)w = bboxes[:, 2].reshape(-1, 1)h = bboxes[:, 3].reshape(-1, 1)x_idx = np.ceil(x_center * self.S) - 1  # 看这个bounding box 在哪个grid 里面y_idx = np.ceil(y_center * self.S) - 1# for exception 0, ceil(0)-1 = -1x_idx[x_idx < 0] = 0y_idx[y_idx < 0] = 0# calc offset of x_center, y_centerx_center = x_center - x_idx / self.S - 1 / (2 * self.S)y_center = y_center - y_idx / self.S - 1 / (2 * self.S)conf = np.ones_like(x_center)temp = np.concatenate([x_center, y_center, w, h, conf], axis=1)temp = np.repeat(temp, self.B, axis=0).reshape(num_bboxes, -1)temp = np.concatenate([temp, np_class], axis=1)for i in range(num_bboxes):np_target[int(y_idx[i]), int(x_idx[i])] = temp[i]return np_target"""def BGR2RGB(self, img):return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)def BGR2HSV(self, img):return cv2.cvtColor(img, cv2.COLOR_BGR2HSV)def HSV2BGR(self, img):return cv2.cvtColor(img, cv2.COLOR_HSV2BGR)def RandomBrightness(self, bgr):if random.random() < 0.5:hsv = self.BGR2HSV(bgr)h, s, v = cv2.split(hsv)adjust = random.choice([0.5, 1.5])v = v * adjustv = np.clip(v, 0, 255).astype(hsv.dtype)hsv = cv2.merge((h, s, v))bgr = self.HSV2BGR(hsv)return bgrdef RandomSaturation(self, bgr):if random.random() < 0.5:hsv = self.BGR2HSV(bgr)h, s, v = cv2.split(hsv)adjust = random.choice([0.5, 1.5])s = s * adjusts = np.clip(s, 0, 255).astype(hsv.dtype)hsv = cv2.merge((h, s, v))bgr = self.HSV2BGR(hsv)return bgrdef RandomHue(self, bgr):if random.random() < 0.5:hsv = self.BGR2HSV(bgr)h, s, v = cv2.split(hsv)adjust = random.choice([0.5, 1.5])h = h * adjusth = np.clip(h, 0, 255).astype(hsv.dtype)hsv = cv2.merge((h, s, v))bgr = self.HSV2BGR(hsv)return bgrdef randomBlur(self, bgr):if random.random() < 0.5:bgr = cv2.blur(bgr, (5, 5))return bgrdef randomShift(self, bgr, boxes, labels):# 平移变换center = (boxes[:, 2:] + boxes[:, :2]) / 2if random.random() < 0.5:height, width, c = bgr.shapeafter_shfit_image = np.zeros((height, width, c), dtype=bgr.dtype)after_shfit_image[:, :, :] = (104, 117, 123)  # bgrshift_x = random.uniform(-width * 0.2, width * 0.2)shift_y = random.uniform(-height * 0.2, height * 0.2)# print(bgr.shape,shift_x,shift_y)# 原图像的平移if shift_x >= 0 and shift_y >= 0:after_shfit_image[int(shift_y):,int(shift_x):,:] = bgr[:height - int(shift_y),:width - int(shift_x),:]elif shift_x >= 0 and shift_y < 0:after_shfit_image[:height + int(shift_y),int(shift_x):,:] = bgr[-int(shift_y):,:width - int(shift_x),:]elif shift_x < 0 and shift_y >= 0:after_shfit_image[int(shift_y):, :width +int(shift_x), :] = bgr[:height -int(shift_y), -int(shift_x):, :]elif shift_x < 0 and shift_y < 0:after_shfit_image[:height + int(shift_y), :width + int(shift_x), :] = bgr[-int(shift_y):, -int(shift_x):, :]shift_xy = torch.FloatTensor([[int(shift_x), int(shift_y)]]).expand_as(center)center = center + shift_xymask1 = (center[:, 0] > 0) & (center[:, 0] < width)mask2 = (center[:, 1] > 0) & (center[:, 1] < height)mask = (mask1 & mask2).view(-1, 1)boxes_in = boxes[mask.expand_as(boxes)].view(-1, 4)if len(boxes_in) == 0:return bgr, boxes, labelsbox_shift = torch.FloatTensor([[int(shift_x), int(shift_y), int(shift_x), int(shift_y)]]).expand_as(boxes_in)boxes_in = boxes_in + box_shiftlabels_in = labels[mask.view(-1)]return after_shfit_image, boxes_in, labels_inreturn bgr, boxes, labelsdef randomScale(self, bgr, boxes):# 固定住高度，以0.8-1.2伸缩宽度，做图像形变if random.random() < 0.5:scale = random.uniform(0.8, 1.2)height, width, c = bgr.shapebgr = cv2.resize(bgr, (int(width * scale), height))scale_tensor = torch.FloatTensor([[scale, 1, scale, 1]]).expand_as(boxes)boxes = boxes * scale_tensorreturn bgr, boxesreturn bgr, boxesdef randomCrop(self, bgr, boxes, labels):if random.random() < 0.5:center = (boxes[:, 2:] + boxes[:, :2]) / 2height, width, c = bgr.shapeh = random.uniform(0.6 * height, height)w = random.uniform(0.6 * width, width)x = random.uniform(0, width - w)y = random.uniform(0, height - h)x, y, h, w = int(x), int(y), int(h), int(w)center = center - torch.FloatTensor([[x, y]]).expand_as(center)mask1 = (center[:, 0] > 0) & (center[:, 0] < w)mask2 = (center[:, 1] > 0) & (center[:, 1] < h)mask = (mask1 & mask2).view(-1, 1)boxes_in = boxes[mask.expand_as(boxes)].view(-1, 4)if(len(boxes_in) == 0):return bgr, boxes, labelsbox_shift = torch.FloatTensor([[x, y, x, y]]).expand_as(boxes_in)boxes_in = boxes_in - box_shiftboxes_in[:, 0] = boxes_in[:, 0].clamp_(min=0, max=w)boxes_in[:, 2] = boxes_in[:, 2].clamp_(min=0, max=w)boxes_in[:, 1] = boxes_in[:, 1].clamp_(min=0, max=h)boxes_in[:, 3] = boxes_in[:, 3].clamp_(min=0, max=h)labels_in = labels[mask.view(-1)]img_croped = bgr[y:y + h, x:x + w, :]return img_croped, boxes_in, labels_inreturn bgr, boxes, labelsdef subMean(self, bgr, mean):mean = np.array(mean, dtype=np.float32)bgr = bgr - meanreturn bgrdef random_flip(self, im, boxes):if random.random() < 0.5:im_lr = np.fliplr(im).copy()h, w, _ = im.shapexmin = w - boxes[:, 2]xmax = w - boxes[:, 0]boxes[:, 0] = xminboxes[:, 2] = xmaxreturn im_lr, boxesreturn im, boxesdef random_bright(self, im, delta=16):alpha = random.random()if alpha > 0.3:im = im * alpha + random.randrange(-delta, delta)im = im.clip(min=0, max=255).astype(np.uint8)return imdef main():from torch.utils.data import DataLoaderimport torchvision.transforms as transformsfile_root = 'F:/face_datas/VOC/VOCdevkit/VOC2012/JPEGImages/'train_dataset = yoloDataset(root=file_root,list_file='./voc2012.txt',train=True,transform=[transforms.ToTensor()])train_loader = DataLoader(train_dataset,batch_size=2,shuffle=False,num_workers=0)train_iter = iter(train_loader)for i in range(100):img, target = next(train_iter)print(img, target)if __name__ == '__main__':main()

训练

import numpy as np
from dataset import yoloDataset
from yoloLoss import yoloLoss
from yolo_v1_net import resnet50
from torch.autograd import Variable
from torchvision import models
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import torch.nn as nn
import torch
import os
import argparsedevice = 'cuda' if torch.cuda.is_available() else 'cpu'
file_root = 'F:/face_datas/VOC/VOCdevkit/VOC2012/JPEGImages/'
batch_size = 2
learning_rate = 0.001
num_epochs = 30train_dataset = yoloDataset(root=file_root, list_file='voc2012.txt', train=True, transform=[transforms.ToTensor()])
train_loader = DataLoader(train_dataset,batch_size=batch_size,shuffle=True,num_workers=0)
test_dataset = yoloDataset(root=file_root,list_file='voc2007test.txt',train=False,transform=[transforms.ToTensor()])
test_loader = DataLoader(test_dataset,batch_size=batch_size,shuffle=False,num_workers=0)
print('the dataset has %d images' % (len(train_dataset)))
print('the batch_size is %d' % (batch_size))
print('loading network structure...')net = resnet50()
net = net.to(device)
#print(net)print('load pre_trained model...')
resnet = models.resnet50(pretrained=True)
new_state_dict = resnet.state_dict()op = net.state_dict()
for k in new_state_dict.keys():print(k)if k in op.keys() and not k.startswith('fc'):  # startswith() 方法用于检查字符串是否是以指定子字符串开头，如果是则返回 True，否则返回 Falseprint('yes')op[k] = new_state_dict[k]
net.load_state_dict(op)if False:net.load_state_dict(torch.load('best.pth'))
print('testing the cuda device here')
print('cuda', torch.cuda.current_device(), torch.cuda.device_count())criterion = yoloLoss(7, 2, 5, 0.5)net.train()
# different learning rate
params = []
params_dict = dict(net.named_parameters())for key, value in params_dict.items():if key.startswith('features'):params += [{'params': [value], 'lr':learning_rate * 1}]  # 这儿的学习率好像并没有变化？？？else:params += [{'params': [value], 'lr':learning_rate}]
optimizer = torch.optim.SGD(params,lr=learning_rate,momentum=0.9,weight_decay=5e-4)torch.multiprocessing.freeze_support()for epoch in range(num_epochs):net.train()if epoch == 30:learning_rate = 0.0001if epoch == 40:learning_rate = 0.00001for param_group in optimizer.param_groups:param_group['lr'] = learning_rateprint('\n\nStarting epoch %d / %d' % (epoch + 1, num_epochs))print('Learning Rate for this epoch: {}'.format(learning_rate))total_loss = 0.for i, (images, target) in enumerate(train_loader):images, target = images.cuda(), target.cuda()pred = net(images)loss = criterion(pred, target)total_loss += loss.item()optimizer.zero_grad()loss.backward()optimizer.step()if (i + 1) % 5 == 0:print('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f, average_loss: %.4f' % (epoch +1, num_epochs,i + 1, len(train_loader), loss.item(), total_loss / (i + 1)))validation_loss = 0.0net.eval()for i, (images, target) in enumerate(test_loader):images, target = images.cuda(), target.cuda()pred = net(images)loss = criterion(pred, target)validation_loss += loss.item()validation_loss /= len(test_loader)if best_test_loss > validation_loss:best_test_loss = validation_lossprint('get best test loss %.5f' % best_test_loss)torch.save(net.state_dict(), 'best.pth')torch.save(net.state_dict(), 'yolo.pth')

下面是开始训练了，但是训练的时候电脑风扇转的太快了，怕万一把电脑给跑废了这个假期没法儿做其他的了，训练了一部分就没有训练了。反正是学学思想和代码为以后有机会从事这个行业做准备。

预测

import torch
from torch.autograd import Variablefrom resnet_yolo import resnet50
import torchvision.transforms as transforms
import cv2
import numpy as npVOC_CLASSES = (    # always index 0'aeroplane', 'bicycle', 'bird', 'boat','bottle', 'bus', 'car', 'cat', 'chair','cow', 'diningtable', 'dog', 'horse','motorbike', 'person', 'pottedplant','sheep', 'sofa', 'train', 'tvmonitor')Color = [[0, 0, 0],[128, 0, 0],[0, 128, 0],[128, 128, 0],[0, 0, 128],[128, 0, 128],[0, 128, 128],[128, 128, 128],[64, 0, 0],[192, 0, 0],[64, 128, 0],[192, 128, 0],[64, 0, 128],[192, 0, 128],[64, 128, 128],[192, 128, 128],[0, 64, 0],[128, 64, 0],[0, 192, 0],[128, 192, 0],[0, 64, 128]]def decoder(pred):  # 在前面数据提取和加载部分我们有写过编码部分，是为了方便网络的训练，但是在最后的预测阶段我们需要框出图中的物体因此需要再转换成bbox[x1,y1,x2,y2]的格式'''pred (tensor) 1x7x7x30return (tensor) box[[x1,y1,x2,y2]] label[...]'''grid_num = 7boxes = []cls_indexs = []probs = []cell_size = 1. / grid_numpred = pred.datapred = pred.squeeze(0)  # 7x7x30contain1 = pred[:, :, 4].unsqueeze(2)contain2 = pred[:, :, 9].unsqueeze(2)contain = torch.cat((contain1, contain2), 2)mask1 = contain > 0.1  # 大于阈值# we always select the best contain_prob what ever it>0.9mask2 = (contain == contain.max())mask = (mask1 + mask2).gt(0)# min_score,min_index = torch.min(contain,2) #每个cell只选最大概率的那个预测框for i in range(grid_num):for j in range(grid_num):for b in range(2):# index = min_index[i,j]# mask[i,j,index] = 0if mask[i, j, b] == 1:# print(i,j,b)box = pred[i, j, b * 5:b * 5 + 4]contain_prob = torch.FloatTensor([pred[i, j, b * 5 + 4]])# cell左上角  up left of cellxy = torch.FloatTensor([j, i]) * cell_size# return cxcy relative to imagebox[:2] = box[:2] * cell_size + xy# 转换成xy形式    convert[cx,cy,w,h] to [x1,xy1,x2,y2]box_xy = torch.FloatTensor(box.size())box_xy[:2] = box[:2] - 0.5 * box[2:]box_xy[2:] = box[:2] + 0.5 * box[2:]max_prob, cls_index = torch.max(pred[i, j, 10:], 0)if float((contain_prob * max_prob)[0]) > 0.1:boxes.append(box_xy.view(1, 4))cls_indexs.append(cls_index)probs.append(contain_prob * max_prob)if len(boxes) == 0:boxes = torch.zeros((1, 4))probs = torch.zeros(1)cls_indexs = torch.zeros(1)else:boxes = torch.cat(boxes, 0)  # (n,4)probs = torch.cat(probs, 0)  # (n,)cls_indexs = torch.cat(cls_indexs, 0)  # (n,)keep = nms(boxes, probs)return boxes[keep], cls_indexs[keep], probs[keep]def nms(bboxes, scores, threshold=0.5):'''bboxes(tensor) [N,4]scores(tensor) [N,]'''x1 = bboxes[:, 0]y1 = bboxes[:, 1]x2 = bboxes[:, 2]y2 = bboxes[:, 3]areas = (x2 - x1) * (y2 - y1)_, order = scores.sort(0, descending=True)keep = []while order.numel() > 0:i = order[0]keep.append(i)if order.numel() == 1:breakxx1 = x1[order[1:]].clamp(min=x1[i])yy1 = y1[order[1:]].clamp(min=y1[i])xx2 = x2[order[1:]].clamp(max=x2[i])yy2 = y2[order[1:]].clamp(max=y2[i])w = (xx2 - xx1).clamp(min=0)h = (yy2 - yy1).clamp(min=0)inter = w * hovr = inter / (areas[i] + areas[order[1:]] - inter)ids = (ovr <= threshold).nonzero().squeeze()if ids.numel() == 0:breakorder = order[ids + 1]return torch.LongTensor(keep)
#
# start predict one image
#def predict_gpu(model, image_name, root_path=''):result = []image = cv2.imread(root_path + image_name)h, w, _ = image.shapeimg = cv2.resize(image, (448, 448))img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)mean = (123, 117, 104)  # RGBimg = img - np.array(mean, dtype=np.float32)transform = transforms.Compose([transforms.ToTensor(), ])img = transform(img)img = Variable(img[None, :, :, :], volatile=True)img = img.cuda()pred = model(img)  # 1x7x7x30pred = pred.cpu()boxes, cls_indexs, probs = decoder(pred)for i, box in enumerate(boxes):x1 = int(box[0] * w)x2 = int(box[2] * w)y1 = int(box[1] * h)y2 = int(box[3] * h)cls_index = cls_indexs[i]cls_index = int(cls_index)  # convert LongTensor to intprob = probs[i]prob = float(prob)result.append([(x1, y1), (x2, y2), VOC_CLASSES[cls_index], image_name, prob])return resultif __name__ == '__main__':model = resnet50()print('load model...')model.load_state_dict(torch.load('best.pth'))model.eval()model.cuda()image_name = 'dog.jpg'image = cv2.imread(image_name)print('predicting...')result = predict_gpu(model, image_name)for left_up, right_bottom, class_name, _, prob in result:color = Color[VOC_CLASSES.index(class_name)]cv2.rectangle(image, left_up, right_bottom, color, 2)label = class_name + str(round(prob, 2))text_size, baseline = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.4, 1)p1 = (left_up[0], left_up[1] - text_size[1])cv2.rectangle(image, (p1[0] -2 //2, p1[1] -2 -baseline), (p1[0] +text_size[0], p1[1] +text_size[1]), color, -1)cv2.putText(image,label,(p1[0],p1[1] +baseline),cv2.FONT_HERSHEY_SIMPLEX,0.4,(255,255,255),1,8)cv2.imwrite('result.jpg', image)

这个是yolo v1的一个比较简单的实现，是为了能够理解Yolo的思想，还有一些训练数据可视化的操作等这里就不写了。