目标检测tricks（基于detectron2）

正确尝试

裁剪

由于目标相对于整张图片来说过小，所以对数据进行裁剪（除了裁剪尺寸还需要关注重叠尺寸，重叠尺寸稍微大一些，尽量保持每个目标有完整的存在，不至于因裁剪而破坏目标，这里设置裁剪512，重叠256）

参考代码：DOTA_devkit

改变anchor size和aspect_ratio

由于数据目标较小，所以需要更改detectron2里默认的anchor.size和aspect_ratio

cfg.MODEL.ANCHOR_GENERATOR.SIZES = [[35], [68], [87], [130], [149]]
cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[1.1], [1.2], [1.4], [1.8], [2.7]]

做法：统计数据标注框的面积和长宽比例，分别通过kmeans聚类方法得出结果。这里使用了sklearn包带的kmeans和网上手写的kmeans代码进行了比较，认为应该是sklearn自带的kmeans得到的聚类结果更能覆盖整体数据，更符合这里目标检测anchor的需求。

加入TTA

测试时数据增强，简称TTA，是对测试数据集进行数据扩展的一种应用，涉及到为测试集中的每个图像创建多个扩增副本，让模型对每个图像做出预测，然后返回这些预测的集合。

cfg.TEST.AUG.ENABLED = True
cfg.TEST.AUG.MIN_SIZES = (400, 500, 512, 600, 700, 800)
cfg.TEST.AUG.MAX_SIZE = 1000
cfg.TEST.AUG.FLIP = True

由于目标检测使用的旋转框五参数格式（x,y,w,h,θx,y,w,h,\thetax,y,w,h,θ），所以对detectron2自带的TTA做了一些改动，主要是apply_box更改为apply_rotated_box以及fast_rcnn_inference_single_image_rotated

class GeneralizedRCNNWithTTA(nn.Module):"""A GeneralizedRCNN with test-time augmentation enabled.Its :meth:`__call__` method has the same interface as :meth:`GeneralizedRCNN.forward`."""def __init__(self, cfg, model, tta_mapper=None, batch_size=3):"""Args:cfg (CfgNode):model (GeneralizedRCNN): a GeneralizedRCNN to apply TTA on.tta_mapper (callable): takes a dataset dict and returns a list ofaugmented versions of the dataset dict. Defaults to`DatasetMapperTTA(cfg)`.batch_size (int): batch the augmented images into this batch size for inference."""super().__init__()if isinstance(model, DistributedDataParallel):model = model.moduleassert isinstance(model, GeneralizedRCNN), "TTA is only supported on GeneralizedRCNN. Got a model of type {}".format(type(model))self.cfg = cfg.clone()assert not self.cfg.MODEL.KEYPOINT_ON, "TTA for keypoint is not supported yet"assert (not self.cfg.MODEL.LOAD_PROPOSALS), "TTA for pre-computed proposals is not supported yet"self.model = modelif tta_mapper is None:tta_mapper = DatasetMapperTTA(cfg.TEST.AUG.MIN_SIZES, cfg.TEST.AUG.MAX_SIZE, cfg.TEST.AUG.FLIP)self.tta_mapper = tta_mapperself.batch_size = batch_size@contextmanagerdef _turn_off_roi_heads(self, attrs):"""Open a context where some heads in `model.roi_heads` are temporarily turned off.Args:attr (list[str]): the attribute in `model.roi_heads` which can be usedto turn off a specific head, e.g., "mask_on", "keypoint_on"."""roi_heads = self.model.roi_headsold = {}for attr in attrs:try:old[attr] = getattr(roi_heads, attr)except AttributeError:# The head may not be implemented in certain ROIHeadspassif len(old.keys()) == 0:yieldelse:for attr in old.keys():setattr(roi_heads, attr, False)yieldfor attr in old.keys():setattr(roi_heads, attr, old[attr])def _batch_inference(self, batched_inputs, detected_instances=None):"""Execute inference on a list of inputs,using batch size = self.batch_size, instead of the length of the list.Inputs & outputs have the same format as :meth:`GeneralizedRCNN.inference`"""if detected_instances is None:detected_instances = [None] * len(batched_inputs)outputs = []inputs, instances = [], []for idx, input, instance in zip(count(), batched_inputs, detected_instances):inputs.append(input)instances.append(instance)if len(inputs) == self.batch_size or idx == len(batched_inputs) - 1:outputs.extend(self.model.inference(inputs,instances if instances[0] is not None else None,do_postprocess=False,))inputs, instances = [], []return outputsdef __call__(self, batched_inputs):"""Same input/output format as :meth:`GeneralizedRCNN.forward`"""def _maybe_read_image(dataset_dict):ret = copy.copy(dataset_dict)if "image" not in ret:image = read_image(ret.pop("file_name"), self.model.input_format)image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1)))  # CHWret["image"] = imageif "height" not in ret and "width" not in ret:ret["height"] = image.shape[1]ret["width"] = image.shape[2]return retreturn [self._inference_one_image(_maybe_read_image(x)) for x in batched_inputs]def _inference_one_image(self, input):"""Args:input (dict): one dataset dict with "image" field being a CHW tensorReturns:dict: one output dict"""orig_shape = (input["height"], input["width"])augmented_inputs, tfms = self._get_augmented_inputs(input)# Detect boxes from all augmented versionswith self._turn_off_roi_heads(["mask_on", "keypoint_on"]):# temporarily disable roi headsall_boxes, all_scores, all_classes = self._get_augmented_boxes(augmented_inputs, tfms)# merge all detected boxes to obtain final predictions for boxesmerged_instances = self._merge_detections(all_boxes, all_scores, all_classes, orig_shape)if self.cfg.MODEL.MASK_ON:# Use the detected boxes to obtain masksaugmented_instances = self._rescale_detected_boxes(augmented_inputs, merged_instances, tfms)# run forward on the detected boxesoutputs = self._batch_inference(augmented_inputs, augmented_instances)# Delete now useless variables to avoid being out of memorydel augmented_inputs, augmented_instances# average the predictionsmerged_instances.pred_masks = self._reduce_pred_masks(outputs, tfms)merged_instances = detector_postprocess(merged_instances, *orig_shape)return {"instances": merged_instances}else:return {"instances": merged_instances}def _get_augmented_inputs(self, input):augmented_inputs = self.tta_mapper(input)tfms = [x.pop("transforms") for x in augmented_inputs]return augmented_inputs, tfmsdef _get_augmented_boxes(self, augmented_inputs, tfms):# 1: forward with all augmented imagesoutputs = self._batch_inference(augmented_inputs)# 2: union the resultsall_boxes = []all_scores = []all_classes = []for output, tfm in zip(outputs, tfms):# Need to inverse the transforms on boxes, to obtain results on original imagepred_boxes = output.pred_boxes.tensororiginal_pred_boxes = tfm.inverse().apply_rotated_box(pred_boxes.cpu().numpy())all_boxes.append(torch.from_numpy(original_pred_boxes).to(pred_boxes.device))all_scores.extend(output.scores)all_classes.extend(output.pred_classes)all_boxes = torch.cat(all_boxes, dim=0)return all_boxes, all_scores, all_classesdef _merge_detections(self, all_boxes, all_scores, all_classes, shape_hw):# select from the union of all resultsnum_boxes = len(all_boxes)num_classes = self.cfg.MODEL.ROI_HEADS.NUM_CLASSES# +1 because fast_rcnn_inference expects background scores as wellall_scores_2d = torch.zeros(num_boxes, num_classes + 1, device=all_boxes.device)for idx, cls, score in zip(count(), all_classes, all_scores):all_scores_2d[idx, cls] = scoremerged_instances, _ = fast_rcnn_inference_single_image_rotated(all_boxes,all_scores_2d,shape_hw,1e-8,self.cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST,self.cfg.TEST.DETECTIONS_PER_IMAGE,)return merged_instancesdef _rescale_detected_boxes(self, augmented_inputs, merged_instances, tfms):augmented_instances = []for input, tfm in zip(augmented_inputs, tfms):# Transform the target box to the augmented image's coordinate spacepred_boxes = merged_instances.pred_boxes.tensor.cpu().numpy()pred_boxes = torch.from_numpy(tfm.apply_rotated_box(pred_boxes))aug_instances = Instances(image_size=input["image"].shape[1:3],pred_boxes=Boxes(pred_boxes),pred_classes=merged_instances.pred_classes,scores=merged_instances.scores,)augmented_instances.append(aug_instances)return augmented_instancesdef _reduce_pred_masks(self, outputs, tfms):# Should apply inverse transforms on masks.# We assume only resize & flip are used. pred_masks is a scale-invariant# representation, so we handle flip speciallyfor output, tfm in zip(outputs, tfms):if any(isinstance(t, HFlipTransform) for t in tfm.transforms):output.pred_masks = output.pred_masks.flip(dims=[3])all_pred_masks = torch.stack([o.pred_masks for o in outputs], dim=0)avg_pred_masks = torch.mean(all_pred_masks, dim=0)return avg_pred_masks

一些超参数调整

学习率BASE_LR调为0.01，MAX_ITER调为100000，学习率衰减STEPS调为（50000，75000）

cfg.SOLVER.BASE_LR = 0.01
cfg.SOLVER.MAX_ITER = 100000
cfg.SOLVER.STEPS = (50000,75000)

cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING = 'range'
cfg.INPUT.MIN_SIZE_TRAIN = (512, 832)

错误/无效/失败尝试

去雾

由于最初没有仔细观察数据，看见有雾就想着去雾，但由于图中多是厚云，常见的去雾代码并不能带来很好的效果，并且后面发现云下根本没数据

（但是也放一下实验后感觉效果还比较好的两个去雾算法）

FFA-Net GCANet

改变anchor size

通过统计数据标注框的长宽，再使用kmeans聚类得出长和宽的聚类结果，再计算面积开方作为anchor.size。这里主要是因为不熟悉detectron2默认送入的anchor是尺寸和纵横比，而不是直接送入anchor的长和宽，所以参考了网上关于yolo聚类获得anchor的过程

（还是贴一个感觉不错的博客） YOLOV3中k-means聚类获得anchor boxes过程详解

数据增强

其实到最后针对旋转框的数据增强也没有做成功，但是记录一下试错的过程吧

直接用detectron2自带的data augmentation进行：该方法中的data augmentation策略并不是完全适用于旋转框目标检测，该方法无法针对单个类别进行数据增强
线下针对少类别数据进行复制并使用其他库进行数据增强：没有搜寻到适用于旋转框目标检测的数据增强库
在线下针对少类别数据进行复制；使用detectron2自带的data augmentation进行数据增强；再把增强后的数据转为XYWHA_ABS格式的coco数据进行训练：数据增强后不太清楚在哪里加入转换坐标

尚未实现的尝试

加入Mosaic增强

Yolov4的mosaic数据增强参考了CutMix数据增强方式, 是CutMix数据增强方法的改进版，对四张图片进行拼接，得到一张新的图片。虽然对图片做Mosaic增强的代码写好了，但因为一些原因还没有进行训练验证。

贴一下代码，这里读入的数据是json格式，输出是txt（其实是还没有改成输出也是json格式）

from PIL import Image, ImageDraw
import numpy as np
from matplotlib.colors import rgb_to_hsv, hsv_to_rgb
import math
import os
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
import pandas as pds
import json
import cv2def rand(a=0, b=1):return np.random.rand() * (b - a) + adef merge_bboxes(bboxes, cutx, cuty):merge_bbox = []for i in range(len(bboxes)):for box in bboxes[i]:tmp_box = []x1, y1, x2, y2, x3, y3, x4, y4 = box[0], box[1], box[2], box[3], box[4], box[5], box[6], box[7]if i == 0:if np.min(box[1::2]) > cuty or np.min(box[::2]) > cutx:continueif np.max(box[1::2]) >= cuty and np.min(box[1::2]) <= cuty:box[1::2][np.argmax(box[1::2])] = cutyif cuty - np.min(box[1::2]) < 5:continueif np.max(box[::2]) >= cutx and x1 <= cutx:box[::2][np.argmax(box[::2])] = cutxif cutx - x1 < 5:continueif i == 1:if y2 < cuty or x1 > cutx:continueif y2 >= cuty and y1 <= cuty:y1 = cutyif y2 - y1 < 5:continueif x2 >= cutx and x1 <= cutx:x2 = cutxif x2 - x1 < 5:continueif i == 2:if y2 < cuty or x2 < cutx:continueif y2 >= cuty and y1 <= cuty:y1 = cutyif y2 - y1 < 5:continueif x2 >= cutx and x1 <= cutx:x1 = cutxif x2 - x1 < 5:continueif i == 3:if y1 > cuty or x2 < cutx:continueif y2 >= cuty and y1 <= cuty:y2 = cutyif y2 - y1 < 5:continueif x2 >= cutx and x1 <= cutx:x1 = cutxif x2 - x1 < 5:continuetmp_box.append(x1)tmp_box.append(y1)tmp_box.append(x2)tmp_box.append(y2)tmp_box.append(box[-1])merge_bbox.append(tmp_box)return merge_bboxdef get_random_data(image_file, annotation_line, input_shape):'''random preprocessing for real-time data augmentation'''h, w = input_shapebox_datas = []cls_datas = []index = 0place_x = [0, 0, 256, 256]place_y = [0, 256, 0, 256]new_image = Image.new('RGB', (w, h), (128, 128, 128))for line in annotation_line:# 每一行进行分割# line_content = line.split(",")# 打开图片path = os.path.join(image_file, line['imagePath'])image = utils.read_image(path, format='BGR')r = np.random.rand(2)augs = T.AugmentationList([T.RandomFlip(prob=0.5),T.RandomFlip(prob=0.5, vertical=True, horizontal=False),T.RandomApply(T.RandomBrightness(0.9, 1.1), prob=0.3),T.RandomApply(T.RandomSaturation(0.9, 1.1), prob=0.3),T.RandomApply(T.RandomContrast(0.9, 1.1), prob=0.3),T.RandomApply(T.ColorTransform(lambda x: x * r[0] + r[1] * 10), prob=0.3)])image, transforms = T.apply_transform_gens([augs], image)dx = place_x[index]dy = place_y[index]image = image[:, :, ::-1]new_image.paste(Image.fromarray(np.uint8(image)), (dx, dy))# cv2.imshow('new_image', new_image)# cv2.imshow('image', Image.fromarray(np.uint8(image)))index += 1iw, ih = image.shape[:2]box = []cls = []for shape in line['shapes']:bbox = []for point in shape['points']:bbox.append(point[0])bbox.append(point[1])box.append(bbox)cls.append(shape['label'])box = np.array(box)# box = np.array([np.array(list(map(float, box.split()[1]))) for box in line['shapes'][0:]])# cls = [cls.split()[-2:] for cls in line['shapes']['label']]if box.shape[-1] == 0:continuebox = transforms.apply_coords(box.reshape(-1, 2)).clip(min=0)# if index == 0:#     image, transforms = T.apply_transform_gens([T.RandomCrop(crop_type='absolute', crop_size=(cuty, cutx))],#                                                image)#     box = transforms.apply_coords(box).clip(min=0)# if index == 1:#     image, transforms = T.apply_transform_gens(#         [T.RandomCrop(crop_type='absolute', crop_size=((h - cuty), cutx))],#         image)#     box = transforms.apply_coords(box).clip(min=0)#     box[0, :] += cutx# if index == 3:#     image, transforms = T.apply_transform_gens(#         [T.RandomCrop(crop_type='absolute', crop_size=(cuty, (w - cutx)))],#         image)#     box = transforms.apply_coords(box).clip(min=0)#     box[1, :] += cuty# if index == 2:#     image, transforms = T.apply_transform_gens(#         [T.RandomCrop(crop_type='absolute', crop_size=((h - cuty), (w - cutx)))],#         image)#     box = transforms.apply_coords(box).clip(min=0)#     box[0, :] += cutx#     box[1, :] += cutyif index == 2:box[:, 1] += 256elif index == 3:box[:, 0] += 256box[:, 1] += 256elif index == 4:box[:, 0] += 256box_datas.append(box)cls_datas.extend(cls)if len(box_datas) == 0:return new_image, []box_datas = np.concatenate(box_datas, axis=0)# vis boxbox_line = box_datas.reshape(-1, 8)# for line in box_line:#     x1, y1, x2, y2, x3, y3, x4, y4 = line#     draw = ImageDraw.Draw(new_image)#     draw.line([(x1, y1), (x2, y2)], fill='red')#     draw.line([(x2, y2), (x3, y3)], fill='red')#     draw.line([(x3, y3), (x4, y4)], fill='red')#     draw.line([(x4, y4), (x1, y1)], fill='red')pd = pds.DataFrame(box_line)pd2 = pds.DataFrame(cls_datas)pd = pds.concat([pd, pd2], axis=1)return new_image, pddef normal_(annotation_line, input_shape):'''random preprocessing for real-time data augmentation'''line = annotation_line.split()image = Image.open(line[0])box = np.array([np.array(list(map(int, box.split(',')))) for box in line[1:]])iw, ih = image.sizeimage = image.transpose(Image.FLIP_LEFT_RIGHT)box[:, [0, 2]] = iw - box[:, [2, 0]]return image, boxdef get_json(json_path):info_group = []for root, dirs, files in os.walk(json_path):for file in files:if file.endswith(".json"):with open(os.path.join(root, file)) as f:info = json.load(f)# info = ",".join(info)info_group.append(info)return info_groupif __name__ == "__main__":json_path = './train'output_path = './train_mosaic'json_group = get_json(json_path)for ind in range(0, len(json_group) - 4, 4):line = json_group[ind:ind + 4]image_data, box_data = get_random_data(json_path, line, [512, 512])if len(box_data) == 0:continuejson_output_path = os.path.join(output_path, str(ind) +'.txt')img_output_path = os.path.join(output_path, str(ind) + '.png')js = box_data.to_json# box_data.to_json(json_output_path)box_data.to_csv(json_output_path, sep=' ', index=False, header=None, mode='w')image_data.save(img_output_path)print(ind)print("finished")# img = Image.fromarray((image_data * 255).astype(np.uint8))# for j in range(len(box_data)):#     x1, y1, x2, y2, x3, y3, x4, y4 = box_data[j][0:8]#     draw = ImageDraw.Draw(img)#     draw.line([(x1, y1), (x2, y2)], fill='red')#     draw.line([(x2, y2), (x3, y3)], fill='red')#     draw.line([(x3, y3), (x4, y4)], fill='red')#     draw.line([(x4, y4), (x1, y1)], fill='red')#     # thickness = 3#     # left, top, right, bottom = box_data[j][0:4]#     # draw = ImageDraw.Draw(img)#     # for i in range(thickness):#     #     draw.rectangle([left + i, top + i, right - i, bottom - i], outline=(255, 255, 255))# img.show()# img.save("box_all.jpg")

角度偏移改坐标偏移

也就是 RoI Transformer —> Gliding vertex

再贴下github官方代码 RoI Transformer Gliding vertex