数据增广这个知识点放在炼丹手册里说,总感觉不太搭,不过看到bag of freebies这个词,心想也算是提高训练精度的一种手段,那就和其他knowledge躺在一起吧。


  • 几何增强
  • 色彩增强
  • 遮挡增强
  • 混叠增强
  • 域迁移





def random_horizontal_flip(image, bboxes):"""
    Randomly horizontal flip the image and correct the box
    :param image: BGR image data shape is [height, width, channel]
    :param bboxes: bounding box shape is [num, 4]
    :return: result
    """if random.random() < 0.5:_, w, _ = image.shapeimage = image[:, ::-1, :]bboxes[:, [0, 2]] = w - bboxes[:, [2, 0]]return image, bboxes




def random_vertical_flip(image, bboxes):"""Randomly vertical flip the image and correct the box:param image: BGR image data shape is [height, width, channel]:param bboxes: bounding box shape is [num, 4]:return: result"""if random.random() < 0.5:h, _, _ = image.shapeimage = image[::-1, :, :]bboxes[:, [1, 3]] = h - bboxes[:, [3, 1]]return image, bboxes




def random_rotate(image, bboxes, angle=5, scale=1.):"""rotate image and bboxes:param image: BGR image data shape is [height, width, channel]:param bboxes: all bounding box in the image. shape is [x_min, y_min, x_max, y_max]:param angle: rotate angle:param scale: default is 1:return: rotate_image:rotate_bboxes:"""height = image.shape[0]width = image.shape[1]# rotate imagerangle = np.deg2rad(angle)new_width = (abs(np.sin(rangle) * height) + abs(np.cos(rangle) * width)) * scalenew_height = (abs(np.cos(rangle) * height) + abs(np.sin(rangle) * width)) * scalerot_mat = cv2.getRotationMatrix2D((new_width * 0.5, new_height * 0.5), angle, scale)rot_move = np.dot(rot_mat, np.array([(new_width-width)*0.5, (new_height-height)*0.5,0]))rot_mat[0, 2] += rot_move[0]rot_mat[1, 2] += rot_move[1]# warpAffinerot_image = cv2.warpAffine(image, rot_mat, (int(math.ceil(new_width)), int(math.ceil(new_height))), flags=cv2.INTER_LANCZOS4)# rotate bboxesrot_bboxes = list()for bbox in bboxes:xmin = bbox[0]ymin = bbox[1]xmax = bbox[2]ymax = bbox[3]point1 = np.dot(rot_mat, np.array([(xmin + xmax) / 2, ymin, 1]))point2 = np.dot(rot_mat, np.array([xmax, (ymin + ymax) / 2, 1]))point3 = np.dot(rot_mat, np.array([(xmin + xmax) / 2, ymax, 1]))point4 = np.dot(rot_mat, np.array([xmin, (ymin + ymax) / 2, 1]))# 合并np.arrayconcat = np.vstack((point1, point2, point3, point4))# 改变array类型concat = concat.astype(np.int32)# 得到旋转后的坐标rx, ry, rw, rh = cv2.boundingRect(concat)rx_min = rxry_min = ryrx_max = rx + rwry_max = ry + rh# 加入list中rot_bboxes.append([rx_min, ry_min, rx_max, ry_max])return rot_image, rot_bboxes




def random_affine(image, bboxes, degrees=10, translate=.1, scale=.1, shear=10, border=(0, 0)):height = image.shape[0] + border[0] * 2width = image.shape[1] + border[1] * 2# Rotation and ScaleR = np.eye(3)a = random.uniform(-degrees, degrees)# a += random.choice([-180, -90, 0, 90])  # add 90deg rotations to small rotationss = random.uniform(1 - scale, 1 + scale)# s = 2 ** random.uniform(-scale, scale)R[:2] = cv2.getRotationMatrix2D(angle=a, center=(image.shape[1] / 2, image.shape[0] / 2), scale=s)# TranslationT = np.eye(3)T[0, 2] = random.uniform(-translate, translate) * image.shape[1] + border[1]  # x translation (pixels)T[1, 2] = random.uniform(-translate, translate) * image.shape[0] + border[0]  # y translation (pixels)# ShearS = np.eye(3)S[0, 1] = math.tan(random.uniform(-shear, shear) * math.pi / 180)  # x shear (deg)S[1, 0] = math.tan(random.uniform(-shear, shear) * math.pi / 180)  # y shear (deg)# Combined rotation matrixM = S @ T @ R  # ORDER IS IMPORTANT HERE!!if (border[0] != 0) or (border[1] != 0) or (M != np.eye(3)).any():  # image changedimage = cv2.warpAffine(image, M[:2], dsize=(width, height), flags=cv2.INTER_LINEAR, borderValue=(114, 114, 114))# Transform label coordinatesn = len(bboxes)if n:# warp pointsxy = np.ones((n * 4, 3))xy[:, :2] = bboxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(n * 4, 2)  # x1y1, x2y2, x1y2, x2y1xy = (xy @ M.T)[:, :2].reshape(n, 8)# create new boxesx = xy[:, [0, 2, 4, 6]]y = xy[:, [1, 3, 5, 7]]xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T# reject warped points outside of imagexy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width)xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height)w = xy[:, 2] - xy[:, 0]h = xy[:, 3] - xy[:, 1]area = w * harea0 = (bboxes[:, 2] - bboxes[:, 0]) * (bboxes[:, 3] - bboxes[:, 1])ar = np.maximum(w / (h + 1e-16), h / (w + 1e-16))  # aspect ratioi = (w > 2) & (h > 2) & (area / (area0 * s + 1e-16) > 0.2) & (ar < 20)bboxes = bboxes[i]bboxes[:, 0:4] = xy[i]return image, bboxes



向外缩放时,最终图像尺寸将大于原始图像尺寸,然后从新图像中剪切出一个部分,其大小等于原始图像。向内缩放时,因为会缩小图像大小,迫使我们对超出边界的内容做出假设。这里没有直接使用resize操作,考虑到resize造成目标的特征扭曲,所以实现上改为letterbox resize,保证原始图像的宽高比。


def letterbox_resize(image, target_size, bboxes, interp=0):"""Resize the image and correct the bbox accordingly.:param image: BGR image data shape is [height, width, channel]:param bboxes: bounding box shape is [num, 4]:param target_size: input size:param interp::return: result"""origin_height, origin_width = image.shape[:2]input_height, input_width = target_sizeresize_ratio = min(input_width / origin_width, input_height / origin_height)resize_width = int(resize_ratio * origin_width)resize_height = int(resize_ratio * origin_height)image_resized = cv2.resize(image, (resize_width, resize_height), interpolation=interp)image_padded = np.full((input_height, input_width, 3), 128, np.uint8)dw = int((input_width - resize_width) / 2)dh = int((input_height - resize_height) / 2)image_padded[dh:resize_height + dh, dw:resize_width + dw, :] = image_resizedif bboxes is None:return image_paddedelse:# xmin, xmax, ymin, ymaxbboxes[:, [0, 2]] = bboxes[:, [0, 2]] * resize_ratio + dwbboxes[:, [1, 3]] = bboxes[:, [1, 3]] * resize_ratio + dhreturn image_padded, bboxes




def random_crop(image, bboxes):"""Randomly crop the image and correct the box:param image: BGR image data shape is [height, width, channel]:param bboxes: bounding box shape is [num, 4]:return: result"""if random.random() < 0.5:h, w, _ = image.shapemax_bbox = np.concatenate([np.min(bboxes[:, 0:2], axis=0), np.max(bboxes[:, 2:4], axis=0)], axis=-1)max_l_trans = max_bbox[0]max_u_trans = max_bbox[1]max_r_trans = w - max_bbox[2]max_d_trans = h - max_bbox[3]crop_xmin = max(0, int(max_bbox[0] - random.uniform(0, max_l_trans)))crop_ymin = max(0, int(max_bbox[1] - random.uniform(0, max_u_trans)))crop_xmax = max(w, int(max_bbox[2] + random.uniform(0, max_r_trans)))crop_ymax = max(h, int(max_bbox[3] + random.uniform(0, max_d_trans)))image = image[crop_ymin: crop_ymax, crop_xmin: crop_xmax]bboxes[:, [0, 2]] = bboxes[:, [0, 2]] - crop_xminbboxes[:, [1, 3]] = bboxes[:, [1, 3]] - crop_yminreturn image, bboxes




def random_translate(image, bboxes):"""translation image and bboxes:param image: BGR image data shape is [height, width, channel]:param bbox: bounding box_1 shape is [num, 4]:return: result"""if random.random() < 0.5:h, w, _ = image.shapemax_bbox = np.concatenate([np.min(bboxes[:, 0:2], axis=0), np.max(bboxes[:, 2:4], axis=0)], axis=-1)max_l_trans = max_bbox[0]max_u_trans = max_bbox[1]max_r_trans = w - max_bbox[2]max_d_trans = h - max_bbox[3]tx = random.uniform(-(max_l_trans - 1), (max_r_trans - 1))ty = random.uniform(-(max_u_trans - 1), (max_d_trans - 1))M = np.array([[1, 0, tx], [0, 1, ty]])image = cv2.warpAffine(image, M, (w, h))bboxes[:, [0, 2]] = bboxes[:, [0, 2]] + txbboxes[:, [1, 3]] = bboxes[:, [1, 3]] + tyreturn image, bboxes




def random_noise(image):"""add noise into image:param image: BGR image data shape is [height, width, channel]:return: result"""shape = image.shapenoise = np.random.normal(size=(shape[0], shape[1]))out = np.zeros_like(image)for i in range(3):out[:, :, i] = image[:, :, i]+noiseout[out > 255] = 255out[out < 0] = 0out = out.astype('uint8')return out





def random_color_distort(image, brightness=32, hue=18, saturation=0.5, value=0.5):"""randomly distort image color include brightness, hue, saturation, value.:param image: BGR image data shape is [height, width, channel]:param brightness::param hue::param saturation::param value::return: result"""def random_hue(image_hsv, hue):if random.random() < 0.5:hue_delta = np.random.randint(-hue, hue)image_hsv[:, :, 0] = (image_hsv[:, :, 0] + hue_delta) % 180return image_hsvdef random_saturation(image_hsv, saturation):if random.random() < 0.5:saturation_mult = 1 + np.random.uniform(-saturation, saturation)image_hsv[:, :, 1] *= saturation_multreturn image_hsvdef random_value(image_hsv, value):if random.random() < 0.5:value_mult = 1 + np.random.uniform(-value, value)image_hsv[:, :, 2] *= value_multreturn image_hsvdef random_brightness(image, brightness):if random.random() < 0.5:image = image.astype(np.float32)brightness_delta = int(np.random.uniform(-brightness, brightness))image = image + brightness_deltareturn np.clip(image, 0, 255)# brightnessimage = random_brightness(image, brightness)image = image.astype(np.uint8)# color jitterimage_hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV).astype(np.float32)if np.random.randint(0, 2):image_hsv = random_value(image_hsv, value)image_hsv = random_saturation(image_hsv, saturation)image_hsv = random_hue(image_hsv, hue)else:image_hsv = random_saturation(image_hsv, saturation)image_hsv = random_hue(image_hsv, hue)image_hsv = random_value(image_hsv, value)image_hsv = np.clip(image_hsv, 0, 255)image = cv2.cvtColor(image_hsv.astype(np.uint8), cv2.COLOR_HSV2BGR)return image



随机擦除(Random Erasing)是用来模拟目标遮挡,试想把物体遮挡一部分后依然能够分类正确,那么肯定会迫使网络利用局部未遮挡的数据进行识别,加大了训练难度,一定程度会提高泛化能力。具体操作是随机选择一个区域,然后采用随机值进行覆盖,模拟遮挡场景。对于目标检测任务有三种随机擦除方法:IRE、ORE、I+ORE

  • IRE:对整张图像使用随机擦除
  • ORE:对图像中的每一个目标框区域都使用随机擦除
  • I+ORE:IRE和ORE的结合。


def random_erasing(image, s_min=0.02, s_max=0.4, ratio=0.3):"""rotate image and bboxes:param image: BGR image data shape is [height, width, channel]:param s_min: min erasing area region:param s_max: max erasing area region:param ratio: min aspect ratio range of earsing region:return: result"""assert len(image.shape) == 3, 'image should be a 3 dimension numpy array'if random.random() < 0.5:while True:s = (s_min, s_max)r = (ratio, 1 / ratio)Se = random.uniform(*s) * image.shape[0] * image.shape[1]re = random.uniform(*r)He = int(round(math.sqrt(Se * re)))We = int(round(math.sqrt(Se / re)))xe = random.randint(0, image.shape[1])ye = random.randint(0, image.shape[0])if xe + We <= image.shape[1] and ye + He <= image.shape[0]:image[ye: ye + He, xe: xe + We, :] = np.random.randint(low=0, high=255, size=(He, We, image.shape[2]))return imagereturn image


随机切断(Random CutOut)类似于随机擦除(Random Erasing)也是模拟遮挡,但是实现比Random Erasing简单,随机选择一个固定大小的正方形区域,在矩形范围内,所有的值都被设置为0,或者其他纯色值,为了避免填充0值对训练的影响,应该对数据进行中心归一化操作norm到0。在原文中指出区域的大小比形状重要,所以cutout只要是正方形就行。


def CutOut(image, hole_num=2, max_size=(100, 100), min_size=(20, 20), fill_value_mode='zero'):"""cut out mask into image:param image: BGR image data shape is [height, width, channel]:return: result"""if random.random() < 0.5:height, width, _ = image.shapeif fill_value_mode == 'zero':f = np.zerosparam = {'shape': (height, width, 3)}elif fill_value_mode == 'one':f = np.oneparam = {'shape': (height, width, 3)}else:f = np.random.uniformparam = {'low': 0, 'high': 255, 'size': (height, width, 3)}mask = np.ones((height, width, 3), np.int32)for index in range(hole_num):y = np.random.randint(height)x = np.random.randint(width)h = np.random.randint(min_size[0], max_size[0] + 1)w = np.random.randint(min_size[1], max_size[1] + 1)y1 = np.clip(y - h // 2, 0, height)y2 = np.clip(y + h // 2, 0, height)x1 = np.clip(x - w // 2, 0, width)x2 = np.clip(x + w // 2, 0, width)mask[y1: y2, x1: x2, :] = 0.image = np.where(mask, image, f(**param))return np.uint8(image)




def random_gridmask(image, mode=1, rotate=1, r_ratio=0.5, d_ratio=1):"""rotate image and bboxes:param image: BGR image data shape is [height, width, channel]:param mode::param rotate::param r_ratio::param d_ratio::return: result"""if random.random() < 0.5:h = image.shape[0]w = image.shape[1]d1 = 2d2 = min(h, w)hh = int(1.5 * h)ww = int(1.5 * w)d = np.random.randint(d1, d2)if rotate == 1:l = np.random.randint(1, d)else:l = min(max(int(d * r_ratio + 0.5), 1), d - 1)mask = np.ones((hh, ww), np.float32)st_h = np.random.randint(d)st_w = np.random.randint(d)for i in range(hh // d):s = d * i + st_ht = min(s + l, hh)mask[s:t, :] *= 0for i in range(ww // d):s = d * i + st_wt = min(s + l, ww)mask[:, s:t] *= 0r = np.random.randint(rotate)mask = Image.fromarray(np.uint8(mask))mask = mask.rotate(r)mask = np.asarray(mask)#  mask = 1*(np.random.randint(0,3,[hh,ww])>0)mask = mask[(hh - h) // 2:(hh - h) // 2 + h, (ww - w) // 2:(ww - w) // 2 + w]if mode == 1:mask = 1 - maskmask = np.expand_dims(mask.astype(np.uint8), axis=2)mask = np.tile(mask, [1, 1, 3])image = image * maskreturn image




  • Mixup
  • Cutmix



def mix_up(image_1, image_2, bbox_1, bbox_2):"""Overlay images and tags:param image_1: BGR image_1 data shape is [height, width, channel]:param image_2: BGR image_2 data shape is [height, width, channel]:param bbox_1: bounding box_1 shape is [num, 4]:param bbox_2: bounding box_2 shape is [num, 4]:return:"""height = max(image_1.shape[0], image_2.shape[0])width = max(image_1.shape[1], image_2.shape[1])mix_image = np.zeros(shape=(height, width, 3), dtype='float32')rand_num = np.random.beta(1.5, 1.5)rand_num = max(0, min(1, rand_num))mix_image[:image_1.shape[0], :image_1.shape[1], :] = image_1.astype('float32') * rand_nummix_image[:image_2.shape[0], :image_2.shape[1], :] += image_2.astype('float32') * (1. - rand_num)mix_image = mix_image.astype('uint8')# the last element of the 2nd dimention is the mix up weightbbox_1 = np.concatenate((bbox_1, np.full(shape=(bbox_1.shape[0], 1), fill_value=rand_num)), axis=-1)bbox_2 = np.concatenate((bbox_2, np.full(shape=(bbox_2.shape[0], 1), fill_value=1. - rand_num)), axis=-1)mix_bbox = np.concatenate((bbox_1, bbox_2), axis=0)mix_bbox = mix_bbox.astype(np.int32)return mix_image, mix_bbox




def rand_bbox(shape, lam):height = shape[0]width = shape[1]cut_ratio = np.sqrt(1. - lam)cut_height = np.int(height * cut_ratio)cut_width = np.int(width * cut_ratio)# uniformcx = np.random.randint(width)cy = np.random.randint(height)bbx1 = np.clip(cx - cut_width // 2, 0, width)bby1 = np.clip(cy - cut_height // 2, 0, height)bbx2 = np.clip(cx + cut_width // 2, 0, width)bby2 = np.clip(cy + cut_height // 2, 0, height)return bbx1, bby1, bbx2, bby2def cut_mix(image_1, image_2, bboxes_1, bboxes_2, beta=1.0):# use uniform distlam = np.random.beta(beta, beta)image_cutmix = image_1.copy()bbx1, bby1, bbx2, bby2 = rand_bbox(image_cutmix.shape, lam)image_cutmix[bby1:bby2, bbx1:bbx2, :] = image_2[bby1:bby2, bbx1:bbx2, :]# adjust lambda to exactly match pixel ratiolam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (image_1.shape[0] * image_1.shape[1]))for i in range(len(bboxes_1)):if bboxes_1[i, 0] > bbx1 and bboxes_1[i, 0] < bbx2 and bboxes_1[i, 1] > bby1 and bboxes_1[i, 1] < bby2 and \bboxes_1[i, 2] > bbx1 and bboxes_1[i, 2] < bbx2 and bboxes_1[i, 3] > bby1 and bboxes_1[i, 3] > bby2:bboxes_1[i, 0] = np.maximum(bbx1, bboxes_1[i, 0])bboxes_1[i, 1] = np.maximum(bby2, bboxes_1[i, 1])bboxes_1[i, 2] = np.minimum(bbx2, bboxes_1[i, 2])bboxes_1[i, 3] = np.maximum(bby1, bboxes_1[i, 3])elif bboxes_1[i, 0] > bbx1 and bboxes_1[i, 0] < bbx2 and bboxes_1[i, 1] < bby1 and bboxes_1[i, 1] < bby2 and \bboxes_1[i, 2] > bbx1 and bboxes_1[i, 2] < bbx2 and bboxes_1[i, 3] > bby1 and bboxes_1[i, 3] < bby2:bboxes_1[i, 0] = np.maximum(bbx1, bboxes_1[i, 0])bboxes_1[i, 1] = np.minimum(bby1, bboxes_1[i, 1])bboxes_1[i, 2] = np.minimum(bbx2, bboxes_1[i, 2])bboxes_1[i, 3] = np.minimum(bby1, bboxes_1[i, 3])elif bboxes_1[i, 0] > bbx1 and bboxes_1[i, 0] < bbx2 and bboxes_1[i, 1] > bby1 and bboxes_1[i, 1] < bby2 and \bboxes_1[i, 2] > bbx1 and bboxes_1[i, 2] > bbx2 and bboxes_1[i, 3] > bby1 and bboxes_1[i, 3] < bby2:bboxes_1[i, 0] = np.maximum(bbx2, bboxes_1[i, 0])bboxes_1[i, 1] = np.maximum(bby1, bboxes_1[i, 1])bboxes_1[i, 2] = np.maximum(bbx2, bboxes_1[i, 2])bboxes_1[i, 3] = np.minimum(bby2, bboxes_1[i, 3])elif bboxes_1[i, 0] < bbx1 and bboxes_1[i, 0] < bbx2 and bboxes_1[i, 1] > bby1 and bboxes_1[i, 1] < bby2 and \bboxes_1[i, 2] > bbx1 and bboxes_1[i, 2] < bbx2 and bboxes_1[i, 3] > bby1 and bboxes_1[i, 3] < bby2:bboxes_1[i, 0] = np.minimum(bbx1, bboxes_1[i, 0])bboxes_1[i, 1] = np.maximum(bby1, bboxes_1[i, 1])bboxes_1[i, 2] = np.minimum(bbx1, bboxes_1[i, 2])bboxes_1[i, 3] = np.minimum(bby2, bboxes_1[i, 3])elif bboxes_1[i, 0] > bbx1 and bboxes_1[i, 0] < bbx2 and bboxes_1[i, 1] > bby1 and bboxes_1[i, 1] < bby2 and \bboxes_1[i, 2] > bbx1 and bboxes_1[i, 2] < bbx2 and bboxes_1[i, 3] > bby1 and bboxes_1[i, 3] < bby2:bboxes_1[i, 0] = 0bboxes_1[i, 1] = 0bboxes_1[i, 2] = 0bboxes_1[i, 3] = 0for i in range(len(bboxes_2)):bboxes_2[i, 0] = np.maximum(bbx1, bboxes_2[i, 0])bboxes_2[i, 1] = np.maximum(bby1, bboxes_2[i, 1])bboxes_2[i, 2] = np.minimum(bbx2, bboxes_2[i, 2])bboxes_2[i, 3] = np.minimum(bby2, bboxes_2[i, 3])if (bboxes_2[i, 0] > bboxes_2[i, 2]) or (bboxes_2[i, 1] > bboxes_2[i, 3]):bboxes_2[i, 0] = 0bboxes_2[i, 1] = 0bboxes_2[i, 2] = 0bboxes_2[i, 3] = 0bboxes_cutmix = np.concatenate([bboxes_1, bboxes_2], axis=0)# compute output# loss = criterion(output, target_a) * lam + criterion(output, target_b) * (1. - lam)return image_cutmix, bboxes_cutmix



  • Cutout:随机的将样本中的部分区域cut掉,并且填充0像素值,分类的结果不变;
  • Mixup:将随机的两张样本按比例混合,分类的结果按比例分配;
  • Cutmix:就是将一部分区域cut掉但不填充0像素而是随机填充训练集中的其他数据的区域像素值,分类结果按一定的比例分配




《Modeling Visual Context is Key to Augmenting Object Detection Datasets》:使用目标类实例分割标注的数据进行数据增广的上下文建模

Adversarial Learning of General Transformations for Data Augmentation》:将仿射变换学习的全局变换与编码器结构学习的局部变换结合

On Feature Normalization and Data Augmentation》:从特征归一化的角度做数据增强


