MTCNN + Deep_Sort实现多目标人脸跟踪之MTCNN人脸检测部分(一)

最近在做一个人脸的任务，所以，就玩了一下mtcnn！！将部分内容写成了博客的形式分享学习！博客内容较为简单，但是，能够实现需求。不足之处多多谅解！！谢谢～

算法环境的搭建
实验机器配置为： intel i7-8700、gtx-2070 * 2、32g RAM
软件环境的搭建：nvidia-430 + cuda9.0 + cudnn7.3 + Anaconda3 +TensorFlow-gpu==1.12 （关于实验环境的搭建请参考本人之前的博客，安装软件环境时只需要修改对应的版本型号即可，安装流程是一样的）
【Notice：由于nvidia20系列显卡貌似只支持cuda10。所以，为了解决本机显卡对cuda9的使用以及tf-gpu1.12版本的使用，需要去NVIDIA官网下载并且安装对应的四个补丁即可（patch1、patch2、patch3、patch4）；补丁下载链接： https://developer.nvidia.com/cuda-90-download-archive】

数据准备
将源码clone到自己本地project目录中：

git clone https://github.com/BobLiu20/mtcnn_tf

实验用到的数据集为：WIDER face dataset – WIDER_train.zip
Landmark dataset—train.zip

数据集下载链接：http://mmlab.ie.cuhk.edu.hk/projects/WIDERFace/
http://mmlab.ie.cuhk.edu.hk/archive/CNN_FacePoint.htm
数据集下载好后，将两部分数据解压缩到project的‘dataset’目录下即可

人脸检测模型的训练
根据作者提供的教程，数据格式的转换以及模型的训练只需要按步骤运行两个脚本即可；
在projcet目录下分别运行：

./clearAll.sh
./runAll.sh  (该脚本包括了数据格式的转换以及模型的训练)

待模型训练起来，耐心等待两天左右即可训练完；

模型测试以及视频检测代码的修改
得到训练好的模型后，测试. 当然，在公开数据集上训练出来的demo，效果不会差。

将你需要测试的照片复制到project目录‘testing/images’目录下
在testing目录下运行测试的代码：
python test_images.py –stage=onet
测试的结果会保存在result_onet 目录下；
检测效果如下：（测试照片来源于百度搜索）

利用MTCNN实现视频中人脸的实时检测

修改工程目录‘detection/MtcnnDetecto.py’代码

import cv2
import time
import numpy as np
import sys
import os
rootPath = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../"))
sys.path.insert(0, rootPath)
from detection.nms import py_nms
from training.mtcnn_config import configclass MtcnnDetector(object):def __init__(self,detectors,min_face_size=24,stride=2,threshold=[0.6, 0.7, 0.7],scale_factor=0.79):self.pnet_detector = detectors[0]self.rnet_detector = detectors[1]self.onet_detector = detectors[2]self.min_face_size = min_face_sizeself.stride = strideself.thresh = thresholdself.scale_factor = scale_factordef convert_to_square(self, bbox):"""convert bbox to squareParameters:----------bbox: numpy array , shape n x 5input bboxReturns:-------square bbox"""square_bbox = bbox.copy()h = bbox[:, 3] - bbox[:, 1] + 1w = bbox[:, 2] - bbox[:, 0] + 1max_side = np.maximum(h, w)square_bbox[:, 0] = bbox[:, 0] + w * 0.5 - max_side * 0.5square_bbox[:, 1] = bbox[:, 1] + h * 0.5 - max_side * 0.5square_bbox[:, 2] = square_bbox[:, 0] + max_side - 1square_bbox[:, 3] = square_bbox[:, 1] + max_side - 1return square_bboxdef calibrate_box(self, bbox, reg):"""calibrate bboxesParameters:----------bbox: numpy array, shape n x 5input bboxesreg:  numpy array, shape n x 4bboxes adjustmentReturns:-------bboxes after refinement"""bbox_c = bbox.copy()w = bbox[:, 2] - bbox[:, 0] + 1w = np.expand_dims(w, 1)h = bbox[:, 3] - bbox[:, 1] + 1h = np.expand_dims(h, 1)reg_m = np.hstack([w, h, w, h])aug = reg_m * regbbox_c[:, 0:4] = bbox_c[:, 0:4] + augreturn bbox_cdef generate_bbox(self, cls_map, reg, scale, threshold):"""generate bbox from feature cls_mapParameters:----------cls_map: numpy array , n x m detect score for each positionreg: numpy array , n x m x 4bboxscale: float numberscale of this detectionthreshold: float numberdetect thresholdReturns:-------bbox array"""cellsize = 12t_index = np.where(cls_map > threshold)# find nothingif t_index[0].size == 0:return np.array([])#offsetdx1, dy1, dx2, dy2 = [reg[t_index[0], t_index[1], i] for i in range(4)]reg = np.array([dx1, dy1, dx2, dy2])score = cls_map[t_index[0], t_index[1]]boundingbox = np.vstack([np.round((self.stride * t_index[1]) / scale),np.round((self.stride * t_index[0]) / scale),np.round((self.stride * t_index[1] + cellsize) / scale),np.round((self.stride * t_index[0] + cellsize) / scale),score,reg])return boundingbox.Tdef processed_image(self, img, scale):height, width, channels = img.shapenew_height = int(height * scale)  # resized new heightnew_width = int(width * scale)  # resized new widthnew_dim = (new_width, new_height)img_resized = cv2.resize(img, new_dim, interpolation=cv2.INTER_LINEAR)  # resized imageimg_resized = (img_resized - 127.5) / 128return img_resizeddef pad(self, bboxes, w, h):"""pad the the bboxes, alse restrict the size of itParameters:----------bboxes: numpy array, n x 5input bboxesw: float numberwidth of the input imageh: float numberheight of the input imageReturns :------dy, dx : numpy array, n x 1start point of the bbox in target imageedy, edx : numpy array, n x 1end point of the bbox in target imagey, x : numpy array, n x 1start point of the bbox in original imageex, ex : numpy array, n x 1end point of the bbox in original imagetmph, tmpw: numpy array, n x 1height and width of the bbox"""tmpw, tmph = bboxes[:, 2] - bboxes[:, 0] + 1, bboxes[:, 3] - bboxes[:, 1] + 1num_box = bboxes.shape[0]dx, dy = np.zeros((num_box,)), np.zeros((num_box,))edx, edy = tmpw.copy() - 1, tmph.copy() - 1x, y, ex, ey = bboxes[:, 0], bboxes[:, 1], bboxes[:, 2], bboxes[:, 3]tmp_index = np.where(ex > w - 1)edx[tmp_index] = tmpw[tmp_index] + w - 2 - ex[tmp_index]ex[tmp_index] = w - 1tmp_index = np.where(ey > h - 1)edy[tmp_index] = tmph[tmp_index] + h - 2 - ey[tmp_index]ey[tmp_index] = h - 1tmp_index = np.where(x < 0)dx[tmp_index] = 0 - x[tmp_index]x[tmp_index] = 0tmp_index = np.where(y < 0)dy[tmp_index] = 0 - y[tmp_index]y[tmp_index] = 0return_list = [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph]return_list = [item.astype(np.int32) for item in return_list]return return_listdef detect_pnet(self, im):"""Get face candidates through pnetParameters:----------im: numpy arrayinput image arrayReturns:-------boxes: numpy arraydetected boxes before calibrationboxes_c: numpy arrayboxes after calibration"""h, w, c = im.shapenet_size = 12current_scale = float(net_size) / self.min_face_size  # find initial scaleim_resized = self.processed_image(im, current_scale)current_height, current_width, _ = im_resized.shape# for fcnall_boxes = list()while min(current_height, current_width) > net_size:#return the result predicted by pnet#cls_cls_map : H*w*2#reg: H*w*4cls_cls_map, reg = self.pnet_detector.predict(im_resized)#boxes: num*9(x1,y1,x2,y2,score,x1_offset,y1_offset,x2_offset,y2_offset)boxes = self.generate_bbox(cls_cls_map[:, :,1], reg, current_scale, self.thresh[0])current_scale *= self.scale_factorim_resized = self.processed_image(im, current_scale)current_height, current_width, _ = im_resized.shapeif boxes.size == 0:continuekeep = py_nms(boxes[:, :5], 0.5, 'Union')boxes = boxes[keep]all_boxes.append(boxes)if len(all_boxes) == 0:return None, None, Noneall_boxes = np.vstack(all_boxes)# merge the detection from first stagekeep = py_nms(all_boxes[:, 0:5], 0.7, 'Union')all_boxes = all_boxes[keep]boxes = all_boxes[:, :5]bbw = all_boxes[:, 2] - all_boxes[:, 0] + 1bbh = all_boxes[:, 3] - all_boxes[:, 1] + 1# refine the boxesboxes_c = np.vstack([all_boxes[:, 0] + all_boxes[:, 5] * bbw,all_boxes[:, 1] + all_boxes[:, 6] * bbh,all_boxes[:, 2] + all_boxes[:, 7] * bbw,all_boxes[:, 3] + all_boxes[:, 8] * bbh,all_boxes[:, 4]])boxes_c = boxes_c.Treturn boxes, boxes_c, Nonedef detect_rnet(self, im, dets):"""Get face candidates using rnetParameters:----------im: numpy arrayinput image arraydets: numpy arraydetection results of pnetReturns:-------boxes: numpy arraydetected boxes before calibrationboxes_c: numpy arrayboxes after calibration"""h, w, c = im.shapedets = self.convert_to_square(dets)dets[:, 0:4] = np.round(dets[:, 0:4])[dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = self.pad(dets, w, h)num_boxes = dets.shape[0]cropped_ims = np.zeros((num_boxes, 24, 24, 3), dtype=np.float32)for i in range(num_boxes):tmp = np.zeros((tmph[i], tmpw[i], 3), dtype=np.uint8)tmp[dy[i]:edy[i] + 1, dx[i]:edx[i] + 1, :] = im[y[i]:ey[i] + 1, x[i]:ex[i] + 1, :]cropped_ims[i, :, :, :] = (cv2.resize(tmp, (24, 24))-127.5) / 128#cls_scores : num_data*2#reg: num_data*4#landmark: num_data*10cls_scores, reg, _ = self.rnet_detector.predict(cropped_ims)cls_scores = cls_scores[:,1]keep_inds = np.where(cls_scores > self.thresh[1])[0]if len(keep_inds) > 0:boxes = dets[keep_inds]boxes[:, 4] = cls_scores[keep_inds]reg = reg[keep_inds]else:return None, None, Nonekeep = py_nms(boxes, 0.6)boxes = boxes[keep]boxes_c = self.calibrate_box(boxes, reg[keep])return boxes, boxes_c, Nonedef detect_onet(self, im, dets):"""Get face candidates using onetParameters:----------im: numpy arrayinput image arraydets: numpy arraydetection results of rnetReturns:-------boxes: numpy arraydetected boxes before calibrationboxes_c: numpy arrayboxes after calibration"""h, w, c = im.shapedets = self.convert_to_square(dets)dets[:, 0:4] = np.round(dets[:, 0:4])[dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = self.pad(dets, w, h)num_boxes = dets.shape[0]cropped_ims = np.zeros((num_boxes, 48, 48, 3), dtype=np.float32)for i in range(num_boxes):tmp = np.zeros((tmph[i], tmpw[i], 3), dtype=np.uint8)tmp[dy[i]:edy[i] + 1, dx[i]:edx[i] + 1, :] = im[y[i]:ey[i] + 1, x[i]:ex[i] + 1, :]cropped_ims[i, :, :, :] = (cv2.resize(tmp, (48, 48))-127.5) / 128cls_scores, reg,landmark = self.onet_detector.predict(cropped_ims)#prob belongs to facecls_scores = cls_scores[:,1]        keep_inds = np.where(cls_scores > self.thresh[2])[0]        if len(keep_inds) > 0:#pickout filtered boxboxes = dets[keep_inds]boxes[:, 4] = cls_scores[keep_inds]reg = reg[keep_inds]landmark = landmark[keep_inds]else:return None, None, None#widthw = boxes[:,2] - boxes[:,0] + 1#heighth = boxes[:,3] - boxes[:,1] + 1landmark[:,0::2] = (np.tile(w,(5,1)) * landmark[:,0::2].T + np.tile(boxes[:,0],(5,1)) - 1).Tlandmark[:,1::2] = (np.tile(h,(5,1)) * landmark[:,1::2].T + np.tile(boxes[:,1],(5,1)) - 1).T        boxes_c = self.calibrate_box(boxes, reg)boxes = boxes[py_nms(boxes, 0.6, "Minimum")]keep = py_nms(boxes_c, 0.6, "Minimum")boxes_c = boxes_c[keep]landmark = landmark[keep]return boxes, boxes_c,landmark# use for video 这部分代码用于视屏人脸检测def detect_video(self, img):"""Detect face over image"""boxes = Nonet = time.time()# pnett1 = 0if self.pnet_detector:boxes, boxes_c, _ = self.detect_pnet(img)if boxes_c is None:return np.array([]), np.array([])t1 = time.time() - tt = time.time()# rnett2 = 0if self.rnet_detector:boxes, boxes_c, _ = self.detect_rnet(img, boxes_c)if boxes_c is None:return np.array([]), np.array([])t2 = time.time() - tt = time.time()# onett3 = 0if self.onet_detector:boxes, boxes_c, landmark = self.detect_onet(img, boxes_c)if boxes_c is None:return np.array([]), np.array([])t3 = time.time() - tt = time.time()# print(#    "time cost " + '{:.3f}'.format(t1 + t2 + t3) + '  pnet {:.3f}  rnet {:.3f}  onet {:.3f}'.format(t1, t2,#                                                                                                  t3))return boxes_c, landmarkdef detect_face(self, test_data):all_boxes = []  #save each image's bboxeslandmarks = []batch_idx = 0for databatch in test_data:# print infoprintStr = "\rDone images: {}\n".format(batch_idx)sys.stdout.write(printStr)sys.stdout.flush()batch_idx += 1im = databatch# pnetif self.pnet_detector:#ignore landmark boxes, boxes_c, landmark = self.detect_pnet(im)if boxes_c is None:all_boxes.append(np.array([]))landmarks.append(np.array([]))continue# rnetif self.rnet_detector:#ignore landmark                 boxes, boxes_c, landmark = self.detect_rnet(im, boxes_c)if boxes_c is None:all_boxes.append(np.array([]))landmarks.append(np.array([]))continue# onetif self.onet_detector:boxes, boxes_c, landmark = self.detect_onet(im, boxes_c)if boxes_c is None:all_boxes.append(np.array([]))landmarks.append(np.array([]))                    continueall_boxes.append(boxes_c)landmarks.append(landmark)return all_boxes,landmarks

2．修改测试的代码testing/test_images.py （这部分代码仅供参考，本人代码能力有限，代码还可以更加优化，代码内还有点bug，哈哈）

#coding:utf-8
#author: AIBC-hxy'''
MTCNN在视频流上进行人脸检测，
'''
import tensorflow as tf
import numpy as np
import os
import sys
from cv2 import cv2rootPath = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../"))
sys.path.insert(0, rootPath)
from training.mtcnn_model import P_Net, R_Net, O_Net
from tools.loader import TestLoader
from detection.MtcnnDetector import MtcnnDetector
from detection.detector import Detector
from detection.fcn_detector import FcnDetectorimport time
import gc
from multiprocessing import Process, Managerdef net(stage):detectors = [None, None, None]if stage in ['pnet', 'rnet', 'onet']:modelPath = os.path.join(rootPath, 'tmp/model/pnet/')a = [b[5:-6] for b in os.listdir(modelPath) if b.startswith('pnet-') and b.endswith('.index')]maxEpoch = max(map(int, a)) # auto match a max epoch modelmodelPath = os.path.join(modelPath, "pnet-%d"%(maxEpoch))print("Use PNet model: %s"%(modelPath))detectors[0] = FcnDetector(P_Net,modelPath) if stage in ['rnet', 'onet']:modelPath = os.path.join(rootPath, 'tmp/model/rnet/')a = [b[5:-6] for b in os.listdir(modelPath) if b.startswith('rnet-') and b.endswith('.index')]maxEpoch = max(map(int, a))modelPath = os.path.join(modelPath, "rnet-%d"%(maxEpoch))print("Use RNet model: %s"%(modelPath))detectors[1] = Detector(R_Net, 24, 1, modelPath)if stage in ['onet']:modelPath = os.path.join(rootPath, 'tmp/model/onet/')a = [b[5:-6] for b in os.listdir(modelPath) if b.startswith('onet-') and b.endswith('.index')]maxEpoch = max(map(int, a))modelPath = os.path.join(modelPath, "onet-%d"%(maxEpoch))print("Use ONet model: %s"%(modelPath))detectors[2] = Detector(O_Net, 48, 1, modelPath)return detectors'''
python多进程：
'''
def receive(stack):top = 100cap = cv2.VideoCapture(0)ret, frame = cap.read()while True:ret, frame = cap.read()if ret:stack.append(frame)if len(stack) >= top:del stack[:]gc.collect()def realse(stack):print('Begin to get frame......')os.environ["CUDA_VISIBLE_DEVICES"] = '0'detectors = net('onet')mtcnnDetector = MtcnnDetector(detectors=detectors, min_face_size = 24, threshold=[0.9, 0.6, 0.7])while True:if len(stack) > 0:image = stack.pop()image = cv2.resize(image, (int(image.shape[1]/3), int(image.shape[0]/3)))image = np.array(image)boxes_c, _ = mtcnnDetector.detect_video(image)            for bbox in boxes_c:x1 = int(bbox[0])y1 = int(bbox[1])x2 = int(bbox[2])y2 = int(bbox[3])cv2.rectangle(image, (x1, y1), (x2,y2), (0,0,255))print('deteced face: ({},{}), ({},{})'.format(x1, y1, x2, y2))   cv2.imshow("Detected", image)if cv2.waitKey(1) & 0xFF == ord('q'):breakcv2.destroyAllWindows()if __name__=='__main__':t = Manager().list()t1 = Process(target=receive, args=(t,))t2 = Process(target=realse, args=(t,))t1.start()t2.start()t1.join()t2.terminate()

代码修改好后直接运行即可：

python test_images.py

视频检测效果截图如下（本人调整了检测框的大小，同时请忽略本人的大脸！！！）：

本博客仅供参考学习！！欢迎交流！互相学习！欢迎点赞！不足之处请多多谅解！