The network model for face recognition is facenet. If you did not install this model during the initial installation, run download-models.sh from the tools directory and pre-install facenet.
<tools/download-models.sh>
Detecing faces in the image using the detectNet
#!/usr/bin/python # # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), # to deal in the Software without restriction, including without limitation # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, and to permit persons to whom the # Software is furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. # import jetson.inference import jetson.utils import argparse import sys import time import numpy as np import cv2 # parse the command line parser = argparse.ArgumentParser(description="Locate objects in an image using an object detection DNN.", formatter_class=argparse.RawTextHelpFormatter, epilog=jetson.inference.detectNet.Usage()) parser.add_argument("file_in", type=str, help="filename of the input image to process") parser.add_argument("file_out", type=str, default=None, nargs='?', help="filename of the output image to save") parser.add_argument("--network", type=str, default="facenet", help="pre-trained model to load (see below for options)") parser.add_argument("--overlay", type=str, default="box,labels,conf", help="detection overlay flags (e.g. --overlay=box,labels,conf)\nvalid combinations are: 'box', 'labels', 'conf', 'none'") parser.add_argument("--threshold", type=float, default=0.5, help="minimum detection threshold to use") try: opt = parser.parse_known_args()[0] except: print("") parser.print_help() sys.exit(0) ''' extra_rate = 1.0 ~ 1.X to enlarge the mask size ''' def do_masking(detection, extra_rate = 1.0): global orgimg bear = mask.copy() oh, ow, _ = orgimg.shape w = min(int(detection.Width * extra_rate) , ow) h = min(int(detection.Height * extra_rate), oh) x = max(int(detection.Left - (w * (extra_rate - 1.0 ) / 2)) , 0) y = max(int(detection.Top - (h * (extra_rate - 1.0 ) / 2)) , 0) dim = (w, h) bear = cv2.resize(bear, dim, interpolation = cv2.INTER_AREA) bg = orgimg[y:y+h, x:x+w] print(bear.shape) for i in range(0, h): for j in range(0, w): B = bear[i][j][0] G = bear[i][j][1] R = bear[i][j][2] if (int(B) + int(G) + int(R)): bg[i][j][0] = B bg[i][j][1] = G bg[i][j][2] = R orgimg[y:y+h, x:x+w] = bg # load an image (into shared CPU/GPU memory) print('input image:%s output image:%s'%(opt.file_in, opt.file_out)) img, width, height = jetson.utils.loadImageRGBA(opt.file_in) orgimg = jetson.utils.cudaToNumpy(img, width, height, 4) #CUDA img is float type orgimg = cv2.cvtColor (orgimg.astype(np.uint8), cv2.COLOR_RGBA2BGR) mask = cv2.imread('./sbear.png', cv2.IMREAD_UNCHANGED) # load the object detection network net = jetson.inference.detectNet(opt.network, sys.argv, opt.threshold) t = time.time() # detect objects in the image (with overlay) detections = net.Detect(img, width, height, opt.overlay) elapsed = time.time() - t # print the detections print("detected {:d} objects in image".format(len(detections))) print("FPS:%f"%(1.0 / elapsed)) for detection in detections: print(detection) # print out timing info net.PrintProfilerTimes() jetson.utils.cudaDeviceSynchronize () arr = jetson.utils.cudaToNumpy(img, width, height, 4) #CUDA img is float type arr = cv2.cvtColor (arr.astype(np.uint8), cv2.COLOR_RGBA2BGR) for detection in detections: do_masking(detection, extra_rate = 1.4) if opt.file_out is not None: cv2.imwrite("/tmp/facedetect.jpg", arr) cv2.imwrite("/tmp/facemask.jpg", orgimg)
Mask Use Images used bear icons.
After finding the position and size of the face in the detection object, resize the bear icon to fit the face size. Then replace the corresponding area of the original image with the bear image.
Let's run the code.
jetson-inference/python/examples# python3 detectnet_face_console.py ../../data/images/humans_1.jpg /tmp/face.jpg --network=facenet --threshold=0.4 jetson.inference.__init__.py jetson.inference -- initializing Python 3.6 bindings... jetson.inference -- registering module types... jetson.inference -- done registering module types jetson.inference -- done Python 3.6 binding initialization .............. detected 5 objects in image FPS:5.688665 <detectNet.Detection object> -- ClassID: 0 -- Confidence: 0.596609 -- Left: 144.198 -- Top: 31.1747 -- Right: 176.455 -- Bottom: 61.5546 -- Width: 32.2563 -- Height: 30.3799 -- Area: 979.941 -- Center: (160.327, 46.3647) <detectNet.Detection object> -- ClassID: 0 -- Confidence: 0.551483 -- Left: 224.628 -- Top: 31.1006 -- Right: 255.552 -- Bottom: 62.7267 -- Width: 30.9232 -- Height: 31.626 -- Area: 977.98 -- Center: (240.09, 46.9136) <detectNet.Detection object> -- ClassID: 0 -- Confidence: 0.603172 -- Left: 414.558 -- Top: 27.3126 -- Right: 450.389 -- Bottom: 56.2331 -- Width: 35.8312 -- Height: 28.9204 -- Area: 1036.26 -- Center: (432.473, 41.7728) <detectNet.Detection object> -- ClassID: 0 -- Confidence: 0.727007 -- Left: 295.354 -- Top: 46.752 -- Right: 321.892 -- Bottom: 77.4316 -- Width: 26.5383 -- Height: 30.6796 -- Area: 814.185 -- Center: (308.623, 62.0918) <detectNet.Detection object> -- ClassID: 0 -- Confidence: 0.853913 -- Left: 346.816 -- Top: 56.7357 -- Right: 377.782 -- Bottom: 79.769 -- Width: 30.9664 -- Height: 23.0333 -- Area: 713.258 -- Center: (362.299, 68.2524)
The result is saved as two images. This image shows the detect result on the screen.
This image is a masked image of the face found with the bear icon.
Now let's automatically mask the video file.
Detecing faces in the video using the detectNet
Important source code is included in the example above. So, using openCV's video processing functions, face masking in the video can also be made easy.OpenCV's cap.read function returns Mat object which is compatible with numpy array. But the returned Mat object is cpu variable, so you must copy the variable to GPU cuda memory with jetson.utils.cudaFromNumpy function.
#!/usr/bin/python # # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), # to deal in the Software without restriction, including without limitation # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, and to permit persons to whom the # Software is furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. # import jetson.inference import jetson.utils import argparse import sys import time import numpy as np import cv2 # parse the command line parser = argparse.ArgumentParser(description="Locate objects in an image using an object detection DNN.", formatter_class=argparse.RawTextHelpFormatter, epilog=jetson.inference.detectNet.Usage()) parser.add_argument("--video_in", type=str, help="filename of the input video to process") parser.add_argument("--video_out", type=str, default=None, nargs='?', help="filename of the output video to save") parser.add_argument("--network", type=str, help="pre-trained model to load (facenet)") parser.add_argument("--overlay", type=str, default="box,labels,conf", help="detection overlay flags (e.g. --overlay=box,labels,conf)\nvalid combinations are: 'box', 'labels', 'conf', 'none'") parser.add_argument("--threshold", type=float, default=0.5, help="minimum detection threshold to use") try: opt = parser.parse_known_args()[0] except: print("") parser.print_help() sys.exit(0) ''' extra_rate = 1.0 ~ 1.X to enlarge the mask size ''' def do_masking(detection, arr, extra_rate = 1.0): bear = mask.copy() oh, ow, _ = arr.shape w = min(int(detection.Width * extra_rate) , ow) h = min(int(detection.Height * extra_rate), oh) x = max(int(detection.Left - (w * (extra_rate - 1.0 ) / 2)) , 0) y = max(int(detection.Top - (h * (extra_rate - 1.0 ) / 2)) , 0) dim = (w, h) bear = cv2.resize(bear, dim, interpolation = cv2.INTER_AREA) bg = arr[y:y+h, x:x+w] print(bear.shape) try: for i in range(0, h): for j in range(0, w): B = bear[i][j][0] G = bear[i][j][1] R = bear[i][j][2] if (int(B) + int(G) + int(R)): bg[i][j][0] = B bg[i][j][1] = G bg[i][j][2] = R arr[y:y+h, x:x+w] = bg except IndexError: print(' index Error') return None return arr def process_frame(img): global out_video height, width, _ = img.shape cudaimg = cv2.cvtColor (img.astype(np.uint8), cv2.COLOR_BGR2RGBA) cuda_mem = jetson.utils.cudaFromNumpy(cudaimg) detections = net.Detect(cuda_mem, width, height, opt.overlay) jetson.utils.cudaDeviceSynchronize () # arr = jetson.utils.cudaToNumpy(cuda_mem, width, height, 4) #CUDA img is float type # arr = cv2.cvtColor (arr.astype(np.uint8), cv2.COLOR_RGBA2BGR) for detection in detections: img = do_masking(detection, img, extra_rate = 1.2) if img is None: return False out_video.write(img) elapsed = time.time() - t # print the detections print("Frame[%d] FPS:%f"%(count, 1.0 / elapsed)) return True cap = cv2.VideoCapture(opt.video_in) ret, img = cap.read() fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v') out_video = cv2.VideoWriter(opt.video_out, fourcc, cap.get(cv2.CAP_PROP_FPS), (img.shape[1], img.shape[0])) mask = cv2.imread('./sbear.png', cv2.IMREAD_UNCHANGED) net = jetson.inference.detectNet(opt.network, sys.argv, opt.threshold) count = 0 while cap.isOpened(): count += 1 t = time.time() ret, img = cap.read() if ret == False: break if False == process_frame(img): break out_video.release() cap.release()
Let's run the code.
jetson-inference/python/examples# python3 detectnet_face_video.py --video_in=./01.mp4 --video_out=./01_mask.mp4 --network=facenet --threshold=0.3 jetson.inference.__init__.py jetson.inference -- initializing Python 3.6 bindings... jetson.inference -- registering module types... jetson.inference -- done registering module types jetson.inference -- done Python 3.6 binding initialization jetson.utils.__init__.py ............... Frame[711] FPS:1.783029 jetson.utils -- freeing CUDA mapped memory jetson.inference -- PyDetection_Dealloc() jetson.inference -- PyDetection_Dealloc() jetson.inference -- PyDetection_Dealloc() jetson.utils -- cudaFromNumpy() ndarray dim 0 = 540 jetson.utils -- cudaFromNumpy() ndarray dim 1 = 960 jetson.utils -- cudaFromNumpy() ndarray dim 2 = 4 (127, 112, 4) (180, 168, 4) Frame[712] FPS:1.807839 jetson.utils -- freeing CUDA mapped memory jetson.inference -- PyDetection_Dealloc() jetson.inference -- PyDetection_Dealloc() jetson.utils -- cudaFromNumpy() ndarray dim 0 = 540 jetson.utils -- cudaFromNumpy() ndarray dim 1 = 960 jetson.utils -- cudaFromNumpy() ndarray dim 2 = 4 (119, 110, 4) (183, 171, 4) index Error jetson.utils -- freeing CUDA mapped memory jetson.inference -- PyDetection_Dealloc() jetson.inference -- PyDetection_Dealloc()
This video clip is part of the converted video. Faces are visible in some frames where the face detection fails.