2019년 12월 22일 일요일

JetsonNano - Hello AI World (NVIDIA DNN vision library) - 3.Detecting face with detectNet

In the previous article, I showed how to detect objects in an image using detectnet.  There are many detectNet models. One of these is face recognition network(facenet-120). I'll continue the detectNet using this model to find face in the image. Because privacy is being strengthened these days, exposing someone's face in an image or video is a risk of law violation. So I'll make an useful program that automatically detects faces in video frames and masks them into specific images.

The network model for face recognition is facenet. If you did not install this model during the initial installation, run download-models.sh from the tools directory and pre-install facenet.

<tools/download-models.sh>


Detecing faces in the image using the detectNet




#!/usr/bin/python
#
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
#

import jetson.inference
import jetson.utils

import argparse
import sys
import time
import numpy as np
import cv2
# parse the command line
parser = argparse.ArgumentParser(description="Locate objects in an image using an object detection DNN.", 
         formatter_class=argparse.RawTextHelpFormatter, epilog=jetson.inference.detectNet.Usage())

parser.add_argument("file_in", type=str, help="filename of the input image to process")
parser.add_argument("file_out", type=str, default=None, nargs='?', help="filename of the output image to save")
parser.add_argument("--network", type=str, default="facenet", help="pre-trained model to load (see below for options)")
parser.add_argument("--overlay", type=str, default="box,labels,conf", help="detection overlay flags (e.g. --overlay=box,labels,conf)\nvalid combinations are:  'box', 'labels', 'conf', 'none'")
parser.add_argument("--threshold", type=float, default=0.5, help="minimum detection threshold to use")

try:
 opt = parser.parse_known_args()[0]
except:
 print("")
 parser.print_help()
 sys.exit(0)

'''
extra_rate = 1.0 ~ 1.X to enlarge the mask size
'''
def do_masking(detection, extra_rate = 1.0):
    global orgimg
    bear = mask.copy()
    oh, ow, _ = orgimg.shape
    w = min(int(detection.Width * extra_rate) , ow)
    h = min(int(detection.Height * extra_rate), oh)
    x = max(int(detection.Left - (w * (extra_rate - 1.0 ) / 2)) , 0)
    y = max(int(detection.Top  - (h * (extra_rate - 1.0 ) / 2)) , 0)

    dim = (w, h) 
    bear = cv2.resize(bear, dim, interpolation = cv2.INTER_AREA)
    bg = orgimg[y:y+h, x:x+w]
    print(bear.shape)
    for i in range(0, h):
        for j in range(0, w):
            B = bear[i][j][0]
            G = bear[i][j][1]
            R = bear[i][j][2]
            if (int(B) + int(G) + int(R)):
                bg[i][j][0] = B
                bg[i][j][1] = G
                bg[i][j][2] = R
    orgimg[y:y+h, x:x+w] = bg


# load an image (into shared CPU/GPU memory)
print('input image:%s  output image:%s'%(opt.file_in, opt.file_out))
img, width, height = jetson.utils.loadImageRGBA(opt.file_in)
orgimg = jetson.utils.cudaToNumpy(img, width, height, 4)      #CUDA img is float type
orgimg = cv2.cvtColor (orgimg.astype(np.uint8), cv2.COLOR_RGBA2BGR)
mask = cv2.imread('./sbear.png', cv2.IMREAD_UNCHANGED)

# load the object detection network
net = jetson.inference.detectNet(opt.network, sys.argv, opt.threshold)
t = time.time()
# detect objects in the image (with overlay)
detections = net.Detect(img, width, height, opt.overlay)
elapsed = time.time() - t
# print the detections
print("detected {:d} objects in image".format(len(detections)))
print("FPS:%f"%(1.0 / elapsed))

for detection in detections:
 print(detection)


# print out timing info
net.PrintProfilerTimes()

jetson.utils.cudaDeviceSynchronize ()
arr = jetson.utils.cudaToNumpy(img, width, height, 4)      #CUDA img is float type
arr = cv2.cvtColor (arr.astype(np.uint8), cv2.COLOR_RGBA2BGR)
for detection in detections:
    do_masking(detection, extra_rate = 1.4)

if opt.file_out is not None:
    cv2.imwrite("/tmp/facedetect.jpg", arr)
    cv2.imwrite("/tmp/facemask.jpg", orgimg)


Mask Use Images used bear icons.


After finding the position and size of the face in the detection object, resize the bear icon to fit the face size. Then replace the corresponding area of ​​the original image with the bear image.
Let's run the code.



jetson-inference/python/examples# python3 detectnet_face_console.py ../../data/images/humans_1.jpg /tmp/face.jpg --network=facenet --threshold=0.4
jetson.inference.__init__.py
jetson.inference -- initializing Python 3.6 bindings...
jetson.inference -- registering module types...
jetson.inference -- done registering module types
jetson.inference -- done Python 3.6 binding initialization

..............

detected 5 objects in image
FPS:5.688665
<detectNet.Detection object>
   -- ClassID: 0
   -- Confidence: 0.596609
   -- Left:    144.198
   -- Top:     31.1747
   -- Right:   176.455
   -- Bottom:  61.5546
   -- Width:   32.2563
   -- Height:  30.3799
   -- Area:    979.941
   -- Center:  (160.327, 46.3647)
<detectNet.Detection object>
   -- ClassID: 0
   -- Confidence: 0.551483
   -- Left:    224.628
   -- Top:     31.1006
   -- Right:   255.552
   -- Bottom:  62.7267
   -- Width:   30.9232
   -- Height:  31.626
   -- Area:    977.98
   -- Center:  (240.09, 46.9136)
<detectNet.Detection object>
   -- ClassID: 0
   -- Confidence: 0.603172
   -- Left:    414.558
   -- Top:     27.3126
   -- Right:   450.389
   -- Bottom:  56.2331
   -- Width:   35.8312
   -- Height:  28.9204
   -- Area:    1036.26
   -- Center:  (432.473, 41.7728)
<detectNet.Detection object>
   -- ClassID: 0
   -- Confidence: 0.727007
   -- Left:    295.354
   -- Top:     46.752
   -- Right:   321.892
   -- Bottom:  77.4316
   -- Width:   26.5383
   -- Height:  30.6796
   -- Area:    814.185
   -- Center:  (308.623, 62.0918)
<detectNet.Detection object>
   -- ClassID: 0
   -- Confidence: 0.853913
   -- Left:    346.816
   -- Top:     56.7357
   -- Right:   377.782
   -- Bottom:  79.769
   -- Width:   30.9664
   -- Height:  23.0333
   -- Area:    713.258
   -- Center:  (362.299, 68.2524)



The result is saved as two images. This image shows the detect result on the screen.



This image is a masked image of the face found with the bear icon.



Now let's automatically mask the video file.

Detecing faces in the video using the detectNet

Important source code is included in the example above. So, using openCV's video processing functions, face masking in the video can also be made easy.
OpenCV's cap.read function returns Mat object which is compatible with numpy array. But the returned Mat object is cpu variable, so you must copy the variable to GPU cuda memory  with jetson.utils.cudaFromNumpy function.



#!/usr/bin/python
#
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
#

import jetson.inference
import jetson.utils

import argparse
import sys
import time
import numpy as np
import cv2
# parse the command line
parser = argparse.ArgumentParser(description="Locate objects in an image using an object detection DNN.", 
         formatter_class=argparse.RawTextHelpFormatter, epilog=jetson.inference.detectNet.Usage())

parser.add_argument("--video_in", type=str, help="filename of the input video to process")
parser.add_argument("--video_out", type=str, default=None, nargs='?', help="filename of the output video to save")
parser.add_argument("--network", type=str, help="pre-trained model to load (facenet)")
parser.add_argument("--overlay", type=str, default="box,labels,conf", help="detection overlay flags (e.g. --overlay=box,labels,conf)\nvalid combinations are:  'box', 'labels', 'conf', 'none'")
parser.add_argument("--threshold", type=float, default=0.5, help="minimum detection threshold to use")

try:
 opt = parser.parse_known_args()[0]
except:
 print("")
 parser.print_help()
 sys.exit(0)

'''
extra_rate = 1.0 ~ 1.X to enlarge the mask size
'''
def do_masking(detection, arr, extra_rate = 1.0):
    bear = mask.copy()
    oh, ow, _ = arr.shape
    w = min(int(detection.Width * extra_rate) , ow)
    h = min(int(detection.Height * extra_rate), oh)
    x = max(int(detection.Left - (w * (extra_rate - 1.0 ) / 2)) , 0)
    y = max(int(detection.Top  - (h * (extra_rate - 1.0 ) / 2)) , 0)

    dim = (w, h) 
    bear = cv2.resize(bear, dim, interpolation = cv2.INTER_AREA)
    bg = arr[y:y+h, x:x+w]
    print(bear.shape)
    try:
        for i in range(0, h):
            for j in range(0, w):
                B = bear[i][j][0]
                G = bear[i][j][1]
                R = bear[i][j][2]
                if (int(B) + int(G) + int(R)):
                    bg[i][j][0] = B
                    bg[i][j][1] = G
                    bg[i][j][2] = R
        arr[y:y+h, x:x+w] = bg
    except IndexError:
        print(' index Error')
        return None
    
    return arr


def process_frame(img):
    global out_video
    height, width, _ = img.shape
    cudaimg = cv2.cvtColor (img.astype(np.uint8), cv2.COLOR_BGR2RGBA)
    cuda_mem = jetson.utils.cudaFromNumpy(cudaimg)
    detections = net.Detect(cuda_mem, width, height, opt.overlay)
    jetson.utils.cudaDeviceSynchronize ()
    # arr = jetson.utils.cudaToNumpy(cuda_mem, width, height, 4)      #CUDA img is float type
    # arr = cv2.cvtColor (arr.astype(np.uint8), cv2.COLOR_RGBA2BGR)
    for detection in detections:
        img = do_masking(detection, img, extra_rate = 1.2)
        if img is None:
            return False
    out_video.write(img)
    elapsed = time.time() - t
    # print the detections
    print("Frame[%d] FPS:%f"%(count, 1.0 / elapsed))
    return True


cap = cv2.VideoCapture(opt.video_in)
ret, img = cap.read()
fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
out_video = cv2.VideoWriter(opt.video_out, fourcc, cap.get(cv2.CAP_PROP_FPS), (img.shape[1], img.shape[0]))
mask = cv2.imread('./sbear.png', cv2.IMREAD_UNCHANGED)
net = jetson.inference.detectNet(opt.network, sys.argv, opt.threshold)

count = 0
while cap.isOpened():
    count += 1
    t = time.time()
    ret, img = cap.read()
    if ret == False:
        break
    if False == process_frame(img):
        break

out_video.release()
cap.release()


Let's run the code.


jetson-inference/python/examples# python3 detectnet_face_video.py --video_in=./01.mp4 --video_out=./01_mask.mp4 --network=facenet --threshold=0.3
jetson.inference.__init__.py
jetson.inference -- initializing Python 3.6 bindings...
jetson.inference -- registering module types...
jetson.inference -- done registering module types
jetson.inference -- done Python 3.6 binding initialization
jetson.utils.__init__.py

...............

Frame[711] FPS:1.783029
jetson.utils -- freeing CUDA mapped memory
jetson.inference -- PyDetection_Dealloc()
jetson.inference -- PyDetection_Dealloc()
jetson.inference -- PyDetection_Dealloc()
jetson.utils -- cudaFromNumpy()  ndarray dim 0 = 540
jetson.utils -- cudaFromNumpy()  ndarray dim 1 = 960
jetson.utils -- cudaFromNumpy()  ndarray dim 2 = 4
(127, 112, 4)
(180, 168, 4)
Frame[712] FPS:1.807839
jetson.utils -- freeing CUDA mapped memory
jetson.inference -- PyDetection_Dealloc()
jetson.inference -- PyDetection_Dealloc()
jetson.utils -- cudaFromNumpy()  ndarray dim 0 = 540
jetson.utils -- cudaFromNumpy()  ndarray dim 1 = 960
jetson.utils -- cudaFromNumpy()  ndarray dim 2 = 4
(119, 110, 4)
(183, 171, 4)
 index Error
jetson.utils -- freeing CUDA mapped memory
jetson.inference -- PyDetection_Dealloc()
jetson.inference -- PyDetection_Dealloc()

This video clip is part of the converted video. Faces are visible in some frames where the face detection fails.



Wrapping up

If you want to use more sophisticated face recognition and face identification not to mask the face of a particular person, it may be better to use dlib. See this page for an example using dlib for face recognition.







2019년 12월 18일 수요일

JetsonNano - Hello AI World (NVIDIA DNN vision library) - 3.Detecting objects with detectNet

This is referred to the contents of https://github.com/dusty-nv/jetson-inference/blob/master/docs/detectnet-console-2.md.

The previous recognition examples output class probabilities representing the entire input image. Next we're going to focus on object detection, and finding where in the frame various objects are located by extracting their bounding boxes. Unlike image classification, object detection networks are capable of detecting many different objects per frame.



We have built the detectNet object for C++, Python3 in the previous post.

Testing an detectNet


The imageNet object accepts an input image and outputs the probability for each class. Having been trained on the ImageNet ILSVRC dataset of 1000 objects, the GoogleNet and ResNet-18 models were automatically downloaded during the build step. See below for other classification models that can be downloaded and used as well.
ImageNet accepts an image and outputs only one class probabilities representing the entire input image.





detectNet Network models summary

NetworkCLI argumentNetworkType enumObject classes
SSD-Mobilenet-v1ssd-mobilenet-v1SSD_MOBILENET_V191 (COCO classes)
SSD-Mobilenet-v2ssd-mobilenet-v2SSD_MOBILENET_V291 (COCO classes)
SSD-Inception-v2ssd-inception-v2SSD_INCEPTION_V291 (COCO classes)
DetectNet-COCO-Dogcoco-dogCOCO_DOGdogs
DetectNet-COCO-Bottlecoco-bottleCOCO_BOTTLEbottles
DetectNet-COCO-Chaircoco-chairCOCO_CHAIRchairs
DetectNet-COCO-Airplanecoco-airplaneCOCO_AIRPLANEairplanes
ped-100pednetPEDNETpedestrians
multiped-500multipedPEDNET_MULTIpedestrians, luggage
facenet-120facenetFACENETfaces

SSD-Mobilenet-v1:

detectNet - for object detection

detectNet is an object detection DNN class name.

detectNet object constructor

The first parameter is the model name and the second parameter is a command line arguments list.



__init__(...)
     Loads an object detection model.
 
     Parameters:
       network (string) -- name of a built-in network to use
                           see below for available options.
 
       argv (strings) -- command line arguments passed to imageNet,
                         see below for available options.
 
       threshold (float) -- minimum detection threshold.
                            default value is 0.5


detectNet arguments:

These arguments are passed through the command line parameter.



detectNet arguments: 
  --network NETWORK     pre-trained model to load, one of the following:
                            * pednet (default)
                            * multiped
                            * facenet
                            * ssd-mobilenet-v1
                            * ssd-mobilenet-v2
                            * ssd-inception-v2
                            * coco-airplane
                            * coco-bottle
                            * coco-chair
                            * coco-dog
  --model MODEL         path to custom model to load (.caffemodel, .uff, or .onnx)
  --prototxt PROTOTXT   path to custom prototxt to load (for .caffemodel only)
  --class_labels LABELS path to text file containing the labels for each class
  --threshold THRESHOLD minimum threshold for detection (default is 0.5)
  --input_blob INPUT    name of the input layer (default is 'data')
  --output_cvg COVERAGE name of the coverge output layer (default is 'coverage')
  --output_bbox BOXES   name of the bounding output layer (default is 'bboxes')
  --mean_pixel PIXEL    mean pixel value to subtract from input (default is 0.0)
  --batch_size BATCH    maximum batch size (default is 1)


Methods

Detect(...)
Detect objects in an RGBA image and return a list of detections.

Parameters:
  image   (capsule) -- CUDA memory capsule
  width   (int)  -- width of the image (in pixels)
  height  (int)  -- height of the image (in pixels)
  overlay (bool) -- true to overlay the bounding boxes (default is true)

Returns:
  [Detections] -- list containing the detected objects (see detectNet.Detection)
GetClassDesc(...)
Return the class description for the given object class.

Parameters:
  (int) -- index of the class, between [0, GetNumClasses()]

Returns:
  (string) -- the text description of the object class
GetClassSynset(...)
Return the synset data category string for the given class.
The synset generally maps to the class training data folder.

Parameters:
  (int) -- index of the class, between [0, GetNumClasses()]

Returns:
  (string) -- the synset of the class, typically 9 characters long
GetNumClasses(...)
Return the number of object classes that this network model is able to detect.

Parameters:  (none)

Returns:
  (int) -- number of object classes that the model supports
GetThreshold(...)
Return the minimum detection threshold.

Parameters:  (none)

Returns:
  (float) -- the threshold for detection
SetThreshold(...)
Return the minimum detection threshold.

Parameters:
  (float) -- detection threshold

Returns:  (none


Testing an detectNet

The detectNet object accepts an input image and outputs the probability for each class. Having been trained on the ImageNet ILSVRC dataset of 1000 objects, the GoogleNet and ResN

segNet - for segmentation

segNet is a Semantic Segmentation DNN class name.

segNet object constructor

The first parameter is the model name and the second parameter is a command line arguments list.



__init__(...)
     Loads an semantic segmentation model.
 
     Parameters:
       network (string) -- name of a built-in network to use,
                           see below for available options.
 
       argv (strings) -- command line arguments passed to segNet,
                         see below for available options.


segNet arguments:

These arguments are passed through the command line parameter.



segNet arguments: 
  --network NETWORK    pre-trained model to load, one of the following:
                           * fcn-resnet18-cityscapes-512x256
                           * fcn-resnet18-cityscapes-1024x512
                           * fcn-resnet18-cityscapes-2048x1024
                           * fcn-resnet18-deepscene-576x320
                           * fcn-resnet18-deepscene-864x480
                           * fcn-resnet18-mhp-512x320
                           * fcn-resnet18-mhp-640x360
                           * fcn-resnet18-voc-320x320 (default)
                           * fcn-resnet18-voc-512x320
                           * fcn-resnet18-sun-512x400
                           * fcn-resnet18-sun-640x512
  --model MODEL        path to custom model to load (caffemodel, uff, or onnx)
  --prototxt PROTOTXT  path to custom prototxt to load (for .caffemodel only)
  --labels LABELS      path to text file containing the labels for each class
  --colors COLORS      path to text file containing the colors for each class
  --input_blob INPUT   name of the input layer (default: 'data')
  --output_blob OUTPUT name of the output layer (default: 'score_fr_21classes')
  --batch_size BATCH   maximum batch size (default is 1)
  --alpha ALPHA        overlay alpha blending value, range 0-255 (default: 120)
  --profile            enable layer profiling in TensorRT


methods

GetClassDesc(...)
Return the class description for the given object class.

Parameters:
  (int) -- index of the class, between [0, GetNumClasses()]

Returns:
  (string) -- the text description of the object class
GetNetworkName(...)
Return the name of the built-in network used by the model.

Parameters:  (none)

Returns:
  (string) -- name of the network (e.g. 'FCN_ResNet18', 'FCN_Alexnet')
              or 'custom' if using a custom-loaded model
GetNumClasses(...)
Return the number of object classes that this network model is able to classify.

Parameters:  (none)

Returns:
  (int) -- number of object classes that the model supports
Mask(...)
Produce a colorized RGBA segmentation mask of the output.

Parameters:
  image  (capsule) -- output CUDA memory capsule
  width  (int) -- width of the image (in pixels)
  height (int) -- height of the image (in pixels)
  filter_mode (string) -- optional string indicating the filter mode, 'point' or 'linear' (default: 'linear')
Returns:  (none)
Overlay(...)
Produce the segmentation overlay alpha blended on top of the original image.

Parameters:
  image  (capsule) -- output CUDA memory capsule
  width  (int) -- width of the image (in pixels)
  height (int) -- height of the image (in pixels)
  filter_mode (string) -- optional string indicating the filter mode, 'point' or 'linear' (default: 'linear')
Returns:  (none)
Process(...)
Perform the initial inferencing processing of the segmentation.
The results can then be visualized using the Overlay() and Mask() functions.

Parameters:
  image  (capsule) -- CUDA memory capsule
  width  (int) -- width of the image (in pixels)
  height (int) -- height of the image (in pixels)
  ignore_class (string) -- optional label name of class to ignore in the classification (default: 'void')
Returns:  (none)
SetOverlayAlpha(...)
Set the alpha blending value used during overlay visualization for all classes

Parameters:
  alpha (float) -- desired alpha value, between 0.0 and 255.0
  explicit_exempt (optional, bool) -- if True, the global alpha doesn't apply to classes that have an alpha value explicitly set in the colors file (default: True)
Returns:  (none)

Testing an detectNet

The previous recognition examples output class probabilities representing the entire input image. Next we're going to focus on object detection, and finding where in the frame various objects are located by extracting their bounding boxes. Unlike image classification, object detection networks are capable of detecting many different objects per frame.



#!/usr/bin/python
#
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
#

import jetson.inference
import jetson.utils

import argparse
import sys
import time
# parse the command line
parser = argparse.ArgumentParser(description="Locate objects in an image using an object detection DNN.", 
         formatter_class=argparse.RawTextHelpFormatter, epilog=jetson.inference.detectNet.Usage())

parser.add_argument("file_in", type=str, help="filename of the input image to process")
parser.add_argument("file_out", type=str, default=None, nargs='?', help="filename of the output image to save")
parser.add_argument("--network", type=str, default="ssd-mobilenet-v2", help="pre-trained model to load (see below for options)")
parser.add_argument("--overlay", type=str, default="box,labels,conf", help="detection overlay flags (e.g. --overlay=box,labels,conf)\nvalid combinations are:  'box', 'labels', 'conf', 'none'")
parser.add_argument("--threshold", type=float, default=0.5, help="minimum detection threshold to use")

try:
 opt = parser.parse_known_args()[0]
except:
 print("")
 parser.print_help()
 sys.exit(0)

# load an image (into shared CPU/GPU memory)
print('input image:%s  output image:%s'%(opt.file_in, opt.file_out))
img, width, height = jetson.utils.loadImageRGBA(opt.file_in)

# load the object detection network
net = jetson.inference.detectNet(opt.network, sys.argv, opt.threshold)
t = time.time()
# detect objects in the image (with overlay)
detections = net.Detect(img, width, height, opt.overlay)
elapsed = time.time() - t
# print the detections
print("detected {:d} objects in image".format(len(detections)))
print("FPS:%f"%(1.0 / elapsed))

for detection in detections:
 print(detection)

# print out timing info
net.PrintProfilerTimes()

# save the output image with the bounding box overlays
if opt.file_out is not None:
 jetson.utils.saveImageRGBA(opt.file_out, img, width, height)

I only modified the code for FPS value check. Let's run the above code. Let's run the code.


jetson-inference/python/examples# python3 detectnet-console.py /usr/local/src/jetson-inference/data/images/bottle_0.jpg /tmp/detectnet.jpg
jetson.inference.__init__.py
jetson.inference -- initializing Python 3.6 bindings...
jetson.inference -- registering module types...
jetson.inference -- done registering module types
jetson.inference -- done Python 3.6 binding initialization
..........
detectNet -- maximum bounding boxes:  100
detectNet -- loaded 91 class info entries
detectNet -- number of object classes:  91
detected 2 objects in image
FPS:9.936001
<detectNet.Detection object>
   -- ClassID: 17
   -- Confidence: 0.986311
   -- Left:    94.026
   -- Top:     36.5977
   -- Right:   480.097
   -- Bottom:  470.884
   -- Width:   386.071
   -- Height:  434.286
   -- Area:    167665
   -- Center:  (287.062, 253.741)
<detectNet.Detection object>
   -- ClassID: 44
   -- Confidence: 0.993281
   -- Left:    20.8567
   -- Top:     231.989
   -- Right:   147.972
   -- Bottom:  583.631
   -- Width:   127.116
   -- Height:  351.642
   -- Area:    44699.2
   -- Center:  (84.4145, 407.81)
   ..........

And this is the result image.


There are two objects(cat, bottle) detected successfully. And the FPS value is 9.93

Tip : Repeated execution of the same python program several times will increase the FPS value to 13 ~ 15.



Testing detectNet with Webcam

Now test the detectNet with webcam and let's check the FPS.
Like  my previous post,  I'll use OpenCV here too.
And I'm going to change the default camera to "/dev/video0" because I'm mainly going to use my webcam on Jetson Nano.



#!/usr/bin/python
#
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
#

import jetson.inference
import jetson.utils

import argparse
import sys, time
import numpy as np
import cv2
# parse the command line
parser = argparse.ArgumentParser(description="Locate objects in a live camera stream using an object detection DNN.", 
         formatter_class=argparse.RawTextHelpFormatter, epilog=jetson.inference.detectNet.Usage())

parser.add_argument("--network", type=str, default="ssd-mobilenet-v2", help="pre-trained model to load (see below for options)")
parser.add_argument("--overlay", type=str, default="box,labels,conf", help="detection overlay flags (e.g. --overlay=box,labels,conf)\nvalid combinations are:  'box', 'labels', 'conf', 'none'")
parser.add_argument("--threshold", type=float, default=0.5, help="minimum detection threshold to use") 
parser.add_argument("--camera", type=str, default="/dev/video0", help="index of the MIPI CSI camera to use (e.g. CSI camera 0)\nor for VL42 cameras, the /dev/video device to use.\nby default, MIPI CSI camera 0 will be used.")
parser.add_argument("--width", type=int, default=640, help="desired width of camera stream (default is 640 pixels)")
parser.add_argument("--height", type=int, default=480, help="desired height of camera stream (default is 480 pixels)")

try:
 opt = parser.parse_known_args()[0]
except:
 print("")
 parser.print_help()
 sys.exit(0)

# load the object detection network
net = jetson.inference.detectNet(opt.network, sys.argv, opt.threshold)

font = jetson.utils.cudaFont()
# create the camera and display
camera = jetson.utils.gstCamera(opt.width, opt.height, opt.camera)
fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
out_video = cv2.VideoWriter('/tmp/detect.mp4', fourcc, 25, (640, 480))

count = 0
img, width, height = camera.CaptureRGBA(zeroCopy=1)
print("========== Capture Width:%d Height:%d ==========="%(width, height))
# process frames until user exits
t = time.time()
while count < 500:
    # capture the image
    img, width, height = camera.CaptureRGBA(zeroCopy=1)

    # detect objects in the image (with overlay)
    detections = net.Detect(img, width, height, opt.overlay)

    # print the detections
    print("detected {:d} objects in image".format(len(detections)))
    fps = 1.0 / ( time.time() - t)
    for detection in detections:
        print(detection)
    font.OverlayText(img, width, height, "FPS:%5.2f"%(fps), 5, 30, font.White, font.Gray40)
    t = time.time()
    #for numpy conversion, wait for synchronizing
    jetson.utils.cudaDeviceSynchronize ()
    arr = jetson.utils.cudaToNumpy(img, width, height, 4)      #CUDA img is float type
    arr1 = cv2.cvtColor (arr.astype(np.uint8), cv2.COLOR_RGBA2BGR)
    if(count % 100 == 0):
        cv2.imwrite("/tmp/detect-" + str(count)+ ".jpg", arr1)
    out_video.write(arr1)
    # cv2.imshow('imageNet', arr1)
    # cv2.waitKey(1)
    print("==== FPS:%f ====="%(fps))

    # print out performance info
    # net.PrintProfilerTimes()
    count += 1


This images are extracted from the video output tested with ssd-mobilenet-v2 model.
Oh, my ballpoint pen was mistaken for a baseball bat!

Wrapping up


There are many other models like "SSD-Inception-v2, DetectNet-COCO-Dog, multiped-500, facenet-120, ...".  Please test it yourself. As I said im my previous post, with jetson inference objects, you can get very good fps values. Once again I strongly recommend using this model.