I used Jetson TX2, Ubuntu 18.04 Official image with root account. And I always use python3. The source code introduced in this article can be downloaded here.
First read this page "JetsonNano - Human Pose estimation using TensorRT" . This page
Since the important explanation including JetPack 4.3 is given in the link page above, this article will only test performance with the TX2 built-in camera.
Get Keypoints from video
In order to use the TX2's built-in camera instead of the webcam used in the Jetson Nano example, only part of the OpenCV camera control was changed.import json import trt_pose.coco import trt_pose.models import torch import torch2trt from torch2trt import TRTModule import time, sys import cv2 import torchvision.transforms as transforms import PIL.Image from trt_pose.draw_objects import DrawObjects from trt_pose.parse_objects import ParseObjects import argparse import os.path ''' hnum: 0 based human index kpoint : keypoints (float type range : 0.0 ~ 1.0 ==> later multiply by image width, height ''' def get_keypoint(humans, hnum, peaks): #check invalid human index kpoint = [] human = humans[0][hnum] C = human.shape[0] for j in range(C): k = int(human[j]) if k >= 0: peak = peaks[0][j][k] # peak[1]:width, peak[0]:height peak = (j, float(peak[0]), float(peak[1])) kpoint.append(peak) #print('index:%d : success [%5.3f, %5.3f]'%(j, peak[1], peak[2]) ) else: peak = (j, None, None) kpoint.append(peak) #print('index:%d : None %d'%(j, k) ) return kpoint parser = argparse.ArgumentParser(description='TensorRT pose estimation run') parser.add_argument('--model', type=str, default='resnet', help = 'resnet or densenet' ) args = parser.parse_args() with open('human_pose.json', 'r') as f: human_pose = json.load(f) topology = trt_pose.coco.coco_category_to_topology(human_pose) num_parts = len(human_pose['keypoints']) num_links = len(human_pose['skeleton']) if 'resnet' in args.model: print('------ model = resnet--------') MODEL_WEIGHTS = 'resnet18_baseline_att_224x224_A_epoch_249.pth' OPTIMIZED_MODEL = 'resnet18_baseline_att_224x224_A_epoch_249_trt.pth' model = trt_pose.models.resnet18_baseline_att(num_parts, 2 * num_links).cuda().eval() WIDTH = 224 HEIGHT = 224 else: print('------ model = densenet--------') MODEL_WEIGHTS = 'densenet121_baseline_att_256x256_B_epoch_160.pth' OPTIMIZED_MODEL = 'densenet121_baseline_att_256x256_B_epoch_160_trt.pth' model = trt_pose.models.densenet121_baseline_att(num_parts, 2 * num_links).cuda().eval() WIDTH = 256 HEIGHT = 256 data = torch.zeros((1, 3, HEIGHT, WIDTH)).cuda() if os.path.exists(OPTIMIZED_MODEL) == False: model.load_state_dict(torch.load(MODEL_WEIGHTS)) model_trt = torch2trt.torch2trt(model, [data], fp16_mode=True, max_workspace_size=1<<25) torch.save(model_trt.state_dict(), OPTIMIZED_MODEL) model_trt = TRTModule() model_trt.load_state_dict(torch.load(OPTIMIZED_MODEL)) t0 = time.time() torch.cuda.current_stream().synchronize() for i in range(50): y = model_trt(data) torch.cuda.current_stream().synchronize() t1 = time.time() print(50.0 / (t1 - t0)) mean = torch.Tensor([0.485, 0.456, 0.406]).cuda() std = torch.Tensor([0.229, 0.224, 0.225]).cuda() device = torch.device('cuda') def preprocess(image): global device device = torch.device('cuda') image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) image = PIL.Image.fromarray(image) image = transforms.functional.to_tensor(image).to(device) image.sub_(mean[:, None, None]).div_(std[:, None, None]) return image[None, ...] def execute(img, src, t): color = (0, 255, 0) data = preprocess(img) cmap, paf = model_trt(data) cmap, paf = cmap.detach().cpu(), paf.detach().cpu() counts, objects, peaks = parse_objects(cmap, paf)#, cmap_threshold=0.15, link_threshold=0.15) fps = 1.0 / (time.time() - t) for i in range(counts[0]): keypoints = get_keypoint(objects, i, peaks) for j in range(len(keypoints)): if keypoints[j][1]: x = round(keypoints[j][2] * WIDTH * X_compress) y = round(keypoints[j][1] * HEIGHT * Y_compress) cv2.circle(src, (x, y), 3, color, 2) cv2.putText(src , "%d" % int(keypoints[j][0]), (x + 5, y), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 1) cv2.circle(src, (x, y), 3, color, 2) print("FPS:%f "%(fps)) #draw_objects(img, counts, objects, peaks) cv2.putText(src , "FPS: %f" % (fps), (20, 20), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1) out_video.write(src) cap_str = "nvarguscamerasrc ! video/x-raw(memory:NVMM), width=(int)640, height=(int)480,format=(string)NV12, framerate=(fraction)24/1 ! nvvidconv flip-method=0 ! video/x-raw, format=(string)BGRx ! videoconvert ! video/x-raw, format=(string)BGR ! appsink" cap = cv2.VideoCapture(cap_str) #cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640) #cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480) ret_val, img = cap.read() fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v') out_video = cv2.VideoWriter('/tmp/output.mp4', fourcc, cap.get(cv2.CAP_PROP_FPS), (640, 480)) count = 0 X_compress = 640.0 / WIDTH * 1.0 Y_compress = 480.0 / HEIGHT * 1.0 if cap is None: print("Camera Open Error") sys.exit(0) parse_objects = ParseObjects(topology) draw_objects = DrawObjects(topology) while cap.isOpened() and count < 500: t = time.time() ret_val, dst = cap.read() if ret_val == False: print("Camera read Error") break img = cv2.resize(dst, dsize=(WIDTH, HEIGHT), interpolation=cv2.INTER_AREA) execute(img, dst, t) count += 1 cv2.destroyAllWindows() out_video.release() cap.release()
Run the code.
root@spytx-desktop:/work/src/trt_pose/tasks/human_pose# python3 detect_camera.py ------ model = resnet-------- 51.2064258063634 nvbuf_utils: Could not get EGL display connection GST_ARGUS: Creating output stream CONSUMER: Waiting until producer is connected... GST_ARGUS: Available Sensor modes : GST_ARGUS: 2592 x 1944 FR = 29.999999 fps Duration = 33333334 ; Analog Gain range min 1.000000, max 16.000000; Exposure Range min 34000, max 550385000; GST_ARGUS: 2592 x 1458 FR = 29.999999 fps Duration = 33333334 ; Analog Gain range min 1.000000, max 16.000000; Exposure Range min 34000, max 550385000; GST_ARGUS: 1280 x 720 FR = 120.000005 fps Duration = 8333333 ; Analog Gain range min 1.000000, max 16.000000; Exposure Range min 22000, max 358733000; GST_ARGUS: Running with following settings: Camera index = 0 Camera mode = 2 Output Stream W = 1280 H = 720 seconds to Run = 0 Frame Rate = 120.000005 GST_ARGUS: PowerService: requested_clock_Hz=3225600 GST_ARGUS: Setup Complete, Starting captures for 0 seconds GST_ARGUS: Starting repeat capture requests. CONSUMER: Producer has connected; continuing. [ WARN:0] global /work/src/opencv-4.1.1/modules/videoio/src/cap_gstreamer.cpp (933) open OpenCV | GStreamer warning: Cannot query video position: status=0, value=-1, duration=-1 FPS:16.402773 FPS:24.641213 FPS:26.656905 FPS:26.783037 FPS:27.759016 FPS:28.158195 FPS:27.907143 FPS:28.684690 FPS:27.349757 FPS:27.770962 FPS:28.226034 FPS:29.051658 FPS:27.624638 FPS:29.532153 FPS:29.811746 FPS:28.997228 FPS:29.378872 FPS:29.567127 FPS:29.001438 FPS:31.355297 FPS:28.274745 FPS:28.858169 FPS:29.077437 FPS:28.063993 FPS:28.832380 FPS:30.299535
Wrapping up
Very satisfactory results were obtained. The FPS 28 ~ 30 values are good enough for actual projects. Again, I recommend using this model for human pose estimation. At the present time, this is the best model available in the Jetson series.We also recommend that you visit https://github.com/NVIDIA-AI-IOT/trt_pose frequently. Well-trained new models may be introduced or updated code may be introduced.
댓글 없음:
댓글 쓰기