Jetson 多模型并行推理:GPU/DLA/CPU 协同 Jetson 多模型并行推理GPU/DLA/CPU 协同1. 多模型并行架构Jetson Orin NX 计算资源 ├── GPU1024 CUDA 32 Tensor Core ├── DLA 0专用 AI 加速器独立于 GPU ├── DLA 1第二路 AI 加速器 └── CPU8 核 ARM后处理、调度 并行策略 ├── GPU DLA0 DLA13 路模型并行 ├── 多 CUDA StreamGPU 内多模型流水线 └── 异步推理采集/推理/后处理重叠2. GPU DLA 双路并行#!/usr/bin/env python3gpu_dla_parallel.pyimporttensorrtastrtimportthreadingimporttimeclassParallelInference:def__init__(self,model_gpu_path,model_dla_path):self.gpu_engineself._load_engine(model_gpu_path)self.dla_engineself._load_engine(model_dla_path)def_load_engine(self,path):loggertrt.Logger(trt.Logger.WARNING)runtimetrt.Runtime(logger)withopen(path,rb)asf:returnruntime.deserialize_cuda_engine(f.read())defgpu_infer(self,input_data):contextself.gpu_engine.create_execution_context()returnself._run(context,input_data)defdla_infer(self,input_data):contextself.dla_engine.create_execution_context()returnself._run(context,input_data)def_run(self,context,input_data):# 推理执行passdefparallel_infer(self,frame_a,frame_b):result[None,None]t1threading.Thread(targetlambda:result.__setitem__(0,self.gpu_infer(frame_a)))t2threading.Thread(targetlambda:result.__setitem__(1,self.dla_infer(frame_b)))starttime.time()t1.start();t2.start()t1.join();t2.join()print(f并行推理:{(time.time()-start)*1000:.1f}ms)returnresult3. 构建 DLA 引擎defbuild_dla_engine(onnx_path,engine_path,dla_core0):loggertrt.Logger(trt.Logger.WARNING)buildertrt.Builder(logger)networkbuilder.create_network(1int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))parsertrt.OnnxParser(network,logger)withopen(onnx_path,rb)asf:parser.parse(f.read())configbuilder.create_builder_config()config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE,130)config.default_device_typetrt.DeviceType.DLA config.DLA_coredla_core config.set_flag(trt.BuilderFlag.GPU_FALLBACK)config.set_flag(trt.BuilderFlag.FP16)enginebuilder.build_serialized_network(network,config)withopen(engine_path,wb)asf:f.write(engine)print(fDLA{dla_core}引擎已保存:{engine_path})4. 异步推理管线#!/usr/bin/env python3async_pipeline.pyimportthreading,queue,time,cv2classAsyncPipeline:def__init__(self,model,max_queue_size3):self.modelmodel self.input_queuequeue.Queue(maxsizemax_queue_size)self.output_queuequeue.Queue(maxsizemax_queue_size)self.runningFalsedefstart(self,camera_id0):self.runningTrueself._threads[threading.Thread(targetself._capture_loop,args(camera_id,),daemonTrue),threading.Thread(targetself._infer_loop,daemonTrue),threading.Thread(targetself._display_loop,daemonTrue),]fortinself._threads:t.start()def_capture_loop(self,camera_id):pipeline(fnvarguscamerasrc sensor-id{camera_id}! fvideo/x-raw(memory:NVMM), width1280, height720, fformatNV12, framerate30/1 ! fnvvidconv ! video/x-raw, formatBGRx ! videoconvert ! fvideo/x-raw, formatBGR ! appsink)capcv2.VideoCapture(pipeline,cv2.CAP_GSTREAMER)whileself.running:ret,framecap.read()ifret:ifself.input_queue.full():self.input_queue.get()self.input_queue.put(frame)cap.release()def_infer_loop(self):whileself.running:try:frameself.input_queue.get(timeout0.1)resultself.model.detect(frame)ifself.output_queue.full():self.output_queue.get()self.output_queue.put((frame,result))exceptqueue.Empty:continuedef_display_loop(self):fps_count,fps_start0,time.time()whileself.running:try:frame,detsself.output_queue.get(timeout0.1)canvasself.model.draw_detections(frame,dets)fps_count1iftime.time()-fps_start1.0:cv2.putText(canvas,fFPS:{fps_count},(10,30),cv2.FONT_HERSHEY_SIMPLEX,1,(0,255,0),2)fps_count0fps_starttime.time()cv2.imshow(Pipeline,canvas)ifcv2.waitKey(1)0xFFord(q):self.runningFalsebreakexceptqueue.Empty:continuecv2.destroyAllWindows()5. 性能基准多模型并行性能Orin NX 16GB ┌──────────────────────┬──────────┬──────────┬──────────┐ │ 配置 │ 模型 A │ 模型 B │ 总吞吐 │ ├──────────────────────┼──────────┼──────────┼──────────┤ │ GPU only │ 50 FPS │ - │ 50 FPS │ │ GPU DLA0 │ 50 FPS │ 40 FPS │ 90 FPS │ │ GPU DLA0 DLA1 │ 50 FPS │ 40 FPS │ 130 FPS │ │ 3x CUDA Stream │ 25 FPS │ 25 FPS │ 75 FPS │ └──────────────────────┴──────────┴──────────┴──────────┘总结核心要点DLA 独立DLA 与 GPU 完全独立可并行运行不同模型GPU_FALLBACKDLA 不支持的层自动回退到 GPU异步管线采集/推理/后处理三阶段重叠最大化吞吐队列管理满队列丢弃旧帧保证实时性