diff --git a/README.md b/README.md index 5ed60a1e..8a5bdf6e 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,7 @@ NVIDIA DeepStream SDK 8.0 / 7.1 / 7.0 / 6.4 / 6.3 / 6.2 / 6.1.1 / 6.1 / 6.0.1 / * [YOLOv9 usage](docs/YOLOv9.md) * [YOLOv10 usage](docs/YOLOv10.md) * [YOLO11 usage](docs/YOLO11.md) +* [YOLO11-OBB usage](docs/YOLO11-OBB.md) * [YOLOv12 usage](docs/YOLOv12.md) * [YOLOv13 usage](docs/YOLOv13.md) * [YOLOR usage](docs/YOLOR.md) @@ -244,6 +245,7 @@ NVIDIA DeepStream SDK 8.0 / 7.1 / 7.0 / 6.4 / 6.3 / 6.2 / 6.1.1 / 6.1 / 6.0.1 / * [YOLOv9](https://github.com/WongKinYiu/yolov9) * [YOLOv10](https://github.com/THU-MIG/yolov10) * [YOLO11](https://github.com/ultralytics/ultralytics) +* [YOLO11-OBB](https://github.com/ultralytics/ultralytics) * [YOLOv12](https://github.com/sunsmarterjie/yolov12) * [YOLOv13](https://github.com/iMoonLab/yolov13) * [YOLOR](https://github.com/WongKinYiu/yolor) diff --git a/config_infer_primary_yolo11_obb.txt b/config_infer_primary_yolo11_obb.txt new file mode 100644 index 00000000..5c7686f7 --- /dev/null +++ b/config_infer_primary_yolo11_obb.txt @@ -0,0 +1,32 @@ +[property] +gpu-id=0 +net-scale-factor=0.0039215697906911373 +model-color-format=0 +onnx-file=yolo11n-obb.onnx +model-engine-file=model_b1_gpu0_fp32.engine +#int8-calib-file=calib.table +labelfile-path=labels.txt +batch-size=1 +network-mode=0 +num-detected-classes=15 +interval=0 +gie-unique-id=1 +process-mode=1 +network-type=0 +cluster-mode=2 +maintain-aspect-ratio=1 +symmetric-padding=1 +#workspace-size=2000 +parse-bbox-func-name=NvDsInferParseYoloOBB +#parse-bbox-func-name=NvDsInferParseYoloOBBCuda +custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so +engine-create-func-name=NvDsInferYoloCudaEngineGet +# Uncomment the line below to expose raw OBB output tensors downstream via NvDsInferTensorMeta. +# The raw tensor contains [x, y, w, h, class_probs..., angle] for each detection. +# Access it in a GStreamer pad probe using NVDSINFER_TENSOR_OUTPUT_META. See docs/YOLO11-OBB.md for details. +#output-tensor-meta=1 + +[class-attrs-all] +nms-iou-threshold=0.45 +pre-cluster-threshold=0.25 +topk=300 diff --git a/docs/YOLO11-OBB.md b/docs/YOLO11-OBB.md new file mode 100644 index 00000000..5720f1cf --- /dev/null +++ b/docs/YOLO11-OBB.md @@ -0,0 +1,306 @@ +# YOLO11-OBB usage + +**NOTE**: YOLO11-OBB (Oriented Bounding Box) models are used for detecting rotated objects. The OBB parser converts oriented boxes to axis-aligned bounding boxes (AABB) for DeepStream visualization. + +* [Convert model](#convert-model) +* [Compile the lib](#compile-the-lib) +* [Edit the config_infer_primary_yolo11_obb file](#edit-the-config_infer_primary_yolo11_obb-file) +* [Edit the deepstream_app_config file](#edit-the-deepstream_app_config-file) +* [Testing the model](#testing-the-model) + +## + +### Convert model + +#### 1. Download the YOLO11 repo and install the requirements + +``` +git clone https://github.com/ultralytics/ultralytics.git +cd ultralytics +pip3 install -e . +pip3 install onnx onnxslim onnxruntime +``` + +**NOTE**: It is recommended to use Python virtualenv. + +#### 2. Copy conversor + +Copy the `export_yolo11_obb.py` file from `DeepStream-Yolo/utils` directory to the `ultralytics` folder. + +#### 3. Download the model + +Download the `pt` file from [YOLO11-OBB](https://github.com/ultralytics/assets/releases/) releases (example for YOLO11n-OBB) + +``` +wget https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n-obb.pt +``` + +**NOTE**: You can use your custom OBB model trained on datasets like DOTAv1, DOTAv1.5, or DOTAv2. + +#### 4. Convert model + +Generate the ONNX model file (example for YOLO11n-OBB) + +``` +python3 export_yolo11_obb.py -w yolo11n-obb.pt --dynamic +``` + +**NOTE**: To change the inference size (default: 640) + +``` +-s SIZE +--size SIZE +-s HEIGHT WIDTH +--size HEIGHT WIDTH +``` + +Example for 1024 + +``` +-s 1024 +``` + +or + +``` +-s 1024 1024 +``` + +**NOTE**: To simplify the ONNX model (DeepStream >= 6.0) + +``` +--simplify +``` + +**NOTE**: To use dynamic batch-size (DeepStream >= 6.1) + +``` +--dynamic +``` + +**NOTE**: To use static batch-size (example for batch-size = 4) + +``` +--batch 4 +``` + +**NOTE**: If you are using the DeepStream 5.1, remove the `--dynamic` arg and use opset 12 or lower. The default opset is 17. + +``` +--opset 12 +``` + +#### 5. Copy generated files + +Copy the generated ONNX model file and labels.txt file (if generated) to the `DeepStream-Yolo` folder. + +## + +### Compile the lib + +1. Open the `DeepStream-Yolo` folder and compile the lib + +2. Set the `CUDA_VER` according to your DeepStream version + +``` +export CUDA_VER=XY.Z +``` + +* x86 platform + + ``` + DeepStream 8.0 = 12.8 + DeepStream 7.1 = 12.6 + DeepStream 7.0 / 6.4 = 12.2 + DeepStream 6.3 = 12.1 + DeepStream 6.2 = 11.8 + DeepStream 6.1.1 = 11.7 + DeepStream 6.1 = 11.6 + DeepStream 6.0.1 / 6.0 = 11.4 + DeepStream 5.1 = 11.1 + ``` + +* Jetson platform + + ``` + DeepStream 8.0 = 13.0 + DeepStream 7.1 = 12.6 + DeepStream 7.0 / 6.4 = 12.2 + DeepStream 6.3 / 6.2 / 6.1.1 / 6.1 = 11.4 + DeepStream 6.0.1 / 6.0 / 5.1 = 10.2 + ``` + +3. Make the lib + +``` +make -C nvdsinfer_custom_impl_Yolo clean && make -C nvdsinfer_custom_impl_Yolo +``` + +## + +### Edit the config_infer_primary_yolo11_obb file + +Edit the `config_infer_primary_yolo11_obb.txt` file according to your model (example for YOLO11n-OBB with 15 classes) + +``` +[property] +... +onnx-file=yolo11n-obb.onnx +... +num-detected-classes=15 +... +parse-bbox-func-name=NvDsInferParseYoloOBB +... +``` + +**NOTE**: For GPU-accelerated parsing (recommended for better performance), use: + +``` +[property] +... +parse-bbox-func-name=NvDsInferParseYoloOBBCuda +... +``` + +**NOTE**: The **YOLO11-OBB** resizes the input with center padding. To get better accuracy, use + +``` +[property] +... +maintain-aspect-ratio=1 +symmetric-padding=1 +... +``` + +**NOTE**: OBB models output oriented bounding boxes with rotation angles. The parser converts these to axis-aligned bounding boxes (AABB) that fully enclose the rotated objects for visualization in DeepStream. The original angle information is lost in this conversion. + +## + +### Edit the deepstream_app_config file + +``` +... +[primary-gie] +... +config-file=config_infer_primary_yolo11_obb.txt +``` + +## + +### Testing the model + +``` +deepstream-app -c deepstream_app_config.txt +``` + +**NOTE**: The TensorRT engine file may take a very long time to generate (sometimes more than 10 minutes). + +**NOTE**: For more information about custom models configuration (`batch-size`, `network-mode`, etc), please check the [`docs/customModels.md`](customModels.md) file. + +## + +### Understanding OBB Output Format + +YOLO11-OBB models output the following format per detection: + +- **x_center, y_center**: Center coordinates of the oriented box +- **width, height**: Dimensions of the oriented box +- **class_probabilities**: Probability for each class (DOTAv1 has 15 classes) +- **angle**: Rotation angle in radians (range: 0 to π/2) + +The DeepStream parser (`NvDsInferParseYoloOBB` or `NvDsInferParseYoloOBBCuda`) converts each oriented box to an axis-aligned bounding box using the formula: + +``` +half_aabb_w = (width * |cos(angle)| + height * |sin(angle)|) / 2 +half_aabb_h = (width * |sin(angle)| + height * |cos(angle)|) / 2 +``` + +This ensures the axis-aligned box fully encloses the rotated object. + +## + +### Common OBB Datasets + +YOLO11-OBB models are typically trained on: + +- **DOTAv1**: 15 classes (plane, ship, storage-tank, baseball-diamond, tennis-court, basketball-court, ground-track-field, harbor, bridge, large-vehicle, small-vehicle, helicopter, roundabout, soccer-ball-field, swimming-pool) +- **DOTAv1.5**: 16 classes (adds container-crane) +- **DOTAv2**: 18 classes (adds airport and helipad) + +Make sure `num-detected-classes` matches your model's training dataset. + +## + +### OBB Geometry and DeepStream Metadata + +#### Why Axis-Aligned Bounding Boxes? + +The DeepStream inference API defines a fixed structure for parsed detections: + +```cpp +struct NvDsInferParseObjectInfo { + float left, top, width, height; // Axis-aligned box only + float detectionConfidence; + unsigned int classId; +}; +``` + +The custom bbox parser callback (`NvDsInferParseYoloOBB`) **must return** `std::vector`. There is no mechanism in this interface to attach additional fields like angle or corner points. This is a DeepStream API constraint, not a limitation of this implementation. + +The AABB returned by the parser is computed using the tightest-fit formula: +``` +half_aabb_w = (obb_width × |cos(angle)| + obb_height × |sin(angle)|) / 2 +half_aabb_h = (obb_width × |sin(angle)| + obb_height × |cos(angle)|) / 2 +``` + +This ensures the axis-aligned box **fully encloses** the rotated object, which is required for DeepStream's NMS, OSD rendering, and object tracking components to function correctly. + +#### Accessing Full OBB Geometry (Including Angle) + +If your application needs the original rotation angle or corner points, you can access them using DeepStream's **raw tensor metadata** feature: + +**Step 1:** Enable `output-tensor-meta` in your config file: +```ini +[property] +... +output-tensor-meta=1 +... +``` + +**Step 2:** Write a GStreamer pad probe to read `NvDsInferTensorMeta` from the buffer. The raw output tensor contains: +``` +[x_center, y_center, width, height, class_prob_0, class_prob_1, ..., angle] +``` + +**Example probe structure (C++):** +```cpp +static GstPadProbeReturn +osd_sink_pad_buffer_probe(GstPad *pad, GstPadProbeInfo *info, gpointer u_data) +{ + GstBuffer *buf = (GstBuffer *) info->data; + NvDsBatchMeta *batch_meta = gst_buffer_get_nvds_batch_meta(buf); + + for (NvDsMetaList *l_frame = batch_meta->frame_meta_list; l_frame; l_frame = l_frame->next) { + NvDsFrameMeta *frame_meta = (NvDsFrameMeta *)(l_frame->data); + + // Access tensor metadata + for (NvDsMetaList *l_user = frame_meta->frame_user_meta_list; l_user; l_user = l_user->next) { + NvDsUserMeta *user_meta = (NvDsUserMeta *)(l_user->data); + if (user_meta->base_meta.meta_type == NVDSINFER_TENSOR_OUTPUT_META) { + NvDsInferTensorMeta *tensor_meta = (NvDsInferTensorMeta *)user_meta->user_meta_data; + // Read raw OBB tensor here - contains angle information + // Tensor format: [num_detections, 4+num_classes+1] + } + } + } + return GST_PAD_PROBE_OK; +} +``` + +**Step 3:** Parse the raw tensor to extract angle and compute corner points if needed. + +For Python examples, see the [DeepStream Python Apps](https://github.com/NVIDIA-AI-IOT/deepstream_python_apps) repository. + +**Summary:** +- **Standard pipeline**: OBB → AABB (works with all DeepStream components) +- **Advanced users**: OBB → AABB + raw tensor metadata (enables custom angle-aware post-processing) diff --git a/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp b/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp index b3f15557..d1cdcaf4 100644 --- a/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp +++ b/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp @@ -26,11 +26,16 @@ #include "nvdsinfer_custom_impl.h" #include "utils.h" +#include extern "C" bool NvDsInferParseYolo(std::vector const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo, NvDsInferParseDetectionParams const& detectionParams, std::vector& objectList); +extern "C" bool +NvDsInferParseYoloOBB(std::vector const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo, + NvDsInferParseDetectionParams const& detectionParams, std::vector& objectList); + static NvDsInferParseObjectInfo convertBBox(const float& bx1, const float& by1, const float& bx2, const float& by2, const uint& netW, const uint& netH) { @@ -94,6 +99,50 @@ decodeTensorYolo(const float* output, const uint& outputSize, const uint& netW, return binfo; } +static std::vector +decodeTensorYoloOBB(const float* output, const uint& outputSize, const uint& netW, const uint& netH, + const std::vector& preclusterThreshold, const uint& numClasses) +{ + std::vector binfo; + + for (uint b = 0; b < outputSize; ++b) { + float x_center = output[b * (4 + numClasses + 1) + 0]; + float y_center = output[b * (4 + numClasses + 1) + 1]; + float width = output[b * (4 + numClasses + 1) + 2]; + float height = output[b * (4 + numClasses + 1) + 3]; + + float maxProb = 0.0f; + int maxIndex = 0; + for (uint c = 0; c < numClasses; ++c) { + float prob = output[b * (4 + numClasses + 1) + 4 + c]; + if (prob > maxProb) { + maxProb = prob; + maxIndex = c; + } + } + + float angle = output[b * (4 + numClasses + 1) + 4 + numClasses]; + + if (maxProb < preclusterThreshold[maxIndex]) { + continue; + } + + float cos_a = fabsf(cosf(angle)); + float sin_a = fabsf(sinf(angle)); + float half_aabb_w = (width * cos_a + height * sin_a) / 2.0f; + float half_aabb_h = (width * sin_a + height * cos_a) / 2.0f; + + float bx1 = x_center - half_aabb_w; + float by1 = y_center - half_aabb_h; + float bx2 = x_center + half_aabb_w; + float by2 = y_center + half_aabb_h; + + addBBoxProposal(bx1, by1, bx2, by2, netW, netH, maxIndex, maxProb, binfo); + } + + return binfo; +} + static bool NvDsInferParseCustomYolo(std::vector const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo, NvDsInferParseDetectionParams const& detectionParams, @@ -119,6 +168,32 @@ NvDsInferParseCustomYolo(std::vector const& outputLayersInfo return true; } +static bool +NvDsInferParseCustomYoloOBB(std::vector const& outputLayersInfo, + NvDsInferNetworkInfo const& networkInfo, NvDsInferParseDetectionParams const& detectionParams, + std::vector& objectList) +{ + if (outputLayersInfo.empty()) { + std::cerr << "ERROR: Could not find output layer in bbox parsing" << std::endl; + return false; + } + + std::vector objects; + + const NvDsInferLayerInfo& output = outputLayersInfo[0]; + const uint outputSize = output.inferDims.d[0]; + const uint numClasses = detectionParams.numClassesConfigured; + + std::vector outObjs = decodeTensorYoloOBB((const float*) (output.buffer), outputSize, + networkInfo.width, networkInfo.height, detectionParams.perClassPreclusterThreshold, numClasses); + + objects.insert(objects.end(), outObjs.begin(), outObjs.end()); + + objectList = objects; + + return true; +} + extern "C" bool NvDsInferParseYolo(std::vector const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo, NvDsInferParseDetectionParams const& detectionParams, std::vector& objectList) @@ -126,4 +201,12 @@ NvDsInferParseYolo(std::vector const& outputLayersInfo, NvDs return NvDsInferParseCustomYolo(outputLayersInfo, networkInfo, detectionParams, objectList); } +extern "C" bool +NvDsInferParseYoloOBB(std::vector const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo, + NvDsInferParseDetectionParams const& detectionParams, std::vector& objectList) +{ + return NvDsInferParseCustomYoloOBB(outputLayersInfo, networkInfo, detectionParams, objectList); +} + CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseYolo); +CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseYoloOBB); diff --git a/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo_cuda.cu b/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo_cuda.cu index e6123ea2..87694061 100644 --- a/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo_cuda.cu +++ b/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo_cuda.cu @@ -32,6 +32,10 @@ extern "C" bool NvDsInferParseYoloCuda(std::vector const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo, NvDsInferParseDetectionParams const& detectionParams, std::vector& objectList); +extern "C" bool +NvDsInferParseYoloOBBCuda(std::vector const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo, + NvDsInferParseDetectionParams const& detectionParams, std::vector& objectList); + __global__ void decodeTensorYoloCuda(NvDsInferParseObjectInfo *binfo, const float* output, const uint outputSize, const uint netW, const uint netH, const float* preclusterThreshold) { @@ -67,6 +71,62 @@ __global__ void decodeTensorYoloCuda(NvDsInferParseObjectInfo *binfo, const floa binfo[x_id].classId = maxIndex; } +__global__ void decodeTensorYoloOBBCuda(NvDsInferParseObjectInfo *binfo, const float* output, const uint outputSize, + const uint netW, const uint netH, const float* preclusterThreshold, const uint numClasses) +{ + int x_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (x_id >= outputSize) { + return; + } + + const uint stride = 4 + numClasses + 1; + + float x_center = output[x_id * stride + 0]; + float y_center = output[x_id * stride + 1]; + float width = output[x_id * stride + 2]; + float height = output[x_id * stride + 3]; + + float maxProb = 0.0f; + int maxIndex = 0; + for (uint c = 0; c < numClasses; ++c) { + float prob = output[x_id * stride + 4 + c]; + if (prob > maxProb) { + maxProb = prob; + maxIndex = c; + } + } + + float angle = output[x_id * stride + 4 + numClasses]; + + if (maxProb < preclusterThreshold[maxIndex]) { + binfo[x_id].detectionConfidence = 0.0; + return; + } + + float cos_a = fabsf(cosf(angle)); + float sin_a = fabsf(sinf(angle)); + float half_aabb_w = (width * cos_a + height * sin_a) / 2.0f; + float half_aabb_h = (width * sin_a + height * cos_a) / 2.0f; + + float bx1 = x_center - half_aabb_w; + float by1 = y_center - half_aabb_h; + float bx2 = x_center + half_aabb_w; + float by2 = y_center + half_aabb_h; + + bx1 = fminf(float(netW), fmaxf(float(0.0), bx1)); + by1 = fminf(float(netH), fmaxf(float(0.0), by1)); + bx2 = fminf(float(netW), fmaxf(float(0.0), bx2)); + by2 = fminf(float(netH), fmaxf(float(0.0), by2)); + + binfo[x_id].left = bx1; + binfo[x_id].top = by1; + binfo[x_id].width = fminf(float(netW), fmaxf(float(0.0), bx2 - bx1)); + binfo[x_id].height = fminf(float(netH), fmaxf(float(0.0), by2 - by1)); + binfo[x_id].detectionConfidence = maxProb; + binfo[x_id].classId = maxIndex; +} + static bool NvDsInferParseCustomYoloCuda(std::vector const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo, NvDsInferParseDetectionParams const& detectionParams, std::vector& objectList) @@ -96,6 +156,36 @@ static bool NvDsInferParseCustomYoloCuda(std::vector const& return true; } +static bool NvDsInferParseCustomYoloOBBCuda(std::vector const& outputLayersInfo, + NvDsInferNetworkInfo const& networkInfo, NvDsInferParseDetectionParams const& detectionParams, + std::vector& objectList) +{ + if (outputLayersInfo.empty()) { + std::cerr << "ERROR: Could not find output layer in bbox parsing" << std::endl; + return false; + } + + const NvDsInferLayerInfo& output = outputLayersInfo[0]; + const uint outputSize = output.inferDims.d[0]; + const uint numClasses = detectionParams.numClassesConfigured; + + thrust::device_vector perClassPreclusterThreshold = detectionParams.perClassPreclusterThreshold; + + thrust::device_vector objects(outputSize); + + int threads_per_block = 1024; + int number_of_blocks = ((outputSize) / threads_per_block) + 1; + + decodeTensorYoloOBBCuda<<>>( + thrust::raw_pointer_cast(objects.data()), (float*) (output.buffer), outputSize, networkInfo.width, + networkInfo.height, thrust::raw_pointer_cast(perClassPreclusterThreshold.data()), numClasses); + + objectList.resize(outputSize); + thrust::copy(objects.begin(), objects.end(), objectList.begin()); + + return true; +} + extern "C" bool NvDsInferParseYoloCuda(std::vector const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo, NvDsInferParseDetectionParams const& detectionParams, std::vector& objectList) @@ -103,4 +193,12 @@ NvDsInferParseYoloCuda(std::vector const& outputLayersInfo, return NvDsInferParseCustomYoloCuda(outputLayersInfo, networkInfo, detectionParams, objectList); } +extern "C" bool +NvDsInferParseYoloOBBCuda(std::vector const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo, + NvDsInferParseDetectionParams const& detectionParams, std::vector& objectList) +{ + return NvDsInferParseCustomYoloOBBCuda(outputLayersInfo, networkInfo, detectionParams, objectList); +} + CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseYoloCuda); +CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseYoloOBBCuda); diff --git a/utils/export_yolo11_obb.py b/utils/export_yolo11_obb.py new file mode 100644 index 00000000..87892a4c --- /dev/null +++ b/utils/export_yolo11_obb.py @@ -0,0 +1,154 @@ +import os +import sys +import onnx +import torch +import torch.nn as nn +from copy import deepcopy + +from ultralytics import YOLO +from ultralytics.nn.modules import C2f, OBB +import ultralytics.utils +import ultralytics.models.yolo +import ultralytics.utils.tal as _m + +sys.modules["ultralytics.yolo"] = ultralytics.models.yolo +sys.modules["ultralytics.yolo.utils"] = ultralytics.utils + + +def _dist2bbox(distance, anchor_points, xywh=False, dim=-1): + lt, rb = distance.chunk(2, dim) + x1y1 = anchor_points - lt + x2y2 = anchor_points + rb + return torch.cat((x1y1, x2y2), dim) + + +_m.dist2bbox.__code__ = _dist2bbox.__code__ + + +class DeepStreamOBBOutput(nn.Module): + def __init__(self, num_classes): + super().__init__() + self.num_classes = num_classes + + def forward(self, x): + """ + Convert YOLOv11-OBB output to DeepStream format. + + Input format (channels first): [batch, 5+num_classes, num_predictions] + - Ch[0:4]: x_center, y_center, width, height + - Ch[4:4+num_classes]: class probabilities + - Ch[4+num_classes]: angle (radians) + + Output format: [batch, num_predictions, 5+num_classes] + - [x_center, y_center, width, height, cls_prob1, cls_prob2, ..., angle] + """ + x = x.transpose(1, 2) # [batch, num_predictions, 5+num_classes] + return x + + +def yolo11_obb_export(weights, device, fuse=True): + model = YOLO(weights) + model = deepcopy(model.model).to(device) + for p in model.parameters(): + p.requires_grad = False + model.eval() + model.float() + if fuse: + model = model.fuse() + for k, m in model.named_modules(): + if isinstance(m, OBB): + m.dynamic = False + m.export = True + m.format = "onnx" + elif isinstance(m, C2f): + m.forward = m.forward_split + return model + + +def suppress_warnings(): + import warnings + warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) + warnings.filterwarnings("ignore", category=UserWarning) + warnings.filterwarnings("ignore", category=DeprecationWarning) + warnings.filterwarnings("ignore", category=FutureWarning) + warnings.filterwarnings("ignore", category=ResourceWarning) + + +def main(args): + suppress_warnings() + + print(f"\nStarting: {args.weights}") + + print("Opening YOLO11-OBB model") + + device = torch.device("cpu") + model = yolo11_obb_export(args.weights, device) + + num_classes = len(model.names) + print(f"Number of classes: {num_classes}") + + if num_classes > 0: + print("Creating labels.txt file") + with open("labels.txt", "w", encoding="utf-8") as f: + for name in model.names.values(): + f.write(f"{name}\n") + + model = nn.Sequential(model, DeepStreamOBBOutput(num_classes)) + + img_size = args.size * 2 if len(args.size) == 1 else args.size + + onnx_input_im = torch.zeros(args.batch, 3, *img_size).to(device) + onnx_output_file = args.weights.rsplit(".", 1)[0] + ".onnx" + + dynamic_axes = { + "input": { + 0: "batch" + }, + "output": { + 0: "batch" + } + } + + print("Exporting the model to ONNX") + torch.onnx.export( + model, + onnx_input_im, + onnx_output_file, + verbose=False, + opset_version=args.opset, + do_constant_folding=True, + input_names=["input"], + output_names=["output"], + dynamic_axes=dynamic_axes if args.dynamic else None + ) + + if args.simplify: + print("Simplifying the ONNX model") + import onnxslim + model_onnx = onnx.load(onnx_output_file) + model_onnx = onnxslim.slim(model_onnx) + onnx.save(model_onnx, onnx_output_file) + + print(f"Done: {onnx_output_file}\n") + + +def parse_args(): + import argparse + parser = argparse.ArgumentParser(description="DeepStream YOLO11-OBB conversion") + parser.add_argument("-w", "--weights", required=True, type=str, help="Input weights (.pt) file path (required)") + parser.add_argument("-s", "--size", nargs="+", type=int, default=[640], help="Inference size [H,W] (default [640])") + parser.add_argument("--opset", type=int, default=17, help="ONNX opset version") + parser.add_argument("--simplify", action="store_true", help="ONNX simplify model") + parser.add_argument("--dynamic", action="store_true", help="Dynamic batch-size") + parser.add_argument("--batch", type=int, default=1, help="Static batch-size") + args = parser.parse_args() + if not os.path.isfile(args.weights): + raise RuntimeError("Invalid weights file") + if args.dynamic and args.batch > 1: + raise RuntimeError("Cannot set dynamic batch-size and static batch-size at same time") + return args + + +if __name__ == "__main__": + args = parse_args() + main(args)