Poor yolo performance

Anton_Kumaigorodskyi · June 26, 2025, 10:12am

After having success in Optimiziation warnings meaning - #9 by ade I made a pause for about 2 months, now once I’ve got back to converting yolo11 models in exactly the same way I did before the performance is very poor, basically broken comapred to same models converted before, a yolo11m example:

$ hailortcli run detector.hef --batch-size 8
Running streaming inference (1.hef):
  Transform data: true
    Type:      auto
    Quantized: true
Network drone_detector/drone_detector: 100% | 80 | FPS: 15.97 | ETA: 00:00:00
> Inference result:
 Network group: drone_detector
    Frames count: 80
    FPS: 15.97
    Send Rate: 156.97 Mbit/s
    Recv Rate: 155.99 Mbit/s

When doing real inference it’s even worse than that and whole thing basically grinds to a halt fast. I’ve made sure my PCIe is V3 and generally changed nothing except running everything on fresh Debian installation on RPI and compiling with a slightly newer version of dataflow compiler (3.31.0 instead of 3.30.0 previously, but I did try 3.30.0 again just in case, the result is the same).

I’ve tried this on many RPI5 devices and on many AI hats (same on both 13, 26 TOPs).

For reference, here’s conversion steps I use to make a HEF file:

# import torch

# # Load our model into our environment
# checkpoint = torch.load('best.pt', weights_only=False)
# model = checkpoint['model']
# model = model.float()
# model.eval()

# # Dummy input in FP32
# dummy_input = torch.randn(16, 3, 640, 640, dtype=torch.float)

# # Export to ONNX
# torch.onnx.export(
#     model,
#     dummy_input,
#     "best.onnx",
#     export_params=True,
#     opset_version=11,  # Adjust opset version if needed
#     do_constant_folding=True,
#     input_names=['input'],
#     output_names=['output'])

# print("ONNX model exported successfully!")

# ===============================================

# import onnx
# import onnxruntime as ort
# import torch

# # Load the ONNX model
# onnx_model = onnx.load("best.onnx")
# onnx.checker.check_model(onnx_model)
# print("ONNX model is valid!")

# # Test the ONNX model with ONNX Runtime
# dummy_input = torch.randn(16, 3, 640, 640).numpy()
# ort_session = ort.InferenceSession("best.onnx")
# outputs = ort_session.run(None, {"input": dummy_input})
# print(outputs[0])

# ===============================================

# from hailo_sdk_client import ClientRunner

# onnx_path = "best.onnx"
# onnx_model_name = "detector"
# chosen_hw_arch = "hailo8"

# # Initialize the ClientRunner
# runner = ClientRunner(hw_arch=chosen_hw_arch)

# end_node_names = [
#     "/model.23/cv3.0/cv3.0.2/Conv", 
#     "/model.23/cv2.0/cv2.0.2/Conv", 
#     "/model.23/cv3.1/cv3.1.2/Conv", 
#     "/model.23/cv2.1/cv2.1.2/Conv", 
#     "/model.23/cv2.2/cv2.2.2/Conv", 
#     "/model.23/cv3.2/cv3.2.2/Conv",
# ]

# net_input_shapes={"input": [16, 3, 640, 640]}

# try:
#     hn, npz = runner.translate_onnx_model(
#         onnx_path,
#         onnx_model_name,
#         end_node_names=end_node_names,
#         net_input_shapes=net_input_shapes,
#     )
#     print("Model translation successful.")
# except Exception as e:
#     print(f"Error during model translation: {e}")
#     raise

# hailo_model_har_name = f"{onnx_model_name}.har"

# try:
#     runner.save_har(hailo_model_har_name)
#     print(f"HAR file saved as: {hailo_model_har_name}")
# except Exception as e:
#     print(f"Error saving HAR file: {e}")

# # ===============================================

# from hailo_sdk_client import ClientRunner
# har_path = "detector.har"
# runner = ClientRunner(har=har_path)
# from pprint import pprint

# try:
#     hn_dict = runner.get_hn()  # Or use runner._hn if get_hn() is unavailable
#     print("Inspecting layers from HailoNet (OrderedDict):")

#     for key, value in hn_dict.items():
#         print(f"Key: {key}")
#         pprint(value)
#         print("\\n" + "="*80 + "\\n")

# except Exception as e:
#     print(f"Error while inspecting hn_dict: {e}")

# # ===============================================

# import numpy as np
# from PIL import Image
# import os

# # Paths to directories and files
# image_dir = 'calibration'

# # File paths for saving calibration data
# calibration_data_path = os.path.join("calibration_data.npy")
# processed_data_path = os.path.join("processed_calibration_data.npy")

# # Initialize an empty list for calibration data
# calib_data = []

# # Process all image files in the directory
# for img_name in os.listdir(image_dir):
#     img_path = os.path.join(image_dir, img_name)
#     if img_name.lower().endswith(('.jpg', '.jpeg', '.png')):
#         img = Image.open(img_path).convert("RGB").resize((640, 640))
#         img_array = np.array(img) / 255.0  # Normalize to [0, 1]
#         calib_data.append(img_array)

# # Convert the calibration data to a NumPy array
# calib_data = np.array(calib_data)

# # Save the normalized calibration data
# np.save(calibration_data_path, calib_data)
# print(f"Normalized calibration dataset saved with shape: {calib_data.shape} to {calibration_data_path}")

# # Scale the normalized data back to [0, 255]
# processed_calibration_data = calib_data * 255.0

# # Save the processed calibration data
# np.save(processed_data_path, processed_calibration_data)
# print(f"Processed calibration dataset saved with shape: {processed_calibration_data.shape} to {processed_data_path}")

# # ===============================================

# import json
# import os

# nms_layer_config = {
#     "nms_scores_th": 0.2,
#     "nms_iou_th": 0.7,
#     "image_dims": [
#         640,
#         640
#     ],
#     "max_proposals_per_class": 100,
#     "classes": 80,
#     "regression_length": 16,
#     "background_removal": False,
#     "background_removal_index": 0,
#     "bbox_decoders": [
#         {
#             "name": "bbox_decoder71",
#             "stride": 8,
#             "reg_layer": "conv71",
#             "cls_layer": "conv74"
#         },
#         {
#             "name": "bbox_decoder87",
#             "stride": 16,
#             "reg_layer": "conv87",
#             "cls_layer": "conv90"
#         },
#         {
#             "name": "bbox_decoder102",
#             "stride": 32,
#             "reg_layer": "conv102",
#             "cls_layer": "conv105"
#         }
#     ]
# }

# # Path to save the updated JSON configuration
# output_path = os.path.join("nms_layer_config.json")

# # Save the updated configuration as a JSON file
# with open(output_path, "w") as json_file:
#     json.dump(nms_layer_config, json_file, indent=4)

# print(f"NMS layer configuration saved to {output_path}")

# # ===============================================

# import os
# from hailo_sdk_client import ClientRunner

# # Define your model's HAR file name
# model_name = "detector"
# hailo_model_har_name = f"{model_name}.har"

# # Ensure the HAR file exists
# assert os.path.isfile(hailo_model_har_name), "Please provide a valid path for the HAR file"

# # Initialize the ClientRunner with the HAR file
# runner = ClientRunner(har=hailo_model_har_name)

# # Define the model script to add a normalization layer
# # Normalization for [0, 1] range
# alls = """
# normalization1 = normalization([0.0, 0.0, 0.0], [255.0, 255.0, 255.0])
# change_output_activation(conv74, sigmoid)
# change_output_activation(conv90, sigmoid)
# change_output_activation(conv105, sigmoid)

# model_optimization_config(calibration, batch_size=8)
# model_optimization_flavor(optimization_level=1, compression_level=0, batch_size=8)
# nms_postprocess("nms_layer_config.json", meta_arch=yolov8, engine=cpu)
# performance_param(compiler_optimization_level=0)
# """

# # Load the model script into the ClientRunner
# runner.load_model_script(alls)

# # Define a calibration dataset
# # Replace 'calib_dataset' with the actual dataset you're using for calibration
# # For example, if it's a directory of images, prepare the dataset accordingly
# calib_dataset = "processed_calibration_data.npy"

# # Perform optimization with the calibration dataset
# runner.optimize(calib_dataset)

# # Save the optimized model to a new Quantized HAR file
# quantized_model_har_path = f"{model_name}_quantized_model.har"
# runner.save_har(quantized_model_har_path)

# print(f"Quantized HAR file saved to: {quantized_model_har_path}")

# # # # ===============================================

from hailo_sdk_client import ClientRunner

# Define the quantized model HAR file
model_name = "detector"
quantized_model_har_path = f"{model_name}_quantized_model.har"

# Initialize the ClientRunner with the HAR file
runner = ClientRunner(har=quantized_model_har_path)
print("[info] ClientRunner initialized successfully.")

# Compile the model
try:
    hef = runner.compile()
    print("[info] Compilation completed successfully.")
except Exception as e:
    print(f"[error] Failed to compile the model: {e}")
    raise
file_name = f"{model_name}.hef"
with open(file_name, "wb") as f:
    f.write(hef)

Anton_Kumaigorodskyi · June 26, 2025, 11:02am

Another thing to add is doing something like this shows a good performance:

$ hailortcli run hailo-rpi5-examples/resources/yolov11n.hef --batch-size 8
Running streaming inference (hailo-rpi5-examples/resources/yolov11n.hef):
  Transform data: true
    Type:      auto
    Quantized: true
Network yolov11n/yolov11n: 100% | 960 | FPS: 191.71 | ETA: 00:00:00
> Inference result:
 Network group: yolov11n
    Frames count: 960
    FPS: 191.71
    Send Rate: 1884.61 Mbit/s
    Recv Rate: 1872.83 Mbit/s

So it could be something about my conversion process, but once again, I have changed nothing in code since the times my models worked well.

omria · June 28, 2025, 11:08pm

Hey @Anton_Kumaigorodskyi ,

Looks like you’re hitting a classic bottleneck - you’re doing both input transforms and NMS on the host CPU, which is absolutely killing your performance even though the actual neural inference is screaming fast on the chip.

1. Your NMS is running on CPU
In your model script you’ve got:

nms_postprocess("nms_layer_config.json", meta_arch=yolov8, engine=cpu)

That engine=cpu is forcing all the YOLO post-processing (bbox decoding, score filtering, NMS) onto the host. This can eat up tens or hundreds of milliseconds per batch, totally negating your Hailo speedup.

Fix: Switch to on-chip NMS:

nms_postprocess(meta_arch="yolov8", engine="nn_core")

This pushes the work onto the neural core where Hailo’s specialized NMS lives.

2. Move your input preprocessing on-chip
Your hailortcli log shows:

Transform data:  true
  Type:      auto

So you’re doing colorspace conversion, normalization etc. on the host before every batch. Even basic mean/std normalization crawls in pure Python.

Make sure you’re using on-chip normalization/input_conversion:

# normalize 0-255 to 0-1 on-chip
norm_layers = normalization([0.0,0.0,0.0], [255.0,255.0,255.0])

# on-chip resize to 640x640
resize1 = resize(resize_shapes=[640,640])

3. Bump up compiler optimization
You’re using:

performance_param(compiler_optimization_level=0)

Level 0 is “get it working” mode. You’ll usually see way better FPS if you let the compiler work harder:

performance_param(compiler_optimization_level="max")

These changes should significantly boost your performance. In our tests, the compiled YOLOv11m runs at about 50 FPS on batch size 1 and roughly 100 FPS on batch size 8, so you should see performance in that range—or even a bit higher.

josef.gugglberger · July 15, 2025, 8:46am

hi @omria,

are you sure that yolov8 post processing is supported on chip? i tried that and got the following error:

  File "//convert.py", line 65, in <module>
    runner.optimize(
  File "/usr/local/lib/python3.10/dist-packages/hailo_sdk_common/states/states.py", line 16, in wrapped_func
    return func(self, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/hailo_sdk_client/runner/client_runner.py", line 2201, in optimize
    result = self._optimize(
  File "/usr/local/lib/python3.10/dist-packages/hailo_sdk_common/states/states.py", line 16, in wrapped_func
    return func(self, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/hailo_sdk_client/runner/client_runner.py", line 2003, in _optimize
    self._sdk_backend.optimize_full_precision(data_continer)
  File "/usr/local/lib/python3.10/dist-packages/hailo_sdk_client/sdk_backend/sdk_backend.py", line 1722, in optimize_full_precision
    model, params = self._apply_model_modification_commands(model, params, update_model_and_params)
  File "/usr/local/lib/python3.10/dist-packages/hailo_sdk_client/sdk_backend/sdk_backend.py", line 1607, in _apply_model_modification_commands
    command.validate_command([layer.name for layer in model])
  File "/usr/local/lib/python3.10/dist-packages/hailo_sdk_client/sdk_backend/script_parser/nms_postprocess_command.py", line 316, in validate_command
    self._validate_nms_engine()
  File "/usr/local/lib/python3.10/dist-packages/hailo_sdk_client/sdk_backend/script_parser/nms_postprocess_command.py", line 348, in _validate_nms_engine
    raise UnsupportedMetaArchError(f"The specified meta architecture {meta_arch.value} cannot be run on chip.")
hailo_sdk_client.tools.core_postprocess.nms_postprocess.UnsupportedMetaArchError: The specified meta architecture yolov8 cannot be run on chip.