Strong Performance Degradation after conversion from HAR to HEF

I created a CNN with LSTM layers with a single outpout y€[True, False] for a computer vision project and trained it with pytorch lightning. The accuracy is ~88%.

I converted the pytorch checkpoint to an oonx, then to a .har and then quantized it.
Running inference with the quantized .har gives me 87% accuracy and an F1 Score of 75%.

However, when i convert it to .hef and then run inference on the hailo8l on the rpi5 i only get 76% accurracy. (which is the worst possible result, because ~ 76% of the data is False and the model gives only False as output with the same data). Fiddling around with the softmax threshold i get a maximum of 76%. So the model is kinda giving me crap results. I am stuck since days, is there anything i am missing?

This is the conversion from quantized .har to .hef:

quantized_model_har_path = f"{models_root}/{model_name}_quantized.har"

runner = ClientRunner(har=quantized_model_har_path, hw_arch='hailo8l')

model_script = """




hef = runner.compile()

file_name = f"{quantized_model_har_path.replace('.har', '.hef')}"

with open(file_name, "wb") as f:

And here the inference script:

#!/usr/bin/env python3
import torch
import os
import numpy as np
from hailo_platform import VDevice, HailoSchedulingAlgorithm
from pathlib import Path
from PIL import Image
from torchvision import transforms
from import DataLoader
from pinno_cv_utils.datasets.labelstudio import FrameTagDataset
from sklearn.metrics import accuracy_score, f1_score
import time

# Paths and HEF setup
data_root = Path(__file__).parents[2] / "hibeas" / "data"
models_dir = Path(__file__).parents[2] / "models"
model_name = "best_checkpoint_model_quantized"
hef_model_path = models_dir / f"{model_name}.hef"

# Check if HEF file exists
if not hef_model_path.exists():
    raise FileNotFoundError("HEF file not found!")

# DataLoader setup
test_set = FrameTagDataset(
    dataset_path=data_root / "dataset_15_05_2024",
    labels_path=data_root / "dataset_15_05_2024/annotations_hibeam_dataset_15_05_2024.json",

# Optionally select a subset for testing
test_indices = np.random.choice(len(test_set), 2000, replace=False)
test_dataloader = DataLoader(, test_indices), batch_size=1

# Main inference function
if __name__ == "__main__":
    start_time = time.time()
    timeout_ms = 1000
    labels_list, predictions_list = [], []

    # Create VDevice with parameters
    params = VDevice.create_params()
    params.scheduling_algorithm = HailoSchedulingAlgorithm.ROUND_ROBIN

    with VDevice(params) as vdevice:
        # Create an infer model from the HEF
        infer_model = vdevice.create_infer_model(str(hef_model_path))

        # Retrieve quantization information
        output_stream = infer_model.output()
        quant_infos = output_stream.quant_infos
        if len(quant_infos) == 1:
            quant_info = quant_infos[0]
            scale = quant_info.qp_scale
            zero_point = quant_info.qp_zp
            raise ValueError("Multiple quant_infos detected. Ensure the model uses a single quantization scheme.")

        # Configure the infer model
        with infer_model.configure() as configured_infer_model:
            # Create bindings
            bindings = configured_infer_model.create_bindings()

            # Prepare output buffer
            output_buffer = np.empty(infer_model.output().shape, dtype=np.uint8)

            # Inference loop
            for images, labels, _, _ in test_dataloader:
                image = images[0].type(torch.uint8).numpy()

                # Set input and output buffers

                # Run synchronous inference
      [bindings], timeout_ms)

                # Get raw output
                raw_output = bindings.output().get_buffer()

                # Dequantize the output
                dequantized_output = (raw_output.astype(np.float32) - zero_point) * scale

                # Apply sigmoid
                output_tensor = torch.tensor(dequantized_output)
                sigmoid_output = torch.sigmoid(output_tensor)

                # Get prediction
                output_bool = sigmoid_output > 0.5
                prediction = output_bool.item()

                labels_list.extend(labels[:, 1].numpy())

                # Optionally print prediction
                # print(f"Raw: {raw_output.item()} - Dequant: {dequantized_output.item()} "
                #       f"- Sigmoid: {np.round(sigmoid_output.item(), 5)} - Prediction: {prediction}")

    # Compute and display inference time and accuracy
    end_time = time.time()
    acc = accuracy_score(labels_list, predictions_list)
    f1 = f1_score(labels_list, predictions_list)
    print(f" Infer Time:      {end_time - start_time:.3f} sec")
    print(f" Average FPS:     {len(test_dataloader) / (end_time - start_time):.3f}")
    print(f" Accuracy:        {acc * 100:.2f}%")
    print(f"F1 Score:         {f1:.4f}")
    print(f"Labels: True: {labels_list.count(True)} - False: {labels_list.count(False)}")
    print(f"Predictions: True: {predictions_list.count(True)} - False: {predictions_list.count(False)}")

This gives me an accuracy of 76%, but as said, this result is worthless.
F1 Score is 0.

So the problem must be somewhere in the conversion from HARto HEF? or is it in the inference?

HAR inference is here (works with quantized and 87% accuracy)

import os
import numpy as np
import torch
from torchvision.transforms import Normalize, Resize, ToTensor, Compose
from import DataLoader
from pinno_cv_utils.datasets.labelstudio import FrameTagDataset
from hailo_sdk_client import ClientRunner, InferenceContext
import pathlib
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score

# Step 1: Set up paths and load the HAR file
models_dir = pathlib.Path(__file__).parents[2] / "models"
#model_name = "best_checkpoint_model"
model_name = "best_checkpoint_model_quantized"
har_model_path = f'{models_dir}/{model_name}.har'


# Step 2: Initialize the Hailo ClientRunner and load the HAR model
runner = ClientRunner(har=har_model_path)

# Step 3: Define dataset paths
data_root = pathlib.Path(__file__).parents[2] / "hibeas" / "data"
assert data_root.exists()

# Dataset for Test
test_set = FrameTagDataset(
    dataset_path=data_root / "dataset_15_05_2024",
    / "dataset_15_05_2024/annotations_hibeam_dataset_15_05_2024.json",
    transform=Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),

# only use 1000 random samples 
test_set =, np.random.choice(len(test_set), 300))

# Test dataloader
test_dataloader = DataLoader(
    test_set, batch_size=1, shuffle=False, num_workers=2, pin_memory=False

# Preprocessing transforms
transform = Compose(
        Resize((224, 224)),  # Resize to the required input size for your model
        ToTensor(),  # Convert to tensor
        Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),  # Normalize

# Step 4: Create inference context (SDK_NATIVE or SDK_QUANTIZED based on model type)
with runner.infer_context(InferenceContext.SDK_NATIVE) as context:

    def run_inference_on_test_set(runner, context, dataloader):
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for batch in tqdm(dataloader, desc="Running Hailo Inference"):
                images, labels, _, _ = batch

                # Preprocess images if not already a tensor
                if not isinstance(images, torch.Tensor):
                    images = torch.stack([transform(image) for image in images])

                # Transpose images from NCHW to NHWC format for Hailo inference
                images = images.permute(0, 2, 3, 1).numpy()  # Convert to NHWC

                # Run Hailo inference on the batch
                outputs = runner.infer(context, images)

                # Process outputs
                output_tensor = torch.tensor(outputs).sigmoid()
                output_bool = output_tensor > 0.5

                # Append batch predictions and labels
                all_labels.extend(labels[:, 1].numpy())  # Assuming you're using the second label in multi-label format

                print(outputs, output_tensor, output_bool)

        return all_labels, all_preds

    # Execute inference
    print("Running Hailo inference on test set...")

    true_labels, predictions = run_inference_on_test_set(runner, context, test_dataloader)

    # Calculate accuracy and F1 score
    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")

please help.
thanks in advance!

Hi @armin.sams,

It looks like there’s some degradation happening during the transition from .har to .hef. Let’s try to narrow down the cause:

  • Model Optimization: Check that the script used for compiling .har to .hef is optimized for the Hailo8L. Using performance_param(compiler_optimization_level=max) can help as a general optimization setting.

  • Output Configuration: Since your output relies on a sigmoid activation with a 0.5 threshold, confirm that the output mappings and tensor configurations in the .hef file align with expectations. Mismatches here could lead to issues.

  • VDevice Scheduling: Test out different scheduling algorithms for the VDevice, such as ROUND_ROBIN, to see if this impacts performance.

  • Compare Outputs: Load the .hef model and test it with a subset of your data, then compare the raw tensor outputs from the .har model (run locally) and the .hef model (on the Hailo8L). This will help determine if the issue stems from model execution or preprocessing.

If none of these steps resolve the issue, please share the .har, .hef, and ONNX files, and I’ll be happy to take a closer look!

thank you.
i tried your steps but nothing works.

here are the normal har and hef and also the quantized ones

thanks in advance.

one further question:

i normalize my data the same way as i normalized it at training

# Dataset for Test
test_set = FrameTagDataset(
    dataset_path=data_root / "dataset_15_05_2024",
    / "dataset_15_05_2024/annotations_hibeam_dataset_15_05_2024.json",
    transform=Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),

# get a subset of 1024 random images
test_subset = np.random.choice(len(test_set), ITEMS, replace=False)
calib_dataset = np.zeros((len(test_subset), 224, 224, 3))

for idx, img in enumerate(test_subset):
    calib_dataset[idx, :, :, :] = img

# Step 1: Set up paths and load the HAR file
models_root = data_root / ".." / ".." / "models"
model_name = "best_checkpoint_model"
har_model_path = f'{models_root}/{model_name}.har'
assert os.path.isfile(har_model_path), "Please provide valid path for HAR file"

runner = ClientRunner(har=har_model_path, hw_arch='hailo8l')

model_script = """
normalization0 = normalization([123.675, 116.28, 103.53], [58.395, 57.12, 57.375])

why do i have to use this imagenet normalization in the model script for the runner?
if i use my normalization, i get an error:

hailo_model_optimization.acceleras.utils.acceleras_exceptions.NegativeSlopeExponentNonFixable: Quantization failed in layer HibeamNetConvLSTM/ew_mult1 due to unsupported required slope. Desired shift is 9.0, but op has only 8 data bits. This error raises when the data or weight range are not balanced. Mostly happens when using random calibration-set/weights, the calibration-set is not normalized properly or batch-normalization was not used during training.

okay, seems i have to use the SDK_QUANTIZED inference context despite using a quantized har.

once i activate this, the performance also degrades to the same unusable state as with the HEF.

import os
import numpy as np
import torch
from torchvision import transforms
from PIL import Image
from pathlib import Path
from hailo_sdk_client import ClientRunner, InferenceContext

# Step 1: Set up paths and load the HAR file
models_dir = Path(__file__).parents[2] / "models"
model_name = "best_checkpoint_model_quantized"
har_model_path = f'{models_dir}/{model_name}.har'
images_dir = Path(__file__).parents[2] / "hibeas" / "data" / "subset_for_onnx_testing"

# Step 2: Initialize the Hailo ClientRunner and load the HAR model
runner = ClientRunner(har=har_model_path)

# Step 4: Set the path to your images
files = sorted(os.listdir(images_dir))

transform = {
    "train": transforms.Compose(
            transforms.Resize((224, 224)),
            # transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.1),
            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
    "val": transforms.Compose(
            transforms.Resize((224, 224)),
            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])]
    "test": transforms.Compose(
            transforms.Resize((224, 224)),
            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])]

outputs = []

# Step 3: Create inference context (SDK_NATIVE or SDK_QUANTIZED based on model type)
with runner.infer_context(InferenceContext.SDK_QUANTIZED) as context:

    # Step 5: Run inference on each image in the folder
    for file in files:
        # Load and preprocess the image (convert to RGB to ensure compatibility)
        image =, file)).convert('RGB')
        input_tensor = transform["test"](image)
        input_tensor = input_tensor.permute(1, 2, 0)

        input_data = input_tensor.unsqueeze(0).numpy()  # Add Batch Dimensions

        # Step 6: Run inference using Hailo ClientRunner
        output = runner.infer(context, input_data)

        # Step 7: If multi-dimensional output, reduce it to a single value
        output_tensor = torch.tensor(output).view(-1)  # Flatten the output if necessary
        output_value = output_tensor.sigmoid().item()  # Apply sigmoid and convert to scalar
        # Step 8: Determine the prediction
        predicted_label = "TRUE" if output_value > 0.5 else "FALSE"

        # Get the label based on filename convention (True/False)
        label_bool = "TRUE" if "True" in file else "FALSE"

        # Print the label and prediction result
        print(f"Prediction - File: {file} - Data: {output.item()} - Sigmoid: {output_value} - Bool: {predicted_label}")

i found the error.

problem was the additional normalization defined in the model script.

