Compiling with 16-bit inputs crashes

Fernando_CamaroNogues · April 9, 2026, 7:28am

Hello Hailo Community

I am compiling some part of the self-attention block in a LightGlue model, and in order to have high SNR I wanted to use also 16-bit inputs. However, with that change (adding “quantization_param({*input*}, precision_mode=a16_w16)” in the alls script) the compilation crashes with the message:

Exception: Compilation failed: vector::_M_range_check: __n (which is 1) >= this->size() (which is 1)

Hailo Dataflow Compiler v3.33.0

Here is script to reproduce the error:

import numpy as np
from hailo_sdk_client import ClientRunner

# ==========================================================
# CONFIGURATION
# ==========================================================
# Point this to the ONNX file you are sharing with Hailo
ONNX_FILE = "selfattn_layer0_part1.onnx" 
NET_NAME = "bug_repro_net"

def reproduce_crash():
    print(f"1. Loading ONNX Model: {ONNX_FILE}")
    runner = ClientRunner(hw_arch="hailo8")
    runner.translate_onnx_model(ONNX_FILE, NET_NAME)

    print("2. Generating Dummy Calibration Data...")
    # Dynamically generate random data matching the ONNX input shapes
    calib_dict = {}
    for layer in runner.get_hn_model().get_input_layers():
        shape = list(layer.output_shape)
        shape[0] = 4  # Set a dummy batch size of 4 for calibration
        calib_dict[layer.name] = np.random.randn(*shape).astype(np.float32)

    print("3. Applying ALLS Configuration...")
    # The exact configuration that triggers the allocator crash
    model_script = [
        "post_quantization_optimization(finetune, policy=enabled, batch_size=4)",
            "pre_quantization_optimization(ew_add_fusing, policy=disabled)",
            "model_optimization_flavor(optimization_level=2, compression_level=0)",

            "quantization_param({*output*}, precision_mode=a16_w16)",
            "quantization_param({*input*}, precision_mode=a16_w16)",
            "quantization_param({*conv*}, precision_mode=a16_w16)",
            "quantization_param({*ew*}, precision_mode=a16_w16)", 
            
            
            # Maximum compiler search effort (Performance Flow)
            "performance_param(compiler_optimization_level=2)",

            # Explicitly allow it to use host RAM and merge contexts
            "context_switch_param(allow_auto_merge_in_multicontext=True)", 
            "allocator_param(automatic_ddr=True)"
    ]

    # Force 16-bit natively on mathematical nodes
    hn_dict = runner.get_hn_dict()
    quantizable_types = ['conv', 'matmul', 'fullyconnected', 'ew_add', 'ew_mult', 'ew_sub']
    dynamic_script = [f"quantization_param({{{name}}}, precision_mode=a16_w16)" 
                      for name, info in hn_dict['layers'].items() 
                      if any(q in info.get('type', '').lower() for q in quantizable_types)]

    runner.load_model_script("\n".join(model_script + dynamic_script))

    print("4. Running QAFT Optimization...")
    runner.optimize(calib_dict)

    print("5. Running Compilation (Expecting C++ crash here)...")
    try:
        runner.compile()
        print("\n[UNEXPECTED] Compilation succeeded? The bug did not trigger.")
    except Exception as e:
        print(f"\n{'='*50}")
        print("[CRASH REPRODUCED SUCCESSFULLY]")
        print(f"Exception: {e}")
        print(f"{'='*50}\n")

if __name__ == "__main__":
    reproduce_crash()

And in this link you can find the onnx used in the script:

Thanks in advance

Michael · April 9, 2026, 10:25am

Hi @user714 ,

I want suggest removing:
context_switch_param(allow_auto_merge_in_multicontext=True)

Please let me know if this helped.

Thanks,

Fernando_CamaroNogues · April 9, 2026, 11:11am

Thanks Michael,

I tried removing the mentioned line but got the same error.
I also tried removing
“performance_param(compiler_optimization_level=2)”
and
“allocator_param(automatic_ddr=True)”
but no change.

Any idea for the reason of the error?

Thanks