Hello Hailo Community
I am compiling some part of the self-attention block in a LightGlue model, and in order to have high SNR I wanted to use also 16-bit inputs. However, with that change (adding “quantization_param({*input*}, precision_mode=a16_w16)” in the alls script) the compilation crashes with the message:
Exception: Compilation failed: vector::_M_range_check: __n (which is 1) >= this->size() (which is 1)
Hailo Dataflow Compiler v3.33.0
Here is script to reproduce the error:
import numpy as np
from hailo_sdk_client import ClientRunner
# ==========================================================
# CONFIGURATION
# ==========================================================
# Point this to the ONNX file you are sharing with Hailo
ONNX_FILE = "selfattn_layer0_part1.onnx"
NET_NAME = "bug_repro_net"
def reproduce_crash():
print(f"1. Loading ONNX Model: {ONNX_FILE}")
runner = ClientRunner(hw_arch="hailo8")
runner.translate_onnx_model(ONNX_FILE, NET_NAME)
print("2. Generating Dummy Calibration Data...")
# Dynamically generate random data matching the ONNX input shapes
calib_dict = {}
for layer in runner.get_hn_model().get_input_layers():
shape = list(layer.output_shape)
shape[0] = 4 # Set a dummy batch size of 4 for calibration
calib_dict[layer.name] = np.random.randn(*shape).astype(np.float32)
print("3. Applying ALLS Configuration...")
# The exact configuration that triggers the allocator crash
model_script = [
"post_quantization_optimization(finetune, policy=enabled, batch_size=4)",
"pre_quantization_optimization(ew_add_fusing, policy=disabled)",
"model_optimization_flavor(optimization_level=2, compression_level=0)",
"quantization_param({*output*}, precision_mode=a16_w16)",
"quantization_param({*input*}, precision_mode=a16_w16)",
"quantization_param({*conv*}, precision_mode=a16_w16)",
"quantization_param({*ew*}, precision_mode=a16_w16)",
# Maximum compiler search effort (Performance Flow)
"performance_param(compiler_optimization_level=2)",
# Explicitly allow it to use host RAM and merge contexts
"context_switch_param(allow_auto_merge_in_multicontext=True)",
"allocator_param(automatic_ddr=True)"
]
# Force 16-bit natively on mathematical nodes
hn_dict = runner.get_hn_dict()
quantizable_types = ['conv', 'matmul', 'fullyconnected', 'ew_add', 'ew_mult', 'ew_sub']
dynamic_script = [f"quantization_param({{{name}}}, precision_mode=a16_w16)"
for name, info in hn_dict['layers'].items()
if any(q in info.get('type', '').lower() for q in quantizable_types)]
runner.load_model_script("\n".join(model_script + dynamic_script))
print("4. Running QAFT Optimization...")
runner.optimize(calib_dict)
print("5. Running Compilation (Expecting C++ crash here)...")
try:
runner.compile()
print("\n[UNEXPECTED] Compilation succeeded? The bug did not trigger.")
except Exception as e:
print(f"\n{'='*50}")
print("[CRASH REPRODUCED SUCCESSFULLY]")
print(f"Exception: {e}")
print(f"{'='*50}\n")
if __name__ == "__main__":
reproduce_crash()
And in this link you can find the onnx used in the script:
Thanks in advance