Hello Hailo team,
I am experiencing a severe quantization/emulation anomaly when compiling a LightGlue Self-Attention block for the Hailo-10H using DFC v5.3, which does not occur on the Hailo-8 (DFC v3.3).
I have written a reproduction script that takes a calibration data file, parse and quantizes the ONNX model, and directly compares SDK_NATIVE vs SDK_QUANTIZED outputs using SNR and Cosine Similarity.
For the attention block, I am heavily utilizing the .alls script to force 16-bit precision (a16_w16) on the Softmax, Normalization, and output layers.
The Issue:
On the Hailo-8, the emulator perfectly matches the compilerβs analyze-noise report (~28 dB).
On the Hailo-10H, the compiler reports ~30 dB during optimization, but the SDK_QUANTIZED output collapses to 5.6 dB. The Cosine Similarity drops to ~0.86, indicating some mathematical corruption rather than a simple scale or reordering of dimensions.
This is the script;
import argparse
import numpy as np
from hailo_sdk_client import ClientRunner, InferenceContext
# ==========================================================
# MATH DIAGNOSTICS
# ==========================================================
def compute_snr(reference: np.ndarray, physical: np.ndarray) -> float:
"""Computes Signal-to-Noise Ratio in dB."""
if reference.shape != physical.shape:
raise ValueError(f"Shape mismatch: {reference.shape} vs {physical.shape}")
ref, phys = reference.flatten(), physical.flatten()
signal_power = np.sum(ref**2)
noise_power = np.sum((ref - phys) ** 2)
if noise_power == 0:
return float("inf")
return 10 * np.log10(signal_power / noise_power)
def compute_cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
"""Computes Cosine Similarity between two flattened arrays."""
a_flat, b_flat = a.flatten(), b.flatten()
dot_product = np.sum(a_flat * b_flat)
norm_a = np.linalg.norm(a_flat)
norm_b = np.linalg.norm(b_flat)
if norm_a == 0 or norm_b == 0:
return 0.0
return dot_product / (norm_a * norm_b)
# ==========================================================
# HAILO GRAPH MAPPING UTILS
# ==========================================================
def map_numpy_to_hailo_inputs(runner: ClientRunner, np_data_dict: dict, num_samples: int) -> dict:
"""Maps user numpy dictionaries to Hailo internal layer names using ONNX names."""
dataset_dict = {}
for layer in runner.get_hn_model().get_input_layers():
onnx_name = layer.original_names[0]
matched = False
for key in np_data_dict.keys():
if key in onnx_name:
arr = np_data_dict[key]
# Ensure batch dimension exists
if len(arr.shape) == 2:
arr = np.expand_dims(arr, axis=0)
dataset_dict[layer.name] = arr[:num_samples]
matched = True
break
if not matched:
raise KeyError(
f"CRITICAL: Expected ONNX input containing '{onnx_name}' not found!\n"
f"Available data keys: {list(np_data_dict.keys())}"
)
return dataset_dict
# ==========================================================
# MAIN EXECUTION
# ==========================================================
def run_reproduction(args):
print(f"\nπ STARTING REPRODUCTION SCRIPT FOR: {args.target_device.upper()}")
print("-" * 60)
# 1. Initialize Runner & Parse ONNX
print(f"π¦ Translating ONNX model: {args.onnx_path}")
runner = ClientRunner(hw_arch=args.target_device)
runner.translate_onnx_model(args.onnx_path, "test_model")
# 2. Load Calibration Data
print(f"π Loading real calibration data from: {args.init_npz}")
raw_npz_data = np.load(args.init_npz)
# Map the NPZ arrays to the parsed Hailo inputs
dataset_dict = map_numpy_to_hailo_inputs(runner, raw_npz_data, args.samples)
# We will use the first sample for the 1-to-1 inference comparison
infer_dict = {layer_name: data[:1] for layer_name, data in dataset_dict.items()}
# 3. Run Native Inference (Golden Reference)
print("π§ Running SDK_NATIVE inference (Golden FP32 Reference)...")
with runner.infer_context(InferenceContext.SDK_NATIVE) as ctx:
native_results = runner.infer(ctx, infer_dict)
# Standardize native results to a dictionary if it returns a list
out_layers = runner.get_hn_model().get_output_layers()
if isinstance(native_results, list):
native_results = {layer.name: res for layer, res in zip(out_layers, native_results)}
elif isinstance(native_results, np.ndarray):
native_results = {out_layers[0].name: native_results}
# 4. Optimize / Quantize
print(f"βοΈ Optimizing and Quantizing model ({args.samples} samples)...")
# --- Injecting the highly-tuned ALLS script for Attention Blocks ---
script_lines = [
"post_quantization_optimization(finetune, policy=enabled, batch_size=4)",
"pre_quantization_optimization(ew_add_fusing, policy=disabled)",
"model_optimization_flavor(optimization_level=2, compression_level=0)",
"allocator_param(automatic_ddr=True)",
"pre_quantization_optimization(matmul_correction, layers={*matmul*}, correction_type=zp_comp_block)",
"performance_param(compiler_optimization_level=2)",
"context_switch_param(allow_auto_merge_in_multicontext=True)",
"quantization_param({*softmax*}, precision_mode=a16_w16)",
"quantization_param({*normalization*}, precision_mode=a16_w16)",
"quantization_param({*conv8*}, precision_mode=a16_w16)",
"quantization_param({*output*}, precision_mode=a16_w16)",
]
alls_script = "\n".join(script_lines) + "\n"
print("π Loaded ALLS Script:")
print(alls_script)
runner.load_model_script(alls_script)
runner.optimize(dataset_dict)
# 5. Run Quantized Inference
print("π€ Running SDK_QUANTIZED inference...")
with runner.infer_context(InferenceContext.SDK_QUANTIZED) as ctx:
quant_results = runner.infer(ctx, infer_dict)
# Standardize quant results
if isinstance(quant_results, list):
quant_results = {layer.name: res for layer, res in zip(out_layers, quant_results)}
elif isinstance(quant_results, np.ndarray):
quant_results = {out_layers[0].name: quant_results}
# 6. Analyze and Report
print("\n" + "=" * 60)
print(f"π EVALUATION REPORT: {args.target_device.upper()}")
print("=" * 60)
for layer_name in native_results.keys():
native_arr = native_results[layer_name]
quant_arr = quant_results[layer_name]
print(f"Output Node: {layer_name}")
print(f"Shape: {native_arr.shape}")
# Check SNR
snr = compute_snr(native_arr, quant_arr)
alert = " β (SUSPICIOUS)" if snr < 15 else " β
"
print(f"SNR: {snr:6.2f} dB {alert}")
# Check Cosine Similarity (Helps diagnose memory shuffling/transposing)
cos_sim = compute_cosine_similarity(native_arr, quant_arr)
print(f"Cosine Sim: {cos_sim:6.4f}")
# Diagnostics
if snr < 15 and cos_sim > 0.99:
print(" -> DIAGNOSIS: High Cosine Sim but low SNR indicates a global scale/magnitude error.")
elif snr < 15 and cos_sim < 0.5:
print(" -> DIAGNOSIS: Low Cosine Sim indicates scrambled data (e.g., transposed memory strides) or catastrophic precision loss.")
print("-" * 60)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="MRE for Hailo SDK Quantization SNR Drop with Real Data")
parser.add_argument(
"--onnx-path",
type=str,
required=True,
help="Path to the ONNX model to test."
)
parser.add_argument(
"--init-npz",
type=str,
required=True,
help="Path to the real calibration .npz file."
)
parser.add_argument(
"--target-device",
type=str,
choices=["hailo8", "hailo10h"],
required=True,
help="Target Hailo device architecture.",
)
parser.add_argument(
"--samples",
type=int,
default=64,
help="Number of samples to use for calibration."
)
args = parser.parse_args()
run_reproduction(args)
Running for Hailo10H:
...
[info] Output layers signal-to-noise ratio (SNR): measures the quantization noise (higher is better)
[info] test_model/output_layer1 SNR: 30.45 dB
[info] Model Optimization is done
π€ Running SDK_QUANTIZED inference...
[info] Using 1 GPU for inference
Processed: 1images [00:15, 15.76s/images]
============================================================
π EVALUATION REPORT: HAILO10H
============================================================
Output Node: test_model/output_layer1
Shape: (1, 1, 500, 256)
SNR: 5.60 dB β (SUSPICIOUS)
Cosine Sim: 0.8594
------------------------------------------------------------
Running for Hailo8:
[info] Output layers signal-to-noise ratio (SNR): measures the quantization noise (higher is better)
[info] test_model/output_layer1 SNR: 28.31 dB
[info] Model Optimization is done
π€ Running SDK_QUANTIZED inference...
[info] Using 1 GPU for inference
Processed: 1images [00:16, 16.90s/images]
============================================================
π EVALUATION REPORT: HAILO8
============================================================
Output Node: test_model/output_layer1
Shape: (1, 1, 500, 256)
SNR: 28.21 dB β
Cosine Sim: 0.9992
------------------------------------------------------------
Do you have any idea what could be causing this? When I compared the inference with SDK_NATIVE with the actual pytorch model I got around 66 dB for both cases. I even have run inference using the actual Hailo-10H chip but in this case I even got lower SNR, around 3dB.
Thanks a lot in advance!