Hi Hailo Community
I am working on trying to run LightGlue model with Hailo 8 accelerators.
I had to leave aside the positional encoding and some final-layers related to filtering matches but I managed to parse the model after changing a bit how the keypoints encoding are passed.
However, when I compared the parsed HAR model with the associated onnx model, the output differ quite a lot for the same input when I load the pre-trained weights; when the model is randomly initialized the outputs are closer.
I isolated the problem to the self-attention block of lightglue and below is the code to reproduce these output discrepancies.
Environment:
HailoRT v4.23.0
Hailo Dataflow Compiler v3.33.0
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import onnxruntime as ort
from hailo_sdk_client import ClientRunner, InferenceContext
# ==========================================
# Model Definitions (Unchanged Logic)
# ==========================================
class Attention(nn.Module):
def __init__(self) -> None:
super().__init__()
def forward(self, q, k, v) -> torch.Tensor:
return F.scaled_dot_product_attention(q, k, v)
class SelfBlock(nn.Module):
def __init__(self, embed_dim: int, num_heads: int, bias: bool = True) -> None:
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.head_dim = embed_dim // num_heads
self.Wqkv = nn.Linear(embed_dim, 3 * embed_dim, bias=bias)
self.inner_attn = Attention()
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.ffn = nn.Sequential(
nn.Linear(2 * embed_dim, 2 * embed_dim),
nn.LayerNorm(2 * embed_dim, elementwise_affine=True),
nn.GELU(),
nn.Linear(2 * embed_dim, embed_dim),
)
def forward(self, x: torch.Tensor, sines: torch.Tensor, cosines: torch.Tensor) -> torch.Tensor:
cosines = cosines.unsqueeze(1)
sines = sines.unsqueeze(1)
encodings = torch.stack([cosines, sines], 0)
return self.forward_original(x, encodings)
def forward_original(self, x: torch.Tensor, encodings: torch.Tensor) -> torch.Tensor:
batch = x.shape[0]
qkv: torch.Tensor = self.Wqkv(x)
qkv = qkv.reshape(batch, -1, self.num_heads, self.head_dim, 3)
qkv = qkv.transpose(1, 2)
q, k, v = qkv[..., 0], qkv[..., 1], qkv[..., 2]
q = self.apply_cached_rotary_emb(encodings, q)
k = self.apply_cached_rotary_emb(encodings, k)
context = self.inner_attn(q, k, v)
context = context.transpose(1, 2)
context = context.reshape(batch, -1, self.embed_dim)
message = self.out_proj(context)
return x + self.ffn(torch.cat((x, message), -1))
def rotate_half(self, t: torch.Tensor) -> torch.Tensor:
batch = t.shape[0]
t = t.reshape(batch, self.num_heads, -1, self.head_dim // 2, 2)
t = torch.stack((-t[..., 1], t[..., 0]), dim=-1)
t = t.reshape(batch, self.num_heads, -1, self.head_dim)
return t
def apply_cached_rotary_emb(
self, freqs: torch.Tensor, t: torch.Tensor
) -> torch.Tensor:
return (t * freqs[0]) + (self.rotate_half(t) * freqs[1])
# ==========================================
# Helper Functions
# ==========================================
def load_lightglue_weights(model: nn.Module) -> None:
"""Downloads LightGlue weights and loads the first SelfBlock layer into the model."""
url = "https://github.com/cvg/LightGlue/releases/download/{}/{}_lightglue.pth"
version = "v0.1_arxiv"
features = "superpoint"
fname = f"lightglue_{version}_{features}.pth".replace(".", "-")
state_dict = torch.hub.load_state_dict_from_url(
url.format(version, features), file_name=fname
)
prefix = "self_attn.0."
self_block_state_dict = {
k[len(prefix):]: v
for k, v in state_dict.items()
if k.startswith(prefix)
}
load_info = model.load_state_dict(self_block_state_dict, strict=False)
print("\n=== LOAD SUMMARY FOR SELFBLOCK ===")
print(f"Total layers in SelfBlock: {len(model.state_dict())}")
print(f"Successfully loaded: {len(model.state_dict()) - len(load_info.missing_keys)}")
print(f"Missing keys: {len(load_info.missing_keys)}")
print("==================================\n")
# ==========================================
# Main Execution
# ==========================================
def main():
# --- Configurations ---
load_weights = True
embed_dim = 256
num_heads = 4
head_dim = embed_dim // num_heads
batch_size = 3
seq_len = 200
onnx_path = "self_attn.onnx"
hailo_model_har_name = "self_attn.har"
model_name = "self_attn"
input_names = ["desc", "sines", "cosines"]
output_names = ["new_desc"]
# --- Initialize & Load Model ---
self_attn = SelfBlock(embed_dim, num_heads)
if load_weights:
load_lightglue_weights(self_attn)
self_attn.eval()
# --- Generate Dummy Data ---
desc = torch.randn((batch_size, seq_len, embed_dim))
sines = torch.rand((batch_size, seq_len, head_dim)) * 2 - 1
cosines = torch.rand((batch_size, seq_len, head_dim)) * 2 - 1
# --- Export to ONNX ---
torch.onnx.export(
self_attn,
(desc, sines, cosines),
onnx_path,
opset_version=17,
do_constant_folding=True,
input_names=input_names,
output_names=output_names,
dynamic_axes=None,
)
# --- Hailo Translation ---
runner = ClientRunner(hw_arch="hailo8")
hn, npz = runner.translate_onnx_model(
onnx_path,
model_name,
start_node_names=input_names,
end_node_names=output_names,
)
output_names = ["new_desc"] # the translate_onnx_model has overwritten it, so need to set it again
runner.save_har(hailo_model_har_name)
# --- ONNX Runtime Inference ---
onnx_input_data = {
"desc": desc.numpy(),
"sines": sines.numpy(),
"cosines": cosines.numpy(),
}
session = ort.InferenceSession(onnx_path, providers=['CPUExecutionProvider'])
onnx_outputs = session.run(output_names, onnx_input_data)
# --- Hailo Map Inputs ---
hn_model = runner.get_hn_model()
hailo_input_data = {}
for layer in hn_model.get_input_layers():
onnx_name = layer.original_names[0]
if onnx_name in onnx_input_data:
# Hailo expects an extra dimension for spatial data
hailo_input_data[layer.name] = np.expand_dims(onnx_input_data[onnx_name], axis=1)
print(f"Mapped ONNX '{onnx_name}' -> Hailo '{layer.name}'")
else:
print(f"WARNING: Unknown input requirement: {onnx_name}")
# --- Hailo Inference ---
with runner.infer_context(InferenceContext.SDK_NATIVE) as ctx:
hailo_outputs = runner.infer(ctx, hailo_input_data)
# --- Comparison ---
# runner.infer returns a list of outputs; we compare the first output array
error = np.abs(hailo_outputs.squeeze(1) - onnx_outputs[0])
print(f"\nError Max: {np.max(error):.6f}")
print(f"Error Mean: {np.mean(error):.6f}")
if __name__ == "__main__":
main()
When I don’t load the weights I got these discrepancies:
Error Max: 0.016217
Error Mean: 0.002340
And when I load them, these ones:
Error Max: 4.696563
Error Mean: 0.490858
Interestingly, when I change the number of heads from 4 to 1, I still can load the weights without problem and the discrepancies are nearly 0.
What was the reason for these discrepancies? is the math of the parsed graph different from the original one? Or is just a matter of some approximated functions that can be mitigated later with calibration data?
Any help that allows me to run this lightglue model in hailo 8 or newer accelerators will be very much appreciate. Thanks in advance!