LightGlue self-attention block - parsed model produces different outputs

Hi Hailo Community

I am working on trying to run LightGlue model with Hailo 8 accelerators.

I had to leave aside the positional encoding and some final-layers related to filtering matches but I managed to parse the model after changing a bit how the keypoints encoding are passed.

However, when I compared the parsed HAR model with the associated onnx model, the output differ quite a lot for the same input when I load the pre-trained weights; when the model is randomly initialized the outputs are closer.

I isolated the problem to the self-attention block of lightglue and below is the code to reproduce these output discrepancies.

Environment:

HailoRT v4.23.0
Hailo Dataflow Compiler v3.33.0

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import onnxruntime as ort
from hailo_sdk_client import ClientRunner, InferenceContext

# ==========================================
# Model Definitions (Unchanged Logic)
# ==========================================

class Attention(nn.Module):
    def __init__(self) -> None:
        super().__init__()

    def forward(self, q, k, v) -> torch.Tensor:
        return F.scaled_dot_product_attention(q, k, v)

class SelfBlock(nn.Module):
    def __init__(self, embed_dim: int, num_heads: int, bias: bool = True) -> None:
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.Wqkv = nn.Linear(embed_dim, 3 * embed_dim, bias=bias)
        self.inner_attn = Attention()
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.ffn = nn.Sequential(
            nn.Linear(2 * embed_dim, 2 * embed_dim),
            nn.LayerNorm(2 * embed_dim, elementwise_affine=True),
            nn.GELU(),
            nn.Linear(2 * embed_dim, embed_dim),
        )

    def forward(self, x: torch.Tensor, sines: torch.Tensor, cosines: torch.Tensor) -> torch.Tensor:
        cosines = cosines.unsqueeze(1)
        sines = sines.unsqueeze(1)
        encodings = torch.stack([cosines, sines], 0)
        return self.forward_original(x, encodings)

    def forward_original(self, x: torch.Tensor, encodings: torch.Tensor) -> torch.Tensor:
        batch = x.shape[0]
        qkv: torch.Tensor = self.Wqkv(x)
        qkv = qkv.reshape(batch, -1, self.num_heads, self.head_dim, 3)
        qkv = qkv.transpose(1, 2)
        q, k, v = qkv[..., 0], qkv[..., 1], qkv[..., 2]
        q = self.apply_cached_rotary_emb(encodings, q)
        k = self.apply_cached_rotary_emb(encodings, k)
        context = self.inner_attn(q, k, v)
        context = context.transpose(1, 2)
        context = context.reshape(batch, -1, self.embed_dim)
        message = self.out_proj(context)
        return x + self.ffn(torch.cat((x, message), -1))

    def rotate_half(self, t: torch.Tensor) -> torch.Tensor:
        batch = t.shape[0]
        t = t.reshape(batch, self.num_heads, -1, self.head_dim // 2, 2)
        t = torch.stack((-t[..., 1], t[..., 0]), dim=-1)
        t = t.reshape(batch, self.num_heads, -1, self.head_dim)
        return t

    def apply_cached_rotary_emb(
        self, freqs: torch.Tensor, t: torch.Tensor
    ) -> torch.Tensor:
        return (t * freqs[0]) + (self.rotate_half(t) * freqs[1])

# ==========================================
# Helper Functions
# ==========================================

def load_lightglue_weights(model: nn.Module) -> None:
    """Downloads LightGlue weights and loads the first SelfBlock layer into the model."""
    url = "https://github.com/cvg/LightGlue/releases/download/{}/{}_lightglue.pth"
    version = "v0.1_arxiv"
    features = "superpoint"
    fname = f"lightglue_{version}_{features}.pth".replace(".", "-")
    
    state_dict = torch.hub.load_state_dict_from_url(
        url.format(version, features), file_name=fname
    )

    prefix = "self_attn.0."
    self_block_state_dict = {
        k[len(prefix):]: v 
        for k, v in state_dict.items() 
        if k.startswith(prefix)
    }

    load_info = model.load_state_dict(self_block_state_dict, strict=False)

    print("\n=== LOAD SUMMARY FOR SELFBLOCK ===")
    print(f"Total layers in SelfBlock: {len(model.state_dict())}")
    print(f"Successfully loaded:       {len(model.state_dict()) - len(load_info.missing_keys)}")
    print(f"Missing keys:              {len(load_info.missing_keys)}")
    print("==================================\n")

# ==========================================
# Main Execution
# ==========================================

def main():
    # --- Configurations ---
    load_weights = True
    embed_dim = 256
    num_heads = 4
    head_dim = embed_dim // num_heads
    batch_size = 3
    seq_len = 200
    
    onnx_path = "self_attn.onnx"
    hailo_model_har_name = "self_attn.har"
    model_name = "self_attn"
    input_names = ["desc", "sines", "cosines"]
    output_names = ["new_desc"]

    # --- Initialize & Load Model ---
    self_attn = SelfBlock(embed_dim, num_heads)
    if load_weights:
        load_lightglue_weights(self_attn)
    self_attn.eval()

    # --- Generate Dummy Data ---
    desc = torch.randn((batch_size, seq_len, embed_dim))
    sines = torch.rand((batch_size, seq_len, head_dim)) * 2 - 1
    cosines = torch.rand((batch_size, seq_len, head_dim)) * 2 - 1

    # --- Export to ONNX ---
    torch.onnx.export(
        self_attn,
        (desc, sines, cosines),
        onnx_path,
        opset_version=17,
        do_constant_folding=True,
        input_names=input_names,
        output_names=output_names,
        dynamic_axes=None,
    )

    # --- Hailo Translation ---
    runner = ClientRunner(hw_arch="hailo8")
    hn, npz = runner.translate_onnx_model(
        onnx_path,
        model_name,
        start_node_names=input_names,
        end_node_names=output_names,
    )
    output_names = ["new_desc"] # the translate_onnx_model has overwritten it, so need to set it again
    runner.save_har(hailo_model_har_name)

    # --- ONNX Runtime Inference ---
    onnx_input_data = {
        "desc": desc.numpy(),
        "sines": sines.numpy(),
        "cosines": cosines.numpy(),
    }
    
    session = ort.InferenceSession(onnx_path, providers=['CPUExecutionProvider'])
    onnx_outputs = session.run(output_names, onnx_input_data)

    # --- Hailo Map Inputs ---
    hn_model = runner.get_hn_model()
    hailo_input_data = {}
    
    for layer in hn_model.get_input_layers():
        onnx_name = layer.original_names[0]
        if onnx_name in onnx_input_data:
            # Hailo expects an extra dimension for spatial data
            hailo_input_data[layer.name] = np.expand_dims(onnx_input_data[onnx_name], axis=1)
            print(f"Mapped ONNX '{onnx_name}' -> Hailo '{layer.name}'")
        else:
            print(f"WARNING: Unknown input requirement: {onnx_name}")

    # --- Hailo Inference ---
    with runner.infer_context(InferenceContext.SDK_NATIVE) as ctx:
        hailo_outputs = runner.infer(ctx, hailo_input_data)

    # --- Comparison ---
    # runner.infer returns a list of outputs; we compare the first output array
    error = np.abs(hailo_outputs.squeeze(1) - onnx_outputs[0])
    
    print(f"\nError Max:  {np.max(error):.6f}")
    print(f"Error Mean: {np.mean(error):.6f}")

if __name__ == "__main__":
    main()

When I don’t load the weights I got these discrepancies:

Error Max: 0.016217
Error Mean: 0.002340

And when I load them, these ones:
Error Max: 4.696563
Error Mean: 0.490858

Interestingly, when I change the number of heads from 4 to 1, I still can load the weights without problem and the discrepancies are nearly 0.

What was the reason for these discrepancies? is the math of the parsed graph different from the original one? Or is just a matter of some approximated functions that can be mitigated later with calibration data?
Any help that allows me to run this lightglue model in hailo 8 or newer accelerators will be very much appreciate. Thanks in advance!

Hi, Alex here.

This is very likely a solvable pre-processing or similar consistency issue. Ideally, for simplicity, split the ONNX into pre/post/neural processing parts and parse end-to-end (w/o start-end nodes) so that HAR corresponds directly to a specific ONNX.

Please verify that:

(A) The HAR has same structure and I/O as the ONNX it replaces (viewing in Netron)

(B) Input is the same up to appropriate transpose (onnx BCHW, hailo BHWC)

(C) Normalize the error by the signal for fair comparison

1 Like