[Error] parsing area-attention module

Hello, I am on the way to parse an area-attention module but there are some issues with certain layers. Hope I can get some advice about errors I have got

Environment:

dataflow: 3.30.0
torch: 2.4.1
python: 3.10.12
hailo chip: Hailo8

Error:

hailo_sdk_client.model_translator.exceptions.ParsingWithRecommendationException: Parsing failed. The errors found in the graph are:
 UnsupportedShuffleLayerError in op /Transpose_5: Failed to determine type of layer to create in node /Transpose_5
 UnsupportedShuffleLayerError in op /Transpose_4: Failed to determine type of layer to create in node /Transpose_4
 UnsupportedShuffleLayerError in op /Reshape_4: Failed to determine type of layer to create in node /Reshape_4
 UnsupportedReduceMaxLayerError in op /ReduceMax: Failed to create reduce max layer at vertex /ReduceMax. Reduce max is only supported on the features axis, and with keepdim=True
 UnsupportedShuffleLayerError in op /Reshape_3: Failed to determine type of layer to create in node /Reshape_3
 UnsupportedShuffleLayerError in op /Reshape_6: Failed to determine type of layer to create in node /Reshape_6
 UnsupportedShuffleLayerError in op /Reshape_5: Failed to determine type of layer to create in node /Reshape_5
 UnsupportedShuffleLayerError in op /Transpose_7: Failed to determine type of layer to create in node /Transpose_7
 UnsupportedShuffleLayerError in op /Transpose_6: Failed to determine type of layer to create in node /Transpose_6
Please try to parse the model again, using these end node names: /Mul_3, /Slice_3

Code:

import onnx
import torch
from torch import nn
from hailo_sdk_client import ClientRunner

def softmax(logits, axis=-1):
    max_values = torch.max(logits, dim=axis, keepdim=True).values
    exps = torch.exp(logits - max_values)
    return exps / torch.sum(exps, dim=axis, keepdim=True)

def autopad(k, p=None, d=1):  # kernel, padding, dilation
    """Pad to 'same' shape outputs."""
    if d > 1:
        k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k]  # actual kernel-size
    if p is None:
        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]  # auto-pad
    return p

class Conv(nn.Module):
    """Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation)."""

    default_act = nn.SiLU()  # default activation

    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
        """Initialize Conv layer with given arguments including activation."""
        super().__init__()
        self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
        self.bn = nn.BatchNorm2d(c2)
        self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()

    def forward(self, x):
        """Apply convolution, batch normalization and activation to input tensor."""
        return self.act(self.bn(self.conv(x)))

    def forward_fuse(self, x):
        """Apply convolution and activation without batch normalization."""
        return self.act(self.conv(x))

class AAttn(nn.Module):
    """
    Area-attention module for YOLO models, providing efficient attention mechanisms.

    This module implements an area-based attention mechanism that processes input features in a spatially-aware manner,
    making it particularly effective for object detection tasks.

    Attributes:
        area (int): Number of areas the feature map is divided.
        num_heads (int): Number of heads into which the attention mechanism is divided.
        head_dim (int): Dimension of each attention head.
        qkv (Conv): Convolution layer for computing query, key and value tensors.
        proj (Conv): Projection convolution layer.
        pe (Conv): Position encoding convolution layer.

    Methods:
        forward: Applies area-attention to input tensor.

    Examples:
        >>> attn = AAttn(dim=256, num_heads=8, area=4)
        >>> x = torch.randn(1, 256, 32, 32)
        >>> output = attn(x)
        >>> print(output.shape)
        torch.Size([1, 256, 32, 32])
    """

    def __init__(self, dim, num_heads, area=1):
        """
        Initializes an Area-attention module for YOLO models.

        Args:
            dim (int): Number of hidden channels.
            num_heads (int): Number of heads into which the attention mechanism is divided.
            area (int): Number of areas the feature map is divided, default is 1.
        """
        super().__init__()
        self.area = area

        self.num_heads = num_heads
        self.head_dim = head_dim = dim // num_heads
        all_head_dim = head_dim * self.num_heads

        self.qkv = Conv(dim, all_head_dim * 3, 1, act=False)
        self.proj = Conv(all_head_dim, dim, 1, act=False)
        self.pe = Conv(all_head_dim, dim, 7, 1, 3, g=dim, act=False)
        self.act = torch.nn.Softmax(dim=-1)

    def forward(self, x):
        """Processes the input tensor 'x' through the area-attention."""
        B, C, H, W = x.shape
        N = H * W

        qkv = self.qkv(x).flatten(2).transpose(1, 2)
        if self.area > 1:
            qkv = qkv.reshape(B * self.area, N // self.area, C * 3)
            B, N, _ = qkv.shape
        qkv = qkv.view(B, N, self.num_heads, self.head_dim * 3)
        qkv = qkv.permute(0, 2, 3, 1)
        q, k, v = torch.chunk(qkv, 3, dim=2)
        #q, k, v = qkv.split([self.head_dim, self.head_dim, self.head_dim], dim=2)
    
        attn = (q.transpose(-2, -1) @ k) * (self.head_dim**-0.5)
        attn = softmax(attn)
        #attn = attn.softmax(dim=-1)
        x = v @ attn.transpose(-2, -1)
        x = x.permute(0, 3, 1, 2)
        v = v.permute(0, 3, 1, 2)

        if self.area > 1:
            x = x.reshape(B // self.area, N * self.area, C)
            v = v.reshape(B // self.area, N * self.area, C)
            B, N, _ = x.shape

        x = x.reshape(B, H, W, C)
        x = x.permute(0, 3, 1, 2)
        v = v.reshape(B, H, W, C)
        v = v.permute(0, 3, 1, 2)

        x = x + self.pe(v)
        return self.proj(x)

if __name__ == "__main__":
    onnx_path = "test.onnx"

    model = AAttn(dim=256, num_heads=8, area=4)
    model.eval()

    # Dummy input in FP32
    data_shape = [1, 256, 32, 32]
    dummy_input = torch.zeros(data_shape, dtype=torch.float)

    # Export to ONNX
    torch.onnx.export(
        model,
        dummy_input,
        onnx_path,
        export_params=True,
        opset_version=15,  # Adjust opset version if needed
        do_constant_folding=True,
        input_names=['input'],
        output_names=['output']
    )
    
    onnx_model = onnx.load(onnx_path)
    onnx.checker.check_model(onnx_model)

    onnx_inputs = onnx_model.graph.input
    onnx_outputs = onnx_model.graph.output

    start_node_name = onnx_inputs[0].name
    end_node_name = onnx_outputs[0].name

    runner = ClientRunner(hw_arch="hailo8")
    _ = runner.translate_onnx_model(
            onnx_path,
            "parse.har",
            start_node_names=[start_node_name],
            end_node_names=[end_node_name],
            net_input_shapes={start_node_name: data_shape}
        )

Extra stuff i have tried but got same error:

  1. according to Hailo Dataflow Compiler User Guide, permute is not on the supported layers list
    qkv.permute(0,2,3,1) -> transpose(1,2).transpose(2,3)
  2. Following the p138 of User Guide, Features to Columns Reshape Reshaping a tensor from (batch, height, 1, F) to (batch, height, W′, F′), where F =W′· F′.
    qkv.view(B, N, self.num_heads, self.head_dim * 3) -> qkv.unsqueeze(2).view(B, N, self.num_heads, self.head_dim * 3)
  3. created a function for softmax instead of using attn = attn.softmax(dim=-1)
def softmax(logits, axis=-1):
    max_values = torch.max(logits, dim=axis, keepdim=True).values
    exps = torch.exp(logits - max_values)
    return exps / torch.sum(exps, dim=axis, keepdim=True)