Error while parsing .onnx file created from torch.nn.MultiHeadAttention

How can I correctly convert to .onnx and subsequently to .har a model that contains a torch.nn.MultiHeadAttention layer? Is there something I’m doing wrong?

def minimal_example():
    import torch  # torch==2.3.0+cu121
    import onnx   # onnx==1.16.1
    from hailo_sdk_client import ClientRunner  # noqa  hailo_sdk_client==3.27.0

    class AttentionWrapper(torch.nn.Module):
        def __init__(self, embed_dim: int):
            super().__init__()
            self.attn = torch.nn.MultiheadAttention(embed_dim=embed_dim, num_heads=1)

        def forward(self, x: torch.Tensor):
            return self.attn(query=x, key=x, value=x)[0]

    embed_dim = 4
    onnx_path = "example.onnx"
    OPSET_VERSION = 17
    input_shape = (1, 16, embed_dim)
    torch_input = torch.randn(input_shape)
    model = AttentionWrapper(embed_dim=embed_dim)
    # =============== Convert to .ONNX =================
    model.eval()
    with torch.no_grad():
        torch.onnx.export(
            model,
            torch_input,
            f=onnx_path,
            verbose=True,
            do_constant_folding=False,
            opset_version=OPSET_VERSION,  # Use the appropriate ONNX opset version
            input_names=["input"],
            output_names=['output']
        )
    onnx_model = onnx.load(onnx_path)
    onnx.checker.check_model(onnx_model)

    # ============== Parse the model (.onnx -> .har) ===============
    onnx_inputs = onnx_model.graph.input
    onnx_outputs = onnx_model.graph.output

    start_node_name = onnx_inputs[0].name
    end_node_name = onnx_outputs[0].name

    runner = ClientRunner(hw_arch="hailo8")
    _ = runner.translate_onnx_model(
        onnx_path,
        "parsed-model.haf",
        start_node_names=[start_node_name],
        end_node_names=[end_node_name],
        net_input_shapes={start_node_name: input_shape})

Error Message

hailo_sdk_client.model_translator.exceptions.ParsingWithRecommendationException: Parsing failed. The errors found in the graph are:
 UnsupportedShuffleLayerError in op /attn/Transpose_1: Failed to determine type of layer to create in node /attn/Transpose_1
 UnexpectedNodeError in op /attn/Squeeze: Unexpected node /attn/Squeeze (Squeeze)
 UnsupportedSliceLayerError in op /attn/Gather: Gather operation on the batch dimension is not supported. Please modify the model so that the batch dimension remains intact.       
 UnsupportedSliceLayerError in op /attn/Gather_1: Gather operation on the batch dimension is not supported. Please modify the model so that the batch dimension remains intact.     
 UnsupportedSliceLayerError in op /attn/Gather_2: Gather operation on the batch dimension is not supported. Please modify the model so that the batch dimension remains intact.     
 UnsupportedShuffleLayerError in op /attn/Reshape_6: Failed to determine type of layer to create in node /attn/Reshape_6
Please try to parse the model again, using these end node names: /attn/Unsqueeze_2

Even following the recommendation the parsing fails.

Using the torch version that comes with the DFC (1.11.0), and a small change to opset=15, this work on my end.

I tried reinstalling torch==1.11.0 and changing the opset_version to 15 but it still doesn’t parse the model correctly. Now I receive a different error:

    _ = runner.translate_onnx_model(
  File "/.my-venv-path//lib/python3.8/site-packages/hailo_sdk_common/states/states.py", line 16, in wrapped_func
    return func(self, *args, **kwargs)
  File "/.my-venv-path//lib/python3.8/site-packages/hailo_sdk_client/runner/client_runner.py", line 881, in translate_onnx_model
    return self._finalize_parsing(parser.return_data)
  File "/.my-venv-path//lib/python3.8/site-packages/hailo_sdk_client/runner/client_runner.py", line 937, in _finalize_parsing
    self.set_hn(return_data['hn_data'])
  File "/.my-venv-path//lib/python3.8/site-packages/hailo_sdk_common/states/states.py", line 16, in wrapped_func
    return func(self, *args, **kwargs)
  File "/.my-venv-path//lib/python3.8/site-packages/hailo_sdk_client/runner/client_runner.py", line 1356, in set_hn
    self._sdk_backend = self._generate(hn)
  File "/.my-venv-path//lib/python3.8/site-packages/hailo_sdk_client/runner/client_runner.py", line 136, in _generate
    hn = self._load_hn(hn)
  File "/.my-venv-path//lib/python3.8/site-packages/hailo_sdk_client/runner/client_runner.py", line 1363, in _load_hn
    return HailoNN.from_hn(hn)
  File "/.my-venv-path//lib/python3.8/site-packages/hailo_sdk_common/hailo_nn/hailo_nn.py", line 1370, in from_hn
    return HNImporter().from_hn(hn_json)
  File "/.my-venv-path//lib/python3.8/site-packages/hailo_sdk_common/hailo_nn/hailo_nn.py", line 1602, in from_hn
    return self.from_parsed_hn(json.loads(ensure_str(hn_json)))
  File "/.my-venv-path//lib/python3.8/site-packages/hailo_sdk_common/hailo_nn/hailo_nn.py", line 1585, in from_parsed_hn
    self._add_layers()
  File "/.my-venv-path//lib/python3.8/site-packages/hailo_sdk_common/hailo_nn/hailo_nn.py", line 1643, in _add_layers
    layer_parsed = self.create_layer(layer_hn, layer_name)
  File "/.my-venv-path//lib/python3.8/site-packages/hailo_sdk_common/hailo_nn/hailo_nn.py", line 1653, in create_layer
    layer_parsed = cls.TYPE_TO_CLASS[layer_hn['type']].from_hn(layer_hn)
  File "/.my-venv-path//lib/python3.8/site-packages/hailo_sdk_common/hailo_nn/hn_layers/fused_conv2d.py", line 246, in from_hn
    layer = super(FusedConv2DLayer, cls).from_hn(hn)
  File "/.my-venv-path//lib/python3.8/site-packages/hailo_sdk_common/hailo_nn/hn_layers/conv2d.py", line 399, in from_hn
    raise UnsupportedModelError(
hailo_sdk_common.hailo_nn.exceptions.UnsupportedModelError: Input features and output features must be a multiply of groups for conv layer parsed-model/conv2 (translated from Add_70

Moreover, in the official documentation for the Data Flow Compiler 3.27.0. under the Supported PyTorch APIs section (Pag.70) it states:
Supporting PyTorch versions 1.11.0 and higher and torch.nn.MultiHeadAttention is listed in the Supported Layers list (pag. 72)

First, you are right there’s a bug, it should be supported on torch=2.3.0. We have plans to rev up to this version, later this year, so we will have to sovle this :slight_smile:
I doubled checked, I have made another change:
embed_dim = 128
There are relations between the input and output channels.

Thanks @Nadav, with torch 1.11.0 and op_set=15 it works.
However I have another error if I try to reproduce the segformer architecture with the following module:

from typing import List

import torch
import torch.nn as nn
import torch.nn.functional as F
from loguru import logger


def nlc_to_nchw(x: torch.Tensor, h: int, w: int) -> torch.Tensor:
    n, _, _ = x.shape
    return x.permute(0, 2, 1).reshape(n, -1, h, w)


def nchw_to_nlc(x: torch.Tensor) -> torch.Tensor:
    return x.flatten(start_dim=2, end_dim=-1).transpose(1, 2)


class OverlappingPatchEmbedding(nn.Module):
    def __init__(self, in_channels: int, embed_dim: int, downsampling_factor: int):
        super().__init__()
        # TODO: Factorize the following if clauses in order to automatically retrieve k,s,p
        if downsampling_factor == 4:
            k, s, p = 7, downsampling_factor, 3
        elif downsampling_factor == 2:
            k, s, p = 3, downsampling_factor, 1
        else:
            raise NotImplementedError(
                f"OverlappingPatchEmbedding for downsampling_factor={downsampling_factor} is not implemented."
            )
        self.proj = nn.Conv2d(
            in_channels, embed_dim, kernel_size=k, stride=s, padding=p
        )  # noqa
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, x):
        x = self.proj(x)
        _, _, H, W = x.shape
        x = nchw_to_nlc(x)
        x = self.norm(x)
        return x, (H, W)


class MLP(nn.Module):
    def __init__(self, input_dim: int, embed_dim: int):
        super().__init__()
        self.proj = nn.Linear(input_dim, embed_dim)

    def forward(self, x):
        x = nchw_to_nlc(x)
        x = self.proj(x)
        return x


class MixFFN(nn.Module):
    def __init__(self, mlp_in_dim: int, mlp_hidden_dim: int, dropout_prob: float = 0.0):
        super().__init__()
        fc1 = nn.Conv2d(
            in_channels=mlp_in_dim,
            out_channels=mlp_hidden_dim,
            kernel_size=1,
            stride=1,
            bias=True,
        )
        depth_wise_conv = nn.Conv2d(
            in_channels=mlp_hidden_dim,
            out_channels=mlp_hidden_dim,
            kernel_size=3,
            stride=1,
            padding=(3 - 1) // 2,
            bias=True,
            groups=mlp_hidden_dim,
        )  # By setting groups=mlp_hidden_dim we will learn one filter
        # for each dimension in the input embedding space.
        fc2 = nn.Conv2d(
            in_channels=mlp_hidden_dim,
            out_channels=mlp_in_dim,
            kernel_size=1,
            stride=1,
            bias=True,
        )
        dropout = nn.Dropout(p=dropout_prob)
        gelu = nn.GELU()
        self.layers = nn.Sequential(fc1, depth_wise_conv, gelu, dropout, fc2, dropout)

    def forward(self, x_in: torch.Tensor, h: int, w: int):
        # From the paper x_out = MLP(GELU(Conv3×3(MLP(x_in)))) + x_in
        batch_size, seq_len, hidden_dim = x_in.shape
        assert h * w == seq_len
        x = nlc_to_nchw(x_in, h, w)
        x = self.layers(x)
        x = nchw_to_nlc(x)
        return x_in + x


class EfficientSelfAttention(nn.Module):
    def __init__(
        self,
        embed_dim: int,
        reduction_ratio: int,
        num_heads: int,
        dropout_prob: float = 0.0,
    ):
        """
        This block in the original paper isn't explained extremely well, refer to the SRA block introduced in
        section 3.3 of "Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions"
        (https://arxiv.org/pdf/2102.12122) for a more detailed explanation.
        """
        super().__init__()

        self.reduction_conv = nn.Conv2d(
            in_channels=embed_dim,
            out_channels=embed_dim,
            kernel_size=reduction_ratio,
            stride=reduction_ratio,
        )

        self.attn = nn.MultiheadAttention(
            embed_dim=embed_dim,
            num_heads=num_heads,
            dropout=dropout_prob,
            batch_first=True,
        )
        self.norm = nn.LayerNorm(normalized_shape=embed_dim)

    def forward(self, x_in: torch.Tensor, h: int, w: int):
        q = x_in  # q.shape: (N, HXW, embed_dim)
        x = nlc_to_nchw(x_in, h, w)
        x = self.reduction_conv(x)
        x = nchw_to_nlc(x)
        k = v = self.norm(x)  # k.shape: (N, (HXW)/(reduction_ratio**2), embed_dim )
        x = self.attn(query=q, key=k, value=v)[0]
        return x_in + x


class TransformerEncoderLayer(nn.Module):
    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        efficient_attn_reduction_ratio: int,
        mix_ffn_expansion_ratio: int,
        dropout_prob=0.0,
    ):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.efficient_self_attn = EfficientSelfAttention(
            embed_dim=embed_dim,
            reduction_ratio=efficient_attn_reduction_ratio,
            num_heads=num_heads,
            dropout_prob=dropout_prob,
        )
        self.norm2 = nn.LayerNorm(embed_dim)
        mlp_hidden_dim = int(embed_dim * mix_ffn_expansion_ratio)
        self.mix_ffn = MixFFN(
            mlp_in_dim=embed_dim,
            mlp_hidden_dim=mlp_hidden_dim,
            dropout_prob=dropout_prob,
        )

    def forward(self, x: torch.Tensor, h: int, w: int):
        x = x + self.efficient_self_attn(self.norm1(x), h, w)
        x = x + self.mix_ffn(self.norm2(x), h, w)
        return x


class MixTransformer(nn.Module):
    def __init__(
        self,
        in_channels: int = 3,
        n_encoding_blocks: List[int] = (2, 2, 2, 2),
        embed_dims: List[int] = (32, 64, 160, 256),
        efficient_self_attention_num_heads: List[int] = (1, 2, 5, 8),
        efficient_self_attention_reduction_ratios: List[int] = (8, 4, 2, 1),
        mix_ffn_expansion_ratios: List[int] = (8, 8, 4, 4),
        downsampling_factors: List[int] = (4, 2, 2, 2),
        dropout_prob: float = 0.0,
    ):
        """
        :param in_channels: Number of channels in the input image.
        :param n_encoding_blocks: Number of transformer blocks in each stage.
        :param embed_dims: Embedding dimensions for each stage (C_i in the paper).
        :param efficient_self_attention_num_heads: Number of attention heads in each stage.
        :param efficient_self_attention_reduction_ratios: Reduction ratio for each stage's Effic. Self Attention block
        :param mix_ffn_expansion_ratios: Expansion ratios for each stage's MixFFN block. (E_i in the paper)
        :param downsampling_factors: At each encoding stage we down-sample H,W by the specified
            down-sapling factors, we do that by merging overlapping patches in the input as explained
            in section 3.1 "Overlapped Patch Merging" of the paper. In the original implementation these factors are
            4, 2, 2, 2 for MiT-B0

        Reference implementation:
            https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/backbones/mit.py
        """
        super().__init__()
        if not (
            len(n_encoding_blocks)
            == len(embed_dims)
            == len(efficient_self_attention_num_heads)
            == len(downsampling_factors)
            == len(efficient_self_attention_reduction_ratios)
            == len(mix_ffn_expansion_ratios)
        ):
            logger.error(
                f"The parameters '{n_encoding_blocks}', '{embed_dims}', '{efficient_self_attention_num_heads}', "
                f"'{efficient_self_attention_reduction_ratios}', '{mix_ffn_expansion_ratios}' "
                f"and '{downsampling_factors}' "
                f"should be list of integers"
                f" with the same cardinality."
            )

        self.depths = n_encoding_blocks
        self.overlapping_patch_embed = nn.ModuleList()
        self.encoding_blocks = nn.ModuleList()
        self.norm_layers = nn.ModuleList()
        n_stages = len(n_encoding_blocks)
        for stage_idx in range(n_stages):
            self.overlapping_patch_embed.append(
                OverlappingPatchEmbedding(
                    in_channels, embed_dims[stage_idx], downsampling_factors[stage_idx]
                )
            )
            self.encoding_blocks.append(
                nn.ModuleList(
                    [
                        TransformerEncoderLayer(
                            embed_dim=embed_dims[stage_idx],
                            num_heads=efficient_self_attention_num_heads[stage_idx],
                            efficient_attn_reduction_ratio=efficient_self_attention_reduction_ratios[
                                stage_idx
                            ],
                            mix_ffn_expansion_ratio=mix_ffn_expansion_ratios[stage_idx],
                            dropout_prob=dropout_prob,
                        )
                        for _ in range(n_encoding_blocks[stage_idx])
                    ]
                )
            )
            self.norm_layers.append(nn.LayerNorm(embed_dims[stage_idx]))
            in_channels = embed_dims[stage_idx]

    def forward(self, x):
        features = []
        number_of_stages = len(self.depths)
        for stage_idx in range(number_of_stages):
            x, (h, w) = self.overlapping_patch_embed[stage_idx](x)
            for encoding_block in self.encoding_blocks[stage_idx]:
                x = encoding_block(x, h, w)
            x = self.norm_layers[stage_idx](x)
            x = nlc_to_nchw(x, h, w)
            features.append(x)
        return features


class MLPDecoder(nn.Module):
    def __init__(
        self,
        embed_dims: List[int],
        decoder_embed_dim: int,
        num_classes: int,
        dropout_prob: float = 0.0,
    ):
        """
        Reference implementation:
            https://github.com/NVlabs/SegFormer/blob/master/mmseg/models/decode_heads/segformer_head.py
        :param embed_dims: [C_1, C_2, C_3, C_4] in the Paper
        :param decoder_embed_dim: C in the Paper.
        :param num_classes: Num classes to predict.
        :param dropout_prob: prob for Dropout layer just before the last linear layer.
        """
        super().__init__()
        self.linear_layers = nn.ModuleList(
            [MLP(embedding_dim, decoder_embed_dim) for embedding_dim in embed_dims]
        )
        self.relu = nn.ReLU()
        n_stages = len(embed_dims)
        self.linear_fuse = nn.Conv2d(
            in_channels=decoder_embed_dim * n_stages,
            out_channels=decoder_embed_dim,
            kernel_size=1,
        )
        self.dropout = nn.Dropout(dropout_prob)
        self.linear_pred = nn.Conv2d(decoder_embed_dim, num_classes, kernel_size=1)

    def forward(self, features: List[torch.Tensor]):
        new_features = []
        target_resolution = features[0].shape[2:]  # In the original implementation
        # this resolution corresponds to (H/4,W/4), where H and W are the height and width of the original image.
        for i in range(len(features)):
            encoder_output = features[i]
            n, _, h, w = encoder_output.shape
            out = self.linear_layers[i](encoder_output)
            out = nlc_to_nchw(out, h, w)
            up_sampled_out = F.interpolate(
                out, size=target_resolution, mode="bilinear", align_corners=False
            )
            new_features.append(up_sampled_out)
        x = torch.cat(new_features, 1)
        x = self.relu(self.linear_fuse(x))
        x = self.dropout(x)
        x = self.linear_pred(x)
        return x


class SegFormer(nn.Module):
    def __init__(
        self,
        in_channels: int = 3,
        depths: List[int] = (2, 2, 2, 2),
        embed_dims: List[int] = (32, 64, 160, 256),
        encoder_efficient_self_attention_num_heads: List[int] = (1, 2, 5, 8),
        encoder_efficient_self_attention_reduction_ratios: List[int] = (8, 4, 2, 1),
        encoder_mix_ffn_expansion_ratios: List[int] = (8, 8, 4, 4),
        encoder_downsampling_factors: List[int] = (4, 2, 2, 2),
        decoder_embed_dim: int = 256,
        num_classes: int = 19,
        dropout_prob: float = 0.0,
    ):
        """
        Check Figure 2. of the paper (https://arxiv.org/pdf/2105.15203) and section "Details of MiT Series" to have
        a deeper understanding of the parameters presented in this implementation.
        Here, the default parameters for the Mix Transformer (MiT) encoder correspond to the ones of MiT-B0
        as detailed in Table 6 of the Paper.
        :param in_channels: Number of channels in the input image.
        :param depths: Number of encoder layers in each stage.  (L_i in the paper)
        :param embed_dims: Embedding dimensions for each stage (C_i in the paper).
        :param encoder_efficient_self_attention_num_heads: Number of attention heads in each stage. (N_i in the paper)
        :param encoder_efficient_self_attention_reduction_ratios: Reduction ratio in each stage to improve time
            complexity while computing the Efficient Self Attention block. (R_i in the paper)
        :param encoder_mix_ffn_expansion_ratios: Expansion ratios for each stage's MixFFN block. (E_i in the paper)
        :param encoder_downsampling_factors: At each encoding stage we down-sample H,W by the specified
            down-sampling factors, we do that by merging overlapping patches in the input as explained
            in section 3.1 "Overlapped Patch Merging" of the paper.
        :param decoder_embed_dim: Embedding dimension for decoder's MLP (C in the paper)
        :param num_classes: Number of classes to predict from the segmentation model.
        """
        super().__init__()
        self.encoder = MixTransformer(
            in_channels=in_channels,
            n_encoding_blocks=depths,
            embed_dims=embed_dims,
            efficient_self_attention_num_heads=encoder_efficient_self_attention_num_heads,
            efficient_self_attention_reduction_ratios=encoder_efficient_self_attention_reduction_ratios,
            mix_ffn_expansion_ratios=encoder_mix_ffn_expansion_ratios,
            downsampling_factors=encoder_downsampling_factors,
            dropout_prob=dropout_prob,
        )
        self.decoder = MLPDecoder(
            embed_dims=embed_dims,
            decoder_embed_dim=decoder_embed_dim,
            num_classes=num_classes,
            dropout_prob=dropout_prob,
        )

    def forward(self, x):
        features = self.encoder(x)
        out = self.decoder(features)
        out = F.interpolate(out, size=x.shape[2:], mode="bilinear", align_corners=False)
        return out

    def print_num_parameters(self):
        def count_parameters(model):
            return sum(p.numel() for p in model.parameters() if p.requires_grad)

        logger.info(
            f"Encoder parameters: {count_parameters(self.encoder) / 1e6} M, "
            f"Decoder parameters: {count_parameters(self.decoder) / 1e6} M"
        )

Then my code becomes:


def convert_pytorch_2_onnx(torch_model: torch.nn.Module, input_shape_bchw: Tuple[int, int, int, int], output_path: Union[str, pathlib.Path]) -> None:
    torch_input = torch.randn(input_shape_bchw)
    logger.info("Converting model to onnx ...")
    OPSET_VERSION = 15  # for torch==1.11.0
    torch_model.eval()
    with torch.no_grad():
        torch.onnx.export(
            torch_model,
            torch_input,
            str(output_path),
            verbose=True,
            do_constant_folding=False,
            opset_version=OPSET_VERSION,  # Use the appropriate ONNX opset version
            input_names=["input"],
            output_names=['output']
        )
    onnx_model = onnx.load(str(output_path))
    onnx.checker.check_model(onnx_model)
 

def minimal_example():
    onnx_path = "example.onnx"
    model = SegFormer()
    input_shape = (1,3,256,128)

    # =============== Convert to .ONNX =================
    convert_pytorch_2_onnx(
        torch_model=model,
        input_shape_bchw=input_shape,
        output_path=onnx_path
    )
    # ============== Parse the model (.onnx -> .har) ===============
    onnx_model = onnx.load(onnx_path)
    onnx_inputs = onnx_model.graph.input
    onnx_outputs = onnx_model.graph.output

    start_node_name = onnx_inputs[0].name
    end_node_name = onnx_outputs[0].name

    runner = ClientRunner(hw_arch="hailo8")
    _ = runner.translate_onnx_model(
        onnx_path,
        "parsed-model",
        start_node_names=[start_node_name],
        end_node_names=[end_node_name],
        net_input_shapes={start_node_name: input_shape})

Error:

hailo_sdk_client.model_translator.exceptions.ParsingWithRecommendationException: Parsing failed. The errors found in the graph are:
 UnsupportedReduceMeanLayerError in op ReduceMean_441: Reduce mean layer ReduceMean_441 has unsupported axis -1 (must be over one spatial dimension only).
 UnsupportedReduceMeanLayerError in op ReduceMean_445: Reduce mean layer ReduceMean_445 has unsupported axis -1 (must be over one spatial dimension only).
 UnsupportedShuffleLayerError in op Reshape_1756: Failed to determine type of layer to create in node Reshape_1756
 UnsupportedReduceMeanLayerError in op ReduceMean_452: Reduce mean layer ReduceMean_452 has unsupported axis -1 (must be over one spatial dimension only).
 UnsupportedReduceMeanLayerError in op ReduceMean_599: Reduce mean layer ReduceMean_599 has unsupported axis -1 (must be over one spatial dimension only).
 UnsupportedReduceMeanLayerError in op ReduceMean_645: Reduce mean layer ReduceMean_645 has unsupported axis -1 (must be over one spatial dimension only).
 UnsupportedReduceMeanLayerError in op ReduceMean_456: Reduce mean layer ReduceMean_456 has unsupported axis -1 (must be over one spatial dimension only).
 UnsupportedReduceMeanLayerError in op ReduceMean_792: Reduce mean layer ReduceMean_792 has unsupported axis -1 (must be over one spatial dimension only).
 UnsupportedReduceMeanLayerError in op ReduceMean_603: Reduce mean layer ReduceMean_603 has unsupported axis -1 (must be over one spatial dimension only).
 UnsupportedReduceMeanLayerError in op ReduceMean_838: Reduce mean layer ReduceMean_838 has unsupported axis -1 (must be over one spatial dimension only).
 UnsupportedReduceMeanLayerError in op ReduceMean_649: Reduce mean layer ReduceMean_649 has unsupported axis -1 (must be over one spatial dimension only).
 UnsupportedReduceMeanLayerError in op ReduceMean_796: Reduce mean layer ReduceMean_796 has unsupported axis -1 (must be over one spatial dimension only).
 UnsupportedReduceMeanLayerError in op ReduceMean_842: Reduce mean layer ReduceMean_842 has unsupported axis -1 (must be over one spatial dimension only).
 UnsupportedShuffleLayerError in op Reshape_861: Failed to determine type of layer to create in node Reshape_861
 UnsupportedReduceMeanLayerError in op ReduceMean_486: Reduce mean layer ReduceMean_486 has unsupported axis -1 (must be over one spatial dimension only).
 UnsupportedReduceMeanLayerError in op ReduceMean_679: Reduce mean layer ReduceMean_679 has unsupported axis -1 (must be over one spatial dimension only).
 UnsupportedReduceMeanLayerError in op ReduceMean_490: Reduce mean layer ReduceMean_490 has unsupported axis -1 (must be over one spatial dimension only).
 UnsupportedReduceMeanLayerError in op ReduceMean_683: Reduce mean layer ReduceMean_683 has unsupported axis -1 (must be over one spatial dimension only).
 UnsupportedShuffleLayerError in op Reshape_1796: Failed to determine type of layer to create in node Reshape_1796
 UnsupportedShuffleLayerError in op Reshape_1294: Failed to determine type of layer to create in node Reshape_1294
 UnsupportedShuffleLayerError in op Reshape_1836: Failed to determine type of layer to create in node Reshape_1836
 UnsupportedShuffleLayerError in op Reshape_1727: Failed to determine type of layer to create in node Reshape_1727
 UnsupportedShuffleLayerError in op Reshape_1876: Failed to determine type of layer to create in node Reshape_1876
Please try to parse the model again, using these end node names: Transpose_440, Transpose_1745

Any reccomendation on how I should change the architecture in order to make the parser work properly?

Hi @andrea.bonvini, Currently the support for transfomers is limited, this is one of the areas that we invest a lot as a company. With that, at the moment for networks that involves transformers, the compiler is not as robust as with CNNS, with each released version you would see better support. The current vesion doesn’t fully support LN. Currently even in the model-zoo we use BN version for SegFormer. In the coming release (July) we expect to get better support for Transformers.

1 Like

please checking “Supported ONNX Operations” chapter

Transpose, Squeeze, Reshape activation can used only sepcific cases.

hailo didn’t support Gather layer. it can only Tensorflow Lite.

I suggest three solution

First, convert onnx and torch layers to a combination of layers supported by Hailo.

However, it is a very difficult task, and there may not be any supported layers in hailo.

Second, change MA to another attention module working in hailo.

Last, waiting for update .It’s been 1 years since I requested an update about gather and transpose. However, Hailo is delaying the performance issue until later work.

1 Like

Thank you @Nadav.
However, I’m not quite sure how to reproduce the same network as segformer_b0_bn in pure PyTorch code. On GitHub it seems that you used the original implementation, but if I try to build the model by passing torch.nn.BatchNorm1d instead of torch.nn.LayerNorm, the inference pass just doesn’t work since it is initialized with the embedding dimension instead of the number of image patches.
e.g. here

A possible workaround may be to do something like this:

        if norm_layer == "LayerNorm":
            self.norm1 = nn.LayerNorm(normalized_shape=dim)
        elif norm_layer == "BatchNorm1d":
            self.norm1 = nn.BatchNorm1d(num_features=number_of_patches)  # number of patches -> HxW

Does this correspond to what was done for the model in the model_zoo?

The links do not work. The first one is just the the segmentation models table from the model_zoo github/hailo-ai/hailo_model_zoo/blob/master/docs/public_models/HAILO8/HAILO8_semantic_segmentation.rst

The second one is line 125 of the original implementation from NVLabs github/NVlabs/SegFormer/blob/65fa8cfa9b52b6ee7e8897a98705abf8570f9e32/mmseg/models/backbones/mix_transformer.py#L125

Moreover, I cannot find any BatchNormalization block in the .onnx downloadable from the model_zoo. But mainly ReduceMean blocks. So I guess some custom nn.Module was created that uses torch.mean() under the hood.
First ONNX layers in model_zoo’s model:


First ONNX layer in my custom model:

My Custom model:

# ---------------------------------------------------------------
# Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
#
# This work is licensed under the NVIDIA Source Code License
# ---------------------------------------------------------------
import collections
import math
import warnings
from itertools import repeat
from typing import Optional, Tuple

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F



def _trunc_normal_(tensor, mean, std, a, b):
    # Cut & paste from PyTorch official master until it's in a few official releases - RW
    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
    def norm_cdf(x):
        # Computes standard normal cumulative distribution function
        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0

    if (mean < a - 2 * std) or (mean > b + 2 * std):
        warnings.warn(
            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
            "The distribution of values may be incorrect.",
            stacklevel=2,
        )

    # Values are generated by using a truncated uniform distribution and
    # then using the inverse CDF for the normal distribution.
    # Get upper and lower cdf values
    l = norm_cdf((a - mean) / std)  # noqa
    u = norm_cdf((b - mean) / std)

    # Uniformly fill tensor with values from [l, u], then translate to
    # [2l-1, 2u-1].
    tensor.uniform_(2 * l - 1, 2 * u - 1)

    # Use inverse cdf transform for normal distribution to get truncated
    # standard normal
    tensor.erfinv_()

    # Transform to proper mean, std
    tensor.mul_(std * math.sqrt(2.0))
    tensor.add_(mean)

    # Clamp to ensure it's in the proper range
    tensor.clamp_(min=a, max=b)
    return tensor


def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
    # type: (Tensor, float, float, float, float) -> Tensor
    r"""Fills the input Tensor with values drawn from a truncated
    normal distribution. The values are effectively drawn from the
    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
    with values outside :math:`[a, b]` redrawn until they are within
    the bounds. The method used for generating the random values works
    best when :math:`a \leq \text{mean} \leq b`.

    NOTE: this impl is similar to the PyTorch trunc_normal_, the bounds [a, b] are
    applied while sampling the normal with mean/std applied, therefore a, b args
    should be adjusted to match the range of mean, std args.

    Args:
        tensor: an n-dimensional `torch.Tensor`
        mean: the mean of the normal distribution
        std: the standard deviation of the normal distribution
        a: the minimum cutoff value
        b: the maximum cutoff value
    Examples:
        >>> w = torch.empty(3, 5)
        >>> nn.init.trunc_normal_(w)
    """
    with torch.no_grad():
        return _trunc_normal_(tensor, mean, std, a, b)


class Mlp(nn.Module):
    def __init__(
        self,
        in_features,
        hidden_features=None,
        out_features=None,
        act_layer=nn.GELU,
        drop=0.0,
    ):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.dwconv = DWConv(hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=0.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)
        elif isinstance(m, nn.Conv2d):
            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
            fan_out //= m.groups
            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
            if m.bias is not None:
                m.bias.data.zero_()

    def forward(self, x, H, W):
        x = self.fc1(x)
        x = self.dwconv(x, H, W)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


class Attention(nn.Module):
    def __init__(
        self,
        dim,
        num_heads=8,
        qkv_bias=False,
        qk_scale=None,
        attn_drop=0.0,
        proj_drop=0.0,
        sr_ratio=1,
        norm_layer: str = "LayerNorm",
        num_patches: Optional[int] = None
    ):
        super().__init__()
        assert (
            dim % num_heads == 0
        ), f"dim {dim} should be divided by num_heads {num_heads}."

        assert norm_layer in ("LayerNorm", "BatchNorm1d")
        if norm_layer == "BatchNorm1d":
            assert num_patches is not None

        self.dim = dim
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim**-0.5

        self.q = nn.Linear(dim, dim, bias=qkv_bias)
        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

        self.sr_ratio = sr_ratio
        if sr_ratio > 1:
            self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
            if norm_layer == "LayerNorm":
                self.norm = nn.LayerNorm(normalized_shape=dim)
            elif norm_layer == "BatchNorm1d":
                assert (num_patches % sr_ratio ** 2) == 0
                self.norm = nn.BatchNorm1d(num_features=num_patches // sr_ratio**2)

        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=0.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)
        elif isinstance(m, nn.Conv2d):
            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
            fan_out //= m.groups
            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
            if m.bias is not None:
                m.bias.data.zero_()

    def forward(self, x, H, W):
        B, N, C = x.shape
        q = (
            self.q(x)
            .reshape(B, N, self.num_heads, C // self.num_heads)
            .permute(0, 2, 1, 3)
        )

        if self.sr_ratio > 1:
            x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
            x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1)
            x_ = self.norm(x_)
            kv = (
                self.kv(x_)
                .reshape(B, -1, 2, self.num_heads, C // self.num_heads)
                .permute(2, 0, 3, 1, 4)
            )
        else:
            kv = (
                self.kv(x)
                .reshape(B, -1, 2, self.num_heads, C // self.num_heads)
                .permute(2, 0, 3, 1, 4)
            )
        k, v = kv[0], kv[1]

        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)

        return x


def drop_path(
    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
):
    if drop_prob == 0.0 or not training:
        return x
    keep_prob = 1 - drop_prob
    shape = (x.shape[0],) + (1,) * (
        x.ndim - 1
    )  # work with diff dim tensors, not just 2D ConvNets
    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
    if keep_prob > 0.0 and scale_by_keep:
        random_tensor.div_(keep_prob)
    return x * random_tensor


class DropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""

    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
        super(DropPath, self).__init__()
        self.drop_prob = drop_prob
        self.scale_by_keep = scale_by_keep

    def forward(self, x):
        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)

    def extra_repr(self):
        return f"drop_prob={round(self.drop_prob,3):0.3f}"


class Block(nn.Module):

    def __init__(
        self,
        dim,
        num_heads,
        mlp_ratio=4.0,
        qkv_bias=False,
        qk_scale=None,
        drop=0.0,
        attn_drop=0.0,
        drop_path=0.0,
        act_layer=nn.GELU,
        norm_layer: str = "LayerNorm",
        num_features: Optional[int] = None,
        sr_ratio=1,
    ):
        super().__init__()

        if norm_layer == "LayerNorm":
            self.norm1 = nn.LayerNorm(normalized_shape=dim)
        elif norm_layer == "BatchNorm1d":
            self.norm1 = nn.BatchNorm1d(num_features=num_features)

        self.attn = Attention(
            dim,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            qk_scale=qk_scale,
            attn_drop=attn_drop,
            proj_drop=drop,
            sr_ratio=sr_ratio,
            norm_layer=norm_layer,
            num_patches=num_features
        )
        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
        if norm_layer == "LayerNorm":
            self.norm2 = nn.LayerNorm(normalized_shape=dim)
        elif norm_layer == "BatchNorm1d":
            self.norm2 = nn.BatchNorm1d(num_features=num_features)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(
            in_features=dim,
            hidden_features=mlp_hidden_dim,
            act_layer=act_layer,
            drop=drop,
        )
        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=0.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)
        elif isinstance(m, nn.Conv2d):
            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
            fan_out //= m.groups
            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
            if m.bias is not None:
                m.bias.data.zero_()

    def forward(self, x, H, W):
        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
        x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
        return x


class OverlapPatchEmbed(nn.Module):
    """Image to Patch Embedding"""

    def __init__(
        self,
        img_size=224,
        patch_size=7,
        stride=4,
        in_chans=3,
        embed_dim=768,
        norm_layer: str = "LayerNorm",
    ):
        super().__init__()
        img_size = to_2tuple(img_size)
        patch_size = to_2tuple(patch_size)

        self.img_size = img_size
        self.patch_size = patch_size
        padding_h, padding_w = patch_size[0] // 2, patch_size[1] // 2
        self.H, self.W = (
            (img_size[0] - patch_size[0] + 2 * padding_h) // stride + 1,
            (img_size[1] - patch_size[1] + 2 * padding_w) // stride + 1,
        )  # changed it from original implementation mg_size[0] // patch_size, img_size[1] // patch_size
        self.num_patches = (
            self.H * self.W
        )  # computed but not used in original implementation.
        self.proj = nn.Conv2d(
            in_chans,
            embed_dim,
            kernel_size=patch_size,
            stride=stride,
            padding=(padding_h, padding_w),
        )

        if norm_layer == "LayerNorm":
            self.norm = nn.LayerNorm(normalized_shape=embed_dim)
        elif norm_layer == "BatchNorm1d":
            self.norm = nn.BatchNorm1d(num_features=self.num_patches)

        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=0.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)
        elif isinstance(m, nn.Conv2d):
            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
            fan_out //= m.groups
            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
            if m.bias is not None:
                m.bias.data.zero_()

    def forward(self, x):
        x = self.proj(x)
        _, _, H, W = x.shape
        x = x.flatten(2).transpose(1, 2)
        x = self.norm(x)
        return x, H, W


def to_2tuple(x) -> Tuple[int, int]:
    if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
        return tuple(x)
    if isinstance(x, np.ndarray) and len(x) == 2:
        return tuple(x)
    return tuple(repeat(x, 2))


class MixVisionTransformer(
    nn.Module
):  # https://github.com/NVlabs/SegFormer/blob/master/mmseg/models/backbones/mix_transformer.py
    def __init__(
        self,
        img_size: np.ndarray = np.array([224, 224]).astype(np.uint32),
        patch_size=16,  # FIXME: unused. I will leave it here since this is the original implementation.
        # (will be removed once Hailo will support all the layers we need)
        in_chans=3,
        num_classes=1000,
        embed_dims=[64, 128, 256, 512],
        num_heads=[1, 2, 4, 8],
        mlp_ratios=[4, 4, 4, 4],
        qkv_bias=False,
        qk_scale=None,
        drop_rate=0.0,
        attn_drop_rate=0.0,
        drop_path_rate=0.0,
        norm_layer: str = "LayerNorm",
        depths=[3, 4, 6, 3],
        sr_ratios=[8, 4, 2, 1],
    ):
        super().__init__()
        assert norm_layer in ("LayerNorm", "BatchNorm1d")
        self.num_classes = num_classes
        self.depths = depths

        # patch_embed
        self.patch_embed1 = OverlapPatchEmbed(
            img_size=img_size,
            patch_size=7,
            stride=4,
            in_chans=in_chans,
            embed_dim=embed_dims[0],
            norm_layer=norm_layer,
        )
        self.patch_embed2 = OverlapPatchEmbed(
            img_size=img_size // 4,
            patch_size=3,
            stride=2,
            in_chans=embed_dims[0],
            embed_dim=embed_dims[1],
            norm_layer=norm_layer,
        )
        self.patch_embed3 = OverlapPatchEmbed(
            img_size=img_size // 8,
            patch_size=3,
            stride=2,
            in_chans=embed_dims[1],
            embed_dim=embed_dims[2],
            norm_layer=norm_layer,
        )
        self.patch_embed4 = OverlapPatchEmbed(
            img_size=img_size // 16,
            patch_size=3,
            stride=2,
            in_chans=embed_dims[2],
            embed_dim=embed_dims[3],
            norm_layer=norm_layer,
        )

        # transformer encoder
        dpr = [
            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
        ]  # stochastic depth decay rule
        cur = 0
        self.block1 = nn.ModuleList(
            [
                Block(
                    dim=embed_dims[0],
                    num_heads=num_heads[0],
                    mlp_ratio=mlp_ratios[0],
                    qkv_bias=qkv_bias,
                    qk_scale=qk_scale,
                    drop=drop_rate,
                    attn_drop=attn_drop_rate,
                    drop_path=dpr[cur + i],
                    norm_layer=norm_layer,
                    sr_ratio=sr_ratios[0],
                    num_features=(
                        np.prod(img_size // 4) if norm_layer == "BatchNorm1d" else None
                    ),
                )
                for i in range(depths[0])
            ]
        )

        if norm_layer == "LayerNorm":
            self.norm1 = nn.LayerNorm(normalized_shape=embed_dims[0])
        elif norm_layer == "BatchNorm1d":
            self.norm1 = nn.BatchNorm1d(num_features=np.prod(img_size // 4))

        cur += depths[0]
        self.block2 = nn.ModuleList(
            [
                Block(
                    dim=embed_dims[1],
                    num_heads=num_heads[1],
                    mlp_ratio=mlp_ratios[1],
                    qkv_bias=qkv_bias,
                    qk_scale=qk_scale,
                    drop=drop_rate,
                    attn_drop=attn_drop_rate,
                    drop_path=dpr[cur + i],
                    norm_layer=norm_layer,
                    sr_ratio=sr_ratios[1],
                    num_features=(
                        np.prod(img_size // 8) if norm_layer == "BatchNorm1d" else None
                    ),
                )
                for i in range(depths[1])
            ]
        )
        if norm_layer == "LayerNorm":
            self.norm2 = nn.LayerNorm(normalized_shape=embed_dims[1])
        elif norm_layer == "BatchNorm1d":
            self.norm2 = nn.BatchNorm1d(num_features=np.prod(img_size // 8))

        cur += depths[1]
        self.block3 = nn.ModuleList(
            [
                Block(
                    dim=embed_dims[2],
                    num_heads=num_heads[2],
                    mlp_ratio=mlp_ratios[2],
                    qkv_bias=qkv_bias,
                    qk_scale=qk_scale,
                    drop=drop_rate,
                    attn_drop=attn_drop_rate,
                    drop_path=dpr[cur + i],
                    norm_layer=norm_layer,
                    sr_ratio=sr_ratios[2],
                    num_features=(
                        np.prod(img_size // 16) if norm_layer == "BatchNorm1d" else None
                    ),
                )
                for i in range(depths[2])
            ]
        )
        if norm_layer == "LayerNorm":
            self.norm3 = nn.LayerNorm(normalized_shape=embed_dims[2])
        elif norm_layer == "BatchNorm1d":
            self.norm3 = nn.BatchNorm1d(num_features=np.prod(img_size // 16))

        cur += depths[2]
        self.block4 = nn.ModuleList(
            [
                Block(
                    dim=embed_dims[3],
                    num_heads=num_heads[3],
                    mlp_ratio=mlp_ratios[3],
                    qkv_bias=qkv_bias,
                    qk_scale=qk_scale,
                    drop=drop_rate,
                    attn_drop=attn_drop_rate,
                    drop_path=dpr[cur + i],
                    norm_layer=norm_layer,
                    sr_ratio=sr_ratios[3],
                    num_features=(
                        np.prod(img_size // 32) if norm_layer == "BatchNorm1d" else None
                    ),
                )
                for i in range(depths[3])
            ]
        )
        if norm_layer == "LayerNorm":
            self.norm4 = nn.LayerNorm(normalized_shape=embed_dims[3])
        elif norm_layer == "BatchNorm1d":
            self.norm4 = nn.BatchNorm1d(num_features=np.prod(img_size // 32))

        # classification head
        # self.head = nn.Linear(embed_dims[3], num_classes) if num_classes > 0 else nn.Identity()

        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=0.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)
        elif isinstance(m, nn.Conv2d):
            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
            fan_out //= m.groups
            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
            if m.bias is not None:
                m.bias.data.zero_()

    def reset_drop_path(self, drop_path_rate):
        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(self.depths))]
        cur = 0
        for i in range(self.depths[0]):
            self.block1[i].drop_path.drop_prob = dpr[cur + i]

        cur += self.depths[0]
        for i in range(self.depths[1]):
            self.block2[i].drop_path.drop_prob = dpr[cur + i]

        cur += self.depths[1]
        for i in range(self.depths[2]):
            self.block3[i].drop_path.drop_prob = dpr[cur + i]

        cur += self.depths[2]
        for i in range(self.depths[3]):
            self.block4[i].drop_path.drop_prob = dpr[cur + i]

    def freeze_patch_emb(self):
        self.patch_embed1.requires_grad = False

    @torch.jit.ignore
    def no_weight_decay(self):
        return {
            "pos_embed1",
            "pos_embed2",
            "pos_embed3",
            "pos_embed4",
            "cls_token",
        }  # has pos_embed may be better

    def get_classifier(self):
        return self.head

    def reset_classifier(self, num_classes, global_pool=""):
        self.num_classes = num_classes
        self.head = (
            nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
        )

    def forward_features(self, x):
        B = x.shape[0]
        outs = []

        # stage 1
        x, H, W = self.patch_embed1(x)
        for i, blk in enumerate(self.block1):
            x = blk(x, H, W)
        x = self.norm1(x)
        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
        outs.append(x)

        # stage 2
        x, H, W = self.patch_embed2(x)
        for i, blk in enumerate(self.block2):
            x = blk(x, H, W)
        x = self.norm2(x)
        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
        outs.append(x)

        # stage 3
        x, H, W = self.patch_embed3(x)
        for i, blk in enumerate(self.block3):
            x = blk(x, H, W)
        x = self.norm3(x)
        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
        outs.append(x)

        # stage 4
        x, H, W = self.patch_embed4(x)
        for i, blk in enumerate(self.block4):
            x = blk(x, H, W)
        x = self.norm4(x)
        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
        outs.append(x)
        return outs

    def forward(self, x):
        x = self.forward_features(x)
        # x = self.head(x)

        return x


class DWConv(nn.Module):
    def __init__(self, dim=768):
        super(DWConv, self).__init__()
        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)

    def forward(self, x, H, W):
        B, N, C = x.shape
        x = x.transpose(1, 2).view(B, C, H, W)
        x = self.dwconv(x)
        x = x.flatten(2).transpose(1, 2)
        return x

class MiTB0(MixVisionTransformer):
    def __init__(self, in_channels: int, img_size: np.ndarray, norm_layer: str):
        super().__init__(
            in_chans=in_channels,
            img_size=img_size,
            patch_size=4,
            embed_dims=[32, 64, 160, 256],
            num_heads=[1, 2, 5, 8],
            mlp_ratios=[4, 4, 4, 4],
            qkv_bias=True,
            norm_layer=norm_layer,
            depths=[2, 2, 2, 2],
            sr_ratios=[8, 4, 2, 1],
            drop_rate=0.0,
            drop_path_rate=0.1,
        )




class MLPDecoder(nn.Module):
    def __init__(
        self,
        embed_dims: List[int],
        decoder_embed_dim: int,
        num_classes: int,
        dropout_prob: float = 0.0,
    ):
        super().__init__()
        self.linear_layers = nn.ModuleList(
            [MLP(embedding_dim, decoder_embed_dim) for embedding_dim in embed_dims]
        )
        self.relu = nn.ReLU()
        n_stages = len(embed_dims)
        self.linear_fuse = nn.Conv2d(
            in_channels=decoder_embed_dim * n_stages,
            out_channels=decoder_embed_dim,
            kernel_size=1,
        )
        self.dropout = nn.Dropout(dropout_prob)
        self.linear_pred = nn.Conv2d(decoder_embed_dim, num_classes, kernel_size=1)

    def forward(self, features: List[torch.Tensor]):
        new_features = []
        target_resolution = features[0].shape[2:]  # In the original implementation
        # this resolution corresponds to (H/4,W/4), where H and W are the height and width of the original image.
        for i in range(len(features)):
            encoder_output = features[i]
            n, _, h, w = encoder_output.shape
            out = self.linear_layers[i](encoder_output)
            out = nlc_to_nchw(out, h, w)
            up_sampled_out = F.interpolate(
                out, size=target_resolution, mode="bilinear", align_corners=False
            )
            new_features.append(up_sampled_out)
        x = torch.cat(new_features, 1)
        x = self.relu(self.linear_fuse(x))
        x = self.dropout(x)
        x = self.linear_pred(x)
        return x

class SegFormerOriginalWithBatchNorm(torch.nn.Module):
    def __init__(self, in_channels: int, img_size_hw: Tuple[int, int], num_classes: int):
        super().__init__()
        if isinstance(img_size_hw, list):
            img_size_hw = tuple(img_size_hw)
        assert isinstance(img_size_hw, tuple) and len(img_size_hw) == 2
        self.encoder = MiTB0(in_channels=in_channels, img_size=np.array(img_size_hw), norm_layer="BatchNorm1d")
        self.decoder = MLPDecoder(
            embed_dims=[32, 64, 160, 256], decoder_embed_dim=256, num_classes=num_classes
        )

    def forward(self, x: torch.Tensor):
        features = self.encoder(x)
        out = self.decoder(features)
        logits_mask = F.interpolate(out, size=x.shape[2:], mode="bilinear", align_corners=False)
        return logits_mask

How I export the model:

model = SegFormerOriginalWithBatchNorm(in_channels=3, num_classes=2, img_size_hw=(256, 128))
input_shape_bchw = (1, 3, 256, 128)

tmp_onnx_output_path = "SegFormerOriginalWithBatchNorm.onnx"
# Export .ONNX succesfully.
convert_pytorch_2_onnx(torch_model=model, input_shape_bchw=input_shape_bchw, output_path=tmp_onnx_output_path)

tmp_har_path = "SegFormerOriginalWithBatchNorm.har"
# Crashes here
convert_onnx_2_har(onnx_path=tmp_onnx_output_path, har_path=tmp_har_path, input_shape_bchw=input_shape_bchw)
convert_torch_module_2_hef(
    torch_module=model,
    hef_path="MyTest.hef",
    npy_rgb_unnormalized_bhwc_calibration_dataset_path=npy_calibration_dataset_path,
    input_shape_bchw=input_shape_bchw,
    apply_normalization=True
)

Conversion functions:

import os
import pathlib
import numpy as np
from typing import Union, Tuple, Optional, List, Dict
import onnx  # ==1.14.0

import torch  # ==2.3.0

def convert_pytorch_2_onnx(torch_model: torch.nn.Module, input_shape_bchw: Tuple[int, int, int, int], output_path: Union[str, pathlib.Path]) -> None:
    torch_input = torch.randn(input_shape_bchw)
    torch_model.eval()
    with torch.no_grad():
        torch.onnx.export(
            torch_model,
            torch_input,
            str(output_path),
            verbose=False,
            do_constant_folding=False,
            opset_version=15,  # Use the appropriate ONNX opset version
            input_names=["input"],
            output_names=['output']
        )

    onnx_model = onnx.load(str(output_path))
    onnx.checker.check_model(onnx_model)


def convert_onnx_2_har(
        onnx_path: Union[str, pathlib.Path],
        har_path: Union[str, pathlib.Path],
        input_shape_bchw: Tuple[int, int, int, int],
        hw_arch: str = "hailo8",
):
    from hailo_sdk_client import ClientRunner, InferenceContext  # noqa  hailo_sdk_client==3.27.0

    onnx_model_name = pathlib.Path(onnx_path).stem
    onnx_model = onnx.load(onnx_path)
    onnx_inputs = onnx_model.graph.input
    onnx_outputs = onnx_model.graph.output

    start_node_name = onnx_inputs[0].name
    end_node_name = onnx_outputs[0].name

    runner = ClientRunner(hw_arch=hw_arch)
    _ = runner.translate_onnx_model(
        str(onnx_path),
        onnx_model_name,
        start_node_names=[start_node_name],
        end_node_names=[end_node_name],
        net_input_shapes={start_node_name: input_shape_bchw}
    ) 

    runner.save_har(str(har_path))
    test_images_bchw = np.random.randint(0, 255, size=input_shape_bchw)
    b,c,h,w = input_shape_bchw
    # test_images_bhwc = np.random.randint(0, 255, size=(b,h,w,c))
    with runner.infer_context(InferenceContext.SDK_NATIVE) as ctx:
        # CRASHES here.
        native_res = runner.infer(ctx, test_images_bchw)

Error:

Inference: 0entries [00:00, ?entries/s]Traceback (most recent call last):
  File "/my/project/conversion/conversion_tools.py", line 276, in <module>
    convert_torch_module_2_hef(
  File "/my/project/conversion/conversion_tools.py", line 199, in convert_torch_module_2_hef
    convert_onnx_2_har(onnx_path=tmp_onnx_output_path, har_path=tmp_har_path, input_shape_bchw=input_shape_bchw)
  File "/my/project/conversion/conversion_tools.py", line 127, in convert_onnx_2_har
    native_res = runner.infer(ctx, test_images_bchw)
  File ".venv/lib/python3.10/site-packages/hailo_sdk_common/states/states.py", line 16, in wrapped_func
    return func(self, *args, **kwargs)
  File ".venv/lib/python3.10/site-packages/hailo_sdk_client/runner/client_runner.py", line 297, in infer
    return self._infer_emulator(data, data_count, batch_size, context)
  File ".venv/lib/python3.10/site-packages/hailo_sdk_client/runner/client_runner.py", line 316, in _infer_emulator
    return self._sdk_backend.acceleras_inference(dataset=dataset,
  File ".venv/lib/python3.10/site-packages/hailo_sdk_client/sdk_backend/sdk_backend.py", line 1015, in acceleras_inference
    return self._inference_flow.run(dataset, batch_size, data_count)
  File ".venv/lib/python3.10/site-packages/hailo_model_optimization/flows/inference_flow.py", line 43, in run
    infer_output = self.predict(dataset, callbacks=[callback], verbose=0)
  File ".venv/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File ".venv/lib/python3.10/site-packages/tensorflow/python/framework/func_graph.py", line 1200, in autograph_handler
    raise e.ag_error_metadata.to_exception(e)
tensorflow.python.framework.errors_impl.InvalidArgumentError: in user code:

    File ".venv/lib/python3.10/site-packages/keras/engine/training.py", line 2169, in predict_function  *
        return step_function(self, iterator)
    File ".venv/lib/python3.10/site-packages/keras/engine/training.py", line 2155, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File ".venv/lib/python3.10/site-packages/keras/engine/training.py", line 2143, in run_step  **
        outputs = model.predict_step(data)
    File ".venv/lib/python3.10/site-packages/keras/engine/training.py", line 2111, in predict_step
        return self(x, training=False)
    File ".venv/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None

    InvalidArgumentError: Exception encountered when calling layer 'simulation_inference_model' (type SimulationInferenceModel).

    in user code:

        File ".venv/lib/python3.10/site-packages/hailo_model_optimization/flows/inference_flow.py", line 99, in call  *
            return self._model(inputs, **kwargs)
        File ".venv/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
        File ".venv/lib/python3.10/site-packages/hailo_model_optimization/acceleras/model/hailo_model/hailo_model.py", line 948, in build
            self.compute_output_shape(input_shape)
        File ".venv/lib/python3.10/site-packages/hailo_model_optimization/acceleras/model/hailo_model/hailo_model.py", line 895, in compute_output_shape
            return self.compute_and_verify_output_shape(input_shape, verify_layer_inputs_shape=False)
        File ".venv/lib/python3.10/site-packages/hailo_model_optimization/acceleras/model/hailo_model/hailo_model.py", line 929, in compute_and_verify_output_shape
            layer_output_shape = layer.compute_output_shape(layer_input_shapes)
        File ".venv/lib/python3.10/site-packages/hailo_model_optimization/acceleras/hailo_layers/base_hailo_layer.py", line 1321, in compute_output_shape
            op_output_shape = op.compute_output_shape(op_input_shapes)
        File ".venv/lib/python3.10/site-packages/hailo_model_optimization/acceleras/atomic_ops/base_atomic_op.py", line 646, in compute_output_shape
            shapes = self._compute_output_shape(input_shape)
        File ".venv/lib/python3.10/site-packages/hailo_model_optimization/acceleras/atomic_ops/base_atomic_op.py", line 672, in _compute_output_shape

Are there any recent updates on MultiHeadAttention?

We have added many transformer based models to the Model-Zoo, like-
DETR
DEiT
FastViT
Fast-SAM
ViTPose
CLIP
And others, you’re welcome to experiment with those.