Unable to convert simplest PyTorch model

Hi, I am trying to do most basic things with Hailo Dataflow Compiler, such as converting ONNX to HAR, and doing quantization step. I probably do not understand some steps that I need to perform to run simple network on Hailo.

Does Dataflow Compiler understand what nn.Embedding (from PyTorch) or Gather operation is?

Here is the model visualized with Netron:

Here is a simple script I am trying to work with:

import torch.nn as nn
import torch.utils.data
import hailo_sdk_client
from hailo_sdk_client import ClientRunner

print(f'Hailo Dataflow Compiler v{hailo_sdk_client.__version__}')

batch_size = 1
input_len = 15
vocab_len = 256  # UTF-8 characters
embedding_len = 256

torch.manual_seed(0)
model = nn.Sequential(
    nn.Embedding(vocab_len, embedding_len),
    nn.ReLU(),
    nn.Flatten(),
    nn.Linear(input_len * embedding_len, 256, bias=False),
    nn.ReLU(),
    nn.Linear(256, vocab_len, bias=False),
)

output = model(torch.zeros(batch_size, input_len, dtype=torch.long))
print(f"{output.mean()=}, {output.std(unbiased=False)=}, {output.shape=}")

with torch.no_grad():
    dummy_input = torch.randint(vocab_len, (batch_size, input_len))
    # torch.onnx.export(model, dummy_input, "model.onnx", verbose=True, input_names=["input"], output_names=["output"])
    torch.onnx.export(model, dummy_input, "model.onnx", verbose=True)

# chosen_hw_arch = "hailo8"
# chosen_hw_arch = "hailo15h"  # For Hailo-15 devices
chosen_hw_arch = "hailo8r"  # For Mini PCIe modules or Hailo-8R devices
runner = ClientRunner(hw_arch=chosen_hw_arch)
hn, npz = runner.translate_onnx_model(
    "model.onnx",
    "network",
    start_node_names=["/0/Gather"],
    end_node_names=["/5/MatMul"],
    net_input_shapes={"/0/Gather": [batch_size, input_len]},
)
runner.save_har("model.har")

runner.optimize(None)

hef = runner.compile()
file_name = f"model.hef"
with open(file_name, "wb") as f:
    f.write(hef)

Output:

Hailo Dataflow Compiler v3.28.0
output.mean()=tensor(-0.0118, grad_fn=<MeanBackward0>), output.std(unbiased=False)=tensor(0.1509, grad_fn=<StdBackward0>), output.shape=torch.Size([1, 256])
Exported graph: graph(%input.1 : Long(1, 15, strides=[15, 1], requires_grad=0, device=cpu),
      %0.weight : Float(256, 256, strides=[256, 1], requires_grad=1, device=cpu),
      %onnx::MatMul_12 : Float(3840, 256, strides=[1, 3840], requires_grad=0, device=cpu),
      %onnx::MatMul_13 : Float(256, 256, strides=[1, 256], requires_grad=0, device=cpu)):
  %/0/Gather_output_0 : Float(1, 15, 256, strides=[3840, 256, 1], requires_grad=0, device=cpu) = onnx::Gather[onnx_name="/0/Gather"](%0.weight, %input.1), scope: torch.nn.modules.container.Sequential::/torch.nn.modules.sparse.Embedding::0 # /fedora/p/i/qubu/.venv/lib/python3.10/site-packages/torch/nn/functional.py:2267:0
  %/1/Relu_output_0 : Float(1, 15, 256, strides=[3840, 256, 1], requires_grad=0, device=cpu) = onnx::Relu[onnx_name="/1/Relu"](%/0/Gather_output_0), scope: torch.nn.modules.container.Sequential::/torch.nn.modules.activation.ReLU::1 # /fedora/p/i/qubu/.venv/lib/python3.10/site-packages/torch/nn/functional.py:1500:0
  %/2/Flatten_output_0 : Float(1, 3840, strides=[3840, 1], requires_grad=0, device=cpu) = onnx::Flatten[axis=1, onnx_name="/2/Flatten"](%/1/Relu_output_0), scope: torch.nn.modules.container.Sequential::/torch.nn.modules.flatten.Flatten::2 # /fedora/p/i/qubu/.venv/lib/python3.10/site-packages/torch/nn/modules/flatten.py:50:0
  %/3/MatMul_output_0 : Float(1, 256, strides=[256, 1], requires_grad=0, device=cpu) = onnx::MatMul[onnx_name="/3/MatMul"](%/2/Flatten_output_0, %onnx::MatMul_12), scope: torch.nn.modules.container.Sequential::/torch.nn.modules.linear.Linear::3 # /fedora/p/i/qubu/.venv/lib/python3.10/site-packages/torch/nn/modules/linear.py:117:0
  %/4/Relu_output_0 : Float(1, 256, strides=[256, 1], requires_grad=0, device=cpu) = onnx::Relu[onnx_name="/4/Relu"](%/3/MatMul_output_0), scope: torch.nn.modules.container.Sequential::/torch.nn.modules.activation.ReLU::4 # /fedora/p/i/qubu/.venv/lib/python3.10/site-packages/torch/nn/functional.py:1500:0
  %11 : Float(1, 256, strides=[256, 1], requires_grad=0, device=cpu) = onnx::MatMul[onnx_name="/5/MatMul"](%/4/Relu_output_0, %onnx::MatMul_13), scope: torch.nn.modules.container.Sequential::/torch.nn.modules.linear.Linear::5 # /fedora/p/i/qubu/.venv/lib/python3.10/site-packages/torch/nn/modules/linear.py:117:0
  return (%11)

[info] Translation started on ONNX model network
[info] Restored ONNX model network (completion time: 00:00:00.03)
[info] Extracted ONNXRuntime meta-data for Hailo model (completion time: 00:00:00.11)
Traceback (most recent call last):
  File "/fedora/p/i/qubu/main_converter.py", line 35, in <module>
    hn, npz = runner.translate_onnx_model(
  File "/fedora/p/i/qubu/.venv/lib/python3.10/site-packages/hailo_sdk_common/states/states.py", line 16, in wrapped_func
    return func(self, *args, **kwargs)
  File "/fedora/p/i/qubu/.venv/lib/python3.10/site-packages/hailo_sdk_client/runner/client_runner.py", line 1158, in translate_onnx_model
    parser.translate_onnx_model(
  File "/fedora/p/i/qubu/.venv/lib/python3.10/site-packages/hailo_sdk_client/sdk_backend/parser/parser.py", line 209, in translate_onnx_model
    set_model_net_input_shapes(onnx_model, net_input_shapes)
  File "/fedora/p/i/qubu/.venv/lib/python3.10/site-packages/hailo_sdk_common/onnx_tools/onnx_shape_inference.py", line 202, in set_model_net_input_shapes
    raise UnsupportedGraphInputError(
hailo_sdk_common.onnx_tools.onnx_shape_inference.UnsupportedGraphInputError: Couldn't find predecessors for node /0/Gather in the given model.

Thank you.

Hey @ivanstepanovftw,

Welcome to the Hailo Community!

It looks like the Hailo Dataflow Compiler might not natively support the nn.Embedding layer or the Gather operation from PyTorch during the ONNX translation process. This is likely because embedding layers and gather operations are more common in NLP models, while Hailo’s hardware and tools are more optimized for vision-based models.

Here’s how you can approach this:

  1. Embedding/Gather Workaround:
    Since Hailo doesn’t directly support these layers, you can preprocess the embeddings outside the Hailo pipeline. Instead of using nn.Embedding, convert your inputs into a one-hot encoded representation or use a precomputed embedding matrix before passing the data to Hailo. You can simulate the embedding behavior with a fully connected layer and export the modified model to ONNX.

  2. Input Shape Error:
    The error you’re seeing:

    UnsupportedGraphInputError: Couldn't find predecessors for node /0/Gather in the given model.
    

    suggests an issue with how the Gather node is structured. When you adjust the model to replace Gather, make sure the input tensor shape is explicitly defined and aligned with Hailo’s input requirements during the ONNX export.

  3. Alternative Approach:
    If modifying the embedding operation isn’t feasible, consider simplifying the model by removing unsupported layers, handling them as a preprocessing step, and only feeding compatible layers into the Hailo Dataflow Compiler.

Let me know if you’d like more details on how to implement these adjustments! I’d be happy to assist.

Best regards

Thank you! Removing nn.Embedding indeed solves the problem. Here is the final code that works in Hailo Dataflow Compiler v3.28.0:

import torch.nn as nn
import torch.utils.data
import hailo_sdk_client
from hailo_sdk_client import ClientRunner

print(f'Hailo Dataflow Compiler v{hailo_sdk_client.__version__}')

batch_size = 1
input_len = 15
vocab_len = 256  # UTF-8 characters
embedding_len = 256

torch.manual_seed(0)
model = nn.Sequential(
    nn.Linear(vocab_len, embedding_len),
    nn.ReLU(),
    nn.Flatten(),
    nn.Linear(input_len * embedding_len, 256, bias=False),
    nn.ReLU(),
    nn.Linear(256, vocab_len, bias=False),
)

# Create one-hot input instead of embedding indices
input_data = torch.zeros(batch_size, input_len, vocab_len)
dummy_input = torch.randint(vocab_len, (batch_size, input_len))
for i in range(batch_size):
    for j in range(input_len):
        input_data[i, j, dummy_input[i, j]] = 1  # One-hot encoding

output = model(input_data)
print(f"{output.mean()=}, {output.std(unbiased=False)=}, {output.shape=}")

with torch.no_grad():
    # torch.onnx.export(model, input_data, "model.onnx", verbose=True, input_names=["input"], output_names=["output"])
    torch.onnx.export(model, input_data, "model.onnx", verbose=True)

# chosen_hw_arch = "hailo8"
# chosen_hw_arch = "hailo15h"  # For Hailo-15 devices
chosen_hw_arch = "hailo8r"  # For Mini PCIe modules or Hailo-8R devices
runner = ClientRunner(hw_arch=chosen_hw_arch)
hn, npz = runner.translate_onnx_model(
    "model.onnx",
    "network",
    start_node_names=["/0/MatMul"],
    end_node_names=["/5/MatMul"],
    net_input_shapes={"/0/MatMul": [batch_size, input_len, vocab_len]},
)
runner.save_har("model.har")

runner.optimize(None)

hef = runner.compile()
file_name = f"model.hef"
with open(file_name, "wb") as f:
    f.write(hef)
1 Like