Output image is shifted on Hailo8

Hi, I use Hailo8 to run our image compression model. The output image of onnxruntime on CPU is right, but the output image is shifted compared to the input image when running the model on Hailo-8.
input:


hailo8 output:

onnxruntime output:

Pytorch model structure:

import torch
import torch.nn as nn
from net.channel import Channel
from loss.distortion import Distortion
from random import choice
import argparse

class ResBlock(nn.Module):
    def __init__(self, in_channel, out_channel, kernel_size=3, stride=1):
        super(ResBlock, self).__init__()
        self.relu1 = nn.ReLU()
        self.conv1 = nn.Conv2d(in_channel, out_channel, kernel_size, stride, padding=kernel_size//2)
        self.relu2 = nn.ReLU()
        self.conv2 = nn.Conv2d(out_channel, out_channel, kernel_size, stride, padding=kernel_size//2)

    def forward(self, x):
        x_1 = self.relu1(x)
        first_layer = self.conv1(x_1)
        first_layer = self.relu2(first_layer)
        second_layer = self.conv2(first_layer)
        return x + second_layer


class EncoderOne(nn.Module):
    def __init__(self, out_channels) -> None:
        super().__init__()
        self.module = nn.Sequential(
            nn.Conv2d(3, out_channels, kernel_size=3, stride=2, padding=1),
            # DeformDownsample(3, out_channels, kernel_size=3, stride=2),
            ResBlock(out_channels, out_channels),
            ResBlock(out_channels, out_channels),
            ResBlock(out_channels, out_channels),
            nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=2, padding=1)
            # conv(out_channels, out_channels, kernel_size=3, stride=2)
        )

    def forward(self, x):
        return self.module(x)

class EncoderTwo(nn.Module):  # TODO try the new idea, different dimension for different frames, first frame with full dimension and the rest with half dimension
    def __init__(self, in_channels, out_channels) -> None:
        super().__init__()
        self.module = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, padding=1),
            ResBlock(out_channels, out_channels),
            ResBlock(out_channels, out_channels),
            ResBlock(out_channels, out_channels),
            # nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=2, padding=1),
            ResBlock(out_channels, out_channels),
            ResBlock(out_channels, out_channels),
            ResBlock(out_channels, out_channels),
            nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=2, padding=1),
            ResBlock(out_channels, out_channels),
            ResBlock(out_channels, out_channels),
            ResBlock(out_channels, out_channels)
        )

    def forward(self, x):
        return self.module(x)

class DecoderOne(nn.Module):
    def __init__(self, in_channels, out_channels) -> None:
        super().__init__()
        self.module = nn.Sequential(
            ResBlock(in_channels, in_channels),
            ResBlock(in_channels, in_channels),
            ResBlock(in_channels, in_channels),
            nn.ConvTranspose2d(in_channels,in_channels,kernel_size=3,stride=2,output_padding=1,padding=1),
            # deconv(in_channels, in_channels, kernel_size=3, stride=2),
            ResBlock(in_channels, in_channels),
            ResBlock(in_channels, in_channels),
            ResBlock(in_channels, in_channels),
            # nn.ConvTranspose2d(in_channels,in_channels,kernel_size=3,stride=2,output_padding=1,padding=1),
            ResBlock(in_channels, in_channels),
            ResBlock(in_channels, in_channels),
            ResBlock(in_channels, in_channels),
            nn.ConvTranspose2d(in_channels,out_channels,kernel_size=3,stride=2,output_padding=1,padding=1)
        )

    def forward(self, x):
        return self.module(x)

class DecoderTwo(nn.Module):
    def __init__(self, out_channels) -> None:
        super().__init__()
        self.module = nn.Sequential(
            # Added extra upsampling layer to compensate for the extra 2x downsampling from the deformable conv
            # deconv(out_channels, out_channels, kernel_size=3, stride=2),
            nn.ConvTranspose2d(out_channels, out_channels, kernel_size=3, stride=2, output_padding=1, padding=1),
            ResBlock(out_channels, out_channels),
            ResBlock(out_channels, out_channels),
            ResBlock(out_channels, out_channels),
            nn.ConvTranspose2d(out_channels, 3, kernel_size=3, stride=2, output_padding=1, padding=1)
            # deconv(out_channels, 3, kernel_size=3, stride=2)
        )

    def forward(self, x):
        return self.module(x)



class TsSemNet(nn.Module):
    def __init__(self, args, config):
        super(TsSemNet, self).__init__()
        self.encoder1 = EncoderOne(64)
        self.encoder2 = EncoderTwo(64, 128)
        self.decoder1 = DecoderOne(128, 64)
        self.decoder2 = DecoderTwo(64)
        self.channel = Channel(args, config)
        self.pass_channel = True
        self.squared_difference = torch.nn.MSELoss(reduction='none')
        self.distortion_loss = Distortion(args)
        self.multiple_snr = args.multiple_snr.split(",")
        for i in range(len(self.multiple_snr)):
            self.multiple_snr[i] = int(self.multiple_snr[i])

    def forward(self, input_image, given_SNR):
        feature = self.encoder1(input_image)
        feature = self.encoder2(feature)
        if self.training:
            feature = self.channel.forward(feature, given_SNR, False)
        feature = self.decoder1(feature)
        recon_image = self.decoder2(feature)
        return recon_image

model conversion and run steps

# step one: export model to onnx format
torch.onnx.export(net, (img,13), "model/ts_semantic.onnx")

# step two: hailo parser model
from hailo_sdk_client import ClientRunner
from hailo_sdk_client.exposed_definitions import Dims
chosen_hw_arch="hailo8"
onnx_model_name = "se_256"
onnx_path = "ts_semantic.onnx"
runner = ClientRunner(hw_arch=chosen_hw_arch)
hn, npz = runner.translate_onnx_model(
    onnx_path,
    onnx_model_name,
    start_node_names=["input.1"],
    end_node_names=["241"],
    net_input_shapes={"input.1": [1, 3, 256, 256]},
    net_input_format={'input.1': [Dims.BATCH, Dims.CHANNELS, Dims.HEIGHT, Dims.WIDTH]},
    disable_shape_inference=True
    )
hailo_model_har_name = f'{onnx_model_name}_hailo_model.har'
runner.save_har(hailo_model_har_name)

# step three: optimize model
from hailo_sdk_client import ClientRunner
import numpy as np

calib_dataset = np.load('hailo_calib_256_1024.npy')

model_name = "se_256"
hailo_model_har_name = f'{model_name}_hailo_model.har'
runner = ClientRunner(har=hailo_model_har_name)
alls = "normalization1 = normalization([0.0, 0.0, 0.0], [255, 255, 255])\n"
runner.load_model_script(alls)
runner.optimize(calib_dataset)
quantized_model_har_path = f'{model_name}_quantized_model.har'
runner.save_har(quantized_model_har_path)

# step four: compiler model
hailo compiler se_256_quantized_model.har

# step five: run model with cpp code
#include "hailo/hailort.hpp"

#include <iostream>
#include <thread>
#include <filesystem>

#include <opencv2/opencv.hpp>

#undef max

constexpr int WIDTH  = 256;
constexpr int HEIGHT = 256;

using hailort::Device;
using hailort::Hef;
using hailort::Expected;
using hailort::make_unexpected;
using hailort::ConfiguredNetworkGroup;
using hailort::VStreamsBuilder;
using hailort::InputVStream;
using hailort::OutputVStream;
using hailort::MemoryView;


std::string dst_dir{};

std::string getCmdOption(int argc, char *argv[], const std::string &option)
{
    std::string cmd;
    for (int i = 1; i < argc; ++i)
    {
        std::string arg = argv[i];
        if (0 == arg.find(option, 0))
        {
            std::size_t found = arg.find("=", 0) + 1;
            cmd = arg.substr(found, 200);
            return cmd;
        }
    }
    return cmd;
}

Expected<std::shared_ptr<ConfiguredNetworkGroup>> configure_network_group(Device &device, const std::string &hef_file)
{
    auto hef = Hef::create(hef_file);
    if (!hef) {
        return make_unexpected(hef.status());
    }

    auto configure_params = hef->create_configure_params(HAILO_STREAM_INTERFACE_PCIE);
    if (!configure_params) {
        return make_unexpected(configure_params.status());
    }

    auto network_groups = device.configure(hef.value(), configure_params.value());
    if (!network_groups) {
        return make_unexpected(network_groups.status());
    }

    if (1 != network_groups->size()) {
        std::cerr << "Invalid amount of network groups" << std::endl;
        return make_unexpected(HAILO_INTERNAL_FAILURE);
    }

    return std::move(network_groups->at(0));
}

template <typename T=InputVStream>
std::string info_to_str(T &stream)
{
    std::string result = stream.get_info().name;
    result += " (";
    result += std::to_string(stream.get_info().shape.height);
    result += ", ";
    result += std::to_string(stream.get_info().shape.width);
    result += ", ";
    result += std::to_string(stream.get_info().shape.features);
    result += ")";
    return result;
}

template <typename T>
hailo_status write_all(std::vector<InputVStream> &input, std::string &video_path)
{
    std::vector<cv::String> file_names;
    cv::glob(video_path, file_names, false);
    std::cout << "-I- Started write thread " << video_path << std::endl;

    for (std::string file : file_names) {
        auto rgb_frame = cv::imread(file,  cv::IMREAD_COLOR);

        if (rgb_frame.channels() == 3)
            cv::cvtColor(rgb_frame, rgb_frame, cv::COLOR_BGR2RGB);


        if (rgb_frame.rows != HEIGHT || rgb_frame.cols != WIDTH)
            cv::resize(rgb_frame, rgb_frame, cv::Size(WIDTH, HEIGHT), cv::INTER_AREA);
        // rgb_frame.convertTo(rgb_frame, CV_32FC3, 1.0/255.0f);
        int factor = std::is_same<T, uint8_t>::value ? 1 : 4;                                  // In case we use float32_t, we have 4 bytes per component
        auto status = input[0].write(MemoryView(rgb_frame.data, HEIGHT * WIDTH * 3 * factor)); // Writing HEIGHT * WIDTH, 3 channels of uint8
        if (HAILO_SUCCESS != status)
            return status;
    }
    return HAILO_SUCCESS;
}

template <typename T>
hailo_status read_all(OutputVStream &output, std::string &video_path)
{
    // std::vector<T> data(output.get_frame_size());
    int data_size = WIDTH * HEIGHT * 3 * 4;
    cv::Mat ret_mat(HEIGHT, WIDTH, CV_32FC3);
    std::vector<cv::String> file_names;
    std::cout << "-I- Started read thread " << std::endl;
    cv::glob(video_path, file_names, false);
    size_t num_frames = 0;
    for (std::string file : file_names) {

        auto status = output.read(MemoryView(ret_mat.data, data_size));
        if (HAILO_SUCCESS != status)
            return status;
        num_frames++;
        cv::Mat save_mat;
        ret_mat.convertTo(save_mat, CV_8UC3, 255.0f);
	    cv::cvtColor(save_mat, save_mat, cv::COLOR_RGB2BGR);
        std::filesystem::path file_path(file);
        std::string file_name = file_path.filename().string();
        std::string save_path = dst_dir + "/" + file_name;
        cv::imwrite(save_path, save_mat);
        std::cout << "save img " << save_path << std::endl;
    }
    std::cout << "-I- Finished read thread " << std::endl;
    return HAILO_SUCCESS;
}

void print_net_banner(std::pair< std::vector<InputVStream>, std::vector<OutputVStream> > &vstreams) {
    std::cout << "-I---------------------------------------------------------------------" << std::endl;
    std::cout << "-I- Dir  Name                                     " << std::endl;
    std::cout << "-I---------------------------------------------------------------------" << std::endl;
    for (auto &value: vstreams.first)
        std::cout << "-I- IN:  " << info_to_str<InputVStream>(value) << std::endl;
    std::cout << "-I---------------------------------------------------------------------" << std::endl;
    for (auto &value: vstreams.second)
        std::cout << "-I- OUT: " << info_to_str<OutputVStream>(value) << std::endl;
    std::cout << "-I---------------------------------------------------------------------" << std::endl;
}

template <typename IN_T, typename OUT_T>
hailo_status infer(std::vector<InputVStream> &inputs, std::vector<OutputVStream> &outputs, std::string video_path)
{
    hailo_status input_status = HAILO_UNINITIALIZED;
    hailo_status output_status = HAILO_UNINITIALIZED;
    std::vector<std::thread> output_threads;

    std::thread input_thread([&inputs, &video_path, &input_status]() { input_status = write_all<IN_T>(inputs, video_path); });

    for (auto &output: outputs)
        output_threads.push_back( std::thread([&output, &video_path, &output_status]() { output_status = read_all<OUT_T>(output, video_path); }) );

    input_thread.join();

    for (auto &out: output_threads)
        out.join();

    if ((HAILO_SUCCESS != input_status) || (HAILO_SUCCESS != output_status)) {
        return HAILO_INTERNAL_FAILURE;
    }

    std::cout << "-I- Inference finished successfully" << std::endl;
    return HAILO_SUCCESS;
}

int main(int argc, char**argv)
{
    std::string hef_file   = getCmdOption(argc, argv, "-hef=");
    std::string src_dir = getCmdOption(argc, argv, "-src=");
    dst_dir = getCmdOption(argc, argv, "-dst=");
    auto all_devices       = Device::scan_pcie();
    std::cout << "-src dir: " << src_dir << std::endl;
    std::cout << "hef: " << hef_file << std::endl;
    std::cout << "dst dir: " << dst_dir << std::endl;

    auto device = Device::create_pcie(all_devices.value()[0]);
    if (!device) {
        std::cerr << "-E- Failed create_pcie " << device.status() << std::endl;
        return device.status();
    }

    auto network_group = configure_network_group(*device.value(), hef_file);
    if (!network_group) {
        std::cerr << "-E- Failed to configure network group " << hef_file << std::endl;
        return network_group.status();
    }

    auto input_vstream_params = network_group.value()->make_input_vstream_params(true, HAILO_FORMAT_TYPE_UINT8, HAILO_DEFAULT_VSTREAM_TIMEOUT_MS, HAILO_DEFAULT_VSTREAM_QUEUE_SIZE);
    auto output_vstream_params = network_group.value()->make_output_vstream_params(false, HAILO_FORMAT_TYPE_FLOAT32, HAILO_DEFAULT_VSTREAM_TIMEOUT_MS, HAILO_DEFAULT_VSTREAM_QUEUE_SIZE);
    auto input_vstreams  = VStreamsBuilder::create_input_vstreams(*network_group.value(), input_vstream_params.value());
    auto output_vstreams = VStreamsBuilder::create_output_vstreams(*network_group.value(), output_vstream_params.value());
    if (!input_vstreams || !output_vstreams) {
        std::cerr << "-E- Failed creating input: " << input_vstreams.status() << " output status:" << output_vstreams.status() << std::endl;
        return input_vstreams.status();
    }
    auto vstreams = std::make_pair(input_vstreams.release(), output_vstreams.release());

    print_net_banner(vstreams);

    auto activated_network_group = network_group.value()->activate();
    if (!activated_network_group) {
        std::cerr << "-E- Failed activated network group " << activated_network_group.status();
        return activated_network_group.status();
    }

    auto status  = infer<uint8_t, float32_t>(vstreams.first, vstreams.second, src_dir);

    if (HAILO_SUCCESS != status) {
        std::cerr << "-E- Inference failed "  << status << std::endl;
        return status;
    }
    return HAILO_SUCCESS;
}

Could you please help me to solve this problem. If need, I can provide my model for you to reproduction this. Thanks.

Hey @Sun_Jie1 ,

Welcome to the Hailo Community!

The root cause is how our Dataflow Compiler handles ConvTranspose operations. We currently support 3×3 deconvolutions with stride 2×2 using SAME_TENSORFLOW padding mode, which means your 16×16→32×32 and 32×32→64×64 upsampling follows TensorFlow’s “same” padding conventions. Meanwhile, PyTorch’s ConvTranspose2d with pad=1 and output_padding=1 uses ONNX’s more flexible padding rules, and that can place the kernel anchors one pixel off from where Hailo puts them.

That single pixel difference in kernel positioning is what’s causing the slight shift you’re seeing in your final 256×256 output.

Here are three ways to fix this:

Option 1: Replace deconv with upsample + conv (my recommendation)
Instead of using ConvTranspose2d, try rewriting your decoder layers like this:

import torch.nn.functional as F

class UpsampleBlock(nn.Module):
    def __init__(self, in_ch, out_ch):
        super().__init__()
        self.conv = nn.Conv2d(in_ch, out_ch, kernel_size=3, stride=1, padding=1)

    def forward(self, x):
        # nearest-neighbor upsample by 2, then 3×3 conv
        x = F.interpolate(x, scale_factor=2, mode='nearest')
        return self.conv(x)

Our compiler handles Resize operations (both nearest-neighbor and bilinear) really well, so this approach should give you bit-identical results between CPU and Hailo.

Option 2: Use PixelShuffle
If you prefer keeping it as a single fused operation, replace each:

nn.ConvTranspose2d(in_ch, out_ch, kernel_size=3, stride=2, padding=1, output_padding=1)

with:

nn.Sequential(
    nn.Conv2d(in_ch, out_ch * 4, kernel_size=3, padding=1),
    nn.PixelShuffle(upscale_factor=2),
)

PixelShuffle just reorders feature maps into spatial blocks, so there’s no padding semantics to disagree on between platforms.

Option 3: Align ONNX padding to TensorFlow (if you must keep ConvTranspose)
Our compiler automatically treats 3×3 stride-2 deconvolutions as SAME_TENSORFLOW padding, but PyTorch’s ONNX exporter usually sets explicit pads and output_padding attributes. You can try:

  1. Re-export your model so every ConvTranspose node uses auto_pad=‘SAME_TENSORFLOW’ instead of explicit pad attributes
  2. Make sure disable_shape_inference=False during translation so the compiler picks up the ONNX auto_pad correctly
  3. Check your ONNX graph (Netron works great for this) to confirm each ConvTranspose has auto_pad=‘SAME_TENSORFLOW’ and no conflicting pads/output_padding

I’d recommend the upsample+conv approach as it’s the most reliable way to eliminate any deconvolution padding mismatches on Hailo-8.

Hope this helps!

Hi @omria
Thank you for your help. I’ve tried the first solution and it solved my problem.