Hi, I use Hailo8 to run our image compression model. The output image of onnxruntime on CPU is right, but the output image is shifted compared to the input image when running the model on Hailo-8.
input:
hailo8 output:
onnxruntime output:
Pytorch model structure:
import torch
import torch.nn as nn
from net.channel import Channel
from loss.distortion import Distortion
from random import choice
import argparse
class ResBlock(nn.Module):
def __init__(self, in_channel, out_channel, kernel_size=3, stride=1):
super(ResBlock, self).__init__()
self.relu1 = nn.ReLU()
self.conv1 = nn.Conv2d(in_channel, out_channel, kernel_size, stride, padding=kernel_size//2)
self.relu2 = nn.ReLU()
self.conv2 = nn.Conv2d(out_channel, out_channel, kernel_size, stride, padding=kernel_size//2)
def forward(self, x):
x_1 = self.relu1(x)
first_layer = self.conv1(x_1)
first_layer = self.relu2(first_layer)
second_layer = self.conv2(first_layer)
return x + second_layer
class EncoderOne(nn.Module):
def __init__(self, out_channels) -> None:
super().__init__()
self.module = nn.Sequential(
nn.Conv2d(3, out_channels, kernel_size=3, stride=2, padding=1),
# DeformDownsample(3, out_channels, kernel_size=3, stride=2),
ResBlock(out_channels, out_channels),
ResBlock(out_channels, out_channels),
ResBlock(out_channels, out_channels),
nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=2, padding=1)
# conv(out_channels, out_channels, kernel_size=3, stride=2)
)
def forward(self, x):
return self.module(x)
class EncoderTwo(nn.Module): # TODO try the new idea, different dimension for different frames, first frame with full dimension and the rest with half dimension
def __init__(self, in_channels, out_channels) -> None:
super().__init__()
self.module = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, padding=1),
ResBlock(out_channels, out_channels),
ResBlock(out_channels, out_channels),
ResBlock(out_channels, out_channels),
# nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=2, padding=1),
ResBlock(out_channels, out_channels),
ResBlock(out_channels, out_channels),
ResBlock(out_channels, out_channels),
nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=2, padding=1),
ResBlock(out_channels, out_channels),
ResBlock(out_channels, out_channels),
ResBlock(out_channels, out_channels)
)
def forward(self, x):
return self.module(x)
class DecoderOne(nn.Module):
def __init__(self, in_channels, out_channels) -> None:
super().__init__()
self.module = nn.Sequential(
ResBlock(in_channels, in_channels),
ResBlock(in_channels, in_channels),
ResBlock(in_channels, in_channels),
nn.ConvTranspose2d(in_channels,in_channels,kernel_size=3,stride=2,output_padding=1,padding=1),
# deconv(in_channels, in_channels, kernel_size=3, stride=2),
ResBlock(in_channels, in_channels),
ResBlock(in_channels, in_channels),
ResBlock(in_channels, in_channels),
# nn.ConvTranspose2d(in_channels,in_channels,kernel_size=3,stride=2,output_padding=1,padding=1),
ResBlock(in_channels, in_channels),
ResBlock(in_channels, in_channels),
ResBlock(in_channels, in_channels),
nn.ConvTranspose2d(in_channels,out_channels,kernel_size=3,stride=2,output_padding=1,padding=1)
)
def forward(self, x):
return self.module(x)
class DecoderTwo(nn.Module):
def __init__(self, out_channels) -> None:
super().__init__()
self.module = nn.Sequential(
# Added extra upsampling layer to compensate for the extra 2x downsampling from the deformable conv
# deconv(out_channels, out_channels, kernel_size=3, stride=2),
nn.ConvTranspose2d(out_channels, out_channels, kernel_size=3, stride=2, output_padding=1, padding=1),
ResBlock(out_channels, out_channels),
ResBlock(out_channels, out_channels),
ResBlock(out_channels, out_channels),
nn.ConvTranspose2d(out_channels, 3, kernel_size=3, stride=2, output_padding=1, padding=1)
# deconv(out_channels, 3, kernel_size=3, stride=2)
)
def forward(self, x):
return self.module(x)
class TsSemNet(nn.Module):
def __init__(self, args, config):
super(TsSemNet, self).__init__()
self.encoder1 = EncoderOne(64)
self.encoder2 = EncoderTwo(64, 128)
self.decoder1 = DecoderOne(128, 64)
self.decoder2 = DecoderTwo(64)
self.channel = Channel(args, config)
self.pass_channel = True
self.squared_difference = torch.nn.MSELoss(reduction='none')
self.distortion_loss = Distortion(args)
self.multiple_snr = args.multiple_snr.split(",")
for i in range(len(self.multiple_snr)):
self.multiple_snr[i] = int(self.multiple_snr[i])
def forward(self, input_image, given_SNR):
feature = self.encoder1(input_image)
feature = self.encoder2(feature)
if self.training:
feature = self.channel.forward(feature, given_SNR, False)
feature = self.decoder1(feature)
recon_image = self.decoder2(feature)
return recon_image
model conversion and run steps
# step one: export model to onnx format
torch.onnx.export(net, (img,13), "model/ts_semantic.onnx")
# step two: hailo parser model
from hailo_sdk_client import ClientRunner
from hailo_sdk_client.exposed_definitions import Dims
chosen_hw_arch="hailo8"
onnx_model_name = "se_256"
onnx_path = "ts_semantic.onnx"
runner = ClientRunner(hw_arch=chosen_hw_arch)
hn, npz = runner.translate_onnx_model(
onnx_path,
onnx_model_name,
start_node_names=["input.1"],
end_node_names=["241"],
net_input_shapes={"input.1": [1, 3, 256, 256]},
net_input_format={'input.1': [Dims.BATCH, Dims.CHANNELS, Dims.HEIGHT, Dims.WIDTH]},
disable_shape_inference=True
)
hailo_model_har_name = f'{onnx_model_name}_hailo_model.har'
runner.save_har(hailo_model_har_name)
# step three: optimize model
from hailo_sdk_client import ClientRunner
import numpy as np
calib_dataset = np.load('hailo_calib_256_1024.npy')
model_name = "se_256"
hailo_model_har_name = f'{model_name}_hailo_model.har'
runner = ClientRunner(har=hailo_model_har_name)
alls = "normalization1 = normalization([0.0, 0.0, 0.0], [255, 255, 255])\n"
runner.load_model_script(alls)
runner.optimize(calib_dataset)
quantized_model_har_path = f'{model_name}_quantized_model.har'
runner.save_har(quantized_model_har_path)
# step four: compiler model
hailo compiler se_256_quantized_model.har
# step five: run model with cpp code
#include "hailo/hailort.hpp"
#include <iostream>
#include <thread>
#include <filesystem>
#include <opencv2/opencv.hpp>
#undef max
constexpr int WIDTH = 256;
constexpr int HEIGHT = 256;
using hailort::Device;
using hailort::Hef;
using hailort::Expected;
using hailort::make_unexpected;
using hailort::ConfiguredNetworkGroup;
using hailort::VStreamsBuilder;
using hailort::InputVStream;
using hailort::OutputVStream;
using hailort::MemoryView;
std::string dst_dir{};
std::string getCmdOption(int argc, char *argv[], const std::string &option)
{
std::string cmd;
for (int i = 1; i < argc; ++i)
{
std::string arg = argv[i];
if (0 == arg.find(option, 0))
{
std::size_t found = arg.find("=", 0) + 1;
cmd = arg.substr(found, 200);
return cmd;
}
}
return cmd;
}
Expected<std::shared_ptr<ConfiguredNetworkGroup>> configure_network_group(Device &device, const std::string &hef_file)
{
auto hef = Hef::create(hef_file);
if (!hef) {
return make_unexpected(hef.status());
}
auto configure_params = hef->create_configure_params(HAILO_STREAM_INTERFACE_PCIE);
if (!configure_params) {
return make_unexpected(configure_params.status());
}
auto network_groups = device.configure(hef.value(), configure_params.value());
if (!network_groups) {
return make_unexpected(network_groups.status());
}
if (1 != network_groups->size()) {
std::cerr << "Invalid amount of network groups" << std::endl;
return make_unexpected(HAILO_INTERNAL_FAILURE);
}
return std::move(network_groups->at(0));
}
template <typename T=InputVStream>
std::string info_to_str(T &stream)
{
std::string result = stream.get_info().name;
result += " (";
result += std::to_string(stream.get_info().shape.height);
result += ", ";
result += std::to_string(stream.get_info().shape.width);
result += ", ";
result += std::to_string(stream.get_info().shape.features);
result += ")";
return result;
}
template <typename T>
hailo_status write_all(std::vector<InputVStream> &input, std::string &video_path)
{
std::vector<cv::String> file_names;
cv::glob(video_path, file_names, false);
std::cout << "-I- Started write thread " << video_path << std::endl;
for (std::string file : file_names) {
auto rgb_frame = cv::imread(file, cv::IMREAD_COLOR);
if (rgb_frame.channels() == 3)
cv::cvtColor(rgb_frame, rgb_frame, cv::COLOR_BGR2RGB);
if (rgb_frame.rows != HEIGHT || rgb_frame.cols != WIDTH)
cv::resize(rgb_frame, rgb_frame, cv::Size(WIDTH, HEIGHT), cv::INTER_AREA);
// rgb_frame.convertTo(rgb_frame, CV_32FC3, 1.0/255.0f);
int factor = std::is_same<T, uint8_t>::value ? 1 : 4; // In case we use float32_t, we have 4 bytes per component
auto status = input[0].write(MemoryView(rgb_frame.data, HEIGHT * WIDTH * 3 * factor)); // Writing HEIGHT * WIDTH, 3 channels of uint8
if (HAILO_SUCCESS != status)
return status;
}
return HAILO_SUCCESS;
}
template <typename T>
hailo_status read_all(OutputVStream &output, std::string &video_path)
{
// std::vector<T> data(output.get_frame_size());
int data_size = WIDTH * HEIGHT * 3 * 4;
cv::Mat ret_mat(HEIGHT, WIDTH, CV_32FC3);
std::vector<cv::String> file_names;
std::cout << "-I- Started read thread " << std::endl;
cv::glob(video_path, file_names, false);
size_t num_frames = 0;
for (std::string file : file_names) {
auto status = output.read(MemoryView(ret_mat.data, data_size));
if (HAILO_SUCCESS != status)
return status;
num_frames++;
cv::Mat save_mat;
ret_mat.convertTo(save_mat, CV_8UC3, 255.0f);
cv::cvtColor(save_mat, save_mat, cv::COLOR_RGB2BGR);
std::filesystem::path file_path(file);
std::string file_name = file_path.filename().string();
std::string save_path = dst_dir + "/" + file_name;
cv::imwrite(save_path, save_mat);
std::cout << "save img " << save_path << std::endl;
}
std::cout << "-I- Finished read thread " << std::endl;
return HAILO_SUCCESS;
}
void print_net_banner(std::pair< std::vector<InputVStream>, std::vector<OutputVStream> > &vstreams) {
std::cout << "-I---------------------------------------------------------------------" << std::endl;
std::cout << "-I- Dir Name " << std::endl;
std::cout << "-I---------------------------------------------------------------------" << std::endl;
for (auto &value: vstreams.first)
std::cout << "-I- IN: " << info_to_str<InputVStream>(value) << std::endl;
std::cout << "-I---------------------------------------------------------------------" << std::endl;
for (auto &value: vstreams.second)
std::cout << "-I- OUT: " << info_to_str<OutputVStream>(value) << std::endl;
std::cout << "-I---------------------------------------------------------------------" << std::endl;
}
template <typename IN_T, typename OUT_T>
hailo_status infer(std::vector<InputVStream> &inputs, std::vector<OutputVStream> &outputs, std::string video_path)
{
hailo_status input_status = HAILO_UNINITIALIZED;
hailo_status output_status = HAILO_UNINITIALIZED;
std::vector<std::thread> output_threads;
std::thread input_thread([&inputs, &video_path, &input_status]() { input_status = write_all<IN_T>(inputs, video_path); });
for (auto &output: outputs)
output_threads.push_back( std::thread([&output, &video_path, &output_status]() { output_status = read_all<OUT_T>(output, video_path); }) );
input_thread.join();
for (auto &out: output_threads)
out.join();
if ((HAILO_SUCCESS != input_status) || (HAILO_SUCCESS != output_status)) {
return HAILO_INTERNAL_FAILURE;
}
std::cout << "-I- Inference finished successfully" << std::endl;
return HAILO_SUCCESS;
}
int main(int argc, char**argv)
{
std::string hef_file = getCmdOption(argc, argv, "-hef=");
std::string src_dir = getCmdOption(argc, argv, "-src=");
dst_dir = getCmdOption(argc, argv, "-dst=");
auto all_devices = Device::scan_pcie();
std::cout << "-src dir: " << src_dir << std::endl;
std::cout << "hef: " << hef_file << std::endl;
std::cout << "dst dir: " << dst_dir << std::endl;
auto device = Device::create_pcie(all_devices.value()[0]);
if (!device) {
std::cerr << "-E- Failed create_pcie " << device.status() << std::endl;
return device.status();
}
auto network_group = configure_network_group(*device.value(), hef_file);
if (!network_group) {
std::cerr << "-E- Failed to configure network group " << hef_file << std::endl;
return network_group.status();
}
auto input_vstream_params = network_group.value()->make_input_vstream_params(true, HAILO_FORMAT_TYPE_UINT8, HAILO_DEFAULT_VSTREAM_TIMEOUT_MS, HAILO_DEFAULT_VSTREAM_QUEUE_SIZE);
auto output_vstream_params = network_group.value()->make_output_vstream_params(false, HAILO_FORMAT_TYPE_FLOAT32, HAILO_DEFAULT_VSTREAM_TIMEOUT_MS, HAILO_DEFAULT_VSTREAM_QUEUE_SIZE);
auto input_vstreams = VStreamsBuilder::create_input_vstreams(*network_group.value(), input_vstream_params.value());
auto output_vstreams = VStreamsBuilder::create_output_vstreams(*network_group.value(), output_vstream_params.value());
if (!input_vstreams || !output_vstreams) {
std::cerr << "-E- Failed creating input: " << input_vstreams.status() << " output status:" << output_vstreams.status() << std::endl;
return input_vstreams.status();
}
auto vstreams = std::make_pair(input_vstreams.release(), output_vstreams.release());
print_net_banner(vstreams);
auto activated_network_group = network_group.value()->activate();
if (!activated_network_group) {
std::cerr << "-E- Failed activated network group " << activated_network_group.status();
return activated_network_group.status();
}
auto status = infer<uint8_t, float32_t>(vstreams.first, vstreams.second, src_dir);
if (HAILO_SUCCESS != status) {
std::cerr << "-E- Inference failed " << status << std::endl;
return status;
}
return HAILO_SUCCESS;
}
Could you please help me to solve this problem. If need, I can provide my model for you to reproduction this. Thanks.