Hailo8 - help needed with YOLOv4 output interpretation

Hello,
I need some assistance to run an example inference application on the Hailo8.

I’m trying to use the license plate detection network with the infer_pipeline_example.

I see that even though the in-out buffers are setup no data is written/read from it, so no actual inference is performed.

The network requires a 416x416x3 image as input, so I provided one in the form of a txt file, where the bytes of the image are represented as characters.

At this point I need to read out the bounding box informations. I used the little information that I could find online regarding yolo v4 output structure, but the result is still wrong. I attached below my code, any guidance would be appreciated.

void manualDrawBox(
    std::vector<uint8_t> &image_data,
    int width,
    int height,
    const std::vector<uint8_t> &centers,
    const std::vector<uint8_t> &obj,
    const std::vector<uint8_t> &probs,
    const std::vector<uint8_t> &scales,
    float confidence_threshold,
    float obj_threshold, int r, int g, int b)
{

    int channels = 3; // Assuming RGB channels

    size_t num_boxes = obj.size(); // Assuming obj vector length represents the number of boxes

    for (size_t i = 0; i < num_boxes; ++i)
    {
        float objectness = static_cast<float>(obj[i]) / 255.0f;
        if (objectness > 0.5)
        {
            cout << "found a good box" << endl;

            // Extract the center and scale information
            int center_x = static_cast<int>((static_cast<float>(centers[i * 2]) / 255.0f) * width);
            int center_y = static_cast<int>((static_cast<float>(centers[i * 2 + 1]) / 255.0f) * height);
            int box_width = static_cast<int>((static_cast<float>(scales[i * 2]) / 255.0f) * width);
            int box_height = static_cast<int>((static_cast<float>(scales[i * 2 + 1]) / 255.0f) * height);

            // center = output + offset of relative cell

            cout << "[" << center_x << " - " << center_y << "]" << "w: " << box_width << "h: " << box_height << endl;

            // Calculate top-left and bottom-right corners
            int top_left_x = std::max(center_x - box_width / 2, 0);
            int top_left_y = std::max(center_y - box_height / 2, 0);
            int bottom_right_x = std::min(center_x + box_width / 2, width - 1);
            int bottom_right_y = std::min(center_y + box_height / 2, height - 1);

            // Draw top and bottom horizontal lines of the bounding box
            for (int x = top_left_x; x <= bottom_right_x; ++x)
            {
                // Top border
                int top_index = (top_left_y * width + x) * channels;
                image_data[top_index] = r;     // Blue
                image_data[top_index + 1] = g; // Green
                image_data[top_index + 2] = b; // Red

                // Bottom border
                int bottom_index = (bottom_right_y * width + x) * channels;
                image_data[bottom_index] = r;     // Blue
                image_data[bottom_index + 1] = g; // Green
                image_data[bottom_index + 2] = b; // Red
            }

            // Draw left and right vertical lines of the bounding box
            for (int y = top_left_y; y <= bottom_right_y; ++y)
            {
                // Left border
                int left_index = (y * width + top_left_x) * channels;
                image_data[left_index] = r;     // Blue
                image_data[left_index + 1] = g; // Green
                image_data[left_index + 2] = b; // Red

                // Right border
                int right_index = (y * width + bottom_right_x) * channels;
                image_data[right_index] = r;     // Blue
                image_data[right_index + 1] = g; // Green
                image_data[right_index + 2] = b; // Red
            }
        }
    }
}

hailo_status infer(InferVStreams &pipeline)
{
    const size_t frames_count = 1;
    std::vector<uint8_t> image_data = read_image_from_file("image.txt");

    // Set input buffers and MemoryView
    std::map<std::string, std::vector<uint8_t>> input_data;
    auto input_vstreams = pipeline.get_input_vstreams();
    for (const auto &input_vstream : input_vstreams)
    {
        input_data.emplace(input_vstream.get().name(), std::vector<uint8_t>(input_vstream.get().get_frame_size() * frames_count));
    }
    for (const auto &input_vstream : input_vstreams)
    {
        auto &input_buffer = input_data[input_vstream.get().name()];
        for (size_t i = 0; i < image_data.size(); ++i)
        {
            input_buffer[i] = static_cast<uint8_t>(image_data[i] + 128);
        }
    }
    std::map<std::string, MemoryView> input_data_mem_views;
    for (const auto &input_vstream : input_vstreams)
    {
        auto &input_buffer = input_data[input_vstream.get().name()];
        input_data_mem_views.emplace(input_vstream.get().name(), MemoryView(input_buffer.data(), input_buffer.size()));
    }

    // Set output buffers and MemoryView
    std::map<std::string, std::vector<uint8_t>> output_data;
    auto output_vstreams = pipeline.get_output_vstreams();
    for (const auto &output_vstream : output_vstreams)
    {
        output_data.emplace(output_vstream.get().name(), std::vector<uint8_t>(output_vstream.get().get_frame_size() * frames_count));
    }
    std::map<std::string, MemoryView> output_data_mem_views;
    for (const auto &output_vstream : output_vstreams)
    {
        auto &output_buffer = output_data[output_vstream.get().name()];
        output_data_mem_views.emplace(output_vstream.get().name(), MemoryView(output_buffer.data(), output_buffer.size()));
    }

    // Inference
    hailo_status status = pipeline.infer(input_data_mem_views, output_data_mem_views, frames_count);
    if (status != HAILO_SUCCESS)
    {
        return status;
    }

    // Read output
    vector<uint8_t> centers_19;
    vector<uint8_t> obj_19;
    vector<uint8_t> probs_19;
    vector<uint8_t> scales_19;

    vector<uint8_t> centers_21;
    vector<uint8_t> obj_21;
    vector<uint8_t> probs_21;
    vector<uint8_t> scales_21;

    string s;

// Process output
    std::vector<std::pair<std::string, std::vector<float>>> processed_output;
    for (const auto &output_vstream : output_vstreams)
    {
        s = output_vstream.get().name();

        if (s == "tiny_yolov4_license_plates/conv19_centers")
        {
            centers_19 = output_data[s];
            cout << s << " - " << output_vstream.get().get_frame_size() << endl;
            cout << s << " - " << output_vstream.get().get_frame_size() << endl;
        }

        if (s == "tiny_yolov4_license_plates/conv19_obj")
        {
            obj_19 = output_data[s];
            cout << s << " - " << output_vstream.get().get_frame_size() << endl;
        }

        if (s == "tiny_yolov4_license_plates/conv19_probs")
        {
            probs_19 = output_data[s];
            cout << s << " - " << output_vstream.get().get_frame_size() << endl;
        }

        if (s == "tiny_yolov4_license_plates/conv19_scales")
        {
            scales_19 = output_data[s];
            cout << s << " - " << output_vstream.get().get_frame_size() << endl;
        }

        if (s == "tiny_yolov4_license_plates/conv21_centers")
        {
            centers_21 = output_data[s];
            cout << s << " - " << output_vstream.get().get_frame_size() << endl;
        }

        if (s == "tiny_yolov4_license_plates/conv21_obj")
        {
            obj_21 = output_data[s];
            cout << s << " - " << output_vstream.get().get_frame_size() << endl;
        }

        if (s == "tiny_yolov4_license_plates/conv21_probs")
        {
            probs_21 = output_data[s];
            cout << s << " - " << output_vstream.get().get_frame_size() << endl;
        }

        if (s == "tiny_yolov4_license_plates/conv21_scales")
        {
            scales_21 = output_data[s];
            cout << s << " - " << output_vstream.get().get_frame_size() << endl;
        }
    }

    manualDrawBox(image_data, 416, 416, centers_19, obj_19, probs_19, scales_19, 0.1, 0.1, 255, 0, 0);
    manualDrawBox(image_data, 416, 416, centers_21, obj_21, probs_21, scales_21, 0.1, 0.1, 0, 0, 255);

    // write image to file etc

    return HAILO_SUCCESS;
}

Hi @Stefano_Nicolis
If you are ok using other models, you can see our guide on license plate recognition: A Comprehensive Guide to Building a License Plate Recognition (LPR) Systems

I got it to work by using this function to extract the detection boxes and “writing” them directly in the image data:

std::vector<unsigned char> draw_yolov4_boxes(
    std::vector<unsigned char> &input_image,
    uint8_t *conv19_centers, size_t conv19_centers_size,
    uint8_t *conv19_obj, size_t conv19_obj_size,
    uint8_t *conv19_probs, size_t conv19_probs_size,
    uint8_t *conv19_scales, size_t conv19_scales_size,
    uint8_t *conv21_centers, size_t conv21_centers_size,
    uint8_t *conv21_obj, size_t conv21_obj_size,
    uint8_t *conv21_probs, size_t conv21_probs_size,
    uint8_t *conv21_scales, size_t conv21_scales_size,
    bool use_direct_sizes = false)
{

    // Image dimensions
    const int img_width = 416;
    const int img_height = 416;
    const int channels = 3;

    // Grid dimensions and strides
    const int grid_sizes[2] = {13, 26};
    const float strides[2] = {32.0f, 16.0f};

    // Anchor boxes for 416x416 input
    const float anchors[2][3][2] = {
        {{23.0f, 27.0f}, {81.0f, 82.0f}, {224.0f, 246.0f}},
        {{10.0f, 14.0f}, {37.0f, 58.0f}, {135.0f, 169.0f}}};

    // Output image (copy input)
    std::vector<unsigned char> output_image = input_image;

    // Process each detection layer (conv19 and conv21)
    for (int layer = 0; layer < 2; ++layer)
    {
        const int grid_size = grid_sizes[layer];
        const float stride = strides[layer];
        const int num_anchors = 3;

        // Select the appropriate arrays for this layer
        uint8_t *centers = (layer == 0) ? conv19_centers : conv21_centers;
        uint8_t *obj = (layer == 0) ? conv19_obj : conv21_obj;
        uint8_t *probs = (layer == 0) ? conv19_probs : conv21_probs;
        uint8_t *scales = (layer == 0) ? conv19_scales : conv21_scales;

        size_t centers_size = (layer == 0) ? conv19_centers_size : conv21_centers_size;
        size_t obj_size = (layer == 0) ? conv19_obj_size : conv21_obj_size;
        size_t probs_size = (layer == 0) ? conv19_probs_size : conv21_probs_size;
        size_t scales_size = (layer == 0) ? conv19_scales_size : conv21_scales_size;

        // Process each grid cell
        for (int i = 0; i < grid_size; ++i)
        {
            for (int j = 0; j < grid_size; ++j)
            {
                for (int a = 0; a < num_anchors; ++a)
                {
                    int base_idx = (i * grid_size + j) * num_anchors + a;
                    int centers_idx = base_idx * 2;
                    int scales_idx = base_idx * 2;

                    // Validate indices
                    if (centers_idx + 1 >= centers_size || scales_idx + 1 >= scales_size ||
                        base_idx >= obj_size || base_idx >= probs_size)
                    {
                        std::cerr << "Error: Index out of bounds at grid (" << i << "," << j
                                  << "), anchor " << a << std::endl;
                        continue;
                    }

                    // Extract values
                    float tx = centers[centers_idx];
                    float ty = centers[centers_idx + 1];
                    float tw = scales[scales_idx];
                    float th = scales[scales_idx + 1];
                    float objectness = obj[base_idx];
                    float class_prob = probs[base_idx];

                    // Apply sigmoid
                    objectness = sigmoid(objectness);
                    class_prob = sigmoid(class_prob);

                    // Filter by objectness threshold
                    if (objectness <= 0.5f)
                    {
                        continue;
                    }

                    // Compute bounding box coordinates
                    float bx = (sigmoid(tx) + static_cast<float>(j)) * stride;
                    float by = (sigmoid(ty) + static_cast<float>(i)) * stride;

                    float bw, bh;
                    if (use_direct_sizes)
                    {
                        bw = tw;
                        bh = th;
                    }
                    else
                    {
                        float anchor_w = anchors[layer][a][0];
                        float anchor_h = anchors[layer][a][1];
                        bw = anchor_w * std::exp(tw);
                        bh = anchor_h * std::exp(th);
                    }

                    // Convert to (x_min, y_min, x_max, y_max)
                    float x_min = bx - bw / 2.0f;
                    float y_min = by - bh / 2.0f;
                    float x_max = bx + bw / 2.0f;
                    float y_max = by + bh / 2.0f;

                    // Clip to image boundaries
                    x_min = std::max(0.0f, std::min(x_min, static_cast<float>(img_width - 1)));
                    x_max = std::max(0.0f, std::min(x_max, static_cast<float>(img_width - 1)));
                    y_min = std::max(0.0f, std::min(y_min, static_cast<float>(img_height - 1)));
                    y_max = std::max(0.0f, std::min(y_max, static_cast<float>(img_height - 1)));

                    // Draw bounding box
                    int x1 = static_cast<int>(x_min);
                    int x2 = static_cast<int>(x_max);
                    int y1 = static_cast<int>(y_min);
                    int y2 = static_cast<int>(y_max);

                    // Draw top and bottom edges
                    for (int x = x1; x <= x2; ++x)
                    {
                        if (y1 >= 0 && y1 < img_height && x >= 0 && x < img_width)
                        {
                            int idx = (y1 * img_width + x) * channels;
                            output_image[idx] = 255;   // R
                            output_image[idx + 1] = 0; // G
                            output_image[idx + 2] = 0; // B
                        }
                        if (y2 >= 0 && y2 < img_height && x >= 0 && x < img_width)
                        {
                            int idx = (y2 * img_width + x) * channels;
                            output_image[idx] = 255;
                            output_image[idx + 1] = 0;
                            output_image[idx + 2] = 0;
                        }
                    }

                    // Draw left and right edges
                    for (int y = y1; y <= y2; ++y)
                    {
                        if (y >= 0 && y < img_height && x1 >= 0 && x1 < img_width)
                        {
                            int idx = (y * img_width + x1) * channels;
                            output_image[idx] = 255;
                            output_image[idx + 1] = 0;
                            output_image[idx + 2] = 0;
                        }
                        if (y >= 0 && y < img_height && x2 >= 0 && x2 < img_width)
                        {
                            int idx = (y * img_width + x2) * channels;
                            output_image[idx] = 255;
                            output_image[idx + 1] = 0;
                            output_image[idx + 2] = 0;
                        }
                    }
                }
            }
        }
    }

    return output_image;
}

And calling it:


// getting output vector
    for (const auto &output_vstream : output_vstreams)
    {
        s = output_vstream.get().name();

        if (s == "tiny_yolov4_license_plates/conv19_centers")
        {
            centers_19 = output_data[s];
        }
       // etc........
}
    image_data = draw_yolov4_boxes(
        image_data,
        centers_19, 1014, // centers_19 and its size
        obj_19, 507,      // obj_19 and its size
        probs_19, 507,    // probs_19 and its size
        scales_19, 1014,  // scales_19 and its size
        centers_21, 4056, // centers_21 and its size
        obj_21, 2028,     // obj_21 and its size
        probs_21, 2028,   // probs_21 and its size
        scales_21, 4056,  // scales_21 and its size
        true              // use_direct_sizes
    );
1 Like