Yolov8 output boxes not making any sense

I’m using a custom yolov8n model. Does infer_results require any additional postprocessing? What am I missing? Any help would be appreciated.

import cv2
import os, random, time
import numpy as np
from hailo_platform import (HEF, Device, VDevice, HailoStreamInterface, InferVStreams, ConfigureParams,
    InputVStreamParams, OutputVStreamParams, InputVStreams, OutputVStreams, FormatType)

# yolox_s_leaky input resolution
INPUT_RES_H = 640
INPUT_RES_W = 640

# Loading compiled HEFs to device:
hef_path = 'yolov8n.hef'
video_file = "test.mp4"
hef = HEF(hef_path)

devices = Device.scan()

with VDevice(device_ids=devices) as target:
        configure_params = ConfigureParams.create_from_hef(hef, interface=HailoStreamInterface.PCIe)
        network_group = target.configure(hef, configure_params)[0]
        network_group_params = network_group.create_params()
        input_vstream_info = hef.get_input_vstream_infos()[0]
        output_vstream_info = hef.get_output_vstream_infos()[0]
        input_vstreams_params = InputVStreamParams.make_from_network_group(network_group, quantized=False, format_type=FormatType.FLOAT32)
        output_vstreams_params = OutputVStreamParams.make_from_network_group(network_group, quantized=False, format_type=FormatType.FLOAT32)
        height, width, channels = hef.get_input_vstream_infos()[0].shape
                            
        source = 'camera'
        cap = cv2.VideoCapture(video_file)

        # check if the camera was opened successfully
        if not cap.isOpened():
            print("Could not open camera")
            exit()
        
        start_time = time.time()
        frame_count = 0

        while True:
            # read a frame from the video source
            ret, frame = cap.read()

            # check if the frame was successfully read
            if not ret:
                print("Could not read frame")
                break
            frame_count += 1
            # Get height and width from capture
            orig_w = cap.get(cv2.CAP_PROP_FRAME_WIDTH)  
            orig_h = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)        

            # loop if video source
            if source == 'video' and not cap.get(cv2.CAP_PROP_POS_FRAMES) % cap.get(cv2.CAP_PROP_FRAME_COUNT):
                cap.set(cv2.CAP_PROP_POS_FRAMES, 0)

            # resize image for yolox_s_leaky input resolution and infer it
            resized_img = cv2.resize(frame, (INPUT_RES_H, INPUT_RES_W), interpolation = cv2.INTER_AREA)
            with InferVStreams(network_group, input_vstreams_params, output_vstreams_params) as infer_pipeline:
                input_data = {input_vstream_info.name: np.expand_dims(np.asarray(resized_img), axis=0).astype(np.float32)}    
                with network_group.activate(network_group_params):
                    infer_results = infer_pipeline.infer(input_data)
                    
            print(infer_results['yolov8n/yolov8_nms_postprocess'])  
            print("===========================")   
            for key in infer_results.keys():
                for cls, results in enumerate(infer_results[key][0]):
                    for x,y,w,h,conf in results:
                        if conf > 0.5:
                            x *= orig_w
                            y *= orig_h
                            w *= orig_w
                            h *= orig_h
                            cv2.rectangle(frame, (int(x-w/2),int(y-h/2)), (int(x+w/2),int(y+h/2)), (0,0,255), 2)
                    
            cv2.imshow('frame',frame)
            
            if cv2.waitKey(1) & 0xFF == ord('q'):
                cv2.destroyAllWindows()
                break
            avg_fps = frame_count / (time.time() - start_time)
            print(f"FPS = {avg_fps}")

Hi @theprimetux,
Can you please also share how the non-sense image looks like with the boxes?

Hey @Nadav

Thanks for the reply. I tried the above code with the pretrained yolov8s model. Using object detection python example in Hailo-Application-Code-Examples the model detected the 3 people in the image. But using the code I provided I got the below output. It did detect 3 boxes, but all the boxes are misplaced.

The x,y,w,h values I got are:

0.08861363 0.534286 0.999343 0.83911073 0.8983882
0.05825618 0.0012002736 0.9957857 0.35066473 0.887064
0.052458048 0.2884963 0.9969375 0.5481765 0.871965

hailortcli parse-hef yolov8s.hef gives the following output

Architecture HEF was compiled for: HAILO8L
Network group name: yolov8s, Multi Context - Number of contexts: 3
Network name: yolov8s/yolov8s
VStream infos:
Input yolov8s/input_layer1 UINT8, NHWC(640x640x3)
Output yolov8s/yolov8_nms_postprocess FLOAT32, HAILO NMS(number of classes: 80, maximum bounding boxes per class: 100, maximum frame size: 160320)
Operation:
Op YOLOV8
Name: YOLOV8-Post-Process
Score threshold: 0.200
IoU threshold: 0.70
Classes: 80
Cross classes: false
Max bboxes per class: 100
Image height: 640
Image width: 640

Ok,
First, let me suggest a small fix to the inference code, if we load digital images, you should use UINT8:

with VDevice() as target:
    configure_params = ConfigureParams.create_from_hef(hef, interface=HailoStreamInterface.PCIe)
    network_group = target.configure(hef, configure_params)[0]
    network_group_params = network_group.create_params()
    input_vstream_info = hef.get_input_vstream_infos()[0]
    input_vstreams_params = InputVStreamParams.make_from_network_group(network_group, quantized=False, format_type=FormatType.UINT8)
    output_vstreams_params = OutputVStreamParams.make_from_network_group(network_group, quantized=False, format_type=FormatType.FLOAT32)
    with InferVStreams(network_group, input_vstreams_params, output_vstreams_params) as infer_pipeline:
        input_data = {input_vstream_info.name: np.expand_dims(resized_image, axis=0).astype(np.uint8)}    
        with network_group.activate(network_group_params):
            infer_results = infer_pipeline.infer(input_data)

Then if you using our box drawing logic get’s a good box on your picure:

def get_label(class_id):
    with open('/data/coco/labels.json','r') as f:
        labels = eval(f.read())         
        return labels[str(class_id)]

def draw_detection(draw, d, c, color, scale_factor_x, scale_factor_y):
    """Draw box and label for 1 detection."""
    label = get_label(c)
    ymin, xmin, ymax, xmax = d
    font = ImageFont.truetype('LiberationSans-Regular.ttf', size=15)
    draw.rectangle([(xmin * scale_factor_x, ymin * scale_factor_y), (xmax * scale_factor_x, ymax * scale_factor_y)], outline=color, width=2)
    draw.text((xmin * scale_factor_x + 4, ymin * scale_factor_y + 4), label, fill=color, font=font)
    return label

def annotate_image(image, results, thr=0.45, dim=640, offset_background=True):
    COLORS = np.random.randint(0, 255, size=(90, 3), dtype=np.uint8)
    draw = ImageDraw.Draw(image)
    oh, ow, _ = np.asarray(img).shape
    rh, rw = oh/dim, ow/dim
    for idx, class_detections in enumerate(results[list(infer_results.keys())[0]][0]):
        if class_detections.shape[0]>0:
            color = tuple(int(c) for c in COLORS[idx])
            for det in class_detections:
                if det[4] > thr:
                    if offset_background:
                        label = draw_detection(draw, det[0:4] * dim , idx+1, color, rw, rh)
                    else:
                        label = draw_detection(draw, det[0:4] * dim , idx, color, rw, rh)

Then the output image, looks like this:

1 Like

Oh, so that’s how to draw the output. Thank you so much, that fixed my problem :slight_smile: