Accessing Hailo Processed Frame (with Bounding Boxes) for Communication Purposes
Hello,
I’m working on integrating the hailo-rpi5-examples/basic_pipelines/detection.py
example into my own project, particularly for processing and communicating detection data. I’ve modified the user_app_callback_class
to include communication-related variables and functions specific to my use case.
Currently, inside the app_callback
function, I can access the frame using get_numpy_from_buffer()
, but this returns the raw, unprocessed frame. While I can draw bounding boxes on this frame using the detection outputs (like coordinates and labels), this approach introduces additional CPU load and causes FPS drops.
However, when running the example, I can see that Hailo’s pipeline already draws processed frames with bounding boxes, labels, and other overlays — as shown in the Hailo detection app window.
My Question:
Is there any way to access the same final processed frame that is rendered and displayed by the detection app (i.e., the frame with bounding boxes already drawn by Hailo’s internal pipeline)?
I would like to retrieve and send this fully processed frame via my communication pipeline, instead of redrawing everything manually on the raw frame.
I’ve searched through the provided APIs and codebase but couldn’t find where or how that processed frame is rendered or made accessible. If this is handled by internal Hailo functions or GStreamer elements, is there a way to tap into the pipeline at the right point to retrieve that final video frame?
What I’m doing now in app_callback function:
def app_callback(pad, info, user_data):
# Get the GstBuffer from the probe info
buffer = info.get_buffer()
# Check if the buffer is valid
if buffer is None:
return Gst.PadProbeReturn.OK
# Using the user_data to count the number of frames
user_data.increment()
string_to_print = f"Frame count: {user_data.get_count()}\n"
# Get the caps from the pad
format, width, height = get_caps_from_pad(pad)
# If the user_data.use_frame is set to True, we can get the video frame from the buffer
frame = None
user_data.use_frame = True #I am not sure how this should work
if user_data.use_frame and format is not None and width is not None and height is not None:
# Get video frame
frame = get_numpy_from_buffer(buffer, format, width, height)
# Get the detections from the buffer
roi = hailo.get_roi_from_buffer(buffer)
detections = roi.get_objects_typed(hailo.HAILO_DETECTION)
data=shared.get_data()
error_x_center,error_y_center = data[0],data[1]
# Parse the detections
detection_count = 0
for detection in detections:
label = detection.get_label()
bbox = detection.get_bbox()
confidence = detection.get_confidence()
if confidence > 0.4:
# Call the coordinate methods
x_min = bbox.xmin()
y_min = bbox.ymin()
box_width = bbox.width()
box_height = bbox.height()
# Calculate max coordinates
x_max = x_min + box_width
y_max = y_min + box_height
# Calculate center point (these are normalized 0-1)
center_x = x_min + (box_width / 2)
#center_y = (y_min + (box_height / 2) - 0.22) * 1.83 #?
center_y = y_min + (box_height / 2)
# Debug print for coordinates
print(f"{label.capitalize()} detected!\n"
f"Position: center=({center_x:.2f}, {center_y:.2f})\n"
f"Bounds: xmin={x_min:.2f}, ymin={y_min:.2f}, xmax={x_max:.2f}, ymax={y_max:.2f}\n"
f"Confidence: {confidence:.2f}\n")
if label == "triangle":
red_center, red_w, red_h = (center_x, center_y), box_width, box_height
# Line (to the target center)
cv2.line(frame,
(int(width / 2) + error_x_center, int(height / 2) + error_y_center),
(int(red_center[0] * width), int(red_center[1] * height)),
(255, 0, 0), 2)
# Bounding box
cv2.rectangle(frame,
(int(x_min * width), int(y_min * height)),
(int(x_max * width), int(y_max * height)),
(255, 0, 0), 2)
elif label == "hexagon":
blue_center, blue_w, blue_h = (center_x, center_y), box_width, box_height
# Line (to the target center)
cv2.line(frame,
(int(width / 2) + error_x_center, int(height / 2) + error_y_center),
(int(blue_center[0] * width), int(blue_center[1] * height)),
(0, 0, 255), 2)
# Bounding box
cv2.rectangle(frame,
(int(x_min * width), int(y_min * height)),
(int(x_max * width), int(y_max * height)),
(0, 0, 255), 2)
text = f"{label} ({confidence:.2f})"
cv2.putText(frame, text,
(int(x_min * width), int(y_min * height) - 10),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
(255, 255, 255),
2)
if user_data.use_frame:
# Note: using imshow will not work here, as the callback function is not running in the main thread
# Let's print the detection count to the frame
cv2.putText(frame, f"Detections: {detection_count}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
# Example of how to use the new_variable and new_function from the user_data
# Let's print the new_variable and the result of the new_function to the frame
#cv2.putText(frame, f"{user_data.new_function()} {user_data.new_variable}", (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
# Convert the frame to BGR
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
user_data.send_video(frame)
print(string_to_print)
return Gst.PadProbeReturn.OK
This works, but it’s costly — I’m duplicating work that Hailo already seems to be doing for display purposes.
Any suggestions or guidance would be greatly appreciated!
Thank you in advance