Postprocessing scrfd_10g output

Hi there, when I run the following code. I get the result.

Output 'scrfd_10g/conv42' shape: [80, 80, 8]
Output 'scrfd_10g/conv41' shape: [80, 80, 2]
Output 'scrfd_10g/conv43' shape: [80, 80, 20]
Output 'scrfd_10g/conv50' shape: [40, 40, 8]
Output 'scrfd_10g/conv49' shape: [40, 40, 2]
Output 'scrfd_10g/conv51' shape: [40, 40, 20]
Output 'scrfd_10g/conv57' shape: [20, 20, 8]
Output 'scrfd_10g/conv56' shape: [20, 20, 2]
Output 'scrfd_10g/conv58' shape: [20, 20, 20]

I am kind of confused on how to proceed with this. I followed the following link yet no results.

hailo_model_zoo

Here is the code.

import numpy as np
from hailo_platform import VDevice, HailoSchedulingAlgorithm, HEF
import cv2

timeout_ms = 1000

params = VDevice.create_params()
params.scheduling_algorithm = HailoSchedulingAlgorithm.ROUND_ROBIN

with VDevice(params) as vdevice:

    # Create an infer model from an HEF:
    infer_model = vdevice.create_infer_model('/home/suraas/Desktop/face_detection/models/scrfd_10g.hef')

    # Configure the infer model and create bindings for it
    with infer_model.configure() as configured_infer_model:
        bindings = configured_infer_model.create_bindings()

        # Load and preprocess image
        # Load image
        image = cv2.imread('/home/suraas/Desktop/face_detection/sample.png')
        # Resize image to match model's expected input shape (736x1280)
        resized_image = cv2.resize(image, (640, 640))  # Note: cv2.resize takes (width, height)
        # Convert to appropriate format (usually float32 or uint8 depending on model)
        input_tensor = resized_image.astype(np.uint8)
        # Set the preprocessed image as input
        bindings.input().set_buffer(input_tensor)

        # Get all output tensors
        output_tensors = infer_model.outputs

        # Print output shapes for debugging
        for output in output_tensors:
            print(f"Output '{output.name}' shape: {output.shape}")

        # Create buffers for each output
        output_buffers = {}
        for output in output_tensors:
            # Create buffer with exact shape and use uint8 dtype
            buffer = np.zeros(output.shape, dtype=np.uint8)
            bindings.output(output.name).set_buffer(buffer)
            output_buffers[output.name] = buffer

        # Run synchronous inference
        configured_infer_model.run([bindings], timeout_ms)

        # Get results from each output buffer
        results = {}
        for output_name, buffer in output_buffers.items():
            results[output_name] = bindings.output(output_name).get_buffer()
        
        # For async inference
        job = configured_infer_model.run_async([bindings])
        job.wait(timeout_ms)

I tried couple of ways to get the BBoxes but all lead to random number of faces detected like 700+ and 1000+ faces where in the face actual found is just 8-10.

Here is the actual code to extract bboxes for the faces found.

import numpy as np
from hailo_platform import VDevice, HailoSchedulingAlgorithm, HEF
import cv2

timeout_ms = 1000

params = VDevice.create_params()
params.scheduling_algorithm = HailoSchedulingAlgorithm.ROUND_ROBIN

with VDevice(params) as vdevice:

    # Create an infer model from an HEF:
    infer_model = vdevice.create_infer_model('/home/suraas/Desktop/face_detection/models/scrfd_10g.hef')

    # Configure the infer model and create bindings for it
    with infer_model.configure() as configured_infer_model:
        bindings = configured_infer_model.create_bindings()

        # Load and preprocess image
        # Load image
        image_path = '/home/suraas/Desktop/face_detection/class.png'
        image = cv2.imread(image_path)
        if image is None:
            raise FileNotFoundError(f"Failed to load image at path: {image_path}")
        original_height, original_width = image.shape[:2]
        # Scale factors for later use
        scale_x = original_width / 640
        scale_y = original_height / 640
        # Resize image to match model's expected input shape (736x1280)
        resized_image = cv2.resize(image, (640, 640))  # Note: cv2.resize takes (width, height)
        # Convert to appropriate format (usually float32 or uint8 depending on model)
        input_tensor = resized_image.astype(np.uint8)
        # Set the preprocessed image as input
        bindings.input().set_buffer(input_tensor)

        # Get all output tensors
        output_tensors = infer_model.outputs

        for output in output_tensors:
            print(output.name, output.shape)

        # Create buffers for each output
        output_buffers = {}
        for output in output_tensors:
            # Create buffer with exact shape and use uint8 dtype
            buffer = np.zeros(output.shape, dtype=np.uint8)
            bindings.output(output.name).set_buffer(buffer)
            output_buffers[output.name] = buffer

        # Run synchronous inference
        configured_infer_model.run([bindings], timeout_ms)

        # Get results from each output buffer
        results = {}
        for output_name, buffer in output_buffers.items():
            results[output_name] = bindings.output(output_name).get_buffer().astype(np.float32) 
        
        # For async inference
        job = configured_infer_model.run_async([bindings])
        job.wait(timeout_ms)

def process_outputs(results, input_size=640):
    # Convert outputs to float32 for easier processing
    scores_list = []
    bboxes_list = []
    
    # Process outputs for each feature level (10g has 3 levels)
    for idx, stride in enumerate([8, 16, 32]):  # strides for different feature levels
        # Get relevant outputs for this level
        scores = results[f'scrfd_10g/conv{41+idx*8}'].astype(np.float32)  # confidence scores
        bbox_preds = results[f'scrfd_10g/conv{42+idx*8}'].astype(np.float32)  # bounding box predictions
        
        height = input_size // stride
        width = input_size // stride
        
        # Print shapes for debugging
        print(f"Stride {stride}:")
        print(f"Original scores shape: {scores.shape}")
        
        # Generate anchors
        anchor_centers = np.stack(np.mgrid[:height, :width][::-1], axis=-1).astype(np.float32)
        anchor_centers = (anchor_centers * stride).reshape((-1, 2))
        
        # Process scores - reshape maintaining the spatial dimensions
        if stride == 32:
            scores = scores.reshape((height * width, -1))[:, :2]  # Ensure 2 channels
        else:
            scores = scores.reshape((-1, 2))
        
        # Reshape bbox_preds to match the spatial dimensions
        bbox_preds = bbox_preds.reshape((height * width, -1))[:, :4]  # Always take first 4 channels
        
        print(f"scores shape after reshape: {scores.shape}")
        print(f"anchor_centers shape: {anchor_centers.shape}")
        print(f"bbox_preds shape: {bbox_preds.shape}")
        
        # Ensure pos_inds is within bounds
        pos_inds = np.where(scores[:, 1] > 95)[0]  # Increased confidence threshold
        pos_inds = pos_inds[pos_inds < len(anchor_centers)]
        
        if len(pos_inds) > 0:
            bboxes = distance2bbox(anchor_centers[pos_inds], bbox_preds[pos_inds])
            scores = scores[pos_inds, 1]
            
            scores_list.append(scores)
            bboxes_list.append(bboxes)
    
    # Combine all detections
    scores = np.concatenate(scores_list, axis=0) if scores_list else np.array([])
    bboxes = np.concatenate(bboxes_list, axis=0) if bboxes_list else np.array([])
    
    # Apply NMS
    keep = nms(bboxes, scores, nms_threshold=0.6)
    if keep.size > 0:
        return bboxes[keep], scores[keep]
    return np.array([]), np.array([])

def distance2bbox(points, distance):
    x1 = points[:, 0] - distance[:, 0]
    y1 = points[:, 1] - distance[:, 1]
    x2 = points[:, 0] + distance[:, 2]
    y2 = points[:, 1] + distance[:, 3]
    return np.stack([x1, y1, x2, y2], axis=-1)

def nms(boxes, scores, nms_threshold=0.6):  # Lowered NMS threshold
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]
    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
    order = scores.argsort()[::-1]
    keep = []
    
    while order.size > 0:
        i = order[0]
        keep.append(i)
        xx1 = np.maximum(x1[i], x1[order[1:]])
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])
        w = np.maximum(0.0, xx2 - xx1 + 1)
        h = np.maximum(0.0, yy2 - yy1 + 1)
        inter = w * h
        ovr = inter / (areas[i] + areas[order[1:]] - inter)
        inds = np.where(ovr <= nms_threshold)[0]
        order = order[inds + 1]
    return np.array(keep)

def calculate_iou(box1, box2):
    # Calculate intersection over union between two boxes
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])
    
    intersection = max(0, x2 - x1) * max(0, y2 - y1)
    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
    area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
    
    return intersection / (area1 + area2 - intersection)

def remove_multi_face_overlaps(boxes, scores, iou_threshold=0.3):
    keep = []
    n_boxes = len(boxes)
    
    for i in range(n_boxes):
        overlaps_multiple = False
        overlap_count = 0
        
        # Check how many other boxes this box overlaps with
        for j in range(n_boxes):
            if i != j:
                iou = calculate_iou(boxes[i], boxes[j])
                if iou > iou_threshold:
                    overlap_count += 1
                    
        # Keep only if it doesn't overlap with multiple boxes
        if overlap_count <= 1:  # Allow overlap with at most one other box
            keep.append(i)
            
    return np.array(keep)

# After getting results, process them
bboxes, scores = process_outputs(results)

# Sort detections by confidence score
if len(scores) > 0:
    # First apply regular NMS
    sorted_idx = np.argsort(scores)[::-1]
    bboxes = bboxes[sorted_idx]
    scores = scores[sorted_idx]
    
    # Then remove boxes that overlap with multiple faces
    keep_idx = remove_multi_face_overlaps(bboxes, scores, iou_threshold=0.3)
    bboxes = bboxes[keep_idx]
    scores = scores[keep_idx]

# Draw and save detected faces
original_image = cv2.imread('/home/suraas/Desktop/face_detection/class.png')
output_image = original_image.copy()

# Keep only top N detections (e.g., top 20)
max_detections = 20
if len(scores) > max_detections:
    bboxes = bboxes[:max_detections]
    scores = scores[:max_detections]

# After getting bboxes and scores, scale them back to original image size
if len(bboxes) > 0:
    # Scale bounding boxes to original image size
    bboxes[:, [0, 2]] *= scale_x  # scale x coordinates
    bboxes[:, [1, 3]] *= scale_y  # scale y coordinates

print("\nDetected Faces:")
print("---------------")
for idx, (bbox, score) in enumerate(zip(bboxes, scores)):
    bbox = bbox.astype(np.int32)
    
    # Ensure bbox coordinates are within image bounds
    x1, y1, x2, y2 = bbox
    x1 = max(0, x1)
    y1 = max(0, y1)
    x2 = min(original_image.shape[1], x2)
    y2 = min(original_image.shape[0], y2)
    
    # Print detection info
    print(f"Face #{idx + 1}:")
    print(f"  Position: x1={x1}, y1={y1}, x2={x2}, y2={y2}")
    print(f"  Confidence Score: {score:.3f}")
    print(f"  Box Size: {x2-x1}x{y2-y1} pixels")
    print("---------------")
    
    # Extract face region
    face = original_image[y1:y2, x1:x2]
    
    # Save face only if region is valid and score is high enough
    if face.size > 0 and score > 100:  # Changed from 0.7 to 0.85
        cv2.imwrite(f'face_{idx}_conf_{score:.2f}.png', face)
    
        # Draw rectangle and score on the output image
        cv2.rectangle(output_image, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(output_image, f'{score:.2f}', (x1, y1 - 5),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

# Save annotated image with all detections
cv2.imwrite('detected_faces_all.png', output_image)

@hyperwolf
Please see our post on how we integrated the postprocessor for scrfd models: SCRFD Model Support in DeGirum PySDK