Hi there, when I run the following code. I get the result.
Output 'scrfd_10g/conv42' shape: [80, 80, 8]
Output 'scrfd_10g/conv41' shape: [80, 80, 2]
Output 'scrfd_10g/conv43' shape: [80, 80, 20]
Output 'scrfd_10g/conv50' shape: [40, 40, 8]
Output 'scrfd_10g/conv49' shape: [40, 40, 2]
Output 'scrfd_10g/conv51' shape: [40, 40, 20]
Output 'scrfd_10g/conv57' shape: [20, 20, 8]
Output 'scrfd_10g/conv56' shape: [20, 20, 2]
Output 'scrfd_10g/conv58' shape: [20, 20, 20]
I am kind of confused on how to proceed with this. I followed the following link yet no results.
Here is the code.
import numpy as np
from hailo_platform import VDevice, HailoSchedulingAlgorithm, HEF
import cv2
timeout_ms = 1000
params = VDevice.create_params()
params.scheduling_algorithm = HailoSchedulingAlgorithm.ROUND_ROBIN
with VDevice(params) as vdevice:
# Create an infer model from an HEF:
infer_model = vdevice.create_infer_model('/home/suraas/Desktop/face_detection/models/scrfd_10g.hef')
# Configure the infer model and create bindings for it
with infer_model.configure() as configured_infer_model:
bindings = configured_infer_model.create_bindings()
# Load and preprocess image
# Load image
image = cv2.imread('/home/suraas/Desktop/face_detection/sample.png')
# Resize image to match model's expected input shape (736x1280)
resized_image = cv2.resize(image, (640, 640)) # Note: cv2.resize takes (width, height)
# Convert to appropriate format (usually float32 or uint8 depending on model)
input_tensor = resized_image.astype(np.uint8)
# Set the preprocessed image as input
bindings.input().set_buffer(input_tensor)
# Get all output tensors
output_tensors = infer_model.outputs
# Print output shapes for debugging
for output in output_tensors:
print(f"Output '{output.name}' shape: {output.shape}")
# Create buffers for each output
output_buffers = {}
for output in output_tensors:
# Create buffer with exact shape and use uint8 dtype
buffer = np.zeros(output.shape, dtype=np.uint8)
bindings.output(output.name).set_buffer(buffer)
output_buffers[output.name] = buffer
# Run synchronous inference
configured_infer_model.run([bindings], timeout_ms)
# Get results from each output buffer
results = {}
for output_name, buffer in output_buffers.items():
results[output_name] = bindings.output(output_name).get_buffer()
# For async inference
job = configured_infer_model.run_async([bindings])
job.wait(timeout_ms)
I tried couple of ways to get the BBoxes but all lead to random number of faces detected like 700+ and 1000+ faces where in the face actual found is just 8-10.
Here is the actual code to extract bboxes for the faces found.
import numpy as np
from hailo_platform import VDevice, HailoSchedulingAlgorithm, HEF
import cv2
timeout_ms = 1000
params = VDevice.create_params()
params.scheduling_algorithm = HailoSchedulingAlgorithm.ROUND_ROBIN
with VDevice(params) as vdevice:
# Create an infer model from an HEF:
infer_model = vdevice.create_infer_model('/home/suraas/Desktop/face_detection/models/scrfd_10g.hef')
# Configure the infer model and create bindings for it
with infer_model.configure() as configured_infer_model:
bindings = configured_infer_model.create_bindings()
# Load and preprocess image
# Load image
image_path = '/home/suraas/Desktop/face_detection/class.png'
image = cv2.imread(image_path)
if image is None:
raise FileNotFoundError(f"Failed to load image at path: {image_path}")
original_height, original_width = image.shape[:2]
# Scale factors for later use
scale_x = original_width / 640
scale_y = original_height / 640
# Resize image to match model's expected input shape (736x1280)
resized_image = cv2.resize(image, (640, 640)) # Note: cv2.resize takes (width, height)
# Convert to appropriate format (usually float32 or uint8 depending on model)
input_tensor = resized_image.astype(np.uint8)
# Set the preprocessed image as input
bindings.input().set_buffer(input_tensor)
# Get all output tensors
output_tensors = infer_model.outputs
for output in output_tensors:
print(output.name, output.shape)
# Create buffers for each output
output_buffers = {}
for output in output_tensors:
# Create buffer with exact shape and use uint8 dtype
buffer = np.zeros(output.shape, dtype=np.uint8)
bindings.output(output.name).set_buffer(buffer)
output_buffers[output.name] = buffer
# Run synchronous inference
configured_infer_model.run([bindings], timeout_ms)
# Get results from each output buffer
results = {}
for output_name, buffer in output_buffers.items():
results[output_name] = bindings.output(output_name).get_buffer().astype(np.float32)
# For async inference
job = configured_infer_model.run_async([bindings])
job.wait(timeout_ms)
def process_outputs(results, input_size=640):
# Convert outputs to float32 for easier processing
scores_list = []
bboxes_list = []
# Process outputs for each feature level (10g has 3 levels)
for idx, stride in enumerate([8, 16, 32]): # strides for different feature levels
# Get relevant outputs for this level
scores = results[f'scrfd_10g/conv{41+idx*8}'].astype(np.float32) # confidence scores
bbox_preds = results[f'scrfd_10g/conv{42+idx*8}'].astype(np.float32) # bounding box predictions
height = input_size // stride
width = input_size // stride
# Print shapes for debugging
print(f"Stride {stride}:")
print(f"Original scores shape: {scores.shape}")
# Generate anchors
anchor_centers = np.stack(np.mgrid[:height, :width][::-1], axis=-1).astype(np.float32)
anchor_centers = (anchor_centers * stride).reshape((-1, 2))
# Process scores - reshape maintaining the spatial dimensions
if stride == 32:
scores = scores.reshape((height * width, -1))[:, :2] # Ensure 2 channels
else:
scores = scores.reshape((-1, 2))
# Reshape bbox_preds to match the spatial dimensions
bbox_preds = bbox_preds.reshape((height * width, -1))[:, :4] # Always take first 4 channels
print(f"scores shape after reshape: {scores.shape}")
print(f"anchor_centers shape: {anchor_centers.shape}")
print(f"bbox_preds shape: {bbox_preds.shape}")
# Ensure pos_inds is within bounds
pos_inds = np.where(scores[:, 1] > 95)[0] # Increased confidence threshold
pos_inds = pos_inds[pos_inds < len(anchor_centers)]
if len(pos_inds) > 0:
bboxes = distance2bbox(anchor_centers[pos_inds], bbox_preds[pos_inds])
scores = scores[pos_inds, 1]
scores_list.append(scores)
bboxes_list.append(bboxes)
# Combine all detections
scores = np.concatenate(scores_list, axis=0) if scores_list else np.array([])
bboxes = np.concatenate(bboxes_list, axis=0) if bboxes_list else np.array([])
# Apply NMS
keep = nms(bboxes, scores, nms_threshold=0.6)
if keep.size > 0:
return bboxes[keep], scores[keep]
return np.array([]), np.array([])
def distance2bbox(points, distance):
x1 = points[:, 0] - distance[:, 0]
y1 = points[:, 1] - distance[:, 1]
x2 = points[:, 0] + distance[:, 2]
y2 = points[:, 1] + distance[:, 3]
return np.stack([x1, y1, x2, y2], axis=-1)
def nms(boxes, scores, nms_threshold=0.6): # Lowered NMS threshold
x1 = boxes[:, 0]
y1 = boxes[:, 1]
x2 = boxes[:, 2]
y2 = boxes[:, 3]
areas = (x2 - x1 + 1) * (y2 - y1 + 1)
order = scores.argsort()[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
w = np.maximum(0.0, xx2 - xx1 + 1)
h = np.maximum(0.0, yy2 - yy1 + 1)
inter = w * h
ovr = inter / (areas[i] + areas[order[1:]] - inter)
inds = np.where(ovr <= nms_threshold)[0]
order = order[inds + 1]
return np.array(keep)
def calculate_iou(box1, box2):
# Calculate intersection over union between two boxes
x1 = max(box1[0], box2[0])
y1 = max(box1[1], box2[1])
x2 = min(box1[2], box2[2])
y2 = min(box1[3], box2[3])
intersection = max(0, x2 - x1) * max(0, y2 - y1)
area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
return intersection / (area1 + area2 - intersection)
def remove_multi_face_overlaps(boxes, scores, iou_threshold=0.3):
keep = []
n_boxes = len(boxes)
for i in range(n_boxes):
overlaps_multiple = False
overlap_count = 0
# Check how many other boxes this box overlaps with
for j in range(n_boxes):
if i != j:
iou = calculate_iou(boxes[i], boxes[j])
if iou > iou_threshold:
overlap_count += 1
# Keep only if it doesn't overlap with multiple boxes
if overlap_count <= 1: # Allow overlap with at most one other box
keep.append(i)
return np.array(keep)
# After getting results, process them
bboxes, scores = process_outputs(results)
# Sort detections by confidence score
if len(scores) > 0:
# First apply regular NMS
sorted_idx = np.argsort(scores)[::-1]
bboxes = bboxes[sorted_idx]
scores = scores[sorted_idx]
# Then remove boxes that overlap with multiple faces
keep_idx = remove_multi_face_overlaps(bboxes, scores, iou_threshold=0.3)
bboxes = bboxes[keep_idx]
scores = scores[keep_idx]
# Draw and save detected faces
original_image = cv2.imread('/home/suraas/Desktop/face_detection/class.png')
output_image = original_image.copy()
# Keep only top N detections (e.g., top 20)
max_detections = 20
if len(scores) > max_detections:
bboxes = bboxes[:max_detections]
scores = scores[:max_detections]
# After getting bboxes and scores, scale them back to original image size
if len(bboxes) > 0:
# Scale bounding boxes to original image size
bboxes[:, [0, 2]] *= scale_x # scale x coordinates
bboxes[:, [1, 3]] *= scale_y # scale y coordinates
print("\nDetected Faces:")
print("---------------")
for idx, (bbox, score) in enumerate(zip(bboxes, scores)):
bbox = bbox.astype(np.int32)
# Ensure bbox coordinates are within image bounds
x1, y1, x2, y2 = bbox
x1 = max(0, x1)
y1 = max(0, y1)
x2 = min(original_image.shape[1], x2)
y2 = min(original_image.shape[0], y2)
# Print detection info
print(f"Face #{idx + 1}:")
print(f" Position: x1={x1}, y1={y1}, x2={x2}, y2={y2}")
print(f" Confidence Score: {score:.3f}")
print(f" Box Size: {x2-x1}x{y2-y1} pixels")
print("---------------")
# Extract face region
face = original_image[y1:y2, x1:x2]
# Save face only if region is valid and score is high enough
if face.size > 0 and score > 100: # Changed from 0.7 to 0.85
cv2.imwrite(f'face_{idx}_conf_{score:.2f}.png', face)
# Draw rectangle and score on the output image
cv2.rectangle(output_image, (x1, y1), (x2, y2), (0, 255, 0), 2)
cv2.putText(output_image, f'{score:.2f}', (x1, y1 - 5),
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
# Save annotated image with all detections
cv2.imwrite('detected_faces_all.png', output_image)