Sorry might be a bit late to the party but I was running into very similar issues and this post from another thread helped me out immensely: Help with inference/post-processing for the scrfd model for face detection - #3 by olof.
The issue that I was running into that the post unveiled was that the class layer output needed to be adjusted accordingly. Hope this helps.
def rescale_network_outputs(self, outputs):
box_layer_names = ["scrfd_2_5g/conv43", "scrfd_2_5g/conv50", "scrfd_2_5g/conv56"]
class_layer_names = ["scrfd_2_5g/conv42", "scrfd_2_5g/conv49", "scrfd_2_5g/conv55"]
landmark_layer_names = ["scrfd_2_5g/conv44", "scrfd_2_5g/conv51", "scrfd_2_5g/conv57"]
rescaled_outputs = []
for output_name, output in outputs[0].items():
# Convert to float32 to avoid overflow
output = output.astype(np.float32)
if output_name in box_layer_names:
downscale_factor = 32 # Magic number, but probably the maximum downscale factor
output = output / downscale_factor
elif output_name in class_layer_names:
# From range UINT8 [0, 255] to FLOAT32 [0, 1]
output = output / 255
elif output_name in landmark_layer_names:
# Converts from Qunatized UINT8 to FLOAT32
# These are approximate values as I couldn't find the exact values in the model
# Exact values are determined when they compile from onnx to hef
zero_point = 113
scale = 29
output = (output - zero_point) / scale
else:
raise ValueError(f"Unknown output name: {output_name}")
reshaped = output.reshape(1, -1, output.shape[-1])
rescaled_outputs.append(reshaped)
return rescaled_outputs