pySDK long delay on first inference, subsequent fast

I’ve pretty easily implemented the Facial Recognition system documented here Facial Recognition with PySDK and it’s working great. My one concern is that the very first time I use the facial detection model it takes about 1.5 seconds. Same for the first time I use the facial vectoring model (again about 1.5 seconds). After that first use though the inferencing goes to 0.02 or so. Huge difference. Is this a known thing where we should be doing some sort of initialization before using it in real-time frame analysis? Should we just run a blank inference on start-up to “prime” it? Images show times on first inference and second.


Code:

face_rec_model_name = "arcface_mobilefacenet--112x112_quant_hailort_hailo8l_1"
face_det_model_name = "scrfd_500m--640x640_quant_hailort_hailo8l_1"

# Face Model
FaceModel = dg.load_model(
    model_name=face_det_model_name,
    inference_host_address=your_host_address,
    zoo_url=your_model_zoo,
    token=your_token,
    device_type=device_type,
    overlay_color=[(255,255,0),(0,255,0)]   
)

# Face Recognition Model
FaceVectorModel = dg.load_model(
    model_name=face_rec_model_name,
    inference_host_address=your_host_address,
    zoo_url=your_model_zoo,
    token=your_token,
    device_type=device_type,
)

##. INSIDE SOME FUNCTION....

        print(f"{time.monotonic()}: Before Face Detection")
        faceDetectResult = FaceModel(imageForFaceRec)  
        print(f"{time.monotonic()}: After Facial Detection")
        if len(faceDetectResult.results) > 0:
            face = faceDetectResult.results[0]
            if "label" in face and face["label"] == "face":
                if "score" in face and face["score"] > min_confidence:
                    # Extract bounding box (assumed in [x1, y1, x2, y2] format)
                    x1, y1, x2, y2 = map(int, face["bbox"])  # Convert bbox coordinates to integers
                    cropped_face, _ = self.crop_with_context(imageForFaceRec, [x1, y1, x2, y2])

                    # Display cropped faces for testing
                    if DEBUG:
                        cv2.imshow('Face', cropped_face)

                    # align and crop face.
                    landmarks = [landmark["landmark"] for landmark in face["landmarks"]]
                    print(f"{time.monotonic()}: Before align and crop")
                    aligned_face, _ = self.align_and_crop(imageForFaceRec, landmarks)
                    print(f"{time.monotonic()}: After align and crop")
                    #
                    #  Display the aligned face
                    if DEBUG:
                        cv2.imshow('Aligned Face', aligned_face)

                    # Get the embeddings
                    print(f"{time.monotonic()}: Before Facial Vecotring")
                    face_embedding = FaceVectorModel(aligned_face).results[0]["data"][0]
                    print(f"{time.monotonic()}: After Facial Vecotring")
                    self.faceVector = face_embedding

Thank you.

Hi @user116
Yes, first inference for a model has model loading overhead. As you suggested, you can run a warm up inference on all models before the msin application code.

How many models can be loaded at one time.? I have pose detection as my main. Then face detection and facial analysis (vector creation) if certain conditions are met. Would I need to warm up between each switch of inference model?

Hi @user116, You need not warmup between every switch of inference model. You can load all the models at once and run a quick warmup before running the inference loop. Please see below code for reference:

import cv2, numpy as np, degirum as dg

# ------------------------------------------------------------------
# 1. SETUP
# ------------------------------------------------------------------
host = "@local"
zoo = "degirum/hailo"
device_type = "HAILORT/HAILO8L"
token=''
pose_model_name = "yolov8n_relu6_coco_pose--640x640_quant_hailort_hailo8l_1"
face_model_name = "scrfd_500m--640x640_quant_hailort_hailo8l_1"
face_vec_model_name = "arcface_mobilefacenet--112x112_quant_hailort_hailo8l_1"

print("Loading models...")
PoseModel = dg.load_model(model_name=pose_model_name, inference_host_address=host, zoo_url=zoo, token=token, device_type=device_type)
FaceModel = dg.load_model(model_name=face_model_name, inference_host_address=host, zoo_url=zoo, token=token, device_type=device_type)
FaceVectorModel = dg.load_model(model_name=face_vec_model_name, inference_host_address=host, zoo_url=zoo, token=token, device_type=device_type)


# Dummy image for warm-up
dummy_pose_img = np.zeros((640,640,3), dtype=np.uint8)
dummy_face_img = np.zeros((640,640,3), dtype=np.uint8)
dummy_face_crop = np.zeros((112,112,3), dtype=np.uint8)

print("Warming up models...")
PoseModel(dummy_pose_img)
FaceModel(dummy_face_img)
FaceVectorModel(dummy_face_crop)

print("Warm-up complete. Models are ready for real-time inference.")


cap = cv2.VideoCapture(0) # change to a file path or RTSP if needed

while True:
  ret, frame = cap.read()
  if not ret: break

  # --- Pose inference ---
  pose_result = PoseModel(frame)
  print("\n========== POSE MODEL ==========")
  # print(f"Raw Output: {pose_result} \n ")
  print(f"Number of people detected: {len(pose_result.results)}")
  overlay = pose_result.image_overlay.copy()

  # Dummy condition: run face inference only if >1 person detected
  if len(pose_result.results) > 1:
    print(f"***** Condition met: {len(pose_result.results)} persons found")

    # --- Run Face Detection ---
    face_result = FaceModel(frame)
    print("\n========== FACE DETECTION MODEL ==========")
    # print(f"Raw Output: {face_result}")
    print(f"Number of faces detected: {len(face_result.results)}")
    for face in face_result.results:
      x1,y1,x2,y2 = map(int, face["bbox"])
      face_crop = frame[y1:y2, x1:x2]

      # Resize to 112x112 for FaceVec
      if face_crop.shape[0] > 0 and face_crop.shape[1] > 0:
        face_resized = cv2.resize(face_crop, (112,112))
        vec_result = FaceVectorModel(face_resized)
        embedding = np.asarray(vec_result.results[0]["data"]).flatten()

        # Optional: print embedding length or preview ID
        print("\n========== FACE VECTOR MODEL ==========")
        # print(f"Raw Output: {vec_result}")
        print(f"Embedding Length: {len(embedding)}")
        print(f"Embedding Vector (first 5): {embedding[:5]}")
        print(f"Embedding Norm: {np.linalg.norm(embedding)}")
        cv2.putText(overlay, f"VecID: {embedding[0]:.2f}", (x1, y1 - 10),
            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 0), 1)

  # Show overlay
  cv2.imshow("Pose + Optional Face Pipeline", overlay)
  if cv2.waitKey(1) & 0xFF in (ord('q'), ord('x')):
    break

cap.release()
cv2.destroyAllWindows()

You can also try below sample code to test the latency difference in running the model with and without warmup:

import time, numpy as np, degirum as dg

# Load a Hailo model
model = dg.load_model(
    model_name="yolov8n_relu6_coco_pose--640x640_quant_hailort_hailo8l_1",
    inference_host_address="@local",
    token="dg_8PNGrkCskAPQooMPxoBRT8qBPSzac2cKoF2Qo",
    zoo_url="degirum/hailo",
    device_type="HAILORT/HAILO8L"
)

dummy_input = np.zeros((640,640,3), dtype=np.uint8)

# --- Inference WITHOUT warm-up ---
start = time.time()
_ = model(dummy_input)
t1 = time.time() - start
print(f"First inference (no warm-up): {t1*1000:.1f} ms")

# --- Inference WITH warm-up ---
_ = model(dummy_input)  # warm-up step

start = time.time()
_ = model(dummy_input)
t2 = time.time() - start
print(f"Subsequent inference (warmed up): {t2*1000:.1f} ms")

Please let us know if you have further questions.