Latency Benchmark for various batch sizes=(1,8,16,32,63)

Hi, I have written below Python script (measure_latency_kpi_sync) to Benchmark Latency measurement for various batch sizes (1,8,16,32,63) using the run (sync) command but the results does not match the hailortcli benchmark tool.

Similarly to benchmark FPS measurement (measure_fps_kpi_async) using run_async but the results don’t match what is given on the Hailo8L model zoo.

Kindly may you please check and let us know what we are doing wrong and how can we improve the code so it matches Hailo8L provided results? Thanks

HailoRT version 4.22.0

def measure_latency_kpi_sync(hef_file, batch_size):
timeout_ms = 10000
number_of_frames = 100
inference_times =

params = VDevice.create_params()
params.scheduling_algorithm = HailoSchedulingAlgorithm.ROUND_ROBIN

# The vdevice is used as a context manager ("with" statement) to ensure it's released on time.
with VDevice(params) as vdevice:

	# Create an infer model from an HEF:
	infer_model = vdevice.create_infer_model(hef_file)
	infer_model.set_batch_size(batch_size)
	# Configure the infer model and create bindings for it
	with infer_model.configure() as configured_infer_model:
		bindings = configured_infer_model.create_bindings()
		#print(infer_model.input().shape)
		for i in range(number_of_frames):
			# Set input and output buffers
			buffer = np.empty(infer_model.input().shape, dtype=np.uint8)
			bindings.input().set_buffer(buffer)
			if len(bindings._output_names) == 1:
				bindings.output().set_buffer(np.empty(infer_model.output().shape, dtype=np.uint8))
			else:
				for name in bindings._output_names:
					bindings.output(name).set_buffer(np.empty(infer_model.output(name).shape, dtype=np.uint8))
			bindings_list = []
			for j in range(batch_size):
				bindings_list.append(bindings)
			# Run synchronous inference and access the output buffers
			start_time = time.time()
			configured_infer_model.run(bindings_list, timeout_ms)
			inference_time = time.time() - start_time
			inference_times.append(inference_time)
		latency_mean = np.mean(inference_times)*1000
		latency_std = np.std(inference_times)*1000
		print(f"Average Latency of model {os.path.basename(hef_file)} is {latency_mean} ms and std: {latency_std} ms for batch_size {batch_size}\n")
		return latency_mean, latency_std

def measure_fps_kpi_async(hef_file):
	number_of_frames = 5000
	timeout_ms = 10000
	fps_times = []
	params = VDevice.create_params()
	params.scheduling_algorithm = HailoSchedulingAlgorithm.ROUND_ROBIN
	params.group_id = "SHARED"
	with VDevice(params) as vdevice:
		# Create an infer model from an HEF:
		infer_model = vdevice.create_infer_model(hef_file)
		
		# Once the infer model is set, configure the infer model
		with infer_model.configure() as configured_infer_model:
			bindings = configured_infer_model.create_bindings()
			for i in range(3):
				inference_times = []
				for _ in range(number_of_frames):
					# Create bindings for it and set buffers
					bindings.input().set_buffer(np.empty(infer_model.input().shape, dtype=np.uint8))
					if len(bindings._output_names) == 1:
						bindings.output().set_buffer(np.empty(infer_model.output().shape, dtype=np.uint8))
					else:
						for name in bindings._output_names:
							bindings.output(name).set_buffer(np.empty(infer_model.output(name).shape, dtype=np.uint8))
					start_time = time.time()
					configured_infer_model.wait_for_async_ready(timeout_ms=10000)
					job = configured_infer_model.run_async([bindings], partial(example_callback, bindings=bindings))
					configured_infer_model.wait_for_async_ready(timeout_ms=10000)
					inference_time = time.time() - start_time
					inference_times.append(inference_time)
				job.wait(timeout_ms)
				fps_times.append(1/np.mean(inference_times))
			fps_mean = np.mean(fps_times)
			fps_std = np.std(fps_times)
			print(f" FPS of model {os.path.basename(hef_file)} is {fps_mean} and std: {fps_std} \n")
			return fps_mean, fps_std