Input and output issues of the CLIP ViT-B/16 image encoder

li_hua · November 14, 2025, 10:11am

I encountered an issue when using the clip_vit_b_16_image_encoder public model from hailo_model_zoo (version 2.17). The model library link is: hailo_model_zoo/docs/public_models/HAILO8/HAILO8_zero_shot_classification.rst at update-to-version-2.17 · hailo-ai/hailo_model_zoo · GitHub .

I fed the same preprocessed data into both the ONNX model and HEF model included in the library, but the output vectors of the two models are inconsistent. Could anyone help with this? I would like to know how to configure or adjust the operations to make the output vector of the HEF model consistent with that of the ONNX model.

The following is the running code
import onnxruntime as ort

import numpy as np

from PIL import Image

from torchvision import transforms

from torchvision.transforms import functional as TF

from hailo_platform import HEF, VDevice, InputVStreamParams, OutputVStreamParams, FormatType, HailoStreamInterface, InferVStreams, ConfigureParams,HailoSchedulingAlgorithm




normalize = transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],

                                 std=[0.26862954, 0.26130258, 0.27577711])




def onnx_infer(onnx_path,img_array):

    # 创建ONNX Runtime会话

    session = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])

    input_name = session.get_inputs()[0].name

    output_name = session.get_outputs()[0].name

    onnx_output = session.run([output_name], {input_name: img_array})[0]

    return onnx_output.squeeze()




def hef_infer(hef_path,img_array):

    first_hef2 = HEF(hef_path)

    hefs = [first_hef2]




    # Creating the VDevice target with scheduler enabled

    params = VDevice.create_params()

    params.scheduling_algorithm = HailoSchedulingAlgorithm.ROUND_ROBIN

    params.multi_process_service = True

    params.group_id = "SHARED"

    target = VDevice(params)




    configure_params0 = ConfigureParams.create_from_hef(hefs[0], interface=HailoStreamInterface.PCIe)

    model_name0 = hefs[0].get_network_group_names()[0]

    batch_size = 1

    configure_params0[model_name0].batch_size = batch_size

    network_groups0 = target.configure(hefs[0], configure_params0)

    network_group0 = network_groups0[0]




    input_vstreams_params0 = InputVStreamParams.make(network_group0, quantized=False,format_type=FormatType.FLOAT32)

    output_vstreams_params0 = OutputVStreamParams.make(network_group0, quantized=True,format_type=FormatType.FLOAT32)

    input_vstream_info0 = hefs[0].get_input_vstream_infos()[0]

    output_vstream_info0 = hefs[0].get_output_vstream_infos()[0]





    input = input_vstream_info0.name

    input_dict = {input: img_array}




    with InferVStreams(network_group0, input_vstreams_params0, output_vstreams_params0, tf_nms_format = True) as infer_pipeline0:

        output_data = infer_pipeline0.infer(input_dict)

        tt = output_data[output_vstream_info0.name].squeeze()

        return tt




def read_img(img_path):

    img = Image.open(img_path).convert('RGB')

    img = TF.resize(img, 224, transforms.InterpolationMode.LANCZOS)

    img = TF.center_crop(img, (224,224))

    img = TF.to_tensor(img).to('cpu')

    img = normalize(img)

    img_np = img.numpy() 

    img_array = np.expand_dims(img_np, axis=0)

    return img_array





img_array=read_img('./mountain-477832_1280.jpg')

onnx=onnx_infer('./clip_vit_b_16.onnx',img_array)

hef=hef_infer('./clip_vit_b_16_image_encoder.hef',img_array)




cos_sim = np.dot(onnx, hef) / (

    np.linalg.norm(onnx) * np.linalg.norm(hef)

)

l2_dist = np.linalg.norm(onnx - hef)




print(f"cos_sim:{cos_sim},l2_dist:{l2_dist}")

The following is the running result