Hi,I have a 6GPU Ubuntu server, and now when compiling the oxxn model using the following command
hailomz compile yolov8m --ckpt=hf100yolov8m.onnx --hw-arch hailo8 --calib-path train/images --classes 1 --performance
the following error occurs
Detected at node 'NcclAllReduce_1' defined at (most recent call last):
File "/home/spx/anaconda3/envs/lbz/bin/hailomz", line 33, in <module>
sys.exit(load_entry_point('hailo-model-zoo', 'console_scripts', 'hailomz')())
File "/home/spx/yolov8compile/hailo_model_zoo/hailo_model_zoo/main.py", line 122, in main
run(args)
File "/home/spx/yolov8compile/hailo_model_zoo/hailo_model_zoo/main.py", line 111, in run
return handlers[args.command](args)
File "/home/spx/yolov8compile/hailo_model_zoo/hailo_model_zoo/main_driver.py", line 248, in compile
_ensure_optimized(runner, logger, args, network_info)
File "/home/spx/yolov8compile/hailo_model_zoo/hailo_model_zoo/main_driver.py", line 91, in _ensure_optimized
optimize_model(
File "/home/spx/yolov8compile/hailo_model_zoo/hailo_model_zoo/core/main_utils.py", line 353, in optimize_model
runner.optimize(calib_feed_callback)
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/site-packages/hailo_sdk_common/states/states.py", line 16, in wrapped_func
return func(self, *args, **kwargs)
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/site-packages/hailo_sdk_client/runner/client_runner.py", line 2201, in optimize
result = self._optimize(
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/site-packages/hailo_sdk_common/states/states.py", line 16, in wrapped_func
return func(self, *args, **kwargs)
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/site-packages/hailo_sdk_client/runner/client_runner.py", line 2020, in _optimize
checkpoint_info = self._sdk_backend.full_quantization(
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/site-packages/hailo_sdk_client/sdk_backend/sdk_backend.py", line 1196, in full_quantization
new_checkpoint_info = self._full_acceleras_run(
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/site-packages/hailo_sdk_client/sdk_backend/sdk_backend.py", line 1434, in _full_acceleras_run
new_checkpoint_info = self._optimization_flow_runner(optimization_flow, checkpoint_info)
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/site-packages/hailo_sdk_client/sdk_backend/sdk_backend.py", line 2088, in _optimization_flow_runner
optimization_flow.run()
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/site-packages/hailo_model_optimization/tools/orchestator.py", line 239, in wrapper
return func(self, *args, **kwargs)
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/site-packages/hailo_model_optimization/flows/optimization_flow.py", line 357, in run
step_func()
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/site-packages/hailo_model_optimization/tools/subprocess_wrapper.py", line 125, in parent_wrapper
proc.start()
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/multiprocessing/process.py", line 121, in start
self._popen = self._Popen(self)
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/multiprocessing/context.py", line 224, in _Popen
return _default_context.get_context().Process._Popen(process_obj)
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/multiprocessing/context.py", line 281, in _Popen
return Popen(process_obj)
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/multiprocessing/popen_fork.py", line 19, in __init__
self._launch(process_obj)
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/multiprocessing/popen_fork.py", line 71, in _launch
code = process_obj._bootstrap(parent_sentinel=child_r)
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/site-packages/hailo_model_optimization/tools/subprocess_wrapper.py", line 104, in child_wrapper
func(self, *args, **kwargs)
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/site-packages/hailo_model_optimization/flows/optimization_flow.py", line 382, in step2
self.post_quantization_optimization()
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/site-packages/hailo_model_optimization/flows/optimization_flow.py", line 450, in post_quantization_optimization
self._finetune()
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/site-packages/hailo_model_optimization/flows/optimization_flow.py", line 748, in _finetune
algo.run()
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/site-packages/hailo_model_optimization/algorithms/optimization_algorithm.py", line 55, in run
return super().run()
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/site-packages/hailo_model_optimization/algorithms/algorithm_base.py", line 159, in run
self._run_int()
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/site-packages/hailo_model_optimization/algorithms/finetune/qft.py", line 387, in _run_int
self.run_qft(self._model_native, self._model, metrics=self.metrics)
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/site-packages/hailo_model_optimization/algorithms/finetune/qft.py", line 507, in run_qft
self.main_train_summary_per_epoch = qft_distiller.fit(
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
return fn(*args, **kwargs)
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/site-packages/keras/engine/training.py", line 1685, in fit
tmp_logs = self.train_function(iterator)
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/site-packages/keras/engine/training.py", line 1284, in train_function
return step_function(self, iterator)
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/site-packages/keras/engine/training.py", line 1268, in step_function
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "/home/spx/anaconda3/envs/lbz/lib/python3.10/site-packages/keras/optimizers/utils.py", line 175, in _all_reduce_sum_fn
return distribution.extended.batch_reduce_to(
Node: 'NcclAllReduce_1'
6 root error(s) found.
(0) UNKNOWN: Error invoking NCCL: unhandled cuda error
[[{{node NcclAllReduce}}]]
(1) UNKNOWN: Error invoking NCCL: unhandled cuda error
[[{{node NcclAllReduce_5}}]]
(2) UNKNOWN: Error invoking NCCL: unhandled cuda error
[[{{node NcclAllReduce_4}}]]
(3) UNKNOWN: Error invoking NCCL: unhandled cuda error
[[{{node NcclAllReduce_3}}]]
(4) UNKNOWN: Error invoking NCCL: unhandled cuda error
[[{{node NcclAllReduce_2}}]]
(5) UNKNOWN: Error invoking NCCL: unhandled cuda error
[[{{node NcclAllReduce_1}}]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_1429544]
It seems to be a problem with optimizing using multiple GPUs, so I tried using GPUs to run it
export CUDA_VISIBLE_DEVICES=0
and it worked !!!
I would like to know if the current version of Hailomz does not support multiple GPUs, or if there are other commands that I have not added.
Thank you very much for your answer