From 1dab9bc8a9192a6081821c3a6b6c4aee3b7912c3 Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Wed, 3 Jul 2024 17:56:59 -0600 Subject: [PATCH] [Bugfix] set OMP_NUM_THREADS to 1 by default for multiprocessing (#6109) Signed-off-by: Travis Johnson Co-authored-by: Nick Hill --- vllm/executor/multiproc_gpu_executor.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py index ae5062bd..dcde2797 100644 --- a/vllm/executor/multiproc_gpu_executor.py +++ b/vllm/executor/multiproc_gpu_executor.py @@ -37,6 +37,11 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor): # Disable torch async compiling which won't work with daemonic processes os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1" + # Set OMP_NUM_THREADS to 1 if it is not set explicitly, avoids CPU + # contention amongst the shards + if "OMP_NUM_THREADS" not in os.environ: + os.environ["OMP_NUM_THREADS"] = "1" + assert world_size <= cuda_device_count_stateless(), ( "please set tensor_parallel_size to less than max local gpu count")