From 1dab9bc8a9192a6081821c3a6b6c4aee3b7912c3 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Wed, 3 Jul 2024 17:56:59 -0600
Subject: [PATCH] [Bugfix] set OMP_NUM_THREADS to 1 by default for
 multiprocessing (#6109)

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
---
 vllm/executor/multiproc_gpu_executor.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
index ae5062bd..dcde2797 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -37,6 +37,11 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
         # Disable torch async compiling which won't work with daemonic processes
         os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 
+        # Set OMP_NUM_THREADS to 1 if it is not set explicitly, avoids CPU
+        # contention amongst the shards
+        if "OMP_NUM_THREADS" not in os.environ:
+            os.environ["OMP_NUM_THREADS"] = "1"
+
         assert world_size <= cuda_device_count_stateless(), (
             "please set tensor_parallel_size to less than max local gpu count")