From 30bad5c49278ec5c3836a7bf00faa1316e8827b8 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 12 Dec 2023 22:01:53 -0800
Subject: [PATCH] Fix peak memory profiling (#2031)

---
 vllm/utils.py         | 5 -----
 vllm/worker/worker.py | 7 +++----
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index d5d8d4ef..7ec9e328 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -40,11 +40,6 @@ def get_max_shared_memory_bytes(gpu: int = 0) -> int:
     return int(max_shared_mem)
 
 
-def get_gpu_memory(gpu: int = 0) -> int:
-    """Returns the total memory of the GPU in bytes."""
-    return torch.cuda.get_device_properties(gpu).total_memory
-
-
 def get_cpu_memory() -> int:
     """Returns the total CPU memory of the node in bytes."""
     return psutil.virtual_memory().total
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 6f5e16f0..e3294911 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -13,7 +13,6 @@ from vllm.model_executor.parallel_utils.parallel_state import (
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.model_runner import ModelRunner
-from vllm.utils import get_gpu_memory
 
 
 class Worker:
@@ -81,7 +80,6 @@ class Worker:
         # Profile the memory usage of the model and get the maximum number of
         # cache blocks that can be allocated with the remaining free memory.
         torch.cuda.empty_cache()
-        torch.cuda.reset_peak_memory_stats()
 
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.
@@ -90,8 +88,9 @@ class Worker:
         # Calculate the number of blocks that can be allocated with the
         # profiled peak memory.
         torch.cuda.synchronize()
-        peak_memory = torch.cuda.max_memory_allocated()
-        total_gpu_memory = get_gpu_memory()
+        free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
+        peak_memory = total_gpu_memory - free_gpu_memory
+
         cache_block_size = CacheEngine.get_cache_block_size(
             block_size, self.model_config, self.parallel_config)
         num_gpu_blocks = int(