[Misc] Add logging for CUDA memory (#10027)
Signed-off-by: Chenghao Yang <yangalan1996@gmail.com> Signed-off-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Chenghao Yang <yangalan1996@gmail.com> Co-authored-by: youkaichao <youkaichao@gmail.com>
This commit is contained in:
parent
cd34029e91
commit
09d3550372
@ -48,9 +48,10 @@ from vllm.prompt_adapter.worker_manager import (
|
|||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
|
from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
|
||||||
from vllm.transformers_utils.config import uses_mrope
|
from vllm.transformers_utils.config import uses_mrope
|
||||||
from vllm.utils import (DeviceMemoryProfiler, PyObjectCache, async_tensor_h2d,
|
from vllm.utils import (DeviceMemoryProfiler, GiB_bytes, PyObjectCache,
|
||||||
flatten_2d_lists, is_pin_memory_available,
|
async_tensor_h2d, flatten_2d_lists,
|
||||||
supports_dynamo, weak_ref_tensor)
|
is_pin_memory_available, supports_dynamo,
|
||||||
|
weak_ref_tensor)
|
||||||
from vllm.worker.model_runner_base import (
|
from vllm.worker.model_runner_base import (
|
||||||
ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
|
ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
|
||||||
_add_attn_metadata_broadcastable_dict,
|
_add_attn_metadata_broadcastable_dict,
|
||||||
@ -1383,16 +1384,16 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
|
|||||||
per sequence in the batch.
|
per sequence in the batch.
|
||||||
"""
|
"""
|
||||||
assert not self.model_config.enforce_eager
|
assert not self.model_config.enforce_eager
|
||||||
logger.info("Capturing the model for CUDA graphs. This may lead to "
|
logger.info("Capturing cudagraphs for decoding. This may lead to "
|
||||||
"unexpected consequences if the model is not static. To "
|
"unexpected consequences if the model is not static. To "
|
||||||
"run the model in eager mode, set 'enforce_eager=True' or "
|
"run the model in eager mode, set 'enforce_eager=True' or "
|
||||||
"use '--enforce-eager' in the CLI.")
|
"use '--enforce-eager' in the CLI.")
|
||||||
logger.info("CUDA graphs can take additional 1~3 GiB memory per GPU. "
|
logger.info("If out-of-memory error occurs during cudagraph capture,"
|
||||||
"If you are running out of memory, consider decreasing "
|
" consider decreasing `gpu_memory_utilization` or "
|
||||||
"`gpu_memory_utilization` or enforcing eager mode. "
|
"switching to eager mode. You can also reduce the "
|
||||||
"You can also reduce the `max_num_seqs` as needed "
|
"`max_num_seqs` as needed to decrease memory usage.")
|
||||||
"to decrease memory usage.")
|
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
|
start_free_gpu_memory = torch.cuda.mem_get_info()[0]
|
||||||
|
|
||||||
# Prepare dummy inputs. These will be reused for all batch sizes.
|
# Prepare dummy inputs. These will be reused for all batch sizes.
|
||||||
max_batch_size = self.max_batchsize_to_capture
|
max_batch_size = self.max_batchsize_to_capture
|
||||||
@ -1497,9 +1498,12 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
|
|||||||
graph_runner)
|
graph_runner)
|
||||||
|
|
||||||
end_time = time.perf_counter()
|
end_time = time.perf_counter()
|
||||||
|
end_free_gpu_memory = torch.cuda.mem_get_info()[0]
|
||||||
elapsed_time = end_time - start_time
|
elapsed_time = end_time - start_time
|
||||||
|
cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory
|
||||||
# This usually takes < 10 seconds.
|
# This usually takes < 10 seconds.
|
||||||
logger.info("Graph capturing finished in %.0f secs.", elapsed_time)
|
logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
|
||||||
|
elapsed_time, cuda_graph_size / GiB_bytes)
|
||||||
|
|
||||||
def _update_inputs_to_capture_for_enc_dec_model(self,
|
def _update_inputs_to_capture_for_enc_dec_model(self,
|
||||||
capture_inputs: Dict[str,
|
capture_inputs: Dict[str,
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user