[misc] [doc] [frontend] LLM torch profiler support (#7943)
This commit is contained in:
parent
29f49cd6e3
commit
12dd715807
@ -17,14 +17,28 @@ Traces can be visualized using https://ui.perfetto.dev/.
|
|||||||
.. tip::
|
.. tip::
|
||||||
|
|
||||||
Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
|
Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
|
||||||
|
|
||||||
Example commands:
|
.. tip::
|
||||||
|
|
||||||
|
To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
|
||||||
|
Set the env variable VLLM_RPC_GET_DATA_TIMEOUT_MS to a big number before you start the server. Say something like 30 minutes.
|
||||||
|
``export VLLM_RPC_GET_DATA_TIMEOUT_MS=1800000``
|
||||||
|
|
||||||
|
Example commands and usage:
|
||||||
|
===========================
|
||||||
|
|
||||||
|
Offline Inference:
|
||||||
|
------------------
|
||||||
|
|
||||||
|
Refer to `examples/offline_inference_with_profiler.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_with_profiler.py>`_ for an example.
|
||||||
|
|
||||||
|
|
||||||
OpenAI Server:
|
OpenAI Server:
|
||||||
|
--------------
|
||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
|
||||||
VLLM_TORCH_PROFILER_DIR=/mnt/traces/ python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B
|
VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B
|
||||||
|
|
||||||
benchmark_serving.py:
|
benchmark_serving.py:
|
||||||
|
|
||||||
|
|||||||
33
examples/offline_inference_with_profiler.py
Normal file
33
examples/offline_inference_with_profiler.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
|
# enable torch profiler, can also be set on cmd line
|
||||||
|
os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile"
|
||||||
|
|
||||||
|
# Sample prompts.
|
||||||
|
prompts = [
|
||||||
|
"Hello, my name is",
|
||||||
|
"The president of the United States is",
|
||||||
|
"The capital of France is",
|
||||||
|
"The future of AI is",
|
||||||
|
]
|
||||||
|
# Create a sampling params object.
|
||||||
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||||
|
|
||||||
|
# Create an LLM.
|
||||||
|
llm = LLM(model="facebook/opt-125m")
|
||||||
|
|
||||||
|
llm.start_profile()
|
||||||
|
|
||||||
|
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||||
|
# that contain the prompt, generated text, and other information.
|
||||||
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
|
|
||||||
|
llm.stop_profile()
|
||||||
|
|
||||||
|
# Print the outputs.
|
||||||
|
for output in outputs:
|
||||||
|
prompt = output.prompt
|
||||||
|
generated_text = output.outputs[0].text
|
||||||
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
@ -1914,6 +1914,12 @@ class LLMEngine:
|
|||||||
self.tokenizer.check_health()
|
self.tokenizer.check_health()
|
||||||
self.model_executor.check_health()
|
self.model_executor.check_health()
|
||||||
|
|
||||||
|
def start_profile(self) -> None:
|
||||||
|
self.model_executor.start_profile()
|
||||||
|
|
||||||
|
def stop_profile(self) -> None:
|
||||||
|
self.model_executor.stop_profile()
|
||||||
|
|
||||||
def is_tracing_enabled(self) -> bool:
|
def is_tracing_enabled(self) -> bool:
|
||||||
return self.tracer is not None
|
return self.tracer is not None
|
||||||
|
|
||||||
|
|||||||
@ -560,6 +560,12 @@ class LLM:
|
|||||||
outputs = self._run_engine(use_tqdm=use_tqdm)
|
outputs = self._run_engine(use_tqdm=use_tqdm)
|
||||||
return LLMEngine.validate_outputs(outputs, EmbeddingRequestOutput)
|
return LLMEngine.validate_outputs(outputs, EmbeddingRequestOutput)
|
||||||
|
|
||||||
|
def start_profile(self) -> None:
|
||||||
|
self.llm_engine.start_profile()
|
||||||
|
|
||||||
|
def stop_profile(self) -> None:
|
||||||
|
self.llm_engine.stop_profile()
|
||||||
|
|
||||||
# LEGACY
|
# LEGACY
|
||||||
def _convert_v1_inputs(
|
def _convert_v1_inputs(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@ -296,6 +296,12 @@ class CPUExecutor(ExecutorBase):
|
|||||||
for result in parallel_worker_tasks:
|
for result in parallel_worker_tasks:
|
||||||
result.get()
|
result.get()
|
||||||
|
|
||||||
|
def start_profile(self) -> None:
|
||||||
|
self.driver_method_invoker(self.driver_worker, "start_profile")
|
||||||
|
|
||||||
|
def stop_profile(self) -> None:
|
||||||
|
self.driver_method_invoker(self.driver_worker, "stop_profile")
|
||||||
|
|
||||||
|
|
||||||
class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase):
|
class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase):
|
||||||
|
|
||||||
|
|||||||
@ -169,6 +169,12 @@ class GPUExecutor(ExecutorBase):
|
|||||||
# it's running.
|
# it's running.
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def start_profile(self) -> None:
|
||||||
|
self.driver_worker.start_profile()
|
||||||
|
|
||||||
|
def stop_profile(self) -> None:
|
||||||
|
self.driver_worker.stop_profile()
|
||||||
|
|
||||||
|
|
||||||
class GPUExecutorAsync(GPUExecutor, ExecutorAsyncBase):
|
class GPUExecutorAsync(GPUExecutor, ExecutorAsyncBase):
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user