[misc] [doc] [frontend] LLM torch profiler support (#7943)

2024-09-06 17:48:48 -07:00 · 2024-09-06 17:48:48 -07:00 · 12dd715807
commit 12dd715807
parent 29f49cd6e3
6 changed files with 74 additions and 3 deletions
--- a/docs/source/dev/profiling/profiling_index.rst
+++ b/docs/source/dev/profiling/profiling_index.rst
@ -17,14 +17,28 @@ Traces can be visualized using https://ui.perfetto.dev/.
 .. tip::
   Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
-   
+
-Example commands:
+.. tip::
   To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
   Set the env variable VLLM_RPC_GET_DATA_TIMEOUT_MS to a big number before you start the server. Say something like 30 minutes.
   ``export VLLM_RPC_GET_DATA_TIMEOUT_MS=1800000``
 Example commands and usage:
 ===========================
 Offline Inference:
 ------------------
 Refer to `examples/offline_inference_with_profiler.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_with_profiler.py>`_ for an example.
 OpenAI Server:
 --------------
 .. code-block:: bash
-    VLLM_TORCH_PROFILER_DIR=/mnt/traces/ python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B 
+    VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B 
 benchmark_serving.py:
--- a/examples/offline_inference_with_profiler.py
+++ b/examples/offline_inference_with_profiler.py
@ -0,0 +1,33 @@
 import os
 from vllm import LLM, SamplingParams
 # enable torch profiler, can also be set on cmd line
 os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile"
 # Sample prompts.
 prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
 ]
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 # Create an LLM.
 llm = LLM(model="facebook/opt-125m")
 llm.start_profile()
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
 llm.stop_profile()
 # Print the outputs.
 for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@ -1914,6 +1914,12 @@ class LLMEngine:
            self.tokenizer.check_health()
        self.model_executor.check_health()
    def start_profile(self) -> None:
        self.model_executor.start_profile()
    def stop_profile(self) -> None:
        self.model_executor.stop_profile()
    def is_tracing_enabled(self) -> bool:
        return self.tracer is not None
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -560,6 +560,12 @@ class LLM:
        outputs = self._run_engine(use_tqdm=use_tqdm)
        return LLMEngine.validate_outputs(outputs, EmbeddingRequestOutput)
    def start_profile(self) -> None:
        self.llm_engine.start_profile()
    def stop_profile(self) -> None:
        self.llm_engine.stop_profile()
    # LEGACY
    def _convert_v1_inputs(
        self,
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@ -296,6 +296,12 @@ class CPUExecutor(ExecutorBase):
        for result in parallel_worker_tasks:
            result.get()
    def start_profile(self) -> None:
        self.driver_method_invoker(self.driver_worker, "start_profile")
    def stop_profile(self) -> None:
        self.driver_method_invoker(self.driver_worker, "stop_profile")
 class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase):
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@ -169,6 +169,12 @@ class GPUExecutor(ExecutorBase):
        # it's running.
        return
    def start_profile(self) -> None:
        self.driver_worker.start_profile()
    def stop_profile(self) -> None:
        self.driver_worker.stop_profile()
 class GPUExecutorAsync(GPUExecutor, ExecutorAsyncBase):