[misc] [doc] [frontend] LLM torch profiler support (#7943)

2024-09-06 17:48:48 -07:00 · 2024-09-06 17:48:48 -07:00 · 12dd715807
commit 12dd715807
parent 29f49cd6e3
6 changed files with 74 additions and 3 deletions
--- a/docs/source/dev/profiling/profiling_index.rst
+++ b/docs/source/dev/profiling/profiling_index.rst
@ -17,14 +17,28 @@ Traces can be visualized using https://ui.perfetto.dev/.
 .. tip::

   Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
-   
-Example commands:
+
+.. tip::
+
+   To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
+   Set the env variable VLLM_RPC_GET_DATA_TIMEOUT_MS to a big number before you start the server. Say something like 30 minutes.
+   ``export VLLM_RPC_GET_DATA_TIMEOUT_MS=1800000``
+  
+Example commands and usage:
+===========================
+
+Offline Inference:
+------------------
+
+Refer to `examples/offline_inference_with_profiler.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_with_profiler.py>`_ for an example.
+

 OpenAI Server:
+--------------

 .. code-block:: bash

-    VLLM_TORCH_PROFILER_DIR=/mnt/traces/ python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B 
+    VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B 

 benchmark_serving.py:

--- a/examples/offline_inference_with_profiler.py
+++ b/examples/offline_inference_with_profiler.py
@ -0,0 +1,33 @@
+import os
+
+from vllm import LLM, SamplingParams
+
+# enable torch profiler, can also be set on cmd line
+os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile"
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# Create an LLM.
+llm = LLM(model="facebook/opt-125m")
+
+llm.start_profile()
+
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+
+llm.stop_profile()
+
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@ -1914,6 +1914,12 @@ class LLMEngine:
            self.tokenizer.check_health()
        self.model_executor.check_health()

+    def start_profile(self) -> None:
+        self.model_executor.start_profile()
+
+    def stop_profile(self) -> None:
+        self.model_executor.stop_profile()
+
    def is_tracing_enabled(self) -> bool:
        return self.tracer is not None

--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -560,6 +560,12 @@ class LLM:
        outputs = self._run_engine(use_tqdm=use_tqdm)
        return LLMEngine.validate_outputs(outputs, EmbeddingRequestOutput)

+    def start_profile(self) -> None:
+        self.llm_engine.start_profile()
+
+    def stop_profile(self) -> None:
+        self.llm_engine.stop_profile()
+
    # LEGACY
    def _convert_v1_inputs(
        self,
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@ -296,6 +296,12 @@ class CPUExecutor(ExecutorBase):
        for result in parallel_worker_tasks:
            result.get()

+    def start_profile(self) -> None:
+        self.driver_method_invoker(self.driver_worker, "start_profile")
+
+    def stop_profile(self) -> None:
+        self.driver_method_invoker(self.driver_worker, "stop_profile")
+

 class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase):

--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@ -169,6 +169,12 @@ class GPUExecutor(ExecutorBase):
        # it's running.
        return

+    def start_profile(self) -> None:
+        self.driver_worker.start_profile()
+
+    def stop_profile(self) -> None:
+        self.driver_worker.stop_profile()
+

 class GPUExecutorAsync(GPUExecutor, ExecutorAsyncBase):