diff --git a/docs/source/index.rst b/docs/source/index.rst index 0231ce67..300c2276 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -67,6 +67,7 @@ Documentation serving/deploying_with_triton serving/deploying_with_docker serving/serving_with_langchain + serving/metrics .. toctree:: :maxdepth: 1 diff --git a/docs/source/serving/metrics.rst b/docs/source/serving/metrics.rst new file mode 100644 index 00000000..15e57bd3 --- /dev/null +++ b/docs/source/serving/metrics.rst @@ -0,0 +1,13 @@ +Production Metrics +================== + +vLLM exposes a number of metrics that can be used to monitor the health of the +system. These metrics are exposed via the `/metrics` endpoint on the vLLM +OpenAI compatible API server. + +The following metrics are exposed: + +.. literalinclude:: ../../../vllm/engine/metrics.py + :language: python + :start-after: begin-metrics-definitions + :end-before: end-metrics-definitions diff --git a/requirements.txt b/requirements.txt index e8a44328..9a27eae8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,4 @@ xformers >= 0.0.22.post7 # Required for CUDA 12.1. fastapi uvicorn[standard] pydantic == 1.10.13 # Required for OpenAI server. +aioprometheus[starlette] diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index b7dd60df..2400dd53 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -7,6 +7,7 @@ from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, SchedulerConfig) from vllm.core.scheduler import Scheduler, SchedulerOutputs from vllm.engine.arg_utils import EngineArgs +from vllm.engine.metrics import record_metrics from vllm.engine.ray_utils import RayWorkerVllm, initialize_cluster, ray from vllm.logger import init_logger from vllm.outputs import RequestOutput @@ -591,8 +592,8 @@ class LLMEngine: else: self.num_generation_tokens.append((now, num_batched_tokens)) - elapsed_time = now - self.last_logging_time - if elapsed_time < _LOGGING_INTERVAL_SEC: + should_log = now - self.last_logging_time >= _LOGGING_INTERVAL_SEC + if not should_log: return # Discard the old stats. @@ -631,6 +632,16 @@ class LLMEngine: else: cpu_cache_usage = 0.0 + record_metrics( + avg_prompt_throughput=avg_prompt_throughput, + avg_generation_throughput=avg_generation_throughput, + scheduler_running=len(self.scheduler.running), + scheduler_swapped=len(self.scheduler.swapped), + scheduler_waiting=len(self.scheduler.waiting), + gpu_cache_usage=gpu_cache_usage, + cpu_cache_usage=cpu_cache_usage, + ) + logger.info("Avg prompt throughput: " f"{avg_prompt_throughput:.1f} tokens/s, " "Avg generation throughput: " diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py new file mode 100644 index 00000000..c6407120 --- /dev/null +++ b/vllm/engine/metrics.py @@ -0,0 +1,51 @@ +from aioprometheus import Gauge + +# The begin-* and end* here are used by the documentation generator +# to extract the metrics definitions. + +# begin-metrics-definitions +gauge_avg_prompt_throughput = Gauge("vllm:avg_prompt_throughput_toks_per_s", + "Average prefill throughput in tokens/s.") +gauge_avg_generation_throughput = Gauge( + "vllm:avg_generation_throughput_toks_per_s", + "Average generation throughput in tokens/s.") + +gauge_scheduler_running = Gauge( + "vllm:num_requests_running", + "Number of requests that is currently running for inference.") +gauge_scheduler_swapped = Gauge("vllm:num_requests_swapped", + "Number requests swapped to CPU.") +gauge_scheduler_waiting = Gauge("vllm:num_requests_waiting", + "Number of requests waiting to be processed.") + +gauge_gpu_cache_usage = Gauge( + "vllm:gpu_cache_usage_perc", + "GPU KV-cache usage. 1 means 100 percent usage.") +gauge_cpu_cache_usage = Gauge( + "vllm:cpu_cache_usage_perc", + "CPU KV-cache usage. 1 means 100 percent usage.") +# end-metrics-definitions + +labels = {} + + +def add_global_metrics_labels(**kwargs): + labels.update(kwargs) + + +def record_metrics( + avg_prompt_throughput: float, + avg_generation_throughput: float, + scheduler_running: int, + scheduler_swapped: int, + scheduler_waiting: int, + gpu_cache_usage: float, + cpu_cache_usage: float, +): + gauge_avg_prompt_throughput.set(labels, avg_prompt_throughput) + gauge_avg_generation_throughput.set(labels, avg_generation_throughput) + gauge_scheduler_running.set(labels, scheduler_running) + gauge_scheduler_swapped.set(labels, scheduler_swapped) + gauge_scheduler_waiting.set(labels, scheduler_waiting) + gauge_gpu_cache_usage.set(labels, gpu_cache_usage) + gauge_cpu_cache_usage.set(labels, cpu_cache_usage) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index ef9a3985..39ea750a 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -9,6 +9,8 @@ import time from http import HTTPStatus from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union +from aioprometheus import MetricsMiddleware +from aioprometheus.asgi.starlette import metrics import fastapi import uvicorn from fastapi import Request @@ -18,6 +20,7 @@ from fastapi.responses import JSONResponse, StreamingResponse, Response from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm.engine.metrics import add_global_metrics_labels from vllm.entrypoints.openai.protocol import ( CompletionRequest, CompletionResponse, CompletionResponseChoice, CompletionResponseStreamChoice, CompletionStreamResponse, @@ -82,6 +85,10 @@ def parse_args(): return parser.parse_args() +app.add_middleware(MetricsMiddleware) # Trace HTTP server metrics +app.add_route("/metrics", metrics) # Exposes HTTP metrics + + def create_error_response(status_code: HTTPStatus, message: str) -> JSONResponse: return JSONResponse(ErrorResponse(message=message, @@ -722,6 +729,9 @@ if __name__ == "__main__": trust_remote_code=engine_model_config.trust_remote_code) load_chat_template(args, tokenizer) + # Register labels for metrics + add_global_metrics_labels(model_name=engine_args.model) + uvicorn.run(app, host=args.host, port=args.port,