Add Production Metrics in Prometheus format (#1890)
This commit is contained in:
parent
5f09cbdb63
commit
5313c2cb8b
@ -67,6 +67,7 @@ Documentation
|
|||||||
serving/deploying_with_triton
|
serving/deploying_with_triton
|
||||||
serving/deploying_with_docker
|
serving/deploying_with_docker
|
||||||
serving/serving_with_langchain
|
serving/serving_with_langchain
|
||||||
|
serving/metrics
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
|
|||||||
13
docs/source/serving/metrics.rst
Normal file
13
docs/source/serving/metrics.rst
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
Production Metrics
|
||||||
|
==================
|
||||||
|
|
||||||
|
vLLM exposes a number of metrics that can be used to monitor the health of the
|
||||||
|
system. These metrics are exposed via the `/metrics` endpoint on the vLLM
|
||||||
|
OpenAI compatible API server.
|
||||||
|
|
||||||
|
The following metrics are exposed:
|
||||||
|
|
||||||
|
.. literalinclude:: ../../../vllm/engine/metrics.py
|
||||||
|
:language: python
|
||||||
|
:start-after: begin-metrics-definitions
|
||||||
|
:end-before: end-metrics-definitions
|
||||||
@ -12,3 +12,4 @@ xformers >= 0.0.22.post7 # Required for CUDA 12.1.
|
|||||||
fastapi
|
fastapi
|
||||||
uvicorn[standard]
|
uvicorn[standard]
|
||||||
pydantic == 1.10.13 # Required for OpenAI server.
|
pydantic == 1.10.13 # Required for OpenAI server.
|
||||||
|
aioprometheus[starlette]
|
||||||
|
|||||||
@ -7,6 +7,7 @@ from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
|
|||||||
SchedulerConfig)
|
SchedulerConfig)
|
||||||
from vllm.core.scheduler import Scheduler, SchedulerOutputs
|
from vllm.core.scheduler import Scheduler, SchedulerOutputs
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
|
from vllm.engine.metrics import record_metrics
|
||||||
from vllm.engine.ray_utils import RayWorkerVllm, initialize_cluster, ray
|
from vllm.engine.ray_utils import RayWorkerVllm, initialize_cluster, ray
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.outputs import RequestOutput
|
from vllm.outputs import RequestOutput
|
||||||
@ -591,8 +592,8 @@ class LLMEngine:
|
|||||||
else:
|
else:
|
||||||
self.num_generation_tokens.append((now, num_batched_tokens))
|
self.num_generation_tokens.append((now, num_batched_tokens))
|
||||||
|
|
||||||
elapsed_time = now - self.last_logging_time
|
should_log = now - self.last_logging_time >= _LOGGING_INTERVAL_SEC
|
||||||
if elapsed_time < _LOGGING_INTERVAL_SEC:
|
if not should_log:
|
||||||
return
|
return
|
||||||
|
|
||||||
# Discard the old stats.
|
# Discard the old stats.
|
||||||
@ -631,6 +632,16 @@ class LLMEngine:
|
|||||||
else:
|
else:
|
||||||
cpu_cache_usage = 0.0
|
cpu_cache_usage = 0.0
|
||||||
|
|
||||||
|
record_metrics(
|
||||||
|
avg_prompt_throughput=avg_prompt_throughput,
|
||||||
|
avg_generation_throughput=avg_generation_throughput,
|
||||||
|
scheduler_running=len(self.scheduler.running),
|
||||||
|
scheduler_swapped=len(self.scheduler.swapped),
|
||||||
|
scheduler_waiting=len(self.scheduler.waiting),
|
||||||
|
gpu_cache_usage=gpu_cache_usage,
|
||||||
|
cpu_cache_usage=cpu_cache_usage,
|
||||||
|
)
|
||||||
|
|
||||||
logger.info("Avg prompt throughput: "
|
logger.info("Avg prompt throughput: "
|
||||||
f"{avg_prompt_throughput:.1f} tokens/s, "
|
f"{avg_prompt_throughput:.1f} tokens/s, "
|
||||||
"Avg generation throughput: "
|
"Avg generation throughput: "
|
||||||
|
|||||||
51
vllm/engine/metrics.py
Normal file
51
vllm/engine/metrics.py
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
from aioprometheus import Gauge
|
||||||
|
|
||||||
|
# The begin-* and end* here are used by the documentation generator
|
||||||
|
# to extract the metrics definitions.
|
||||||
|
|
||||||
|
# begin-metrics-definitions
|
||||||
|
gauge_avg_prompt_throughput = Gauge("vllm:avg_prompt_throughput_toks_per_s",
|
||||||
|
"Average prefill throughput in tokens/s.")
|
||||||
|
gauge_avg_generation_throughput = Gauge(
|
||||||
|
"vllm:avg_generation_throughput_toks_per_s",
|
||||||
|
"Average generation throughput in tokens/s.")
|
||||||
|
|
||||||
|
gauge_scheduler_running = Gauge(
|
||||||
|
"vllm:num_requests_running",
|
||||||
|
"Number of requests that is currently running for inference.")
|
||||||
|
gauge_scheduler_swapped = Gauge("vllm:num_requests_swapped",
|
||||||
|
"Number requests swapped to CPU.")
|
||||||
|
gauge_scheduler_waiting = Gauge("vllm:num_requests_waiting",
|
||||||
|
"Number of requests waiting to be processed.")
|
||||||
|
|
||||||
|
gauge_gpu_cache_usage = Gauge(
|
||||||
|
"vllm:gpu_cache_usage_perc",
|
||||||
|
"GPU KV-cache usage. 1 means 100 percent usage.")
|
||||||
|
gauge_cpu_cache_usage = Gauge(
|
||||||
|
"vllm:cpu_cache_usage_perc",
|
||||||
|
"CPU KV-cache usage. 1 means 100 percent usage.")
|
||||||
|
# end-metrics-definitions
|
||||||
|
|
||||||
|
labels = {}
|
||||||
|
|
||||||
|
|
||||||
|
def add_global_metrics_labels(**kwargs):
|
||||||
|
labels.update(kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def record_metrics(
|
||||||
|
avg_prompt_throughput: float,
|
||||||
|
avg_generation_throughput: float,
|
||||||
|
scheduler_running: int,
|
||||||
|
scheduler_swapped: int,
|
||||||
|
scheduler_waiting: int,
|
||||||
|
gpu_cache_usage: float,
|
||||||
|
cpu_cache_usage: float,
|
||||||
|
):
|
||||||
|
gauge_avg_prompt_throughput.set(labels, avg_prompt_throughput)
|
||||||
|
gauge_avg_generation_throughput.set(labels, avg_generation_throughput)
|
||||||
|
gauge_scheduler_running.set(labels, scheduler_running)
|
||||||
|
gauge_scheduler_swapped.set(labels, scheduler_swapped)
|
||||||
|
gauge_scheduler_waiting.set(labels, scheduler_waiting)
|
||||||
|
gauge_gpu_cache_usage.set(labels, gpu_cache_usage)
|
||||||
|
gauge_cpu_cache_usage.set(labels, cpu_cache_usage)
|
||||||
@ -9,6 +9,8 @@ import time
|
|||||||
from http import HTTPStatus
|
from http import HTTPStatus
|
||||||
from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union
|
from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
from aioprometheus import MetricsMiddleware
|
||||||
|
from aioprometheus.asgi.starlette import metrics
|
||||||
import fastapi
|
import fastapi
|
||||||
import uvicorn
|
import uvicorn
|
||||||
from fastapi import Request
|
from fastapi import Request
|
||||||
@ -18,6 +20,7 @@ from fastapi.responses import JSONResponse, StreamingResponse, Response
|
|||||||
|
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||||
|
from vllm.engine.metrics import add_global_metrics_labels
|
||||||
from vllm.entrypoints.openai.protocol import (
|
from vllm.entrypoints.openai.protocol import (
|
||||||
CompletionRequest, CompletionResponse, CompletionResponseChoice,
|
CompletionRequest, CompletionResponse, CompletionResponseChoice,
|
||||||
CompletionResponseStreamChoice, CompletionStreamResponse,
|
CompletionResponseStreamChoice, CompletionStreamResponse,
|
||||||
@ -82,6 +85,10 @@ def parse_args():
|
|||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
app.add_middleware(MetricsMiddleware) # Trace HTTP server metrics
|
||||||
|
app.add_route("/metrics", metrics) # Exposes HTTP metrics
|
||||||
|
|
||||||
|
|
||||||
def create_error_response(status_code: HTTPStatus,
|
def create_error_response(status_code: HTTPStatus,
|
||||||
message: str) -> JSONResponse:
|
message: str) -> JSONResponse:
|
||||||
return JSONResponse(ErrorResponse(message=message,
|
return JSONResponse(ErrorResponse(message=message,
|
||||||
@ -722,6 +729,9 @@ if __name__ == "__main__":
|
|||||||
trust_remote_code=engine_model_config.trust_remote_code)
|
trust_remote_code=engine_model_config.trust_remote_code)
|
||||||
load_chat_template(args, tokenizer)
|
load_chat_template(args, tokenizer)
|
||||||
|
|
||||||
|
# Register labels for metrics
|
||||||
|
add_global_metrics_labels(model_name=engine_args.model)
|
||||||
|
|
||||||
uvicorn.run(app,
|
uvicorn.run(app,
|
||||||
host=args.host,
|
host=args.host,
|
||||||
port=args.port,
|
port=args.port,
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user