Add Production Metrics in Prometheus format (#1890)

This commit is contained in:
Simon Mo 2023-12-02 16:37:44 -08:00 committed by GitHub
parent 5f09cbdb63
commit 5313c2cb8b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 89 additions and 2 deletions

View File

@ -67,6 +67,7 @@ Documentation
serving/deploying_with_triton serving/deploying_with_triton
serving/deploying_with_docker serving/deploying_with_docker
serving/serving_with_langchain serving/serving_with_langchain
serving/metrics
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1

View File

@ -0,0 +1,13 @@
Production Metrics
==================
vLLM exposes a number of metrics that can be used to monitor the health of the
system. These metrics are exposed via the `/metrics` endpoint on the vLLM
OpenAI compatible API server.
The following metrics are exposed:
.. literalinclude:: ../../../vllm/engine/metrics.py
:language: python
:start-after: begin-metrics-definitions
:end-before: end-metrics-definitions

View File

@ -12,3 +12,4 @@ xformers >= 0.0.22.post7 # Required for CUDA 12.1.
fastapi fastapi
uvicorn[standard] uvicorn[standard]
pydantic == 1.10.13 # Required for OpenAI server. pydantic == 1.10.13 # Required for OpenAI server.
aioprometheus[starlette]

View File

@ -7,6 +7,7 @@ from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
SchedulerConfig) SchedulerConfig)
from vllm.core.scheduler import Scheduler, SchedulerOutputs from vllm.core.scheduler import Scheduler, SchedulerOutputs
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.engine.metrics import record_metrics
from vllm.engine.ray_utils import RayWorkerVllm, initialize_cluster, ray from vllm.engine.ray_utils import RayWorkerVllm, initialize_cluster, ray
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
@ -591,8 +592,8 @@ class LLMEngine:
else: else:
self.num_generation_tokens.append((now, num_batched_tokens)) self.num_generation_tokens.append((now, num_batched_tokens))
elapsed_time = now - self.last_logging_time should_log = now - self.last_logging_time >= _LOGGING_INTERVAL_SEC
if elapsed_time < _LOGGING_INTERVAL_SEC: if not should_log:
return return
# Discard the old stats. # Discard the old stats.
@ -631,6 +632,16 @@ class LLMEngine:
else: else:
cpu_cache_usage = 0.0 cpu_cache_usage = 0.0
record_metrics(
avg_prompt_throughput=avg_prompt_throughput,
avg_generation_throughput=avg_generation_throughput,
scheduler_running=len(self.scheduler.running),
scheduler_swapped=len(self.scheduler.swapped),
scheduler_waiting=len(self.scheduler.waiting),
gpu_cache_usage=gpu_cache_usage,
cpu_cache_usage=cpu_cache_usage,
)
logger.info("Avg prompt throughput: " logger.info("Avg prompt throughput: "
f"{avg_prompt_throughput:.1f} tokens/s, " f"{avg_prompt_throughput:.1f} tokens/s, "
"Avg generation throughput: " "Avg generation throughput: "

51
vllm/engine/metrics.py Normal file
View File

@ -0,0 +1,51 @@
from aioprometheus import Gauge
# The begin-* and end* here are used by the documentation generator
# to extract the metrics definitions.
# begin-metrics-definitions
gauge_avg_prompt_throughput = Gauge("vllm:avg_prompt_throughput_toks_per_s",
"Average prefill throughput in tokens/s.")
gauge_avg_generation_throughput = Gauge(
"vllm:avg_generation_throughput_toks_per_s",
"Average generation throughput in tokens/s.")
gauge_scheduler_running = Gauge(
"vllm:num_requests_running",
"Number of requests that is currently running for inference.")
gauge_scheduler_swapped = Gauge("vllm:num_requests_swapped",
"Number requests swapped to CPU.")
gauge_scheduler_waiting = Gauge("vllm:num_requests_waiting",
"Number of requests waiting to be processed.")
gauge_gpu_cache_usage = Gauge(
"vllm:gpu_cache_usage_perc",
"GPU KV-cache usage. 1 means 100 percent usage.")
gauge_cpu_cache_usage = Gauge(
"vllm:cpu_cache_usage_perc",
"CPU KV-cache usage. 1 means 100 percent usage.")
# end-metrics-definitions
labels = {}
def add_global_metrics_labels(**kwargs):
labels.update(kwargs)
def record_metrics(
avg_prompt_throughput: float,
avg_generation_throughput: float,
scheduler_running: int,
scheduler_swapped: int,
scheduler_waiting: int,
gpu_cache_usage: float,
cpu_cache_usage: float,
):
gauge_avg_prompt_throughput.set(labels, avg_prompt_throughput)
gauge_avg_generation_throughput.set(labels, avg_generation_throughput)
gauge_scheduler_running.set(labels, scheduler_running)
gauge_scheduler_swapped.set(labels, scheduler_swapped)
gauge_scheduler_waiting.set(labels, scheduler_waiting)
gauge_gpu_cache_usage.set(labels, gpu_cache_usage)
gauge_cpu_cache_usage.set(labels, cpu_cache_usage)

View File

@ -9,6 +9,8 @@ import time
from http import HTTPStatus from http import HTTPStatus
from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union
from aioprometheus import MetricsMiddleware
from aioprometheus.asgi.starlette import metrics
import fastapi import fastapi
import uvicorn import uvicorn
from fastapi import Request from fastapi import Request
@ -18,6 +20,7 @@ from fastapi.responses import JSONResponse, StreamingResponse, Response
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.metrics import add_global_metrics_labels
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.protocol import (
CompletionRequest, CompletionResponse, CompletionResponseChoice, CompletionRequest, CompletionResponse, CompletionResponseChoice,
CompletionResponseStreamChoice, CompletionStreamResponse, CompletionResponseStreamChoice, CompletionStreamResponse,
@ -82,6 +85,10 @@ def parse_args():
return parser.parse_args() return parser.parse_args()
app.add_middleware(MetricsMiddleware) # Trace HTTP server metrics
app.add_route("/metrics", metrics) # Exposes HTTP metrics
def create_error_response(status_code: HTTPStatus, def create_error_response(status_code: HTTPStatus,
message: str) -> JSONResponse: message: str) -> JSONResponse:
return JSONResponse(ErrorResponse(message=message, return JSONResponse(ErrorResponse(message=message,
@ -722,6 +729,9 @@ if __name__ == "__main__":
trust_remote_code=engine_model_config.trust_remote_code) trust_remote_code=engine_model_config.trust_remote_code)
load_chat_template(args, tokenizer) load_chat_template(args, tokenizer)
# Register labels for metrics
add_global_metrics_labels(model_name=engine_args.model)
uvicorn.run(app, uvicorn.run(app,
host=args.host, host=args.host,
port=args.port, port=args.port,