[Frontend] Add max_tokens prometheus metric (#9881)
Signed-off-by: Tomer Asida <tomera@ai21.com>
This commit is contained in:
parent
9a5664d4a4
commit
ac04a97a9f
@ -70,10 +70,14 @@ EXPECTED_VALUES = {
|
||||
[("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
|
||||
("_count", _NUM_REQUESTS)],
|
||||
"vllm:request_params_n": [("_count", _NUM_REQUESTS)],
|
||||
"vllm:request_params_max_tokens":
|
||||
[("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
|
||||
("_count", _NUM_REQUESTS)],
|
||||
"vllm:prompt_tokens": [("_total",
|
||||
_NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
|
||||
"vllm:generation_tokens":
|
||||
[("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
|
||||
"vllm:generation_tokens": [
|
||||
("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)
|
||||
],
|
||||
"vllm:request_success": [("_total", _NUM_REQUESTS)],
|
||||
}
|
||||
|
||||
@ -149,6 +153,9 @@ EXPECTED_METRICS = [
|
||||
"vllm:request_params_n_sum",
|
||||
"vllm:request_params_n_bucket",
|
||||
"vllm:request_params_n_count",
|
||||
"vllm:request_params_max_tokens_sum",
|
||||
"vllm:request_params_max_tokens_bucket",
|
||||
"vllm:request_params_max_tokens_count",
|
||||
"vllm:num_preemptions_total",
|
||||
"vllm:prompt_tokens_total",
|
||||
"vllm:generation_tokens_total",
|
||||
|
||||
@ -365,6 +365,7 @@ def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
|
||||
"vllm:request_prompt_tokens",
|
||||
"vllm:request_generation_tokens",
|
||||
"vllm:request_params_n",
|
||||
"vllm:request_params_max_tokens",
|
||||
]
|
||||
for metric_name in request_histogram_metrics:
|
||||
metric_value = REGISTRY.get_sample_value(f"{metric_name}_count",
|
||||
|
||||
@ -1685,6 +1685,7 @@ class LLMEngine:
|
||||
num_prompt_tokens_requests: List[int] = []
|
||||
num_generation_tokens_requests: List[int] = []
|
||||
n_requests: List[int] = []
|
||||
max_tokens_requests: List[int] = []
|
||||
finished_reason_requests: List[str] = []
|
||||
|
||||
# Lora requests
|
||||
@ -1792,6 +1793,8 @@ class LLMEngine:
|
||||
])
|
||||
if seq_group.sampling_params is not None:
|
||||
n_requests.append(seq_group.sampling_params.n)
|
||||
max_tokens_requests.append(
|
||||
seq_group.sampling_params.max_tokens)
|
||||
finished_reason_requests.extend([
|
||||
SequenceStatus.get_finished_reason(seq.status)
|
||||
for seq in seq_group.get_finished_seqs()
|
||||
@ -1847,6 +1850,7 @@ class LLMEngine:
|
||||
num_prompt_tokens_requests=num_prompt_tokens_requests,
|
||||
num_generation_tokens_requests=num_generation_tokens_requests,
|
||||
n_requests=n_requests,
|
||||
max_tokens_requests=max_tokens_requests,
|
||||
finished_reason_requests=finished_reason_requests,
|
||||
max_lora=str(max_lora_stat),
|
||||
waiting_lora_adapters=list(waiting_lora_adapters.keys()),
|
||||
|
||||
@ -179,6 +179,12 @@ class Metrics:
|
||||
labelnames=labelnames,
|
||||
buckets=[1, 2, 5, 10, 20],
|
||||
)
|
||||
self.histogram_max_tokens_request = self._histogram_cls(
|
||||
name="vllm:request_params_max_tokens",
|
||||
documentation="Histogram of the max_tokens request parameter.",
|
||||
labelnames=labelnames,
|
||||
buckets=build_1_2_5_buckets(max_model_len),
|
||||
)
|
||||
self.counter_request_success = self._counter_cls(
|
||||
name="vllm:request_success_total",
|
||||
documentation="Count of successfully processed requests.",
|
||||
@ -547,6 +553,8 @@ class PrometheusStatLogger(StatLoggerBase):
|
||||
self.metrics.histogram_num_generation_tokens_request,
|
||||
stats.num_generation_tokens_requests)
|
||||
self._log_histogram(self.metrics.histogram_n_request, stats.n_requests)
|
||||
self._log_histogram(self.metrics.histogram_max_tokens_request,
|
||||
stats.max_tokens_requests)
|
||||
|
||||
def _log_prometheus_interval(self, prompt_throughput: float,
|
||||
generation_throughput: float) -> None:
|
||||
|
||||
@ -53,6 +53,7 @@ class Stats:
|
||||
num_prompt_tokens_requests: List[int]
|
||||
num_generation_tokens_requests: List[int]
|
||||
n_requests: List[int]
|
||||
max_tokens_requests: List[int]
|
||||
finished_reason_requests: List[str]
|
||||
waiting_lora_adapters: List[str]
|
||||
running_lora_adapters: List[str]
|
||||
|
||||
Loading…
Reference in New Issue
Block a user