Add metrics to RequestOutput (#2876)

This commit is contained in:
Antoni Baum 2024-02-20 21:55:57 -08:00 committed by GitHub
parent 181b27d881
commit 017d9f1515
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 61 additions and 9 deletions

View File

@ -64,7 +64,7 @@ def test_request_tracker():
stream_5 = tracker.add_request("5") stream_5 = tracker.add_request("5")
assert tracker.new_requests_event.flag assert tracker.new_requests_event.flag
tracker.process_request_output( tracker.process_request_output(
RequestOutput("2", "output", [], [], [], finished=True)) RequestOutput("2", "output", [], [], [], bool(finished)))
new, finished = tracker.get_new_and_finished_requests() new, finished = tracker.get_new_and_finished_requests()
assert not tracker.new_requests_event.flag assert not tracker.new_requests_event.flag
assert len(finished) == 1 assert len(finished) == 1

View File

@ -33,7 +33,7 @@ class FCFS(Policy):
now: float, now: float,
seq_group: SequenceGroup, seq_group: SequenceGroup,
) -> float: ) -> float:
return now - seq_group.arrival_time return now - seq_group.metrics.arrival_time
class PolicyFactory: class PolicyFactory:

View File

@ -365,10 +365,13 @@ class Scheduler:
# This function call changes the internal states of the scheduler # This function call changes the internal states of the scheduler
# such as self.running, self.swapped, and self.waiting. # such as self.running, self.swapped, and self.waiting.
scheduler_outputs = self._schedule() scheduler_outputs = self._schedule()
now = time.time()
# Create input data structures. # Create input data structures.
seq_group_metadata_list: List[SequenceGroupMetadata] = [] seq_group_metadata_list: List[SequenceGroupMetadata] = []
for seq_group in scheduler_outputs.scheduled_seq_groups: for seq_group in scheduler_outputs.scheduled_seq_groups:
seq_group.maybe_set_first_scheduled_time(now)
seq_data: Dict[int, SequenceData] = {} seq_data: Dict[int, SequenceData] = {}
block_tables: Dict[int, List[int]] = {} block_tables: Dict[int, List[int]] = {}
for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):

View File

@ -728,6 +728,7 @@ class LLMEngine:
def _process_model_outputs( def _process_model_outputs(
self, output: SamplerOutput, self, output: SamplerOutput,
scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]: scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]:
now = time.time()
# Update the scheduled sequence groups with the model outputs. # Update the scheduled sequence groups with the model outputs.
scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups
for seq_group, outputs in zip(scheduled_seq_groups, output): for seq_group, outputs in zip(scheduled_seq_groups, output):
@ -739,6 +740,7 @@ class LLMEngine:
# Create the outputs. # Create the outputs.
request_outputs: List[RequestOutput] = [] request_outputs: List[RequestOutput] = []
for seq_group in scheduled_seq_groups: for seq_group in scheduled_seq_groups:
seq_group.maybe_set_first_token_time(now)
request_output = RequestOutput.from_seq_group(seq_group) request_output = RequestOutput.from_seq_group(seq_group)
request_outputs.append(request_output) request_outputs.append(request_output)
for seq_group in scheduler_outputs.ignored_seq_groups: for seq_group in scheduler_outputs.ignored_seq_groups:
@ -876,11 +878,12 @@ class LLMEngine:
# Latency Timings. # Latency Timings.
time_last_iters = [] time_last_iters = []
for seq_group in scheduler_outputs.scheduled_seq_groups: for seq_group in scheduler_outputs.scheduled_seq_groups:
# Time since last token. (n.b. updates seq_group.last_token_time) # Time since last token. (n.b. updates seq_group.metrics.last_token_time)
time_last_iters.append(seq_group.get_last_latency(now)) time_last_iters.append(seq_group.get_last_latency(now))
# Time since arrival for all finished requests. # Time since arrival for all finished requests.
if seq_group.is_finished(): if seq_group.is_finished():
time_e2e_requests.append(now - seq_group.arrival_time) time_e2e_requests.append(now -
seq_group.metrics.arrival_time)
time_to_first_tokens = time_last_iters if prompt_run else [] time_to_first_tokens = time_last_iters if prompt_run else []
time_per_output_tokens = [] if prompt_run else time_last_iters time_per_output_tokens = [] if prompt_run else time_last_iters

View File

@ -1,7 +1,8 @@
from typing import List, Optional from typing import List, Optional
import time
from vllm.sequence import (PromptLogprobs, SampleLogprobs, SequenceGroup, from vllm.sequence import (PromptLogprobs, SampleLogprobs, SequenceGroup,
SequenceStatus) SequenceStatus, RequestMetrics)
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
@ -60,6 +61,7 @@ class RequestOutput:
prompt_logprobs: The log probabilities to return per prompt token. prompt_logprobs: The log probabilities to return per prompt token.
outputs: The output sequences of the request. outputs: The output sequences of the request.
finished: Whether the whole request is finished. finished: Whether the whole request is finished.
metrics: Metrics associated with the request.
lora_request: The LoRA request that was used to generate the output. lora_request: The LoRA request that was used to generate the output.
""" """
@ -71,6 +73,7 @@ class RequestOutput:
prompt_logprobs: Optional[PromptLogprobs], prompt_logprobs: Optional[PromptLogprobs],
outputs: List[CompletionOutput], outputs: List[CompletionOutput],
finished: bool, finished: bool,
metrics: Optional[RequestMetrics] = None,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
) -> None: ) -> None:
self.request_id = request_id self.request_id = request_id
@ -79,6 +82,7 @@ class RequestOutput:
self.prompt_logprobs = prompt_logprobs self.prompt_logprobs = prompt_logprobs
self.outputs = outputs self.outputs = outputs
self.finished = finished self.finished = finished
self.metrics = metrics
self.lora_request = lora_request self.lora_request = lora_request
@classmethod @classmethod
@ -115,12 +119,15 @@ class RequestOutput:
prompt_token_ids = seq_group.prompt_token_ids prompt_token_ids = seq_group.prompt_token_ids
prompt_logprobs = seq_group.prompt_logprobs prompt_logprobs = seq_group.prompt_logprobs
finished = seq_group.is_finished() finished = seq_group.is_finished()
finished_time = time.time() if finished else None
seq_group.set_finished_time(finished_time)
return cls(seq_group.request_id, return cls(seq_group.request_id,
prompt, prompt,
prompt_token_ids, prompt_token_ids,
prompt_logprobs, prompt_logprobs,
outputs, outputs,
finished, finished,
seq_group.metrics,
lora_request=seq_group.lora_request) lora_request=seq_group.lora_request)
def __repr__(self) -> str: def __repr__(self) -> str:
@ -130,4 +137,5 @@ class RequestOutput:
f"prompt_logprobs={self.prompt_logprobs}, " f"prompt_logprobs={self.prompt_logprobs}, "
f"outputs={self.outputs}, " f"outputs={self.outputs}, "
f"finished={self.finished}, " f"finished={self.finished}, "
f"metrics={self.metrics}, "
f"lora_request={self.lora_request})") f"lora_request={self.lora_request})")

View File

@ -1,6 +1,7 @@
"""Sequence and its related classes.""" """Sequence and its related classes."""
import copy import copy
import enum import enum
from dataclasses import dataclass
from typing import Dict, List, Optional, Union from typing import Dict, List, Optional, Union
from vllm.block import LogicalTokenBlock from vllm.block import LogicalTokenBlock
@ -49,6 +50,25 @@ class SequenceStatus(enum.Enum):
return finish_reason return finish_reason
@dataclass
class RequestMetrics:
"""Metrics associated with a request.
Args:
arrival_time: The time when the request arrived.
first_scheduled_time: The time when the request was first scheduled.
first_token_time: The time when the first token was generated.
time_in_queue: The time the request spent in the queue.
finished_time: The time when the request was finished.
"""
arrival_time: float
last_token_time: float
first_scheduled_time: Optional[float]
first_token_time: Optional[float]
time_in_queue: Optional[float]
finished_time: Optional[float] = None
class SequenceData: class SequenceData:
"""Data associated with a sequence. """Data associated with a sequence.
@ -252,8 +272,11 @@ class SequenceGroup:
self.request_id = request_id self.request_id = request_id
self.seqs_dict = {seq.seq_id: seq for seq in seqs} self.seqs_dict = {seq.seq_id: seq for seq in seqs}
self.sampling_params = sampling_params self.sampling_params = sampling_params
self.arrival_time = arrival_time self.metrics = RequestMetrics(arrival_time=arrival_time,
self.last_token_time = arrival_time last_token_time=arrival_time,
first_scheduled_time=None,
first_token_time=None,
time_in_queue=None)
self.lora_request = lora_request self.lora_request = lora_request
self.prefix: Optional[Prefix] = prefix self.prefix: Optional[Prefix] = prefix
self.prompt_logprobs: Optional[PromptLogprobs] = None self.prompt_logprobs: Optional[PromptLogprobs] = None
@ -276,10 +299,25 @@ class SequenceGroup:
def get_last_latency(self, now: float) -> float: def get_last_latency(self, now: float) -> float:
"""Gets last token latency for Request level timings.""" """Gets last token latency for Request level timings."""
latency = now - self.last_token_time latency = now - self.metrics.last_token_time
self.last_token_time = now self.metrics.last_token_time = now
return latency return latency
def maybe_set_first_token_time(self, time: float) -> None:
"""Sets the first token time for Request level timings."""
if self.metrics.first_token_time is None:
self.metrics.first_token_time = time
def maybe_set_first_scheduled_time(self, time: float) -> None:
"""Sets the first scheduled time and time in queue for Request level timings."""
if self.metrics.first_scheduled_time is None:
self.metrics.first_scheduled_time = time
self.metrics.time_in_queue = time - self.metrics.arrival_time
def set_finished_time(self, time: Optional[float]) -> None:
"""Sets the finished time for Request level timings."""
self.metrics.finished_time = time
def get_max_num_running_seqs(self) -> int: def get_max_num_running_seqs(self) -> int:
"""The maximum number of sequences running in parallel in the remaining """The maximum number of sequences running in parallel in the remaining
lifetime of the request.""" lifetime of the request."""