diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 81baab3f..3f0a8d3d 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -145,7 +145,7 @@ class EngineArgs: max_cpu_loras: Optional[int] = None device: str = 'auto' num_scheduler_steps: int = 1 - multi_step_stream_outputs: bool = False + multi_step_stream_outputs: bool = True ray_workers_use_nsight: bool = False num_gpu_blocks_override: Optional[int] = None num_lookahead_slots: int = 0 @@ -603,13 +603,17 @@ class EngineArgs: parser.add_argument( '--multi-step-stream-outputs', - action='store_true', - help='If True, then multi-step will stream outputs for every step') + action=StoreBoolean, + default=EngineArgs.multi_step_stream_outputs, + nargs="?", + const="True", + help='If False, then multi-step will stream outputs at the end ' + 'of all steps') parser.add_argument( '--scheduler-delay-factor', type=float, default=EngineArgs.scheduler_delay_factor, - help='Apply a delay (of delay factor multiplied by previous' + help='Apply a delay (of delay factor multiplied by previous ' 'prompt latency) before scheduling next prompt.') parser.add_argument( '--enable-chunked-prefill', @@ -632,7 +636,7 @@ class EngineArgs: type=nullable_str, choices=[*QUANTIZATION_METHODS, None], default=EngineArgs.speculative_model_quantization, - help='Method used to quantize the weights of speculative model.' + help='Method used to quantize the weights of speculative model. ' 'If None, we first check the `quantization_config` ' 'attribute in the model config file. If that is ' 'None, we assume the model weights are not '