[Misc] Remove deprecated arg for cuda graph capture (#9864)
Signed-off-by: Roger Wang <ywang@roblox.com>
This commit is contained in:
parent
d087bf863e
commit
3ea2dc2ec4
@ -84,9 +84,6 @@ class ModelConfig:
|
|||||||
disable CUDA graph and always execute the model in eager mode.
|
disable CUDA graph and always execute the model in eager mode.
|
||||||
If False, we will use CUDA graph and eager execution in hybrid.
|
If False, we will use CUDA graph and eager execution in hybrid.
|
||||||
If None, the user did not specify, so default to False.
|
If None, the user did not specify, so default to False.
|
||||||
max_context_len_to_capture: Maximum context len covered by CUDA graphs.
|
|
||||||
When a sequence has context length larger than this, we fall back
|
|
||||||
to eager mode (DEPRECATED. Use max_seq_len_to_capture instead).
|
|
||||||
max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
|
max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
|
||||||
When a sequence has context length larger than this, we fall back
|
When a sequence has context length larger than this, we fall back
|
||||||
to eager mode. Additionally for encoder-decoder models, if the
|
to eager mode. Additionally for encoder-decoder models, if the
|
||||||
@ -147,7 +144,6 @@ class ModelConfig:
|
|||||||
quantization: Optional[str] = None,
|
quantization: Optional[str] = None,
|
||||||
quantization_param_path: Optional[str] = None,
|
quantization_param_path: Optional[str] = None,
|
||||||
enforce_eager: Optional[bool] = None,
|
enforce_eager: Optional[bool] = None,
|
||||||
max_context_len_to_capture: Optional[int] = None,
|
|
||||||
max_seq_len_to_capture: Optional[int] = None,
|
max_seq_len_to_capture: Optional[int] = None,
|
||||||
max_logprobs: int = 20,
|
max_logprobs: int = 20,
|
||||||
disable_sliding_window: bool = False,
|
disable_sliding_window: bool = False,
|
||||||
@ -181,9 +177,6 @@ class ModelConfig:
|
|||||||
self.quantization = quantization
|
self.quantization = quantization
|
||||||
self.quantization_param_path = quantization_param_path
|
self.quantization_param_path = quantization_param_path
|
||||||
self.enforce_eager = enforce_eager
|
self.enforce_eager = enforce_eager
|
||||||
if max_context_len_to_capture is not None:
|
|
||||||
raise ValueError("`max_context_len_to_capture` is deprecated. "
|
|
||||||
"Use `max_seq_len_to_capture` instead.")
|
|
||||||
self.max_seq_len_to_capture = max_seq_len_to_capture
|
self.max_seq_len_to_capture = max_seq_len_to_capture
|
||||||
self.max_logprobs = max_logprobs
|
self.max_logprobs = max_logprobs
|
||||||
self.disable_sliding_window = disable_sliding_window
|
self.disable_sliding_window = disable_sliding_window
|
||||||
|
|||||||
@ -126,7 +126,6 @@ class EngineArgs:
|
|||||||
tokenizer_revision: Optional[str] = None
|
tokenizer_revision: Optional[str] = None
|
||||||
quantization: Optional[str] = None
|
quantization: Optional[str] = None
|
||||||
enforce_eager: Optional[bool] = None
|
enforce_eager: Optional[bool] = None
|
||||||
max_context_len_to_capture: Optional[int] = None
|
|
||||||
max_seq_len_to_capture: int = 8192
|
max_seq_len_to_capture: int = 8192
|
||||||
disable_custom_all_reduce: bool = False
|
disable_custom_all_reduce: bool = False
|
||||||
tokenizer_pool_size: int = 0
|
tokenizer_pool_size: int = 0
|
||||||
@ -504,14 +503,6 @@ class EngineArgs:
|
|||||||
help='Always use eager-mode PyTorch. If False, '
|
help='Always use eager-mode PyTorch. If False, '
|
||||||
'will use eager mode and CUDA graph in hybrid '
|
'will use eager mode and CUDA graph in hybrid '
|
||||||
'for maximal performance and flexibility.')
|
'for maximal performance and flexibility.')
|
||||||
parser.add_argument('--max-context-len-to-capture',
|
|
||||||
type=int,
|
|
||||||
default=EngineArgs.max_context_len_to_capture,
|
|
||||||
help='Maximum context length covered by CUDA '
|
|
||||||
'graphs. When a sequence has context length '
|
|
||||||
'larger than this, we fall back to eager mode. '
|
|
||||||
'(DEPRECATED. Use --max-seq-len-to-capture instead'
|
|
||||||
')')
|
|
||||||
parser.add_argument('--max-seq-len-to-capture',
|
parser.add_argument('--max-seq-len-to-capture',
|
||||||
type=int,
|
type=int,
|
||||||
default=EngineArgs.max_seq_len_to_capture,
|
default=EngineArgs.max_seq_len_to_capture,
|
||||||
@ -939,7 +930,6 @@ class EngineArgs:
|
|||||||
quantization=self.quantization,
|
quantization=self.quantization,
|
||||||
quantization_param_path=self.quantization_param_path,
|
quantization_param_path=self.quantization_param_path,
|
||||||
enforce_eager=self.enforce_eager,
|
enforce_eager=self.enforce_eager,
|
||||||
max_context_len_to_capture=self.max_context_len_to_capture,
|
|
||||||
max_seq_len_to_capture=self.max_seq_len_to_capture,
|
max_seq_len_to_capture=self.max_seq_len_to_capture,
|
||||||
max_logprobs=self.max_logprobs,
|
max_logprobs=self.max_logprobs,
|
||||||
disable_sliding_window=self.disable_sliding_window,
|
disable_sliding_window=self.disable_sliding_window,
|
||||||
|
|||||||
@ -93,9 +93,6 @@ class LLM:
|
|||||||
enforce_eager: Whether to enforce eager execution. If True, we will
|
enforce_eager: Whether to enforce eager execution. If True, we will
|
||||||
disable CUDA graph and always execute the model in eager mode.
|
disable CUDA graph and always execute the model in eager mode.
|
||||||
If False, we will use CUDA graph and eager execution in hybrid.
|
If False, we will use CUDA graph and eager execution in hybrid.
|
||||||
max_context_len_to_capture: Maximum context len covered by CUDA graphs.
|
|
||||||
When a sequence has context length larger than this, we fall back
|
|
||||||
to eager mode (DEPRECATED. Use `max_seq_len_to_capture` instead).
|
|
||||||
max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
|
max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
|
||||||
When a sequence has context length larger than this, we fall back
|
When a sequence has context length larger than this, we fall back
|
||||||
to eager mode. Additionally for encoder-decoder models, if the
|
to eager mode. Additionally for encoder-decoder models, if the
|
||||||
@ -152,7 +149,6 @@ class LLM:
|
|||||||
swap_space: float = 4,
|
swap_space: float = 4,
|
||||||
cpu_offload_gb: float = 0,
|
cpu_offload_gb: float = 0,
|
||||||
enforce_eager: Optional[bool] = None,
|
enforce_eager: Optional[bool] = None,
|
||||||
max_context_len_to_capture: Optional[int] = None,
|
|
||||||
max_seq_len_to_capture: int = 8192,
|
max_seq_len_to_capture: int = 8192,
|
||||||
disable_custom_all_reduce: bool = False,
|
disable_custom_all_reduce: bool = False,
|
||||||
disable_async_output_proc: bool = False,
|
disable_async_output_proc: bool = False,
|
||||||
@ -193,7 +189,6 @@ class LLM:
|
|||||||
swap_space=swap_space,
|
swap_space=swap_space,
|
||||||
cpu_offload_gb=cpu_offload_gb,
|
cpu_offload_gb=cpu_offload_gb,
|
||||||
enforce_eager=enforce_eager,
|
enforce_eager=enforce_eager,
|
||||||
max_context_len_to_capture=max_context_len_to_capture,
|
|
||||||
max_seq_len_to_capture=max_seq_len_to_capture,
|
max_seq_len_to_capture=max_seq_len_to_capture,
|
||||||
disable_custom_all_reduce=disable_custom_all_reduce,
|
disable_custom_all_reduce=disable_custom_all_reduce,
|
||||||
disable_async_output_proc=disable_async_output_proc,
|
disable_async_output_proc=disable_async_output_proc,
|
||||||
|
|||||||
@ -995,7 +995,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
|
|||||||
# Python can be expensive. To optimize this, we cache the block table
|
# Python can be expensive. To optimize this, we cache the block table
|
||||||
# in numpy and only copy the actual input content at every iteration.
|
# in numpy and only copy the actual input content at every iteration.
|
||||||
# The shape of the cached block table will be
|
# The shape of the cached block table will be
|
||||||
# (max batch size to capture, max context len to capture / block size).
|
# (max batch size to capture, max seq len to capture / block size).
|
||||||
self.graph_block_tables = np.zeros(
|
self.graph_block_tables = np.zeros(
|
||||||
(self.max_batchsize_to_capture, self.get_max_block_per_batch()),
|
(self.max_batchsize_to_capture, self.get_max_block_per_batch()),
|
||||||
dtype=np.int32)
|
dtype=np.int32)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user