[Hotfix][Core][VLM] Disable chunked prefill by default and prefix caching for multimodal models (#8425)
This commit is contained in:
parent
019877253b
commit
c16369455f
@ -843,6 +843,13 @@ class EngineArgs:
|
||||
device_config = DeviceConfig(device=self.device)
|
||||
model_config = self.create_model_config()
|
||||
|
||||
if model_config.is_multimodal_model:
|
||||
if self.enable_prefix_caching:
|
||||
logger.warning(
|
||||
"--enable-prefix-caching is currently not "
|
||||
"supported for multimodal models and has been disabled.")
|
||||
self.enable_prefix_caching = False
|
||||
|
||||
cache_config = CacheConfig(
|
||||
block_size=self.block_size if self.device != "neuron" else
|
||||
self.max_model_len, # neuron needs block_size = max_model_len
|
||||
@ -874,7 +881,10 @@ class EngineArgs:
|
||||
# If not explicitly set, enable chunked prefill by default for
|
||||
# long context (> 32K) models. This is to avoid OOM errors in the
|
||||
# initial memory profiling phase.
|
||||
if use_long_context:
|
||||
|
||||
# Chunked prefill is currently disabled for multimodal models by
|
||||
# default.
|
||||
if use_long_context and not model_config.is_multimodal_model:
|
||||
is_gpu = device_config.device_type == "cuda"
|
||||
use_sliding_window = (model_config.get_sliding_window()
|
||||
is not None)
|
||||
|
||||
@ -90,12 +90,12 @@ _MULTIMODAL_MODELS = {
|
||||
"PaliGemmaForConditionalGeneration": ("paligemma",
|
||||
"PaliGemmaForConditionalGeneration"),
|
||||
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
|
||||
"UltravoxModel": ("ultravox", "UltravoxModel"),
|
||||
"QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
|
||||
"PixtralForConditionalGeneration": ("pixtral",
|
||||
"PixtralForConditionalGeneration"),
|
||||
"QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
|
||||
"Qwen2VLForConditionalGeneration": ("qwen2_vl",
|
||||
"Qwen2VLForConditionalGeneration"),
|
||||
"UltravoxModel": ("ultravox", "UltravoxModel"),
|
||||
}
|
||||
_CONDITIONAL_GENERATION_MODELS = {
|
||||
"BartModel": ("bart", "BartForConditionalGeneration"),
|
||||
|
||||
Loading…
Reference in New Issue
Block a user