diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 83e869ef..b005d83c 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -273,7 +273,8 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): config.projector_hidden_act = "gelu" # TODO: Optionally initializes this for supporting embeddings. - self.vision_tower = init_vision_tower_for_llava(config, quant_config) + self.vision_tower = init_vision_tower_for_llava( + config, quant_config, require_post_norm=False) self.multi_modal_projector = LlavaMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, text_hidden_size=config.text_config.hidden_size, diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index d33d4ac5..9466e72e 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -277,7 +277,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, self.multimodal_config = multimodal_config # TODO: Optionally initializes this for supporting embeddings. - self.vision_tower = init_vision_tower_for_llava(config, quant_config) + self.vision_tower = init_vision_tower_for_llava( + config, quant_config, require_post_norm=False) self.image_newline = nn.Parameter( torch.empty(config.text_config.hidden_size)) self.multi_modal_projector = LlavaMultiModalProjector( diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index d02cf904..43eec43d 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -256,7 +256,8 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, self.multimodal_config = multimodal_config # Initialize the vision tower only up to the required feature layer - self.vision_tower = init_vision_tower_for_llava(config, quant_config) + self.vision_tower = init_vision_tower_for_llava( + config, quant_config, require_post_norm=False) self.vision_resampler = LlavaNextVideoPooler(config) self.multi_modal_projector = LlavaNextMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 10aa8049..47e62409 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -400,7 +400,8 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, self.multimodal_config = multimodal_config # Initialize the vision tower only up to the required feature layer - self.vision_tower = init_vision_tower_for_llava(config, quant_config) + self.vision_tower = init_vision_tower_for_llava( + config, quant_config, require_post_norm=False) self.multi_modal_projector = LlavaOnevisionMultiModalProjector(config) self.language_model = init_vllm_registered_model( config.text_config, cache_config, quant_config)