[Model][OpenVINO] Fix regressions from #8346 (#10045)

Signed-off-by: Peter Salas <peter@fixie.ai>
2024-11-05 20:19:15 -08:00 · 2024-11-05 20:19:15 -08:00 · ffc0f2b47a
commit ffc0f2b47a
parent 82bfc38d07
3 changed files with 15 additions and 5 deletions
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@ -11,4 +11,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 # Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
--- a/vllm/attention/backends/openvino.py
+++ b/vllm/attention/backends/openvino.py
@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import List, Tuple, Type
+from typing import Dict, List, Optional, Tuple, Type
 import openvino as ov
 import torch
@ -7,6 +7,7 @@ import torch
 from vllm.attention.backends.abstract import (AttentionBackend,
                                              AttentionMetadata)
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.multimodal import MultiModalPlaceholderMap
 def copy_cache_block(src_tensor: ov.Tensor, dst_tensor: ov.Tensor,
@ -128,3 +129,12 @@ class OpenVINOAttentionMetadata:
    # Shape: scalar
    # Type: i32
    max_context_len: torch.Tensor
    # The index maps that relate multi-modal embeddings to the corresponding
    # placeholders.
    #
    # N.B. These aren't really related to attention and don't belong on this
    # type -- this is just a temporary solution to make them available to
    # `model_executable`.
    multi_modal_placeholder_index_maps: Optional[Dict[
        str, MultiModalPlaceholderMap.IndexMap]]
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@ -21,8 +21,8 @@ from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                              get_tensor_model_parallel_world_size,
                              split_tensor_along_last_dim,
                              tensor_model_parallel_all_gather)
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         token_inputs)
+                         InputContext, token_inputs)
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
@ -915,7 +915,7 @@ def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
    if "image_masks" in out:
        dummy_imgdata["image_masks"] = out["image_masks"]
    dummy_imgdata["seq_len"] = torch.tensor(seq_len, dtype=torch.long)
-    return dummy_seqdata, {"image": dummy_imgdata}
+    return DummyData(dummy_seqdata, {"image": dummy_imgdata})
 def pad_images(