Signed-off-by: Peter Salas <peter@fixie.ai>
This commit is contained in:
parent
82bfc38d07
commit
ffc0f2b47a
@ -11,4 +11,4 @@ trap remove_docker_container EXIT
|
|||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Run the image and launch offline inference
|
# Run the image and launch offline inference
|
||||||
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
|
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import List, Tuple, Type
|
from typing import Dict, List, Optional, Tuple, Type
|
||||||
|
|
||||||
import openvino as ov
|
import openvino as ov
|
||||||
import torch
|
import torch
|
||||||
@ -7,6 +7,7 @@ import torch
|
|||||||
from vllm.attention.backends.abstract import (AttentionBackend,
|
from vllm.attention.backends.abstract import (AttentionBackend,
|
||||||
AttentionMetadata)
|
AttentionMetadata)
|
||||||
from vllm.attention.backends.utils import CommonAttentionState
|
from vllm.attention.backends.utils import CommonAttentionState
|
||||||
|
from vllm.multimodal import MultiModalPlaceholderMap
|
||||||
|
|
||||||
|
|
||||||
def copy_cache_block(src_tensor: ov.Tensor, dst_tensor: ov.Tensor,
|
def copy_cache_block(src_tensor: ov.Tensor, dst_tensor: ov.Tensor,
|
||||||
@ -128,3 +129,12 @@ class OpenVINOAttentionMetadata:
|
|||||||
# Shape: scalar
|
# Shape: scalar
|
||||||
# Type: i32
|
# Type: i32
|
||||||
max_context_len: torch.Tensor
|
max_context_len: torch.Tensor
|
||||||
|
|
||||||
|
# The index maps that relate multi-modal embeddings to the corresponding
|
||||||
|
# placeholders.
|
||||||
|
#
|
||||||
|
# N.B. These aren't really related to attention and don't belong on this
|
||||||
|
# type -- this is just a temporary solution to make them available to
|
||||||
|
# `model_executable`.
|
||||||
|
multi_modal_placeholder_index_maps: Optional[Dict[
|
||||||
|
str, MultiModalPlaceholderMap.IndexMap]]
|
||||||
|
|||||||
@ -21,8 +21,8 @@ from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
|
|||||||
get_tensor_model_parallel_world_size,
|
get_tensor_model_parallel_world_size,
|
||||||
split_tensor_along_last_dim,
|
split_tensor_along_last_dim,
|
||||||
tensor_model_parallel_all_gather)
|
tensor_model_parallel_all_gather)
|
||||||
from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
|
from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
|
||||||
token_inputs)
|
InputContext, token_inputs)
|
||||||
from vllm.model_executor import SamplingMetadata
|
from vllm.model_executor import SamplingMetadata
|
||||||
from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul
|
from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul
|
||||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||||
@ -915,7 +915,7 @@ def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
|
|||||||
if "image_masks" in out:
|
if "image_masks" in out:
|
||||||
dummy_imgdata["image_masks"] = out["image_masks"]
|
dummy_imgdata["image_masks"] = out["image_masks"]
|
||||||
dummy_imgdata["seq_len"] = torch.tensor(seq_len, dtype=torch.long)
|
dummy_imgdata["seq_len"] = torch.tensor(seq_len, dtype=torch.long)
|
||||||
return dummy_seqdata, {"image": dummy_imgdata}
|
return DummyData(dummy_seqdata, {"image": dummy_imgdata})
|
||||||
|
|
||||||
|
|
||||||
def pad_images(
|
def pad_images(
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user