From 9c1f78d5d66c3630ad5f5e149eca6f84b2d254a2 Mon Sep 17 00:00:00 2001 From: omrishiv <327609+omrishiv@users.noreply.github.com> Date: Thu, 15 Aug 2024 09:44:14 -0700 Subject: [PATCH] [Bugfix] update neuron for version > 0.5.0 (#7175) Signed-off-by: omrishiv <327609+omrishiv@users.noreply.github.com> Co-authored-by: Cyrus Leung --- vllm/engine/arg_utils.py | 2 +- vllm/executor/neuron_executor.py | 5 ++--- vllm/worker/neuron_model_runner.py | 1 + vllm/worker/neuron_worker.py | 3 +++ 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index d9938754..48d01fcf 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -316,7 +316,7 @@ class EngineArgs: parser.add_argument('--block-size', type=int, default=EngineArgs.block_size, - choices=[8, 16, 32], + choices=[8, 16, 32, 128, 256, 512, 1024, 2048], help='Token block size for contiguous chunks of ' 'tokens.') diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index 5d4c4f49..b45d5d86 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -100,9 +100,8 @@ class NeuronExecutorAsync(NeuronExecutor, ExecutorAsyncBase): self, execute_model_req: ExecuteModelRequest, ) -> List[SamplerOutput]: - output = await make_async( - self.driver_worker.execute_model - )(seq_group_metadata_list=execute_model_req.seq_group_metadata_list, ) + output = await make_async(self.driver_worker.execute_model + )(execute_model_req=execute_model_req, ) return output async def check_health_async(self) -> None: diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index 6448e5ff..4f3fed2d 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -197,6 +197,7 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): virtual_engine: int = 0, finished_requests_ids: Optional[List[str]] = None ) -> ModelInputForNeuron: + multi_modal_kwargs = None # NOTE: We assume that all sequences in the group are all prompts or # all decodes. is_prompt = seq_group_metadata_list[0].is_prompt diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index f7525e04..3b0ded36 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -89,6 +89,9 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): return WorkerInput(num_seq_groups=len( execute_model_req.seq_group_metadata_list), ) + def execute_worker(self, worker_input: WorkerInput) -> None: + pass + def get_cache_block_size_bytes(self) -> int: """Determine the size in bytes of a cache block.