From 9c1f78d5d66c3630ad5f5e149eca6f84b2d254a2 Mon Sep 17 00:00:00 2001
From: omrishiv <327609+omrishiv@users.noreply.github.com>
Date: Thu, 15 Aug 2024 09:44:14 -0700
Subject: [PATCH] [Bugfix] update neuron for version > 0.5.0 (#7175)

Signed-off-by: omrishiv <327609+omrishiv@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 vllm/engine/arg_utils.py           | 2 +-
 vllm/executor/neuron_executor.py   | 5 ++---
 vllm/worker/neuron_model_runner.py | 1 +
 vllm/worker/neuron_worker.py       | 3 +++
 4 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index d9938754..48d01fcf 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -316,7 +316,7 @@ class EngineArgs:
         parser.add_argument('--block-size',
                             type=int,
                             default=EngineArgs.block_size,
-                            choices=[8, 16, 32],
+                            choices=[8, 16, 32, 128, 256, 512, 1024, 2048],
                             help='Token block size for contiguous chunks of '
                             'tokens.')
 
diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py
index 5d4c4f49..b45d5d86 100644
--- a/vllm/executor/neuron_executor.py
+++ b/vllm/executor/neuron_executor.py
@@ -100,9 +100,8 @@ class NeuronExecutorAsync(NeuronExecutor, ExecutorAsyncBase):
         self,
         execute_model_req: ExecuteModelRequest,
     ) -> List[SamplerOutput]:
-        output = await make_async(
-            self.driver_worker.execute_model
-        )(seq_group_metadata_list=execute_model_req.seq_group_metadata_list, )
+        output = await make_async(self.driver_worker.execute_model
+                                  )(execute_model_req=execute_model_req, )
         return output
 
     async def check_health_async(self) -> None:
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index 6448e5ff..4f3fed2d 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -197,6 +197,7 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
         virtual_engine: int = 0,
         finished_requests_ids: Optional[List[str]] = None
     ) -> ModelInputForNeuron:
+        multi_modal_kwargs = None
         # NOTE: We assume that all sequences in the group are all prompts or
         # all decodes.
         is_prompt = seq_group_metadata_list[0].is_prompt
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
index f7525e04..3b0ded36 100644
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -89,6 +89,9 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
         return WorkerInput(num_seq_groups=len(
             execute_model_req.seq_group_metadata_list), )
 
+    def execute_worker(self, worker_input: WorkerInput) -> None:
+        pass
+
     def get_cache_block_size_bytes(self) -> int:
         """Determine the size in bytes of a cache block.