diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 43905082..5452ce6b 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -6,7 +6,8 @@ import pytest @pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"]) def test_full_graph(model): # make sure these models can be captured in full graph mode - os.environ["VLLM_TEST_DYNAMO_GRAPH_CAPTURE"] = "1" + if "VLLM_TEST_DYNAMO_GRAPH_CAPTURE" not in os.environ: + os.environ["VLLM_TEST_DYNAMO_GRAPH_CAPTURE"] = "1" from vllm import LLM, SamplingParams prompts = [ diff --git a/vllm/envs.py b/vllm/envs.py index b3678399..2003ede9 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -202,6 +202,11 @@ environment_variables: Dict[str, Callable[[], Any]] = { (os.environ.get("VLLM_DYNAMO_USE_CUSTOM_DISPATCHER", "True").lower() in ("true", "1")), + # Internal flag to control whether we use custom op, + # or use the native pytorch implementation + "VLLM_TEST_COMPILE_NO_CUSTOM_OPS": + lambda: int(os.environ.get("VLLM_TEST_COMPILE_NO_CUSTOM_OPS", "0")), + # Internal flag to enable Dynamo fullgraph capture "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE": lambda: bool( diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 49247cd5..9102b5e1 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -1,5 +1,6 @@ import torch.nn as nn +import vllm.envs as envs from vllm.platforms import current_platform from vllm.utils import is_cpu, is_hip, is_xpu @@ -53,6 +54,10 @@ class CustomOp(nn.Module): def dispatch_forward(self): # NOTE(woosuk): Here we assume that vLLM was built for only one # specific backend. Currently, we do not support dynamic dispatching. + + if envs.VLLM_TEST_COMPILE_NO_CUSTOM_OPS: + return self.forward_native + if is_hip(): return self.forward_hip elif is_cpu():