2024-09-18 21:56:58 +08:00
|
|
|
import os
|
|
|
|
|
|
2024-11-17 10:02:14 +08:00
|
|
|
from vllm.config import CompilationLevel
|
2024-10-11 03:39:36 +08:00
|
|
|
|
2024-08-29 07:10:12 +08:00
|
|
|
from ..utils import compare_two_settings
|
|
|
|
|
|
2024-09-18 21:56:58 +08:00
|
|
|
# --enforce-eager on TPU causes graph compilation
|
|
|
|
|
# this times out default Health Check in the MQLLMEngine,
|
|
|
|
|
# so we set the timeout here to 30s
|
|
|
|
|
os.environ["VLLM_RPC_TIMEOUT"] = "30000"
|
|
|
|
|
|
2024-08-29 07:10:12 +08:00
|
|
|
|
|
|
|
|
def test_custom_dispatcher():
|
2024-10-11 03:39:36 +08:00
|
|
|
compare_two_settings(
|
|
|
|
|
"google/gemma-2b",
|
2024-11-22 04:30:42 +08:00
|
|
|
arg1=[
|
|
|
|
|
"--enforce-eager",
|
|
|
|
|
f"-O{CompilationLevel.DYNAMO_ONCE}",
|
|
|
|
|
],
|
|
|
|
|
arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"],
|
2024-11-20 02:09:03 +08:00
|
|
|
env1={},
|
|
|
|
|
env2={})
|