[ci][test] use load dummy for testing (#9165)
This commit is contained in:
parent
8bfaa4e31e
commit
c8627cd41b
@ -269,7 +269,7 @@ steps:
|
|||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
- tests/quantization
|
- tests/quantization
|
||||||
command: pytest -v -s quantization
|
command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 53min
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
|||||||
@ -16,6 +16,7 @@ import requests
|
|||||||
from openai.types.completion import Completion
|
from openai.types.completion import Completion
|
||||||
from typing_extensions import ParamSpec, assert_never
|
from typing_extensions import ParamSpec, assert_never
|
||||||
|
|
||||||
|
import vllm.envs as envs
|
||||||
from tests.models.utils import TextTextLogprobs
|
from tests.models.utils import TextTextLogprobs
|
||||||
from vllm.distributed import (ensure_model_parallel_initialized,
|
from vllm.distributed import (ensure_model_parallel_initialized,
|
||||||
init_distributed_environment)
|
init_distributed_environment)
|
||||||
@ -352,10 +353,26 @@ def compare_all_settings(model: str,
|
|||||||
tokenizer_mode=tokenizer_mode,
|
tokenizer_mode=tokenizer_mode,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
can_force_load_format = True
|
||||||
|
|
||||||
|
for args in all_args:
|
||||||
|
if "--load-format" in args:
|
||||||
|
can_force_load_format = False
|
||||||
|
break
|
||||||
|
|
||||||
prompt = "Hello, my name is"
|
prompt = "Hello, my name is"
|
||||||
token_ids = tokenizer(prompt).input_ids
|
token_ids = tokenizer(prompt).input_ids
|
||||||
ref_results: List = []
|
ref_results: List = []
|
||||||
for i, (args, env) in enumerate(zip(all_args, all_envs)):
|
for i, (args, env) in enumerate(zip(all_args, all_envs)):
|
||||||
|
if can_force_load_format:
|
||||||
|
# we are comparing the results and
|
||||||
|
# usually we don't need real weights.
|
||||||
|
# we force to use dummy weights by default,
|
||||||
|
# and it should work for most of the cases.
|
||||||
|
# if not, we can use VLLM_TEST_FORCE_LOAD_FORMAT
|
||||||
|
# environment variable to force the load format,
|
||||||
|
# e.g. in quantization tests.
|
||||||
|
args = args + ["--load-format", envs.VLLM_TEST_FORCE_LOAD_FORMAT]
|
||||||
compare_results: List = []
|
compare_results: List = []
|
||||||
results = ref_results if i == 0 else compare_results
|
results = ref_results if i == 0 else compare_results
|
||||||
with RemoteOpenAIServer(model,
|
with RemoteOpenAIServer(model,
|
||||||
|
|||||||
@ -397,6 +397,8 @@ environment_variables: Dict[str, Callable[[], Any]] = {
|
|||||||
lambda:
|
lambda:
|
||||||
(os.environ.get("VLLM_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in
|
(os.environ.get("VLLM_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in
|
||||||
("1", "true")),
|
("1", "true")),
|
||||||
|
"VLLM_TEST_FORCE_LOAD_FORMAT":
|
||||||
|
lambda: os.getenv("VLLM_TEST_FORCE_LOAD_FORMAT", "dummy"),
|
||||||
|
|
||||||
# Time in ms for the zmq client to wait for a response from the backend
|
# Time in ms for the zmq client to wait for a response from the backend
|
||||||
# server for simple data operations
|
# server for simple data operations
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user