[Bugfix] Enable some fp8 and quantized fullgraph tests (#10171)
Signed-off-by: Bill Nell <bill@neuralmagic.com>
This commit is contained in:
parent
8e1529dc57
commit
f192aeba74
@ -9,29 +9,26 @@ from vllm.platforms import current_platform
|
|||||||
|
|
||||||
TEST_MODELS = [
|
TEST_MODELS = [
|
||||||
("facebook/opt-125m", {}),
|
("facebook/opt-125m", {}),
|
||||||
# TODO: add fake implementation for compressed-tensors
|
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
|
||||||
# ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
|
"dtype": torch.float16,
|
||||||
# "dtype": torch.float16,
|
"quantization": "compressed-tensors"
|
||||||
# "quantization": "compressed-tensors"
|
}),
|
||||||
# }),
|
|
||||||
("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", {
|
("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", {
|
||||||
"dtype": torch.float16,
|
"dtype": torch.float16,
|
||||||
"quantization": "fp8"
|
"quantization": "fp8"
|
||||||
}),
|
}),
|
||||||
# TODO: add fake implementation for compressed-tensors
|
("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
|
||||||
# ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
|
"quantization": "compressed-tensors"
|
||||||
# "quantization": "compressed-tensors"
|
}),
|
||||||
# }),
|
|
||||||
("meta-llama/Meta-Llama-3-8B", {}),
|
("meta-llama/Meta-Llama-3-8B", {}),
|
||||||
]
|
]
|
||||||
|
|
||||||
# TODO: enable in pytorch 2.5
|
if is_quant_method_supported("aqlm"):
|
||||||
if False and is_quant_method_supported("aqlm"): # noqa: SIM223
|
|
||||||
TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
|
TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
|
||||||
"quantization": "aqlm"
|
"quantization": "aqlm"
|
||||||
}))
|
}))
|
||||||
|
|
||||||
# TODO: enable in pytorch 2.5
|
# TODO: figure out why this fails.
|
||||||
if False and is_quant_method_supported("gguf"): # noqa: SIM223
|
if False and is_quant_method_supported("gguf"): # noqa: SIM223
|
||||||
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
|
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
|
||||||
"quantization": "gguf"
|
"quantization": "gguf"
|
||||||
@ -71,13 +68,13 @@ def check_full_graph_support(model,
|
|||||||
os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(optimization_level)
|
os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(optimization_level)
|
||||||
os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
|
os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
|
||||||
|
|
||||||
# Inductor doesn't support fp8 and the base meta llama uses too
|
# The base meta llama uses too much memory.
|
||||||
# much memory.
|
if (model == "meta-llama/Meta-Llama-3-8B"
|
||||||
quantization = model_kwargs.get("quantization")
|
|
||||||
if ((quantization == "fp8" or model == "meta-llama/Meta-Llama-3-8B")
|
|
||||||
and optimization_level >= CompilationLevel.PIECEWISE):
|
and optimization_level >= CompilationLevel.PIECEWISE):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
print(f"MODEL={model}")
|
||||||
|
|
||||||
prompts = [
|
prompts = [
|
||||||
"Hello, my name is",
|
"Hello, my name is",
|
||||||
"The president of the United States is",
|
"The president of the United States is",
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user