vllm/tests/lora/test_punica_variation.py

"""
This script is mainly used to test whether trtion kernels can run normally
under different conditions, including various batches, numbers of LoRA , and
maximum ranks.
"""
import pytest
import torch

# Enable custom op register
import vllm.lora.ops.bgmv_expand
import vllm.lora.ops.bgmv_expand_slice
import vllm.lora.ops.bgmv_shrink
import vllm.lora.ops.sgmv_expand
import vllm.lora.ops.sgmv_expand_slice
import vllm.lora.ops.sgmv_shrink  # noqa: F401
from vllm.platforms import current_platform

from .utils import (generate_data, generate_data_for_expand_nslices,
                    ref_torch_groupgemm)

HIDDEN_SIZES = [4097]

BATCHES = [1, 4, 16, 32]
NUM_LORA = [1, 8, 32, 128]
DTYPES = [torch.float16, torch.bfloat16]
MAX_RANKS = [1, 4, 8, 16, 32, 64, 128, 256]
SCALES = [0.5]
SEED = [0]
CUDA_DEVICES = [f"cuda:{0}"]


def assert_close(a, b):
    rtol, atol = {
        torch.float16: (6e-2, 6e-2),
        torch.bfloat16: (6e-2, 6e-2),
        torch.float32: (1e-2, 1e-2),
    }[a.dtype]
    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)


# Unlike test_punica_sizes.py, we directly utilize custom op for
# testing, which verifies the correct registration of these ops.
bgmv_expand = torch.ops.vllm.bgmv_expand
bgmv_expand_slice = torch.ops.vllm.bgmv_expand_slice
bgmv_shrink = torch.ops.vllm.bgmv_shrink
sgmv_expand = torch.ops.vllm.sgmv_expand
sgmv_expand_slice = torch.ops.vllm.sgmv_expand_slice
sgmv_shrink = torch.ops.vllm.sgmv_shrink


@pytest.mark.parametrize("batches", BATCHES)
@pytest.mark.parametrize("num_loras", NUM_LORA)
@pytest.mark.parametrize("rank", MAX_RANKS)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@pytest.mark.parametrize("scaling", SCALES)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("op_type", ["shrink", "expand"])
@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("device", CUDA_DEVICES)
def test_punica_sgmv(
    batches: int,
    num_loras: int,
    rank: int,
    hidden_size: int,
    scaling: float,
    dtype: torch.dtype,
    op_type: str,
    seed: int,
    device: str,
):
    torch.set_default_device(device)
    current_platform.seed_everything(seed)

    seq_length = 128
    (
        inputs_tensor,
        lora_weights,
        our_out_tensor,
        ref_out_tensor,
        b_seq_start_loc,
        lora_indices_tensor,
        seq_len_tensor,
        indices,
    ) = generate_data(
        batches,
        hidden_size,
        num_loras,
        rank,
        seq_length,
        dtype,
        op_type,
        device,
    )
    max_seq_length = seq_len_tensor.max()
    token_nums = seq_len_tensor.sum().item()
    if isinstance(max_seq_length, tuple):
        max_seq_length = max_seq_length[0].item()
    else:
        max_seq_length = max_seq_length.item()
    if op_type == "shrink":
        sgmv_shrink(
            inputs_tensor,
            lora_weights,
            our_out_tensor,
            b_seq_start_loc,
            seq_len_tensor,
            lora_indices_tensor,
            batches,
            max_seq_length,
            token_nums,
            scaling,
        )
    else:
        sgmv_expand(
            inputs_tensor,
            lora_weights,
            our_out_tensor,
            b_seq_start_loc,
            seq_len_tensor,
            lora_indices_tensor,
            batches,
            max_seq_length,
            token_nums,
            add_inputs=True,
        )
    ref_torch_groupgemm(
        ref_out_tensor,
        inputs_tensor,
        lora_weights,
        lora_indices_tensor,
        seq_len_tensor,
        batches,
        scaling if op_type == "shrink" else 1.0,
        op_type,
    )
    if op_type == "shrink":
        ref_out_tensor = ref_out_tensor.to(torch.float32)
    assert_close(our_out_tensor, ref_out_tensor)


@pytest.mark.parametrize("batches", BATCHES)
@pytest.mark.parametrize("num_loras", NUM_LORA)
@pytest.mark.parametrize("rank", MAX_RANKS)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@pytest.mark.parametrize("scaling", SCALES)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("op_type", ["shrink", "expand"])
@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("device", CUDA_DEVICES)
def test_punica_bgmv(
    batches: int,
    num_loras: int,
    rank: int,
    hidden_size: int,
    scaling: float,
    dtype: torch.dtype,
    op_type: str,
    seed: int,
    device: str,
):

    torch.set_default_device(device)
    current_platform.seed_everything(seed)

    seq_length = 1
    (
        inputs_tensor,
        lora_weights,
        our_out_tensor,
        ref_out_tensor,
        b_seq_start_loc,
        lora_indices_tensor,
        seq_len_tensor,
        indices,
    ) = generate_data(
        batches,
        hidden_size,
        num_loras,
        rank,
        seq_length,
        dtype,
        op_type,
        device,
    )
    if op_type == "shrink":
        bgmv_shrink(
            inputs_tensor,
            lora_weights,
            our_out_tensor,
            indices,
            scaling,
        )
    else:

        bgmv_expand(
            inputs_tensor,
            lora_weights,
            our_out_tensor,
            indices,
            add_inputs=True,
        )
    ref_torch_groupgemm(
        ref_out_tensor,
        inputs_tensor,
        lora_weights,
        lora_indices_tensor,
        seq_len_tensor,
        batches,
        scaling if op_type == "shrink" else 1.0,
        op_type,
    )
    if op_type == "shrink":
        ref_out_tensor = ref_out_tensor.to(torch.float32)
    assert_close(our_out_tensor, ref_out_tensor)


@pytest.mark.parametrize("batches", BATCHES)
@pytest.mark.parametrize("num_loras", NUM_LORA)
@pytest.mark.parametrize("rank", MAX_RANKS)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@pytest.mark.parametrize("nslices", [2, 3])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("op_type", ["sgmv", "bgmv"])
@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("device", CUDA_DEVICES)
def test_punica_expand_nslices(
    batches: int,
    num_loras: int,
    rank: int,
    hidden_size: int,
    nslices: int,
    dtype: torch.dtype,
    op_type: str,
    seed: int,
    device: str,
):
    torch.set_default_device(device)
    current_platform.seed_everything(seed)

    seq_length = 128 if op_type == "sgmv" else 1
    (
        inputs_tensor,
        lora_weights_lst,
        our_outputs,
        ref_outputs,
        b_seq_start_loc,
        lora_indices_tensor,
        seq_len_tensor,
        indices,
    ) = generate_data_for_expand_nslices(
        batches,
        hidden_size,
        num_loras,
        rank,
        seq_length,
        dtype,
        nslices,
        device,
    )
    max_seq_length = seq_len_tensor.max()
    token_nums = seq_len_tensor.sum().item()
    if isinstance(max_seq_length, tuple):
        max_seq_length = max_seq_length[0].item()
    else:
        max_seq_length = max_seq_length.item()
    slice_offset = 0
    for index in range(nslices):
        lora_weights = lora_weights_lst[index]
        if op_type == "sgmv":
            sgmv_expand_slice(
                inputs_tensor,
                lora_weights,
                our_outputs,
                b_seq_start_loc,
                seq_len_tensor,
                lora_indices_tensor,
                batches,
                max_seq_length,
                token_nums,
                slice_offset,
                hidden_size,
                add_inputs=True,
            )
        else:
            bgmv_expand_slice(
                inputs_tensor,
                lora_weights,
                our_outputs,
                indices,
                slice_offset,
                slice_size=hidden_size,
                add_inputs=True,
            )
        ref_torch_groupgemm(
            ref_outputs[:, slice_offset:slice_offset + hidden_size],
            inputs_tensor,
            lora_weights,
            lora_indices_tensor,
            seq_len_tensor,
            batches,
            1.0,
            op_type="expand",
        )

        slice_offset += hidden_size
    assert_close(our_outputs, ref_outputs)
[Kernel][RFC] Refactor the punica kernel based on Triton (#5036) 2024-08-01 08:12:24 +08:00			`"""`
[Hardware] using current_platform.seed_everything (#9785) Signed-off-by: wangshuai09 <391746016@qq.com> 2024-10-29 22:47:44 +08:00			`This script is mainly used to test whether trtion kernels can run normally`
			`under different conditions, including various batches, numbers of LoRA , and`
[Kernel][RFC] Refactor the punica kernel based on Triton (#5036) 2024-08-01 08:12:24 +08:00			`maximum ranks.`
			`"""`
			`import pytest`
			`import torch`

[Kernel] Register punica ops directly (#10522) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> 2024-11-22 01:18:11 +08:00			`# Enable custom op register`
			`import vllm.lora.ops.bgmv_expand`
			`import vllm.lora.ops.bgmv_expand_slice`
			`import vllm.lora.ops.bgmv_shrink`
			`import vllm.lora.ops.sgmv_expand`
			`import vllm.lora.ops.sgmv_expand_slice`
			`import vllm.lora.ops.sgmv_shrink # noqa: F401`
[Hardware] using current_platform.seed_everything (#9785) Signed-off-by: wangshuai09 <391746016@qq.com> 2024-10-29 22:47:44 +08:00			`from vllm.platforms import current_platform`
[Kernel][RFC] Refactor the punica kernel based on Triton (#5036) 2024-08-01 08:12:24 +08:00
			`from .utils import (generate_data, generate_data_for_expand_nslices,`
			`ref_torch_groupgemm)`

[CI/Build]Reduce the time consumption for LoRA tests (#7396) 2024-08-14 08:27:29 +08:00			`HIDDEN_SIZES = [4097]`
[Kernel][RFC] Refactor the punica kernel based on Triton (#5036) 2024-08-01 08:12:24 +08:00
			`BATCHES = [1, 4, 16, 32]`
[CI/Build]Reduce the time consumption for LoRA tests (#7396) 2024-08-14 08:27:29 +08:00			`NUM_LORA = [1, 8, 32, 128]`
[Kernel][RFC] Refactor the punica kernel based on Triton (#5036) 2024-08-01 08:12:24 +08:00			`DTYPES = [torch.float16, torch.bfloat16]`
[LoRA] Relax LoRA condition (#7146) 2024-08-06 09:57:25 +08:00			`MAX_RANKS = [1, 4, 8, 16, 32, 64, 128, 256]`
[Kernel][RFC] Refactor the punica kernel based on Triton (#5036) 2024-08-01 08:12:24 +08:00			`SCALES = [0.5]`
			`SEED = [0]`
			`CUDA_DEVICES = [f"cuda:{0}"]`


			`def assert_close(a, b):`
			`rtol, atol = {`
			`torch.float16: (6e-2, 6e-2),`
			`torch.bfloat16: (6e-2, 6e-2),`
			`torch.float32: (1e-2, 1e-2),`
			`}[a.dtype]`
			`torch.testing.assert_close(a, b, rtol=rtol, atol=atol)`


[Kernel] Register punica ops directly (#10522) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> 2024-11-22 01:18:11 +08:00			`# Unlike test_punica_sizes.py, we directly utilize custom op for`
			`# testing, which verifies the correct registration of these ops.`
			`bgmv_expand = torch.ops.vllm.bgmv_expand`
			`bgmv_expand_slice = torch.ops.vllm.bgmv_expand_slice`
			`bgmv_shrink = torch.ops.vllm.bgmv_shrink`
			`sgmv_expand = torch.ops.vllm.sgmv_expand`
			`sgmv_expand_slice = torch.ops.vllm.sgmv_expand_slice`
			`sgmv_shrink = torch.ops.vllm.sgmv_shrink`


[Kernel][RFC] Refactor the punica kernel based on Triton (#5036) 2024-08-01 08:12:24 +08:00			`@pytest.mark.parametrize("batches", BATCHES)`
			`@pytest.mark.parametrize("num_loras", NUM_LORA)`
			`@pytest.mark.parametrize("rank", MAX_RANKS)`
			`@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)`
			`@pytest.mark.parametrize("scaling", SCALES)`
			`@pytest.mark.parametrize("dtype", DTYPES)`
			`@pytest.mark.parametrize("op_type", ["shrink", "expand"])`
			`@pytest.mark.parametrize("seed", SEED)`
			`@pytest.mark.parametrize("device", CUDA_DEVICES)`
			`def test_punica_sgmv(`
			`batches: int,`
			`num_loras: int,`
			`rank: int,`
			`hidden_size: int,`
			`scaling: float,`
			`dtype: torch.dtype,`
			`op_type: str,`
			`seed: int,`
			`device: str,`
			`):`
			`torch.set_default_device(device)`
[Hardware] using current_platform.seed_everything (#9785) Signed-off-by: wangshuai09 <391746016@qq.com> 2024-10-29 22:47:44 +08:00			`current_platform.seed_everything(seed)`
[Kernel][RFC] Refactor the punica kernel based on Triton (#5036) 2024-08-01 08:12:24 +08:00
			`seq_length = 128`
			`(`
			`inputs_tensor,`
			`lora_weights,`
			`our_out_tensor,`
			`ref_out_tensor,`
			`b_seq_start_loc,`
			`lora_indices_tensor,`
			`seq_len_tensor,`
			`indices,`
			`) = generate_data(`
			`batches,`
			`hidden_size,`
			`num_loras,`
			`rank,`
			`seq_length,`
			`dtype,`
			`op_type,`
			`device,`
			`)`
			`max_seq_length = seq_len_tensor.max()`
[Kernel][LoRA] Add assertion for punica sgmv kernels (#7585) 2024-09-24 02:57:42 +08:00			`token_nums = seq_len_tensor.sum().item()`
[Kernel][RFC] Refactor the punica kernel based on Triton (#5036) 2024-08-01 08:12:24 +08:00			`if isinstance(max_seq_length, tuple):`
			`max_seq_length = max_seq_length[0].item()`
			`else:`
			`max_seq_length = max_seq_length.item()`
			`if op_type == "shrink":`
			`sgmv_shrink(`
			`inputs_tensor,`
			`lora_weights,`
			`our_out_tensor,`
			`b_seq_start_loc,`
			`seq_len_tensor,`
			`lora_indices_tensor,`
			`batches,`
			`max_seq_length,`
[Kernel][LoRA] Add assertion for punica sgmv kernels (#7585) 2024-09-24 02:57:42 +08:00			`token_nums,`
[Kernel][RFC] Refactor the punica kernel based on Triton (#5036) 2024-08-01 08:12:24 +08:00			`scaling,`
			`)`
			`else:`
			`sgmv_expand(`
			`inputs_tensor,`
			`lora_weights,`
			`our_out_tensor,`
			`b_seq_start_loc,`
			`seq_len_tensor,`
			`lora_indices_tensor,`
			`batches,`
			`max_seq_length,`
[Kernel][LoRA] Add assertion for punica sgmv kernels (#7585) 2024-09-24 02:57:42 +08:00			`token_nums,`
[Kernel][RFC] Refactor the punica kernel based on Triton (#5036) 2024-08-01 08:12:24 +08:00			`add_inputs=True,`
			`)`
			`ref_torch_groupgemm(`
			`ref_out_tensor,`
			`inputs_tensor,`
			`lora_weights,`
			`lora_indices_tensor,`
			`seq_len_tensor,`
			`batches,`
			`scaling if op_type == "shrink" else 1.0,`
			`op_type,`
			`)`
			`if op_type == "shrink":`
			`ref_out_tensor = ref_out_tensor.to(torch.float32)`
			`assert_close(our_out_tensor, ref_out_tensor)`


			`@pytest.mark.parametrize("batches", BATCHES)`
			`@pytest.mark.parametrize("num_loras", NUM_LORA)`
			`@pytest.mark.parametrize("rank", MAX_RANKS)`
			`@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)`
			`@pytest.mark.parametrize("scaling", SCALES)`
			`@pytest.mark.parametrize("dtype", DTYPES)`
			`@pytest.mark.parametrize("op_type", ["shrink", "expand"])`
			`@pytest.mark.parametrize("seed", SEED)`
			`@pytest.mark.parametrize("device", CUDA_DEVICES)`
			`def test_punica_bgmv(`
			`batches: int,`
			`num_loras: int,`
			`rank: int,`
			`hidden_size: int,`
			`scaling: float,`
			`dtype: torch.dtype,`
			`op_type: str,`
			`seed: int,`
			`device: str,`
			`):`

			`torch.set_default_device(device)`
[Hardware] using current_platform.seed_everything (#9785) Signed-off-by: wangshuai09 <391746016@qq.com> 2024-10-29 22:47:44 +08:00			`current_platform.seed_everything(seed)`
[Kernel][RFC] Refactor the punica kernel based on Triton (#5036) 2024-08-01 08:12:24 +08:00
			`seq_length = 1`
			`(`
			`inputs_tensor,`
			`lora_weights,`
			`our_out_tensor,`
			`ref_out_tensor,`
			`b_seq_start_loc,`
			`lora_indices_tensor,`
			`seq_len_tensor,`
			`indices,`
			`) = generate_data(`
			`batches,`
			`hidden_size,`
			`num_loras,`
			`rank,`
			`seq_length,`
			`dtype,`
			`op_type,`
			`device,`
			`)`
			`if op_type == "shrink":`
[LoRA][Kernel] Remove the unused libentry module (#10214) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> 2024-11-11 17:43:23 +08:00			`bgmv_shrink(`
			`inputs_tensor,`
			`lora_weights,`
			`our_out_tensor,`
			`indices,`
			`scaling,`
			`)`
[Kernel][RFC] Refactor the punica kernel based on Triton (#5036) 2024-08-01 08:12:24 +08:00			`else:`
[LoRA][Kernel] Remove the unused libentry module (#10214) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> 2024-11-11 17:43:23 +08:00
			`bgmv_expand(`
			`inputs_tensor,`
			`lora_weights,`
			`our_out_tensor,`
			`indices,`
			`add_inputs=True,`
			`)`
[Kernel][RFC] Refactor the punica kernel based on Triton (#5036) 2024-08-01 08:12:24 +08:00			`ref_torch_groupgemm(`
			`ref_out_tensor,`
			`inputs_tensor,`
			`lora_weights,`
			`lora_indices_tensor,`
			`seq_len_tensor,`
			`batches,`
			`scaling if op_type == "shrink" else 1.0,`
			`op_type,`
			`)`
			`if op_type == "shrink":`
			`ref_out_tensor = ref_out_tensor.to(torch.float32)`
			`assert_close(our_out_tensor, ref_out_tensor)`


			`@pytest.mark.parametrize("batches", BATCHES)`
			`@pytest.mark.parametrize("num_loras", NUM_LORA)`
			`@pytest.mark.parametrize("rank", MAX_RANKS)`
			`@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)`
			`@pytest.mark.parametrize("nslices", [2, 3])`
			`@pytest.mark.parametrize("dtype", DTYPES)`
			`@pytest.mark.parametrize("op_type", ["sgmv", "bgmv"])`
			`@pytest.mark.parametrize("seed", SEED)`
			`@pytest.mark.parametrize("device", CUDA_DEVICES)`
			`def test_punica_expand_nslices(`
			`batches: int,`
			`num_loras: int,`
			`rank: int,`
			`hidden_size: int,`
			`nslices: int,`
			`dtype: torch.dtype,`
			`op_type: str,`
			`seed: int,`
			`device: str,`
			`):`
			`torch.set_default_device(device)`
[Hardware] using current_platform.seed_everything (#9785) Signed-off-by: wangshuai09 <391746016@qq.com> 2024-10-29 22:47:44 +08:00			`current_platform.seed_everything(seed)`
[CI/Build] Avoid CUDA initialization (#8534) 2024-09-18 18:38:11 +08:00
[Kernel][RFC] Refactor the punica kernel based on Triton (#5036) 2024-08-01 08:12:24 +08:00			`seq_length = 128 if op_type == "sgmv" else 1`
			`(`
			`inputs_tensor,`
			`lora_weights_lst,`
			`our_outputs,`
			`ref_outputs,`
			`b_seq_start_loc,`
			`lora_indices_tensor,`
			`seq_len_tensor,`
			`indices,`
			`) = generate_data_for_expand_nslices(`
			`batches,`
			`hidden_size,`
			`num_loras,`
			`rank,`
			`seq_length,`
			`dtype,`
			`nslices,`
			`device,`
			`)`
			`max_seq_length = seq_len_tensor.max()`
[Kernel][LoRA] Add assertion for punica sgmv kernels (#7585) 2024-09-24 02:57:42 +08:00			`token_nums = seq_len_tensor.sum().item()`
[Kernel][RFC] Refactor the punica kernel based on Triton (#5036) 2024-08-01 08:12:24 +08:00			`if isinstance(max_seq_length, tuple):`
			`max_seq_length = max_seq_length[0].item()`
			`else:`
			`max_seq_length = max_seq_length.item()`
			`slice_offset = 0`
			`for index in range(nslices):`
			`lora_weights = lora_weights_lst[index]`
			`if op_type == "sgmv":`
			`sgmv_expand_slice(`
			`inputs_tensor,`
			`lora_weights,`
			`our_outputs,`
			`b_seq_start_loc,`
			`seq_len_tensor,`
			`lora_indices_tensor,`
			`batches,`
			`max_seq_length,`
[Kernel][LoRA] Add assertion for punica sgmv kernels (#7585) 2024-09-24 02:57:42 +08:00			`token_nums,`
[Kernel][RFC] Refactor the punica kernel based on Triton (#5036) 2024-08-01 08:12:24 +08:00			`slice_offset,`
			`hidden_size,`
			`add_inputs=True,`
			`)`
			`else:`
[LoRA][Kernel] Remove the unused libentry module (#10214) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> 2024-11-11 17:43:23 +08:00			`bgmv_expand_slice(`
			`inputs_tensor,`
			`lora_weights,`
			`our_outputs,`
			`indices,`
			`slice_offset,`
			`slice_size=hidden_size,`
			`add_inputs=True,`
			`)`
[Kernel][RFC] Refactor the punica kernel based on Triton (#5036) 2024-08-01 08:12:24 +08:00			`ref_torch_groupgemm(`
			`ref_outputs[:, slice_offset:slice_offset + hidden_size],`
			`inputs_tensor,`
			`lora_weights,`
			`lora_indices_tensor,`
			`seq_len_tensor,`
			`batches,`
			`1.0,`
			`op_type="expand",`
			`)`

			`slice_offset += hidden_size`
			`assert_close(our_outputs, ref_outputs)`