2024-05-17 06:32:50 +08:00
|
|
|
from typing import Optional, Tuple, Type
|
2024-04-11 11:26:07 +08:00
|
|
|
|
|
|
|
|
import torch
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
from vllm._C import cache_ops as vllm_cache_ops
|
|
|
|
|
from vllm._C import ops as vllm_ops
|
|
|
|
|
except ImportError:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# activation ops
|
|
|
|
|
def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
|
|
|
|
|
vllm_ops.silu_and_mul(out, x)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
|
|
|
|
|
vllm_ops.gelu_and_mul(out, x)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
|
|
|
|
|
vllm_ops.gelu_tanh_and_mul(out, x)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
|
|
|
|
|
vllm_ops.gelu_fast(out, x)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
|
|
|
|
|
vllm_ops.gelu_new(out, x)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# page attention ops
|
|
|
|
|
def paged_attention_v1(
|
|
|
|
|
out: torch.Tensor,
|
|
|
|
|
query: torch.Tensor,
|
|
|
|
|
key_cache: torch.Tensor,
|
|
|
|
|
value_cache: torch.Tensor,
|
|
|
|
|
num_kv_heads: int,
|
|
|
|
|
scale: float,
|
|
|
|
|
block_tables: torch.Tensor,
|
2024-05-04 01:20:12 +08:00
|
|
|
seq_lens: torch.Tensor,
|
2024-04-11 11:26:07 +08:00
|
|
|
block_size: int,
|
2024-05-04 01:20:12 +08:00
|
|
|
max_seq_len: int,
|
2024-04-11 11:26:07 +08:00
|
|
|
alibi_slopes: Optional[torch.Tensor],
|
|
|
|
|
kv_cache_dtype: str,
|
|
|
|
|
kv_scale: float,
|
2024-05-25 13:00:52 +08:00
|
|
|
tp_rank: int = 0,
|
|
|
|
|
blocksparse_local_blocks: int = 0,
|
|
|
|
|
blocksparse_vert_stride: int = 0,
|
|
|
|
|
blocksparse_block_size: int = 64,
|
|
|
|
|
blocksparse_head_sliding_step: int = 0,
|
2024-04-11 11:26:07 +08:00
|
|
|
) -> None:
|
2024-05-25 13:00:52 +08:00
|
|
|
vllm_ops.paged_attention_v1(
|
|
|
|
|
out, query, key_cache, value_cache, num_kv_heads, scale, block_tables,
|
|
|
|
|
seq_lens, block_size, max_seq_len, alibi_slopes, kv_cache_dtype,
|
|
|
|
|
kv_scale, tp_rank, blocksparse_local_blocks, blocksparse_vert_stride,
|
|
|
|
|
blocksparse_block_size, blocksparse_head_sliding_step)
|
2024-04-11 11:26:07 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def paged_attention_v2(
|
|
|
|
|
out: torch.Tensor,
|
|
|
|
|
exp_sum: torch.Tensor,
|
|
|
|
|
max_logits: torch.Tensor,
|
|
|
|
|
tmp_out: torch.Tensor,
|
|
|
|
|
query: torch.Tensor,
|
|
|
|
|
key_cache: torch.Tensor,
|
|
|
|
|
value_cache: torch.Tensor,
|
|
|
|
|
num_kv_heads: int,
|
|
|
|
|
scale: float,
|
|
|
|
|
block_tables: torch.Tensor,
|
2024-05-04 01:20:12 +08:00
|
|
|
seq_lens: torch.Tensor,
|
2024-04-11 11:26:07 +08:00
|
|
|
block_size: int,
|
2024-05-04 01:20:12 +08:00
|
|
|
max_seq_len: int,
|
2024-04-11 11:26:07 +08:00
|
|
|
alibi_slopes: Optional[torch.Tensor],
|
|
|
|
|
kv_cache_dtype: str,
|
|
|
|
|
kv_scale: float,
|
2024-05-25 13:00:52 +08:00
|
|
|
tp_rank: int = 0,
|
|
|
|
|
blocksparse_local_blocks: int = 0,
|
|
|
|
|
blocksparse_vert_stride: int = 0,
|
|
|
|
|
blocksparse_block_size: int = 64,
|
|
|
|
|
blocksparse_head_sliding_step: int = 0,
|
2024-04-11 11:26:07 +08:00
|
|
|
) -> None:
|
2024-05-25 13:00:52 +08:00
|
|
|
vllm_ops.paged_attention_v2(
|
|
|
|
|
out, exp_sum, max_logits, tmp_out, query, key_cache, value_cache,
|
|
|
|
|
num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len,
|
|
|
|
|
alibi_slopes, kv_cache_dtype, kv_scale, tp_rank,
|
|
|
|
|
blocksparse_local_blocks, blocksparse_vert_stride,
|
|
|
|
|
blocksparse_block_size, blocksparse_head_sliding_step)
|
2024-04-11 11:26:07 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
# pos encoding ops
|
|
|
|
|
def rotary_embedding(
|
|
|
|
|
positions: torch.Tensor,
|
|
|
|
|
query: torch.Tensor,
|
|
|
|
|
key: torch.Tensor,
|
|
|
|
|
head_size: int,
|
|
|
|
|
cos_sin_cache: torch.Tensor,
|
|
|
|
|
is_neox: bool,
|
|
|
|
|
) -> None:
|
|
|
|
|
vllm_ops.rotary_embedding(positions, query, key, head_size, cos_sin_cache,
|
|
|
|
|
is_neox)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
|
|
|
|
|
key: torch.Tensor, head_size: int,
|
|
|
|
|
cos_sin_cache: torch.Tensor, is_neox: bool,
|
|
|
|
|
rot_dim: int,
|
|
|
|
|
cos_sin_cache_offsets: torch.Tensor) -> None:
|
|
|
|
|
vllm_ops.batched_rotary_embedding(positions, query, key, head_size,
|
|
|
|
|
cos_sin_cache, is_neox, rot_dim,
|
|
|
|
|
cos_sin_cache_offsets)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# layer norm ops
|
|
|
|
|
def rms_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor,
|
|
|
|
|
epsilon: float) -> None:
|
|
|
|
|
vllm_ops.rms_norm(out, input, weight, epsilon)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
|
|
|
|
|
weight: torch.Tensor, epsilon: float) -> None:
|
|
|
|
|
vllm_ops.fused_add_rms_norm(input, residual, weight, epsilon)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# quantization ops
|
|
|
|
|
# awq
|
|
|
|
|
def awq_dequantize(qweight: torch.Tensor, scales: torch.Tensor,
|
|
|
|
|
zeros: torch.Tensor, split_k_iters: int, thx: int,
|
|
|
|
|
thy: int) -> torch.Tensor:
|
|
|
|
|
return vllm_ops.awq_dequantize(qweight, scales, zeros, split_k_iters, thx,
|
|
|
|
|
thy)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def awq_gemm(input: torch.Tensor, qweight: torch.Tensor, qzeros: torch.Tensor,
|
|
|
|
|
scales: torch.Tensor, split_k_iters: int) -> torch.Tensor:
|
|
|
|
|
return vllm_ops.awq_gemm(input, qweight, qzeros, scales, split_k_iters)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# gptq
|
|
|
|
|
def gptq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
|
|
|
|
|
b_gptq_qzeros: torch.Tensor, b_gptq_scales: torch.Tensor,
|
|
|
|
|
b_g_idx: torch.Tensor, use_exllama: bool,
|
|
|
|
|
bit: int) -> torch.Tensor:
|
|
|
|
|
return vllm_ops.gptq_gemm(a, b_q_weight, b_gptq_qzeros, b_gptq_scales,
|
|
|
|
|
b_g_idx, use_exllama, bit)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
|
|
|
|
|
bit: int) -> None:
|
|
|
|
|
vllm_ops.gptq_shuffle(q_weight, q_perm, bit)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# squeezellm
|
|
|
|
|
def squeezellm_gemm(vec: torch.Tensor, mat: torch.Tensor, mul: torch.Tensor,
|
|
|
|
|
lookup_table: torch.Tensor) -> None:
|
|
|
|
|
vllm_ops.squeezellm_gemm(vec, mat, mul, lookup_table)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# marlin
|
|
|
|
|
def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
|
|
|
|
|
b_scales: torch.Tensor, workspace: torch.Tensor, size_m: int,
|
|
|
|
|
size_n: int, size_k: int) -> torch.Tensor:
|
|
|
|
|
return vllm_ops.marlin_gemm(a, b_q_weight, b_scales, workspace, size_m,
|
|
|
|
|
size_n, size_k)
|
|
|
|
|
|
|
|
|
|
|
2024-05-17 00:56:15 +08:00
|
|
|
# marlin_24
|
|
|
|
|
def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
|
|
|
|
|
b_meta: torch.Tensor, b_scales: torch.Tensor,
|
|
|
|
|
workspace: torch.Tensor, num_bits: int, size_m: int,
|
|
|
|
|
size_n: int, size_k: int) -> torch.Tensor:
|
|
|
|
|
return vllm_ops.gptq_marlin_24_gemm(a, b_q_weight, b_meta, b_scales,
|
|
|
|
|
workspace, num_bits, size_m, size_n,
|
|
|
|
|
size_k)
|
|
|
|
|
|
|
|
|
|
|
2024-05-17 06:32:50 +08:00
|
|
|
# cutlass
|
|
|
|
|
def cutlass_scaled_mm_dq(a: torch.Tensor, b: torch.Tensor,
|
|
|
|
|
a_scales: torch.Tensor, b_scales: torch.Tensor,
|
|
|
|
|
out_dtype: Type[torch.dtype]) -> torch.Tensor:
|
|
|
|
|
assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
|
|
|
|
|
assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
|
|
|
|
|
|
|
|
|
|
m = a.shape[0]
|
|
|
|
|
n = b.shape[1]
|
|
|
|
|
out = torch.empty((m, n), dtype=out_dtype, device=a.device)
|
|
|
|
|
|
|
|
|
|
vllm_ops.cutlass_scaled_mm_dq(out, a, b, a_scales, b_scales)
|
|
|
|
|
|
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
2024-04-26 03:03:56 +08:00
|
|
|
# aqlm
|
|
|
|
|
def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor,
|
|
|
|
|
codebooks: torch.Tensor, scales: torch.Tensor,
|
|
|
|
|
codebook_partition_sizes: torch.Tensor,
|
|
|
|
|
bias: Optional[torch.Tensor]) -> torch.Tensor:
|
|
|
|
|
return vllm_ops.aqlm_gemm(input, codes, codebooks, scales,
|
|
|
|
|
codebook_partition_sizes, bias)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def aqlm_dequant(codes: torch.Tensor, codebooks: torch.Tensor,
|
|
|
|
|
codebook_partition_sizes: torch.Tensor) -> torch.Tensor:
|
|
|
|
|
return vllm_ops.aqlm_dequant(codes, codebooks, codebook_partition_sizes)
|
|
|
|
|
|
|
|
|
|
|
2024-04-30 20:14:47 +08:00
|
|
|
# gptq_marlin
|
|
|
|
|
def gptq_marlin_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
|
2024-05-03 00:56:22 +08:00
|
|
|
size_k: int, size_n: int,
|
|
|
|
|
num_bits: int) -> torch.Tensor:
|
|
|
|
|
return vllm_ops.gptq_marlin_repack(b_q_weight, perm, size_k, size_n,
|
|
|
|
|
num_bits)
|
2024-04-30 20:14:47 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def gptq_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
|
|
|
|
|
b_scales: torch.Tensor, g_idx: torch.Tensor,
|
2024-05-03 00:56:22 +08:00
|
|
|
perm: torch.Tensor, workspace: torch.Tensor,
|
|
|
|
|
num_bits: int, size_m: int, size_n: int, size_k: int,
|
2024-04-30 20:14:47 +08:00
|
|
|
is_k_full: bool) -> torch.Tensor:
|
|
|
|
|
return vllm_ops.gptq_marlin_gemm(a, b_q_weight, b_scales, g_idx, perm,
|
2024-05-03 00:56:22 +08:00
|
|
|
workspace, num_bits, size_m, size_n,
|
|
|
|
|
size_k, is_k_full)
|
2024-04-30 20:14:47 +08:00
|
|
|
|
|
|
|
|
|
[Kernel] FP8 support for MoE kernel / Mixtral (#4244)
This PR is the first step towards fixing https://github.com/vllm-project/vllm/pull/3208
It implements dynamic per-tensor scaling (see https://github.com/vllm-project/vllm/pull/4118), so users do not need to compute activation scales on a calibration dataset and they also don't need to convert their model checkpoints. It is enough to specify the `quantization="fp8"` argument. You can try out the PR like this:
```python
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(model="mistralai/Mixtral-8x7B-Instruct-v0.1", tensor_parallel_size=2, quantization="fp8")
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
**Performance**: For this PR, the focus is on making the code clean (while still trying to get reasonable performance), there is a bunch of optimizations that we will submit as a follow up PR that significantly improve the performance (similar to the numbers in https://github.com/vllm-project/vllm/pull/3954). With this PR, the results are as follows:
<img width="725" alt="Screenshot 2024-04-21 at 1 31 50 PM" src="https://github.com/vllm-project/vllm/assets/113316/d8fe1118-07a0-4d4e-8530-37a77d465a03">
**Accuracy**: The accuracy with this PR on MMLU on `mistralai/Mixtral-8x7B-v0.1` is as follows:
```
| Groups |Version|Filter|n-shot|Metric|Value | |Stderr|
|------------------|-------|------|-----:|------|-----:|---|-----:|
|mmlu |N/A |none | 0|acc |0.7018|± |0.0036|
| - humanities |N/A |none | 5|acc |0.6472|± |0.0065|
| - other |N/A |none | 5|acc |0.7673|± |0.0072|
| - social_sciences|N/A |none | 5|acc |0.8099|± |0.0070|
| - stem |N/A |none | 5|acc |0.6131|± |0.0083|
```
this compares favorably with the fp16 results which are
```
| Groups |Version|Filter|n-shot|Metric|Value | |Stderr|
|------------------|-------|------|-----:|------|-----:|---|-----:|
|mmlu |N/A |none | 0|acc |0.7020|± |0.1313|
| - humanities |N/A |none | 5|acc |0.6425|± |0.1349|
| - other |N/A |none | 5|acc |0.7744|± |0.1038|
| - social_sciences|N/A |none | 5|acc |0.8131|± |0.0695|
| - stem |N/A |none | 5|acc |0.6108|± |0.1383|
```
Happy hacking!
2024-04-24 09:18:23 +08:00
|
|
|
# fp8
|
2024-04-27 12:49:59 +08:00
|
|
|
def scaled_fp8_quant(
|
|
|
|
|
input: torch.Tensor,
|
|
|
|
|
scale: Optional[torch.Tensor] = None,
|
[Kernel] [FP8] Improve FP8 linear layer performance (#4691)
This PR improves the FP8 performance of linear layers, which had been lacking before (#4118 (comment) and #4118 (comment)).
We noticed that CUBLASLt can find a better algorithm if the first dimension of the matrix is greater than 16. So this PR enlarges matrices appropriately during quantization. This improves FP8 performance and removes the performance regression vs. FP16, in many cases exceeding FP16 performance.
Here are benchmarks on llama3 70b (ITL numbers for 1000 input and 50 output tokens at fixed qps and at TP 4), all FP8 measurements are for dynamic quantization:
qps = 1: 24 ms (FP8, this PR), 32 ms (FP8, previous main), 26 ms (FP16)
qps = 2: 26 ms (FP8, this PR), 34ms (FP8, previous main), 28 ms (FP16)
qps = 4: 33 ms (FP8, this PR), 44 ms (FP8, previous main), 36 ms (FP16)
qps = 6: 46 ms (FP8, this PR), 56 ms (FP8, previous main), 54 ms (FP16)
qps = 8: 85 ms (FP8, this PR), 85 ms (FP8, previous main), 138 ms (FP16)
2024-05-10 07:38:07 +08:00
|
|
|
batch_dim_padding: Optional[int] = None,
|
2024-04-27 12:49:59 +08:00
|
|
|
) -> Tuple[torch.Tensor, torch.Tensor]:
|
[Kernel] [FP8] Improve FP8 linear layer performance (#4691)
This PR improves the FP8 performance of linear layers, which had been lacking before (#4118 (comment) and #4118 (comment)).
We noticed that CUBLASLt can find a better algorithm if the first dimension of the matrix is greater than 16. So this PR enlarges matrices appropriately during quantization. This improves FP8 performance and removes the performance regression vs. FP16, in many cases exceeding FP16 performance.
Here are benchmarks on llama3 70b (ITL numbers for 1000 input and 50 output tokens at fixed qps and at TP 4), all FP8 measurements are for dynamic quantization:
qps = 1: 24 ms (FP8, this PR), 32 ms (FP8, previous main), 26 ms (FP16)
qps = 2: 26 ms (FP8, this PR), 34ms (FP8, previous main), 28 ms (FP16)
qps = 4: 33 ms (FP8, this PR), 44 ms (FP8, previous main), 36 ms (FP16)
qps = 6: 46 ms (FP8, this PR), 56 ms (FP8, previous main), 54 ms (FP16)
qps = 8: 85 ms (FP8, this PR), 85 ms (FP8, previous main), 138 ms (FP16)
2024-05-10 07:38:07 +08:00
|
|
|
"""
|
|
|
|
|
Quantize input tensor to FP8 and return quantized tensor and scale.
|
|
|
|
|
|
|
|
|
|
This function supports both static and dynamic quantization: If you
|
|
|
|
|
provide the scale, it will use static scaling and if you omit it,
|
|
|
|
|
the scale will be determined dynamically. The function also allows
|
|
|
|
|
optional padding of the output tensor for downstream kernels that
|
|
|
|
|
will benefit from padding.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
input: The input tensor to be quantized to FP8
|
|
|
|
|
scale: Optional scaling factor for the FP8 quantization
|
|
|
|
|
batch_dim_padding: If specified, pad the first dimension
|
|
|
|
|
of the output to at least this value.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
|
|
|
|
|
scaling factor.
|
|
|
|
|
"""
|
|
|
|
|
if batch_dim_padding:
|
|
|
|
|
shape = (max(batch_dim_padding, input.shape[0]), *input.shape[1:])
|
|
|
|
|
output = torch.empty(shape,
|
|
|
|
|
device=input.device,
|
|
|
|
|
dtype=torch.float8_e4m3fn)
|
|
|
|
|
else:
|
|
|
|
|
output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
|
2024-04-27 12:49:59 +08:00
|
|
|
if scale is None:
|
|
|
|
|
scale = torch.zeros(1, device=input.device, dtype=torch.float32)
|
|
|
|
|
vllm_ops.dynamic_scaled_fp8_quant(output, input, scale)
|
|
|
|
|
else:
|
|
|
|
|
vllm_ops.static_scaled_fp8_quant(output, input, scale)
|
[Kernel] FP8 support for MoE kernel / Mixtral (#4244)
This PR is the first step towards fixing https://github.com/vllm-project/vllm/pull/3208
It implements dynamic per-tensor scaling (see https://github.com/vllm-project/vllm/pull/4118), so users do not need to compute activation scales on a calibration dataset and they also don't need to convert their model checkpoints. It is enough to specify the `quantization="fp8"` argument. You can try out the PR like this:
```python
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(model="mistralai/Mixtral-8x7B-Instruct-v0.1", tensor_parallel_size=2, quantization="fp8")
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
**Performance**: For this PR, the focus is on making the code clean (while still trying to get reasonable performance), there is a bunch of optimizations that we will submit as a follow up PR that significantly improve the performance (similar to the numbers in https://github.com/vllm-project/vllm/pull/3954). With this PR, the results are as follows:
<img width="725" alt="Screenshot 2024-04-21 at 1 31 50 PM" src="https://github.com/vllm-project/vllm/assets/113316/d8fe1118-07a0-4d4e-8530-37a77d465a03">
**Accuracy**: The accuracy with this PR on MMLU on `mistralai/Mixtral-8x7B-v0.1` is as follows:
```
| Groups |Version|Filter|n-shot|Metric|Value | |Stderr|
|------------------|-------|------|-----:|------|-----:|---|-----:|
|mmlu |N/A |none | 0|acc |0.7018|± |0.0036|
| - humanities |N/A |none | 5|acc |0.6472|± |0.0065|
| - other |N/A |none | 5|acc |0.7673|± |0.0072|
| - social_sciences|N/A |none | 5|acc |0.8099|± |0.0070|
| - stem |N/A |none | 5|acc |0.6131|± |0.0083|
```
this compares favorably with the fp16 results which are
```
| Groups |Version|Filter|n-shot|Metric|Value | |Stderr|
|------------------|-------|------|-----:|------|-----:|---|-----:|
|mmlu |N/A |none | 0|acc |0.7020|± |0.1313|
| - humanities |N/A |none | 5|acc |0.6425|± |0.1349|
| - other |N/A |none | 5|acc |0.7744|± |0.1038|
| - social_sciences|N/A |none | 5|acc |0.8131|± |0.0695|
| - stem |N/A |none | 5|acc |0.6108|± |0.1383|
```
Happy hacking!
2024-04-24 09:18:23 +08:00
|
|
|
return output, scale
|
|
|
|
|
|
|
|
|
|
|
2024-05-24 05:29:18 +08:00
|
|
|
# int8
|
|
|
|
|
def static_scaled_int8_quant(input: torch.Tensor,
|
|
|
|
|
scale: float) -> torch.Tensor:
|
|
|
|
|
"""
|
|
|
|
|
Quantize the input tensor to int8 and return the quantized tensor.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
input: The input tensor to be quantized to int8.
|
|
|
|
|
scale: Scaling factor for the int8 quantization.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
torch.Tensor: Output tensor in int8.
|
|
|
|
|
"""
|
|
|
|
|
q = torch.empty_like(input, dtype=torch.int8)
|
|
|
|
|
vllm_ops.static_scaled_int8_quant(q, input, scale)
|
|
|
|
|
return q
|
|
|
|
|
|
|
|
|
|
|
2024-04-11 11:26:07 +08:00
|
|
|
# moe
|
|
|
|
|
def moe_align_block_size(topk_ids: torch.Tensor, num_experts: int,
|
|
|
|
|
block_size: int, sorted_token_ids: torch.Tensor,
|
|
|
|
|
experts_ids: torch.Tensor,
|
|
|
|
|
num_tokens_post_pad: torch.Tensor) -> None:
|
|
|
|
|
vllm_ops.moe_align_block_size(topk_ids, num_experts, block_size,
|
|
|
|
|
sorted_token_ids, experts_ids,
|
|
|
|
|
num_tokens_post_pad)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def reshape_and_cache(
|
|
|
|
|
key: torch.Tensor,
|
|
|
|
|
value: torch.Tensor,
|
|
|
|
|
key_cache: torch.Tensor,
|
|
|
|
|
value_cache: torch.Tensor,
|
|
|
|
|
slot_mapping: torch.Tensor,
|
|
|
|
|
kv_cache_dtype: str,
|
|
|
|
|
kv_scale: float,
|
|
|
|
|
) -> None:
|
|
|
|
|
vllm_cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
|
|
|
|
|
slot_mapping, kv_cache_dtype, kv_scale)
|
|
|
|
|
|
|
|
|
|
|
2024-05-04 06:51:27 +08:00
|
|
|
def reshape_and_cache_flash(
|
|
|
|
|
key: torch.Tensor,
|
|
|
|
|
value: torch.Tensor,
|
|
|
|
|
key_cache: torch.Tensor,
|
|
|
|
|
value_cache: torch.Tensor,
|
|
|
|
|
slot_mapping: torch.Tensor,
|
|
|
|
|
kv_cache_dtype: str,
|
|
|
|
|
) -> None:
|
|
|
|
|
vllm_cache_ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
|
|
|
|
|
slot_mapping, kv_cache_dtype)
|
|
|
|
|
|
|
|
|
|
|
2024-04-11 11:26:07 +08:00
|
|
|
def copy_blocks(key_caches: torch.Tensor, value_caches: torch.Tensor,
|
|
|
|
|
block_mapping: torch.Tensor) -> None:
|
|
|
|
|
vllm_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
|
2024-05-10 20:52:48 +08:00
|
|
|
block_mapping: torch.Tensor) -> None:
|
2024-04-11 11:26:07 +08:00
|
|
|
vllm_cache_ops.swap_blocks(src, dst, block_mapping)
|
|
|
|
|
|
|
|
|
|
|
2024-05-10 08:04:17 +08:00
|
|
|
def convert_fp8(output: torch.Tensor,
|
|
|
|
|
input: torch.Tensor,
|
|
|
|
|
scale: float = 1.0,
|
|
|
|
|
kv_dtype: str = "fp8") -> None:
|
|
|
|
|
vllm_cache_ops.convert_fp8(output, input, scale, kv_dtype)
|
2024-04-11 11:26:07 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
#TODO: cuda_utils, custom_ar
|