[Bugfix][CI/Build][Hardware][AMD] Shard ID parameters in AMD tests running parallel jobs (#9279)

Signed-off-by: Hissu Hyvarinen <hissu.hyvarinen@amd.com>
This commit is contained in:
hissu-hyvarinen 2024-11-04 21:37:46 +02:00 committed by GitHub
parent 1c45f4c385
commit 5208dc7a20
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 12 additions and 6 deletions

View File

@ -107,11 +107,12 @@ fi
PARALLEL_JOB_COUNT=8 PARALLEL_JOB_COUNT=8
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then if [[ $commands == *"--shard-id="* ]]; then
# assign job count as the number of shards used
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
#replace shard arguments # assign shard-id for each shard
commands=${commands//"--shard-id= "/"--shard-id=${GPU} "} commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "} echo "Shard ${GPU} commands:$commands_gpu"
echo "Shard ${GPU} commands:$commands"
docker run \ docker run \
--device /dev/kfd --device /dev/dri \ --device /dev/kfd --device /dev/dri \
--network host \ --network host \
@ -123,7 +124,7 @@ if [[ $commands == *"--shard-id="* ]]; then
-e HF_HOME=${HF_MOUNT} \ -e HF_HOME=${HF_MOUNT} \
--name ${container_name}_${GPU} \ --name ${container_name}_${GPU} \
${image_name} \ ${image_name} \
/bin/bash -c "${commands}" \ /bin/bash -c "${commands_gpu}" \
|& while read -r line; do echo ">>Shard $GPU: $line"; done & |& while read -r line; do echo ">>Shard $GPU: $line"; done &
PIDS+=($!) PIDS+=($!)
done done

View File

@ -1,8 +1,11 @@
from typing import List from typing import List
import pytest
import vllm import vllm
from vllm.assets.image import ImageAsset from vllm.assets.image import ImageAsset
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.platforms import current_platform
MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5" MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
@ -53,6 +56,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
return generated_texts return generated_texts
@pytest.mark.xfail(
current_platform.is_rocm(),
reason="MiniCPM-V dependency xformers incompatible with ROCm")
def test_minicpmv_lora(minicpmv_lora_files): def test_minicpmv_lora(minicpmv_lora_files):
llm = vllm.LLM( llm = vllm.LLM(
MODEL_PATH, MODEL_PATH,
@ -63,7 +69,6 @@ def test_minicpmv_lora(minicpmv_lora_files):
trust_remote_code=True, trust_remote_code=True,
gpu_memory_utilization=0.97 # This model is pretty big for CI gpus gpu_memory_utilization=0.97 # This model is pretty big for CI gpus
) )
output1 = do_sample(llm, minicpmv_lora_files, lora_id=1) output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
for i in range(len(EXPECTED_OUTPUT)): for i in range(len(EXPECTED_OUTPUT)):
assert EXPECTED_OUTPUT[i].startswith(output1[i]) assert EXPECTED_OUTPUT[i].startswith(output1[i])