[ { "test_name": "llama8B_tp1_sharegpt", "qps_list": [4,8,16,32,"inf"], "common_parameters": { "model": "meta-llama/Meta-Llama-3-8B-Instruct", "tp": 1, "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 500, "port": 8000, "reuse_server": false }, "lmdeploy_server_parameters": { "dtype": "bfloat16" }, "lmdeploy_client_parameters": { }, "tgi_server_parameters": { }, "tgi_client_parameters": { "endpoint": "/generate_stream" }, "trt_server_parameters": { "model_type": "llama", "model_dtype": "bfloat16", "max_batch_size": 2048, "max_input_len": 4096, "max_seq_len": 6144, "max_num_tokens": 16384, "trt_llm_version": "v0.11.0" }, "trt_client_parameters": { "endpoint": "/v2/models/ensemble/generate_stream" }, "vllm_server_parameters": { "disable_log_stats": "", "disable_log_requests": "", "gpu_memory_utilization": 0.9, "num_scheduler_steps": 10, "max_num_seqs": 512, "dtype": "bfloat16" }, "vllm_client_parameters": { }, "sglang_server_parameters": { "disable_radix_cache": "", "enable_torch_compile": "", "dtype": "bfloat16" }, "sglang_client_parameters": { } }, { "test_name": "llama8B_tp1_sonnet_512_16", "qps_list": [4,8,16,32,"inf"], "common_parameters": { "model": "meta-llama/Meta-Llama-3-8B-Instruct", "tp": 1, "dataset_name": "sonnet", "dataset_path": "./sonnet_4x.txt", "num_prompts": 500, "port": 8000, "sonnet_input_len": 512, "sonnet_output_len": 16, "sonnet_prefix_len": 50, "reuse_server": true }, "lmdeploy_server_parameters": { "dtype": "bfloat16" }, "lmdeploy_client_parameters": { }, "tgi_server_parameters": { }, "tgi_client_parameters": { "endpoint": "/generate_stream" }, "trt_server_parameters": { "model_type": "llama", "model_dtype": "bfloat16", "max_batch_size": 2048, "max_input_len": 4096, "max_seq_len": 6144, "max_num_tokens": 16384, "trt_llm_version": "v0.11.0" }, "trt_client_parameters": { "endpoint": "/v2/models/ensemble/generate_stream" }, "vllm_server_parameters": { "disable_log_stats": "", "disable_log_requests": "", "gpu_memory_utilization": 0.9, "num_scheduler_steps": 10, "max_num_seqs": 512, "dtype": "bfloat16" }, "vllm_client_parameters": { }, "sglang_server_parameters": { "disable_radix_cache": "", "enable_torch_compile": "", "dtype": "bfloat16" }, "sglang_client_parameters": { } }, { "test_name": "llama8B_tp1_sonnet_512_256", "qps_list": [4,8,16,32,"inf"], "common_parameters": { "model": "meta-llama/Meta-Llama-3-8B-Instruct", "tp": 1, "dataset_name": "sonnet", "dataset_path": "./sonnet_4x.txt", "num_prompts": 500, "port": 8000, "sonnet_input_len": 512, "sonnet_output_len": 256, "sonnet_prefix_len": 50, "reuse_server": true }, "lmdeploy_server_parameters": { "dtype": "bfloat16" }, "lmdeploy_client_parameters": { }, "tgi_server_parameters": { }, "tgi_client_parameters": { "endpoint": "/generate_stream" }, "trt_server_parameters": { "model_type": "llama", "model_dtype": "bfloat16", "max_batch_size": 2048, "max_input_len": 4096, "max_seq_len": 6144, "max_num_tokens": 16384, "trt_llm_version": "v0.11.0" }, "trt_client_parameters": { "endpoint": "/v2/models/ensemble/generate_stream" }, "vllm_server_parameters": { "disable_log_stats": "", "disable_log_requests": "", "gpu_memory_utilization": 0.9, "num_scheduler_steps": 10, "max_num_seqs": 512, "dtype": "bfloat16" }, "vllm_client_parameters": { }, "sglang_server_parameters": { "disable_radix_cache": "", "enable_torch_compile": "", "dtype": "bfloat16" }, "sglang_client_parameters": { } }, { "test_name": "llama70B_tp4_sharegpt", "qps_list": [4,8,16,32,"inf"], "common_parameters": { "model": "meta-llama/Meta-Llama-3-70B-Instruct", "tp": 4, "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 500, "port": 8000, "reuse_server": false }, "lmdeploy_server_parameters": { "dtype": "bfloat16" }, "lmdeploy_client_parameters": { }, "tgi_server_parameters": { }, "tgi_client_parameters": { "endpoint": "/generate_stream" }, "trt_server_parameters": { "model_type": "llama", "model_dtype": "bfloat16", "max_batch_size": 2048, "max_input_len": 4096, "max_seq_len": 6144, "max_num_tokens": 16384, "trt_llm_version": "v0.11.0" }, "trt_client_parameters": { "endpoint": "/v2/models/ensemble/generate_stream" }, "vllm_server_parameters": { "disable_log_stats": "", "disable_log_requests": "", "gpu_memory_utilization": 0.9, "num_scheduler_steps": 10, "max_num_seqs": 512, "dtype": "bfloat16" }, "vllm_client_parameters": { }, "sglang_server_parameters": { "disable_radix_cache": "", "dtype": "bfloat16" }, "sglang_client_parameters": { } }, { "test_name": "llama70B_tp4_sonnet_512_16", "qps_list": [4,8,16,32,"inf"], "common_parameters": { "model": "meta-llama/Meta-Llama-3-70B-Instruct", "tp": 4, "dataset_name": "sonnet", "dataset_path": "./sonnet_4x.txt", "num_prompts": 500, "port": 8000, "sonnet_input_len": 512, "sonnet_output_len": 16, "sonnet_prefix_len": 50, "reuse_server": true }, "lmdeploy_server_parameters": { "dtype": "bfloat16" }, "lmdeploy_client_parameters": { }, "tgi_server_parameters": { }, "tgi_client_parameters": { "endpoint": "/generate_stream" }, "trt_server_parameters": { "model_type": "llama", "model_dtype": "bfloat16", "max_batch_size": 2048, "max_input_len": 4096, "max_seq_len": 6144, "max_num_tokens": 16384, "trt_llm_version": "v0.11.0" }, "trt_client_parameters": { "endpoint": "/v2/models/ensemble/generate_stream" }, "vllm_server_parameters": { "disable_log_stats": "", "disable_log_requests": "", "gpu_memory_utilization": 0.9, "num_scheduler_steps": 10, "max_num_seqs": 512, "dtype": "bfloat16" }, "vllm_client_parameters": { }, "sglang_server_parameters": { "disable_radix_cache": "", "dtype": "bfloat16" }, "sglang_client_parameters": { } }, { "test_name": "llama70B_tp4_sonnet_512_256", "qps_list": [4,8,16,32,"inf"], "common_parameters": { "model": "meta-llama/Meta-Llama-3-70B-Instruct", "tp": 4, "dataset_name": "sonnet", "dataset_path": "./sonnet_4x.txt", "num_prompts": 500, "port": 8000, "sonnet_input_len": 512, "sonnet_output_len": 256, "sonnet_prefix_len": 50, "reuse_server": true }, "lmdeploy_server_parameters": { "dtype": "bfloat16" }, "lmdeploy_client_parameters": { }, "tgi_server_parameters": { }, "tgi_client_parameters": { "endpoint": "/generate_stream" }, "trt_server_parameters": { "model_type": "llama", "model_dtype": "bfloat16", "max_batch_size": 2048, "max_input_len": 4096, "max_seq_len": 6144, "max_num_tokens": 16384, "trt_llm_version": "v0.11.0" }, "trt_client_parameters": { "endpoint": "/v2/models/ensemble/generate_stream" }, "vllm_server_parameters": { "disable_log_stats": "", "disable_log_requests": "", "gpu_memory_utilization": 0.9, "num_scheduler_steps": 10, "max_num_seqs": 512, "dtype": "bfloat16" }, "vllm_client_parameters": { }, "sglang_server_parameters": { "disable_radix_cache": "", "dtype": "bfloat16" }, "sglang_client_parameters": { } } ]