From 3c9de9fb767cc051333da0d31d7b09197505f5f6 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 3 Oct 2025 11:47:36 -0700 Subject: [PATCH 1/2] [no ci] Add add-gemma-3n-E2B-it Signed-off-by: Huy Do --- .../benchmarks/cuda/latency-tests.json | 11 ++++++++++ .../benchmarks/cuda/serving-tests.json | 21 +++++++++++++++++++ .../benchmarks/cuda/throughput-tests.json | 12 +++++++++++ 3 files changed, 44 insertions(+) diff --git a/vllm-benchmarks/benchmarks/cuda/latency-tests.json b/vllm-benchmarks/benchmarks/cuda/latency-tests.json index 719b4339..d311c6c9 100644 --- a/vllm-benchmarks/benchmarks/cuda/latency-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/latency-tests.json @@ -117,6 +117,17 @@ "max_model_len": 8192 } }, + { + "test_name": "latency_gemma_3n_e2b_tp1", + "parameters": { + "model": "google/gemma-3n-E2B-it", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15, + "max_model_len": 8192 + } + }, { "test_name": "latency_qwen3_30b_a3b_tp8", "parameters": { diff --git a/vllm-benchmarks/benchmarks/cuda/serving-tests.json b/vllm-benchmarks/benchmarks/cuda/serving-tests.json index 1aaef17f..0cebd825 100644 --- a/vllm-benchmarks/benchmarks/cuda/serving-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/serving-tests.json @@ -532,5 +532,26 @@ "random_input_len": 5250, "random_output_len": 8250 } + }, + { + "test_name": "serving_gemma_3n_e2b_tp1_random_in1k_out2k", + "qps_list": [10], + "server_parameters": { + "model": "google/gemma-3n-E2B-it", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "google/gemma-3n-E2B-it", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 1024, + "random_output_len": 2048 + } } ] diff --git a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json index 9ff9cdad..f8621e4a 100644 --- a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json @@ -128,6 +128,18 @@ "max_model_len": 8192 } }, + { + "test_name": "throughput_gemma_3n-e2b_tp1", + "parameters": { + "model": "google/gemma-3n-E2B-it", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm", + "max_model_len": 8192 + } + }, { "test_name": "throughput_qwen3_30b_a3b_tp8", "parameters": { From 44ad43fece5a463a0024de941a81bb6099061ab0 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 3 Oct 2025 12:48:48 -0700 Subject: [PATCH 2/2] [no ci] Run with VLLM_USE_STANDALONE_COMPILE set Signed-off-by: Huy Do --- .github/workflows/vllm-benchmark.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index d4202ed3..5d2d5d7e 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -263,6 +263,8 @@ jobs: # vLLM-related environment variables ENGINE_VERSION: v1 SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 + # Test + VLLM_USE_STANDALONE_COMPILE: 1 run: | set -eux @@ -282,6 +284,7 @@ jobs: -e HF_TOKEN \ -e ENGINE_VERSION \ -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \ + -e VLLM_USE_STANDALONE_COMPILE \ -e ON_CPU="${ON_CPU}" \ --ipc=host \ --tty \