From 4f217e5ac043f62f0f138cb16e7323bac83cab15 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Mon, 6 Oct 2025 21:57:45 -0700 Subject: [PATCH 1/5] Add a script to run lm-eval Signed-off-by: Huy Do --- ...Llama-4-Maverick-17B-128E-Instruct-FP8.yml | 14 ++ vllm-eval-harness/cuda/openai/gpt-oss-20b.yml | 14 ++ vllm-eval-harness/run_vllm_eval_harness.py | 133 ++++++++++++++++++ 3 files changed, 161 insertions(+) create mode 100644 vllm-eval-harness/cuda/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8.yml create mode 100644 vllm-eval-harness/cuda/openai/gpt-oss-20b.yml create mode 100644 vllm-eval-harness/run_vllm_eval_harness.py diff --git a/vllm-eval-harness/cuda/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8.yml b/vllm-eval-harness/cuda/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8.yml new file mode 100644 index 00000000..4279893c --- /dev/null +++ b/vllm-eval-harness/cuda/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8.yml @@ -0,0 +1,14 @@ +model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" +tasks: +- name: "gsm8k" + device: b200 + tp: 8 + # Adopted from vLLM lm-eval-harness, set the value to 0 if there is no baseline + metrics: + - name: "exact_match,strict-match" + value: 0 + - name: "exact_match,flexible-extract" + value: 0 +limit: 1000 +num_fewshot: 5 +trust_remote_code: True diff --git a/vllm-eval-harness/cuda/openai/gpt-oss-20b.yml b/vllm-eval-harness/cuda/openai/gpt-oss-20b.yml new file mode 100644 index 00000000..16f01bf9 --- /dev/null +++ b/vllm-eval-harness/cuda/openai/gpt-oss-20b.yml @@ -0,0 +1,14 @@ +model_name: "openai/gpt-oss-20b" +tasks: +- name: "gsm8k" + device: b200 + tp: 1 + # Adopted from vLLM lm-eval-harness, set the value to 0 if there is no baseline + metrics: + - name: "exact_match,strict-match" + value: 0 + - name: "exact_match,flexible-extract" + value: 0 +limit: 1000 +num_fewshot: 5 +trust_remote_code: True diff --git a/vllm-eval-harness/run_vllm_eval_harness.py b/vllm-eval-harness/run_vllm_eval_harness.py new file mode 100644 index 00000000..c954cad8 --- /dev/null +++ b/vllm-eval-harness/run_vllm_eval_harness.py @@ -0,0 +1,133 @@ +import os +import glob +import lm_eval +import yaml +from logging import warning, info +from argparse import Action, ArgumentParser, Namespace +import torch +from typing import Dict, Any, List, Optional + + +class ValidateDir(Action): + def __call__( + self, + parser: ArgumentParser, + namespace: Namespace, + values: Any, + option_string: Optional[str] = None, + ) -> None: + if os.path.isdir(values): + setattr(namespace, self.dest, values) + return + + parser.error(f"{values} is not a valid directory") + + +def parse_args() -> Any: + parser = ArgumentParser("Run vLLM lm-eval harness") + + parser.add_argument( + "--configs-dir", + type=str, + action=ValidateDir, + help="the directory contains vLLM lm-eval harness configs", + required=True, + ) + parser.add_argument( + "--models", + type=str, + default="", + help="the comma-separated list of models to evaluate (optional)", + ) + parser.add_argument( + "--tasks", + type=str, + default="", + help="the comma-separated list of tasks to evaluate (optional)", + ) + + return parser.parse_args() + + +def run( + model_name: str, tasks: List[str], tp_size: int, config: Dict[str, Any] +) -> None: + trust_remote_code = config.get("trust_remote_code", False) + max_model_len = config.get("max_model_len", 8192) + + model_args = ( + f"pretrained={model_name}," + f"tensor_parallel_size={tp_size}," + f"add_bos_token=true," + f"trust_remote_code={trust_remote_code}," + f"max_model_len={max_model_len}" + ) + print(model_args) + results = lm_eval.simple_evaluate( + model="vllm", + model_args=model_args, + tasks=tasks, + num_fewshot=config["num_fewshot"], + limit=config["limit"], + batch_size="auto", + ) + print(results) + + +def run_lm_eval(configs_dir: str, models: List[str], tasks: List[str]) -> None: + device_name = torch.cuda.get_device_name().lower() + device_count = torch.cuda.device_count() + + for file in glob.glob(f"{configs_dir}/**/*.yml", recursive=True): + config = yaml.safe_load(file) + # Check the model name + model_name = config.get("model_name", "").lower() + if models and model_name not in models: + info(f"Skip {model_name} from {file}") + continue + + tp_size = 0 + selected_tasks = [] + + # Check the lm-eval tasks, the selected device, and tp + for t in config.get("tasks", []): + task_name = t["name"] + if not task_name: + warning(f"{model_name} from {file}: skip missing task") + continue + + if tasks and task_name not in tasks: + info(f"{model_name} from {file}: {task_name} not selected") + + selected_device = t["device"].lower() + if selected_device not in device_name: + continue + + tp = t["tp"] + if device_count < tp: + warning( + f"{model_name} from {file}: device count {device_count} < tp {tp} in {task_name}" + ) + continue + + selected_tasks.push(task_name) + if not tp_size: + tp_size = tp + assert tp_size == tp + + if not selected_tasks: + info(f"Skip {model_name} from {file}: no task") + continue + + run(model_name, selected_tasks, tp_size, config) + + +def main() -> None: + args = parse_args() + models = [m.strip().lower() for m in args.models.split(",") if m.strip()] + tasks = [m.strip().lower() for m in args.runners.split(",") if m.strip()] + run_lm_eval(args.configs_dir, models, tasks) + + +if __name__ == "__main__": + main() From 3e9212788fc72ec84260df77893da9a28af3ca10 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Mon, 6 Oct 2025 22:09:01 -0700 Subject: [PATCH 2/5] Minor tweak Signed-off-by: Huy Do --- vllm-eval-harness/run_vllm_eval_harness.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm-eval-harness/run_vllm_eval_harness.py b/vllm-eval-harness/run_vllm_eval_harness.py index c954cad8..623a51bd 100644 --- a/vllm-eval-harness/run_vllm_eval_harness.py +++ b/vllm-eval-harness/run_vllm_eval_harness.py @@ -79,7 +79,8 @@ def run_lm_eval(configs_dir: str, models: List[str], tasks: List[str]) -> None: device_count = torch.cuda.device_count() for file in glob.glob(f"{configs_dir}/**/*.yml", recursive=True): - config = yaml.safe_load(file) + with open(file) as f: + config = yaml.safe_load(f) # Check the model name model_name = config.get("model_name", "").lower() if models and model_name not in models: @@ -125,7 +126,7 @@ def run_lm_eval(configs_dir: str, models: List[str], tasks: List[str]) -> None: def main() -> None: args = parse_args() models = [m.strip().lower() for m in args.models.split(",") if m.strip()] - tasks = [m.strip().lower() for m in args.runners.split(",") if m.strip()] + tasks = [m.strip().lower() for m in args.tasks.split(",") if m.strip()] run_lm_eval(args.configs_dir, models, tasks) From 17cc82fadba813f92d3d21008f99e4cfa222899c Mon Sep 17 00:00:00 2001 From: Huy Do Date: Mon, 6 Oct 2025 23:52:38 -0700 Subject: [PATCH 3/5] Add existing models Signed-off-by: Huy Do --- .../cuda/deepseek-ai/DeepSeek-R1.yml | 14 ++++ .../cuda/deepseek-ai/DeepSeek-V3.1.yml | 14 ++++ .../cuda/deepseek-ai/DeepSeek-V3.2-Exp.yml | 14 ++++ .../cuda/google/gemma-3-27b-it.yml | 14 ++++ .../Llama-4-Scout-17B-16E-Instruct.yml | 14 ++++ .../cuda/openai/gpt-oss-120b.yml | 14 ++++ vllm-eval-harness/cuda/qwen/Qwen3-30B-A3B.yml | 14 ++++ vllm-eval-harness/cuda/qwen/Qwen3-8B.yml | 14 ++++ vllm-eval-harness/run_vllm_eval_harness.py | 66 +++++++++++++++++-- 9 files changed, 171 insertions(+), 7 deletions(-) create mode 100644 vllm-eval-harness/cuda/deepseek-ai/DeepSeek-R1.yml create mode 100644 vllm-eval-harness/cuda/deepseek-ai/DeepSeek-V3.1.yml create mode 100644 vllm-eval-harness/cuda/deepseek-ai/DeepSeek-V3.2-Exp.yml create mode 100644 vllm-eval-harness/cuda/google/gemma-3-27b-it.yml create mode 100644 vllm-eval-harness/cuda/meta-llama/Llama-4-Scout-17B-16E-Instruct.yml create mode 100644 vllm-eval-harness/cuda/openai/gpt-oss-120b.yml create mode 100644 vllm-eval-harness/cuda/qwen/Qwen3-30B-A3B.yml create mode 100644 vllm-eval-harness/cuda/qwen/Qwen3-8B.yml diff --git a/vllm-eval-harness/cuda/deepseek-ai/DeepSeek-R1.yml b/vllm-eval-harness/cuda/deepseek-ai/DeepSeek-R1.yml new file mode 100644 index 00000000..c97b4227 --- /dev/null +++ b/vllm-eval-harness/cuda/deepseek-ai/DeepSeek-R1.yml @@ -0,0 +1,14 @@ +model_name: "deepseek-ai/DeepSeek-R1" +tasks: +- name: "gsm8k" + device: b200 + tp: 8 + # Adopted from vLLM lm-eval-harness, set the value to 0 if there is no baseline + metrics: + - name: "exact_match,strict-match" + value: 0 + - name: "exact_match,flexible-extract" + value: 0 +limit: 1000 +num_fewshot: 5 +trust_remote_code: True diff --git a/vllm-eval-harness/cuda/deepseek-ai/DeepSeek-V3.1.yml b/vllm-eval-harness/cuda/deepseek-ai/DeepSeek-V3.1.yml new file mode 100644 index 00000000..833fae02 --- /dev/null +++ b/vllm-eval-harness/cuda/deepseek-ai/DeepSeek-V3.1.yml @@ -0,0 +1,14 @@ +model_name: "deepseek-ai/DeepSeek-V3.1" +tasks: +- name: "gsm8k" + device: b200 + tp: 8 + # Adopted from vLLM lm-eval-harness, set the value to 0 if there is no baseline + metrics: + - name: "exact_match,strict-match" + value: 0 + - name: "exact_match,flexible-extract" + value: 0 +limit: 1000 +num_fewshot: 5 +trust_remote_code: True diff --git a/vllm-eval-harness/cuda/deepseek-ai/DeepSeek-V3.2-Exp.yml b/vllm-eval-harness/cuda/deepseek-ai/DeepSeek-V3.2-Exp.yml new file mode 100644 index 00000000..38543069 --- /dev/null +++ b/vllm-eval-harness/cuda/deepseek-ai/DeepSeek-V3.2-Exp.yml @@ -0,0 +1,14 @@ +model_name: "deepseek-ai/DeepSeek-V3.2-Exp" +tasks: +- name: "gsm8k" + device: b200 + tp: 8 + # Adopted from vLLM lm-eval-harness, set the value to 0 if there is no baseline + metrics: + - name: "exact_match,strict-match" + value: 0 + - name: "exact_match,flexible-extract" + value: 0 +limit: 1000 +num_fewshot: 5 +trust_remote_code: True diff --git a/vllm-eval-harness/cuda/google/gemma-3-27b-it.yml b/vllm-eval-harness/cuda/google/gemma-3-27b-it.yml new file mode 100644 index 00000000..a9f51d88 --- /dev/null +++ b/vllm-eval-harness/cuda/google/gemma-3-27b-it.yml @@ -0,0 +1,14 @@ +model_name: "google/gemma-3-27b-it" +tasks: +- name: "gsm8k" + device: b200 + tp: 8 + # Adopted from vLLM lm-eval-harness, set the value to 0 if there is no baseline + metrics: + - name: "exact_match,strict-match" + value: 0 + - name: "exact_match,flexible-extract" + value: 0 +limit: 1000 +num_fewshot: 5 +trust_remote_code: True diff --git a/vllm-eval-harness/cuda/meta-llama/Llama-4-Scout-17B-16E-Instruct.yml b/vllm-eval-harness/cuda/meta-llama/Llama-4-Scout-17B-16E-Instruct.yml new file mode 100644 index 00000000..965486b2 --- /dev/null +++ b/vllm-eval-harness/cuda/meta-llama/Llama-4-Scout-17B-16E-Instruct.yml @@ -0,0 +1,14 @@ +model_name: "meta-llama/Llama-4-Scout-17B-16E-Instruct" +tasks: +- name: "gsm8k" + device: b200 + tp: 4 + # Adopted from vLLM lm-eval-harness, set the value to 0 if there is no baseline + metrics: + - name: "exact_match,strict-match" + value: 0 + - name: "exact_match,flexible-extract" + value: 0 +limit: 1000 +num_fewshot: 5 +trust_remote_code: True diff --git a/vllm-eval-harness/cuda/openai/gpt-oss-120b.yml b/vllm-eval-harness/cuda/openai/gpt-oss-120b.yml new file mode 100644 index 00000000..3b328220 --- /dev/null +++ b/vllm-eval-harness/cuda/openai/gpt-oss-120b.yml @@ -0,0 +1,14 @@ +model_name: "openai/gpt-oss-120b" +tasks: +- name: "gsm8k" + device: b200 + tp: 8 + # Adopted from vLLM lm-eval-harness, set the value to 0 if there is no baseline + metrics: + - name: "exact_match,strict-match" + value: 0 + - name: "exact_match,flexible-extract" + value: 0 +limit: 1000 +num_fewshot: 5 +trust_remote_code: True diff --git a/vllm-eval-harness/cuda/qwen/Qwen3-30B-A3B.yml b/vllm-eval-harness/cuda/qwen/Qwen3-30B-A3B.yml new file mode 100644 index 00000000..d8c8a6ff --- /dev/null +++ b/vllm-eval-harness/cuda/qwen/Qwen3-30B-A3B.yml @@ -0,0 +1,14 @@ +model_name: "Qwen/Qwen3-30B-A3B" +tasks: +- name: "gsm8k" + device: b200 + tp: 8 + # Adopted from vLLM lm-eval-harness, set the value to 0 if there is no baseline + metrics: + - name: "exact_match,strict-match" + value: 0 + - name: "exact_match,flexible-extract" + value: 0 +limit: 1000 +num_fewshot: 5 +trust_remote_code: True diff --git a/vllm-eval-harness/cuda/qwen/Qwen3-8B.yml b/vllm-eval-harness/cuda/qwen/Qwen3-8B.yml new file mode 100644 index 00000000..18ac330a --- /dev/null +++ b/vllm-eval-harness/cuda/qwen/Qwen3-8B.yml @@ -0,0 +1,14 @@ +model_name: "Qwen/Qwen3-8B" +tasks: +- name: "gsm8k" + device: b200 + tp: 1 + # Adopted from vLLM lm-eval-harness, set the value to 0 if there is no baseline + metrics: + - name: "exact_match,strict-match" + value: 0 + - name: "exact_match,flexible-extract" + value: 0 +limit: 1000 +num_fewshot: 5 +trust_remote_code: True diff --git a/vllm-eval-harness/run_vllm_eval_harness.py b/vllm-eval-harness/run_vllm_eval_harness.py index 623a51bd..abd3bb9b 100644 --- a/vllm-eval-harness/run_vllm_eval_harness.py +++ b/vllm-eval-harness/run_vllm_eval_harness.py @@ -1,3 +1,4 @@ +import json import os import glob import lm_eval @@ -8,6 +9,10 @@ from typing import Dict, Any, List, Optional +# See lm-eval docs for the list of acceptable values +LM_EVAL_MODEL_SOURCE = os.environ.get("LM_EVAL_MODEL_SOURCE", "vllm") + + class ValidateDir(Action): def __call__( self, @@ -49,9 +54,43 @@ def parse_args() -> Any: return parser.parse_args() +def convert_to_pytorch_benchmark_format( + model_name: str, tp_size: int, results: Dict[str, Any] +) -> List[Any]: + records = [] + configs = results.get("configs", {}) + + for task_name, metrics in results.get("results", {}).items(): + for metric_name, metric_value in metrics.items(): + if type(metric_value) is str: + continue + + record = { + "benchmark": { + "name": "vLLM lm-eval harness", + "extra_info": { + "args": { + "tensor_parallel_size": tp_size, + }, + "configs": configs.get(task_name, {}), + }, + }, + "model": { + "name": model_name, + }, + "metric": { + "name": metric_name, + "benchmark_values": [metric_value], + }, + } + records.append(record) + + return records + + def run( model_name: str, tasks: List[str], tp_size: int, config: Dict[str, Any] -) -> None: +) -> Dict[str, Any]: trust_remote_code = config.get("trust_remote_code", False) max_model_len = config.get("max_model_len", 8192) @@ -62,22 +101,24 @@ def run( f"trust_remote_code={trust_remote_code}," f"max_model_len={max_model_len}" ) - print(model_args) - results = lm_eval.simple_evaluate( - model="vllm", + info(f"Evaluating {model_name} with {model_args}") + return lm_eval.simple_evaluate( + model=LM_EVAL_MODEL_SOURCE, model_args=model_args, tasks=tasks, num_fewshot=config["num_fewshot"], limit=config["limit"], batch_size="auto", ) - print(results) def run_lm_eval(configs_dir: str, models: List[str], tasks: List[str]) -> None: device_name = torch.cuda.get_device_name().lower() device_count = torch.cuda.device_count() + results_dir = os.path.join(configs_dir, "results") + os.makedirs(results_dir, exist_ok=True) + for file in glob.glob(f"{configs_dir}/**/*.yml", recursive=True): with open(file) as f: config = yaml.safe_load(f) @@ -111,7 +152,7 @@ def run_lm_eval(configs_dir: str, models: List[str], tasks: List[str]) -> None: ) continue - selected_tasks.push(task_name) + selected_tasks.append(task_name) if not tp_size: tp_size = tp assert tp_size == tp @@ -120,7 +161,18 @@ def run_lm_eval(configs_dir: str, models: List[str], tasks: List[str]) -> None: info(f"Skip {model_name} from {file}: no task") continue - run(model_name, selected_tasks, tp_size, config) + results = run(model_name, selected_tasks, tp_size, config) + results_pytorch_format = convert_to_pytorch_benchmark_format( + model_name, tp_size, results + ) + + results_file = os.path.splitext(os.path.basename(file))[0] + # Dump the results from lm-eval + with open(os.path.join(results_dir, f"{results_file}_lm_eval.json"), "w") as f: + json.dump(results, f, indent=2) + # Dump the results that can be uploaded to PyTorch OSS benchmark infra + with open(os.path.join(results_dir, f"{results_file}_pytorch.json"), "w") as f: + json.dump(results_pytorch_format, f, indent=2) def main() -> None: From ea9a61224ba465f40efcc17aee6c5022f9b0c46c Mon Sep 17 00:00:00 2001 From: Huy Do Date: Tue, 7 Oct 2025 00:00:08 -0700 Subject: [PATCH 4/5] Rename Signed-off-by: Huy Do --- vllm-eval-harness/{cuda => configs}/deepseek-ai/DeepSeek-R1.yml | 0 vllm-eval-harness/{cuda => configs}/deepseek-ai/DeepSeek-V3.1.yml | 0 .../{cuda => configs}/deepseek-ai/DeepSeek-V3.2-Exp.yml | 0 vllm-eval-harness/{cuda => configs}/google/gemma-3-27b-it.yml | 0 .../meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8.yml | 0 .../meta-llama/Llama-4-Scout-17B-16E-Instruct.yml | 0 vllm-eval-harness/{cuda => configs}/openai/gpt-oss-120b.yml | 0 vllm-eval-harness/{cuda => configs}/openai/gpt-oss-20b.yml | 0 vllm-eval-harness/{cuda => configs}/qwen/Qwen3-30B-A3B.yml | 0 vllm-eval-harness/{cuda => configs}/qwen/Qwen3-8B.yml | 0 10 files changed, 0 insertions(+), 0 deletions(-) rename vllm-eval-harness/{cuda => configs}/deepseek-ai/DeepSeek-R1.yml (100%) rename vllm-eval-harness/{cuda => configs}/deepseek-ai/DeepSeek-V3.1.yml (100%) rename vllm-eval-harness/{cuda => configs}/deepseek-ai/DeepSeek-V3.2-Exp.yml (100%) rename vllm-eval-harness/{cuda => configs}/google/gemma-3-27b-it.yml (100%) rename vllm-eval-harness/{cuda => configs}/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8.yml (100%) rename vllm-eval-harness/{cuda => configs}/meta-llama/Llama-4-Scout-17B-16E-Instruct.yml (100%) rename vllm-eval-harness/{cuda => configs}/openai/gpt-oss-120b.yml (100%) rename vllm-eval-harness/{cuda => configs}/openai/gpt-oss-20b.yml (100%) rename vllm-eval-harness/{cuda => configs}/qwen/Qwen3-30B-A3B.yml (100%) rename vllm-eval-harness/{cuda => configs}/qwen/Qwen3-8B.yml (100%) diff --git a/vllm-eval-harness/cuda/deepseek-ai/DeepSeek-R1.yml b/vllm-eval-harness/configs/deepseek-ai/DeepSeek-R1.yml similarity index 100% rename from vllm-eval-harness/cuda/deepseek-ai/DeepSeek-R1.yml rename to vllm-eval-harness/configs/deepseek-ai/DeepSeek-R1.yml diff --git a/vllm-eval-harness/cuda/deepseek-ai/DeepSeek-V3.1.yml b/vllm-eval-harness/configs/deepseek-ai/DeepSeek-V3.1.yml similarity index 100% rename from vllm-eval-harness/cuda/deepseek-ai/DeepSeek-V3.1.yml rename to vllm-eval-harness/configs/deepseek-ai/DeepSeek-V3.1.yml diff --git a/vllm-eval-harness/cuda/deepseek-ai/DeepSeek-V3.2-Exp.yml b/vllm-eval-harness/configs/deepseek-ai/DeepSeek-V3.2-Exp.yml similarity index 100% rename from vllm-eval-harness/cuda/deepseek-ai/DeepSeek-V3.2-Exp.yml rename to vllm-eval-harness/configs/deepseek-ai/DeepSeek-V3.2-Exp.yml diff --git a/vllm-eval-harness/cuda/google/gemma-3-27b-it.yml b/vllm-eval-harness/configs/google/gemma-3-27b-it.yml similarity index 100% rename from vllm-eval-harness/cuda/google/gemma-3-27b-it.yml rename to vllm-eval-harness/configs/google/gemma-3-27b-it.yml diff --git a/vllm-eval-harness/cuda/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8.yml b/vllm-eval-harness/configs/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8.yml similarity index 100% rename from vllm-eval-harness/cuda/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8.yml rename to vllm-eval-harness/configs/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8.yml diff --git a/vllm-eval-harness/cuda/meta-llama/Llama-4-Scout-17B-16E-Instruct.yml b/vllm-eval-harness/configs/meta-llama/Llama-4-Scout-17B-16E-Instruct.yml similarity index 100% rename from vllm-eval-harness/cuda/meta-llama/Llama-4-Scout-17B-16E-Instruct.yml rename to vllm-eval-harness/configs/meta-llama/Llama-4-Scout-17B-16E-Instruct.yml diff --git a/vllm-eval-harness/cuda/openai/gpt-oss-120b.yml b/vllm-eval-harness/configs/openai/gpt-oss-120b.yml similarity index 100% rename from vllm-eval-harness/cuda/openai/gpt-oss-120b.yml rename to vllm-eval-harness/configs/openai/gpt-oss-120b.yml diff --git a/vllm-eval-harness/cuda/openai/gpt-oss-20b.yml b/vllm-eval-harness/configs/openai/gpt-oss-20b.yml similarity index 100% rename from vllm-eval-harness/cuda/openai/gpt-oss-20b.yml rename to vllm-eval-harness/configs/openai/gpt-oss-20b.yml diff --git a/vllm-eval-harness/cuda/qwen/Qwen3-30B-A3B.yml b/vllm-eval-harness/configs/qwen/Qwen3-30B-A3B.yml similarity index 100% rename from vllm-eval-harness/cuda/qwen/Qwen3-30B-A3B.yml rename to vllm-eval-harness/configs/qwen/Qwen3-30B-A3B.yml diff --git a/vllm-eval-harness/cuda/qwen/Qwen3-8B.yml b/vllm-eval-harness/configs/qwen/Qwen3-8B.yml similarity index 100% rename from vllm-eval-harness/cuda/qwen/Qwen3-8B.yml rename to vllm-eval-harness/configs/qwen/Qwen3-8B.yml From 8177b05c507e46898f82fa5e1cdc2b124d00bd52 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 8 Oct 2025 09:57:39 -0700 Subject: [PATCH 5/5] Set max_model_len to auto --- vllm-eval-harness/run_vllm_eval_harness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm-eval-harness/run_vllm_eval_harness.py b/vllm-eval-harness/run_vllm_eval_harness.py index abd3bb9b..3a2de086 100644 --- a/vllm-eval-harness/run_vllm_eval_harness.py +++ b/vllm-eval-harness/run_vllm_eval_harness.py @@ -92,7 +92,7 @@ def run( model_name: str, tasks: List[str], tp_size: int, config: Dict[str, Any] ) -> Dict[str, Any]: trust_remote_code = config.get("trust_remote_code", False) - max_model_len = config.get("max_model_len", 8192) + max_model_len = config.get("max_model_len", "auto") model_args = ( f"pretrained={model_name},"