From fb4e1b284449b7e88a2668cb1d8e14850cc6d71e Mon Sep 17 00:00:00 2001 From: Chen Hany Date: Sun, 14 Jun 2026 20:48:58 +0000 Subject: [PATCH 1/4] Add Nemotron NVFP4 MTP TRTLLM SPEED-bench config Signed-off-by: Chen Hany --- .../specdec_bench_mtp_trtllm.yaml | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml diff --git a/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml b/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml new file mode 100644 index 00000000000..f786b516725 --- /dev/null +++ b/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml @@ -0,0 +1,73 @@ +# SPEED-bench MTP speculative-decoding run for NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 via TRT-LLM. +# +# Nemotron-3-Super-120B-A12B is 120B total params (MoE; 12B active per +# token). BF16 weights = 240 GB total, so tp_size=4 minimum on 80 GB +# H100/A100. Size by total expert storage, not active params. +# +# Slurm run — cells override per-cell knobs via pipeline.task_N.args+=[...]: +# +# uv run slurm.py \ +# --yaml modules/Model-Optimizer/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml \ +# --yes detach=true \ +# pipeline.task_0.args+=["--temperature 0","--max_seq_len 65536","--save_dir /scratchspace//qualitative","--draft_length 7"] \ +# pipeline.task_1.args+=["--temperature 0","--max_seq_len 65536","--save_dir /scratchspace//throughput_32k","--num_requests 80","--draft_length 7"] + +job_name: NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4_specdec_bench_mtp_trtllm + +pipeline: + global_vars: + hf_model: /hf-local/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 + + # task_0: SPEED qualitative split + task_0: + script: common/specdec_bench/run.sh + args: + - --dataset speed + - --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/qualitative + - --engine TRTLLM + - --speculative_algorithm MTP + - --draft_length 3 + - --tp_size 4 + - --ep_size 1 + - --concurrency 32 + - --output_length 4096 + - --aa_timing + - --show_progress + - --save_dir /scratchspace/{sweep_name_default}/qualitative + environment: + - HF_MODEL_CKPT: <> + - HF_LOCAL: /hf-local + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 1 + gpus_per_node: 4 + container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10 + + # task_1: SPEED throughput_32k split + task_1: + script: common/specdec_bench/run.sh + args: + - --dataset speed + - --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/throughput_32k + - --engine TRTLLM + - --speculative_algorithm MTP + - --draft_length 3 + - --tp_size 4 + - --ep_size 1 + - --concurrency 8 + - --num_requests 80 + - --runtime_params common/specdec_bench/runtime_params_throughput_32k.yaml + - --output_length 4096 + - --aa_timing + - --show_progress + - --save_dir /scratchspace/{sweep_name_default}/throughput_32k + environment: + - HF_MODEL_CKPT: <> + - HF_LOCAL: /hf-local + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 1 + gpus_per_node: 4 + container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10 From ee206ed3c4fdaf44c63f2db95b4508c1ab864290 Mon Sep 17 00:00:00 2001 From: Chen Hany Date: Sun, 14 Jun 2026 22:00:59 +0000 Subject: [PATCH 2/4] Use TRTLLM launcher for specbench MTP config Signed-off-by: Chen Hany --- .../specdec_bench_mtp_trtllm.yaml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml b/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml index f786b516725..6792511c7be 100644 --- a/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml +++ b/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml @@ -20,7 +20,7 @@ pipeline: # task_0: SPEED qualitative split task_0: - script: common/specdec_bench/run.sh + script: common/specdec_bench/quick_check.sh args: - --dataset speed - --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/qualitative @@ -37,16 +37,17 @@ pipeline: environment: - HF_MODEL_CKPT: <> - HF_LOCAL: /hf-local + - TRTLLM_LAUNCH_SCRIPT: trtllm-llmapi-launch slurm_config: _factory_: "slurm_factory" nodes: 1 - ntasks_per_node: 1 + ntasks_per_node: 4 gpus_per_node: 4 container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10 # task_1: SPEED throughput_32k split task_1: - script: common/specdec_bench/run.sh + script: common/specdec_bench/quick_check.sh args: - --dataset speed - --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/throughput_32k @@ -65,9 +66,10 @@ pipeline: environment: - HF_MODEL_CKPT: <> - HF_LOCAL: /hf-local + - TRTLLM_LAUNCH_SCRIPT: trtllm-llmapi-launch slurm_config: _factory_: "slurm_factory" nodes: 1 - ntasks_per_node: 1 + ntasks_per_node: 4 gpus_per_node: 4 container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10 From 1bce81b233519eb0642b676cf7b2336fcc8a2a69 Mon Sep 17 00:00:00 2001 From: Pensieve Intern Date: Mon, 15 Jun 2026 00:44:16 +0000 Subject: [PATCH 3/4] Wire TRTLLM max_num_tokens CLI override Signed-off-by: Pensieve Intern --- examples/specdec_bench/run.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/examples/specdec_bench/run.py b/examples/specdec_bench/run.py index ca2f9908966..c2dc3a34907 100644 --- a/examples/specdec_bench/run.py +++ b/examples/specdec_bench/run.py @@ -178,6 +178,10 @@ def run_simple(args): f"or extend _MAX_SEQ_LEN_KEY in run.py." ) engine_args[key] = args.max_seq_len + if args.max_num_tokens is not None: + if args.engine != "TRTLLM": + raise ValueError("--max_num_tokens is currently only wired for --engine TRTLLM.") + engine_args["max_num_tokens"] = args.max_num_tokens sampling_kwargs = args.runtime_params.get("sampling_kwargs", {"temperature": 0}) if args.temperature is not None: sampling_kwargs["temperature"] = args.temperature @@ -349,6 +353,16 @@ def run_simple(args): "throughput_32k split (32K input + 4K output + 4K headroom)." ), ) + parser.add_argument( + "--max_num_tokens", + type=int, + required=False, + default=None, + help=( + "TRT-LLM max batched tokens. Overrides engine_args.max_num_tokens " + "from --runtime_params for --engine TRTLLM." + ), + ) parser.add_argument( "--output_length", type=int, required=False, default=4096, help="Output length" ) From cff6e3e64b6c28da08f8e7bd064458c2a1345789 Mon Sep 17 00:00:00 2001 From: Pensieve Intern Date: Mon, 15 Jun 2026 00:44:39 +0000 Subject: [PATCH 4/4] Restore TRTLLM throughput runtime params path Signed-off-by: Pensieve Intern --- .../specdec_bench_mtp_trtllm.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml b/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml index 6792511c7be..dee3d6fe8c4 100644 --- a/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml +++ b/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml @@ -58,7 +58,6 @@ pipeline: - --ep_size 1 - --concurrency 8 - --num_requests 80 - - --runtime_params common/specdec_bench/runtime_params_throughput_32k.yaml - --output_length 4096 - --aa_timing - --show_progress