From fb4e1b284449b7e88a2668cb1d8e14850cc6d71e Mon Sep 17 00:00:00 2001
From: Chen Hany <chenhany@nvidia.com>
Date: Sun, 14 Jun 2026 20:48:58 +0000
Subject: [PATCH 1/4] Add Nemotron NVFP4 MTP TRTLLM SPEED-bench config

Signed-off-by: Chen Hany <chenhany@nvidia.com>
---
 .../specdec_bench_mtp_trtllm.yaml             | 73 +++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml
diff --git a/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml b/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml
new file mode 100644
index 00000000000..f786b516725
--- /dev/null
+++ b/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml
@@ -0,0 +1,73 @@
+# SPEED-bench MTP speculative-decoding run for NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 via TRT-LLM.
+#
+# Nemotron-3-Super-120B-A12B is 120B total params (MoE; 12B active per
+# token). BF16 weights = 240 GB total, so tp_size=4 minimum on 80 GB
+# H100/A100. Size by total expert storage, not active params.
+#
+# Slurm run — cells override per-cell knobs via pipeline.task_N.args+=[...]:
+#
+#   uv run slurm.py \
+#     --yaml modules/Model-Optimizer/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml \
+#     --yes detach=true \
+#     pipeline.task_0.args+=["--temperature 0","--max_seq_len 65536","--save_dir /scratchspace/<sweep>/qualitative","--draft_length 7"] \
+#     pipeline.task_1.args+=["--temperature 0","--max_seq_len 65536","--save_dir /scratchspace/<sweep>/throughput_32k","--num_requests 80","--draft_length 7"]
+
+job_name: NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4_specdec_bench_mtp_trtllm
+
+pipeline:
+  global_vars:
+    hf_model: /hf-local/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4
+
+  # task_0: SPEED qualitative split
+  task_0:
+    script: common/specdec_bench/run.sh
+    args:
+      - --dataset speed
+      - --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/qualitative
+      - --engine TRTLLM
+      - --speculative_algorithm MTP
+      - --draft_length 3
+      - --tp_size 4
+      - --ep_size 1
+      - --concurrency 32
+      - --output_length 4096
+      - --aa_timing
+      - --show_progress
+      - --save_dir /scratchspace/{sweep_name_default}/qualitative
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  # task_1: SPEED throughput_32k split
+  task_1:
+    script: common/specdec_bench/run.sh
+    args:
+      - --dataset speed
+      - --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/throughput_32k
+      - --engine TRTLLM
+      - --speculative_algorithm MTP
+      - --draft_length 3
+      - --tp_size 4
+      - --ep_size 1
+      - --concurrency 8
+      - --num_requests 80
+      - --runtime_params common/specdec_bench/runtime_params_throughput_32k.yaml
+      - --output_length 4096
+      - --aa_timing
+      - --show_progress
+      - --save_dir /scratchspace/{sweep_name_default}/throughput_32k
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10

From ee206ed3c4fdaf44c63f2db95b4508c1ab864290 Mon Sep 17 00:00:00 2001
From: Chen Hany <chenhany@nvidia.com>
Date: Sun, 14 Jun 2026 22:00:59 +0000
Subject: [PATCH 2/4] Use TRTLLM launcher for specbench MTP config

Signed-off-by: Chen Hany <chenhany@nvidia.com>
---
 .../specdec_bench_mtp_trtllm.yaml                      | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml b/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml
index f786b516725..6792511c7be 100644
--- a/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml
+++ b/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml
@@ -20,7 +20,7 @@ pipeline:
 
   # task_0: SPEED qualitative split
   task_0:
-    script: common/specdec_bench/run.sh
+    script: common/specdec_bench/quick_check.sh
     args:
       - --dataset speed
       - --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/qualitative
@@ -37,16 +37,17 @@ pipeline:
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
       - HF_LOCAL: /hf-local
+      - TRTLLM_LAUNCH_SCRIPT: trtllm-llmapi-launch
     slurm_config:
       _factory_: "slurm_factory"
       nodes: 1
-      ntasks_per_node: 1
+      ntasks_per_node: 4
       gpus_per_node: 4
       container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
 
   # task_1: SPEED throughput_32k split
   task_1:
-    script: common/specdec_bench/run.sh
+    script: common/specdec_bench/quick_check.sh
     args:
       - --dataset speed
       - --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/throughput_32k
@@ -65,9 +66,10 @@ pipeline:
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
       - HF_LOCAL: /hf-local
+      - TRTLLM_LAUNCH_SCRIPT: trtllm-llmapi-launch
     slurm_config:
       _factory_: "slurm_factory"
       nodes: 1
-      ntasks_per_node: 1
+      ntasks_per_node: 4
       gpus_per_node: 4
       container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10

From 1bce81b233519eb0642b676cf7b2336fcc8a2a69 Mon Sep 17 00:00:00 2001
From: Pensieve Intern <chenhany@nvidia.com>
Date: Mon, 15 Jun 2026 00:44:16 +0000
Subject: [PATCH 3/4] Wire TRTLLM max_num_tokens CLI override

Signed-off-by: Pensieve Intern <chenhany@nvidia.com>
---
 examples/specdec_bench/run.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/examples/specdec_bench/run.py b/examples/specdec_bench/run.py
index ca2f9908966..c2dc3a34907 100644
--- a/examples/specdec_bench/run.py
+++ b/examples/specdec_bench/run.py
@@ -178,6 +178,10 @@ def run_simple(args):
                 f"or extend _MAX_SEQ_LEN_KEY in run.py."
             )
         engine_args[key] = args.max_seq_len
+    if args.max_num_tokens is not None:
+        if args.engine != "TRTLLM":
+            raise ValueError("--max_num_tokens is currently only wired for --engine TRTLLM.")
+        engine_args["max_num_tokens"] = args.max_num_tokens
     sampling_kwargs = args.runtime_params.get("sampling_kwargs", {"temperature": 0})
     if args.temperature is not None:
         sampling_kwargs["temperature"] = args.temperature
@@ -349,6 +353,16 @@ def run_simple(args):
             "throughput_32k split (32K input + 4K output + 4K headroom)."
         ),
     )
+    parser.add_argument(
+        "--max_num_tokens",
+        type=int,
+        required=False,
+        default=None,
+        help=(
+            "TRT-LLM max batched tokens. Overrides engine_args.max_num_tokens "
+            "from --runtime_params for --engine TRTLLM."
+        ),
+    )
     parser.add_argument(
         "--output_length", type=int, required=False, default=4096, help="Output length"
     )

From cff6e3e64b6c28da08f8e7bd064458c2a1345789 Mon Sep 17 00:00:00 2001
From: Pensieve Intern <chenhany@nvidia.com>
Date: Mon, 15 Jun 2026 00:44:39 +0000
Subject: [PATCH 4/4] Restore TRTLLM throughput runtime params path

Signed-off-by: Pensieve Intern <chenhany@nvidia.com>
---
 .../specdec_bench_mtp_trtllm.yaml                                | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml b/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml
index 6792511c7be..dee3d6fe8c4 100644
--- a/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml
+++ b/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml
@@ -58,7 +58,6 @@ pipeline:
       - --ep_size 1
       - --concurrency 8
       - --num_requests 80
-      - --runtime_params common/specdec_bench/runtime_params_throughput_32k.yaml
       - --output_length 4096
       - --aa_timing
       - --show_progress