diff --git a/examples/specdec_bench/run.py b/examples/specdec_bench/run.py index ca2f9908966..c2dc3a34907 100644 --- a/examples/specdec_bench/run.py +++ b/examples/specdec_bench/run.py @@ -178,6 +178,10 @@ def run_simple(args): f"or extend _MAX_SEQ_LEN_KEY in run.py." ) engine_args[key] = args.max_seq_len + if args.max_num_tokens is not None: + if args.engine != "TRTLLM": + raise ValueError("--max_num_tokens is currently only wired for --engine TRTLLM.") + engine_args["max_num_tokens"] = args.max_num_tokens sampling_kwargs = args.runtime_params.get("sampling_kwargs", {"temperature": 0}) if args.temperature is not None: sampling_kwargs["temperature"] = args.temperature @@ -349,6 +353,16 @@ def run_simple(args): "throughput_32k split (32K input + 4K output + 4K headroom)." ), ) + parser.add_argument( + "--max_num_tokens", + type=int, + required=False, + default=None, + help=( + "TRT-LLM max batched tokens. Overrides engine_args.max_num_tokens " + "from --runtime_params for --engine TRTLLM." + ), + ) parser.add_argument( "--output_length", type=int, required=False, default=4096, help="Output length" ) diff --git a/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml b/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml new file mode 100644 index 00000000000..dee3d6fe8c4 --- /dev/null +++ b/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml @@ -0,0 +1,74 @@ +# SPEED-bench MTP speculative-decoding run for NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 via TRT-LLM. +# +# Nemotron-3-Super-120B-A12B is 120B total params (MoE; 12B active per +# token). BF16 weights = 240 GB total, so tp_size=4 minimum on 80 GB +# H100/A100. Size by total expert storage, not active params. +# +# Slurm run — cells override per-cell knobs via pipeline.task_N.args+=[...]: +# +# uv run slurm.py \ +# --yaml modules/Model-Optimizer/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/specdec_bench_mtp_trtllm.yaml \ +# --yes detach=true \ +# pipeline.task_0.args+=["--temperature 0","--max_seq_len 65536","--save_dir /scratchspace//qualitative","--draft_length 7"] \ +# pipeline.task_1.args+=["--temperature 0","--max_seq_len 65536","--save_dir /scratchspace//throughput_32k","--num_requests 80","--draft_length 7"] + +job_name: NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4_specdec_bench_mtp_trtllm + +pipeline: + global_vars: + hf_model: /hf-local/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 + + # task_0: SPEED qualitative split + task_0: + script: common/specdec_bench/quick_check.sh + args: + - --dataset speed + - --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/qualitative + - --engine TRTLLM + - --speculative_algorithm MTP + - --draft_length 3 + - --tp_size 4 + - --ep_size 1 + - --concurrency 32 + - --output_length 4096 + - --aa_timing + - --show_progress + - --save_dir /scratchspace/{sweep_name_default}/qualitative + environment: + - HF_MODEL_CKPT: <> + - HF_LOCAL: /hf-local + - TRTLLM_LAUNCH_SCRIPT: trtllm-llmapi-launch + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 4 + gpus_per_node: 4 + container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10 + + # task_1: SPEED throughput_32k split + task_1: + script: common/specdec_bench/quick_check.sh + args: + - --dataset speed + - --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/throughput_32k + - --engine TRTLLM + - --speculative_algorithm MTP + - --draft_length 3 + - --tp_size 4 + - --ep_size 1 + - --concurrency 8 + - --num_requests 80 + - --output_length 4096 + - --aa_timing + - --show_progress + - --save_dir /scratchspace/{sweep_name_default}/throughput_32k + environment: + - HF_MODEL_CKPT: <> + - HF_LOCAL: /hf-local + - TRTLLM_LAUNCH_SCRIPT: trtllm-llmapi-launch + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 4 + gpus_per_node: 4 + container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10