diff --git a/Makefile b/Makefile
index 8685d171be..69b9d5c7a0 100644
--- a/Makefile
+++ b/Makefile
@@ -28,6 +28,12 @@ all: core jvm
 
 core:
 	cd native && cargo build $(FEATURES_ARG)
+# Build the single libcomet cdylib WITH the default-off `ballista` feature, so the
+# in-process Ballista offload (and its NativeBallista JNI entries) is folded into
+# libcomet. Required before running the org.apache.comet.ballista offload suites.
+# The default `core` target stays Ballista-free.
+core-ballista:
+	cd native && cargo build --features ballista $(FEATURES_ARG)
 test-rust:
 	# We need to compile CometException so that the cargo test can pass
 	./mvnw compile -pl common -DskipTests $(PROFILES)
diff --git a/docs/source/contributor-guide/ballista_execution.md b/docs/source/contributor-guide/ballista_execution.md
new file mode 100644
index 0000000000..fb07445658
--- /dev/null
+++ b/docs/source/contributor-guide/ballista_execution.md
@@ -0,0 +1,163 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Experimental: Native Execution on Apache DataFusion Ballista
+
+> **Status: experimental / research.** This is an in-progress exploration, not a supported
+> feature. Design discussion is tracked in
+> [issue #4796](https://github.com/apache/datafusion-comet/issues/4796) and the initial code
+> in draft PR [#4800](https://github.com/apache/datafusion-comet/pull/4800). Interfaces,
+> configuration, and behavior described here are subject to change.
+
+## Overview
+
+Today Comet accelerates Spark by running native operators **inside Spark executors** via JNI.
+This page describes an additional, optional deployment mode being prototyped: running Comet's
+native operators on a distributed **Apache DataFusion Ballista** data plane instead of inside
+Spark executors.
+
+In this mode the Spark cluster acts as a lightweight **control plane** — the driver plans the
+query and hands it off — while all computation happens on Ballista. The existing in-Spark
+accelerator is unchanged; this is purely additive.
+
+The feature is **not tied to Spark Connect**. The native side only consumes a serialized Comet
+plan; it does not care how the plan was produced. Because Comet's `QueryPlanSerde` emits that
+plan from *any* Spark physical plan, the same mechanism supports whole-query offload from a
+regular Spark application as well as from a Spark Connect client. Execution is **all-or-nothing**:
+a query offloaded to Ballista runs entirely on Ballista and its result terminates at the driver
+(or is written out by Ballista). Interleaving Ballista execution with Spark's own distributed
+execution within a single job is out of scope.
+
+## Architecture
+
+```
+  Spark app / Spark Connect client
+              │
+              ▼
+  ┌─────────────────────────────┐   CONTROL PLANE (Spark driver only)
+  │ Spark driver                │   • Catalyst + Comet driver-side plan rules
+  │  + Comet plan rules         │   • serialize the Comet Operator protobuf
+  │  + Ballista client          │   • submit to Ballista; collect results here
+  └──────────────┬──────────────┘   • NO Spark executor tasks run
+                 │  Comet plan protobuf
+                 ▼
+  ┌─────────────────────────────┐   DATA PLANE (Ballista, fully native)
+  │ Ballista scheduler          │   • splits into stages / owns shuffle
+  │ Ballista executors ×N       │   • rebuild the Comet plan over datafusion-ffi
+  │   (Comet-flavored)          │   • run Comet operators + expressions
+  └─────────────────────────────┘
+```
+
+Two design choices make this mostly integration rather than new invention:
+
+- **`datafusion-ffi` boundary, not co-linking.** Comet and Ballista track DataFusion on
+  independent schedules, so their crates are **not** linked together. Comet exposes a native
+  plan (built by its own `PhysicalPlanner`) as an `FFI_ExecutionPlan`; the Ballista side consumes
+  it as a `ForeignExecutionPlan`. They share only a stable C ABI (compatible within a DataFusion
+  major version). This means Comet's `planner.rs`, operators, and expressions are reused as-is,
+  with no reimplementation of plan translation.
+- **Driver-side offload.** Comet's driver-side rules already build a root `CometNativeExec` that
+  holds the whole-query serialized plan. In Ballista mode the driver submits that plan to Ballista
+  and returns results at the driver, instead of dispatching an RDD job to Spark executors.
+
+## Components
+
+**Rust** (`native/`):
+- `comet_ffi_plan_from_proto` (`datafusion-comet` core) — decodes a Comet `Operator` proto, builds
+  the plan with the existing `PhysicalPlanner`, returns an `FFI_ExecutionPlan`.
+- `execution::ballista` module (`datafusion-comet` core, gated behind the default-off `ballista`
+  Cargo feature — built with `cargo build --features ballista` / `make core-ballista`, so it is
+  folded into the single `libcomet` cdylib rather than a separate library):
+  - `CometScanExec` — a serializable DataFusion leaf that rebuilds the plan over FFI at execute time.
+  - `CometPhysicalCodec` / `CometLogicalCodec` — extension codecs that compose with Ballista's own
+    (delegating non-Comet nodes) so Comet plans can be shipped to Ballista executors.
+  - `CometTableProvider` — exposes a Comet plan to Ballista as a table.
+  - the `Java_org_apache_comet_ballista_NativeBallista_*` JNI entries (driver-side submission).
+
+**JVM** (`spark/`):
+- Driver-side offload hook and configuration (see below).
+
+## Configuration (planned / experimental)
+
+| Config | Default | Description |
+| --- | --- | --- |
+| `spark.comet.exec.ballista.enabled` | `false` | Offload Comet plans to Ballista at the driver instead of executing in Spark executors. |
+| `spark.comet.exec.ballista.scheduler.url` | _(unset)_ | External Ballista scheduler to submit to. When unset, an in-process Ballista engine is used. |
+
+## Roadmap
+
+Legend: ✅ done · 🔨 in progress · ⬜ planned
+
+- ✅ **Rust core** — FFI plan export + gated `execution::ballista` module in `datafusion-comet`
+  (`CometScanExec`, composed codecs, `CometTableProvider`) folded into `libcomet` behind the
+  default-off `ballista` feature, with codec round-trip and standalone distributed tests.
+- 🔨 **R1 — driver-side offload (single-stage).** A Spark app runs a query with
+  `spark.comet.exec.ballista.enabled=true`; the driver submits the whole Comet plan to Ballista and
+  returns results, with zero Spark-executor tasks. First target query: TPC-H Q1.
+  - ✅ R1-T1 — JVM → native → in-process Ballista → JVM Arrow round-trip (spike).
+  - ✅ R1-T2 — config flag + driver `executeCollect` override.
+  - ◐ R1-T3 — offload proven end-to-end on Q1's single-stage subset (scan + date filter + decimal projections), results match Spark, 0 executor tasks. Full Q1 GROUP BY is structurally multi-block → R2.
+  - ✅ R1-T4 (R1b) — **external cluster:** a distributed Comet plan submitted to a separate
+    `comet-scheduler` process runs on a separate, **JVM-less** `comet-executor`
+    process and returns correct results (no Ballista change needed — the config `override_*_codec`
+    fields already exist). Config: `spark.comet.exec.ballista.scheduler.url`. Proven at two layers:
+    the Rust harness (`ballista_external_cluster.rs`, a `NativeScan`→shuffle→`Filter` plan), and — via
+    `CometBallistaExternalClusterQ1Suite` (`-Pspark-4.0`, feature-built `libcomet` + binaries) — a
+    **live Spark driver** offloading **full TPC-H Q1's aggregate** to spawned `comet-scheduler` +
+    `comet-executor` child processes, results matching Spark row-for-row (incl. decimal scale) with 0
+    Spark-executor tasks. The full agg fragment (partial-agg `NativeScan` leaf → hash shuffle →
+    final-agg over a `Scan` leaf) runs on the separate JVM-less executor process without a `JAVA_VM`
+    panic.
+- 🔨 **R2 — multi-stage distribution.** A distributed 2-block `GROUP BY` (Comet partial-agg → Ballista hash shuffle → Comet final-agg) runs offloaded with 0 Spark-executor tasks and correct results — **full TPC-H Q1's aggregate now runs distributed on Ballista and matches Spark.**
+  - ✅ R2-T1 (Ballista) — accept a pre-built physical plan for distribution (a `physical_plan` submission variant; its own Ballista branch/PR).
+  - ✅ R2-T2 (Comet native) — feed a `ScanExec` leaf from a native `RecordBatchStream` (not only a JVM input).
+  - ✅ R2-T3 (`comet-ballista`) — `CometFragmentExec`: a Comet fragment whose `Scan` leaf is fed by DataFusion child streams.
+  - ✅ R2-T4 — 2-block `count(*)` single-key distributes across the shuffle; results match Spark, 0 executor tasks.
+  - ✅ R2-T5 — **full TPC-H Q1 aggregate distributed** (no `ORDER BY`): `sum`×4, `avg`×3, `count` over decimals grouped by two keys (`l_returnflag`, `l_linestatus`). `avg`'s partial (sum + count) state and decimal partial sums round-trip through Ballista's Arrow IPC shuffle and compose in the Comet final aggregate; results match Spark's own Q1 row-for-row (incl. decimal scale), 0 executor tasks.
+  - ⬜ N-block generalization (a trailing `ORDER BY` / range exchange is a third stage — still out of scope).
+- ⬜ **JVM-free executor.** Feature-gate the JNI bridge so the native execution crates build without
+  `libjvm`, enabling a standalone Ballista executor binary.
+- ⬜ **Multi-partition scans.** Map a scan's file groups to multiple partitions (currently a
+  `NativeScan` proto encodes a single partition).
+- ⬜ **Wider coverage.** Broaden operator/expression coverage; capture real plans from the JVM
+  `QueryPlanSerde` for validation.
+- ⬜ **Spark Connect front-end.** Package the driver as a Spark Connect server for unchanged clients.
+
+## Known limitations
+
+- Single-stage only in R1 (no distribution yet); plans containing an exchange are rejected.
+- Scans are single-partition today.
+- Queries with dynamic partition pruning or correlated scalar subqueries may still launch Spark
+  executor tasks to resolve those inputs, even in Ballista mode.
+- The FFI boundary requires Comet and Ballista to be built against the same DataFusion **major**
+  version.
+- Comet core links the JNI bridge, so `libjvm` must be present at runtime even where JNI is unused.
+- The offload is folded into the single `libcomet` cdylib behind the default-off `ballista` Cargo
+  feature (there is no separate `comet-ballista` cdylib / second copy of Comet core), so a
+  Comet-on-executor query and an in-process Ballista offload share one `JAVA_VM` and coexist in the
+  same JVM without the "JAVA_VM not initialized" panic. Building with the feature is required for
+  the offload entries (`make core-ballista`); the default build stays Ballista-free.
+- The single-stage `ORDER BY`/range exchange makes Q1's final sort a third stage — out of the current
+  2-block scope; sort on the driver, or wait for N-block generalization.
+
+## References
+
+- Proposal and discussion: [issue #4796](https://github.com/apache/datafusion-comet/issues/4796)
+- Prototype code: draft PR [#4800](https://github.com/apache/datafusion-comet/pull/4800)
+- [Arrow FFI Usage in Comet](ffi.md)
diff --git a/docs/source/contributor-guide/index.md b/docs/source/contributor-guide/index.md
index 5236d03b9d..95a9c1e85e 100644
--- a/docs/source/contributor-guide/index.md
+++ b/docs/source/contributor-guide/index.md
@@ -52,6 +52,14 @@ ANSI Error Propagation <sql_error_propagation>
 S3 Credential Provider Design <s3-credential-provider-design>
 ```
 
+```{toctree}
+:maxdepth: 2
+:caption: Experimental
+:hidden:
+
+Native Execution on Ballista <ballista_execution>
+```
+
 ```{toctree}
 :maxdepth: 2
 :caption: Adding Functionality
diff --git a/native/Cargo.lock b/native/Cargo.lock
index adb764fbfb..136c9b8e8a 100644
--- a/native/Cargo.lock
+++ b/native/Cargo.lock
@@ -203,6 +203,15 @@ dependencies = [
  "zstd",
 ]
 
+[[package]]
+name = "ar_archive_writer"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4087686b4b0a3427190bae57a1d9a478dbb2d40c5dc1bd6e2b6d797913bdd348"
+dependencies = [
+ "object",
+]
+
 [[package]]
 name = "arc-swap"
 version = "1.9.1"
@@ -346,6 +355,34 @@ dependencies = [
  "num-traits",
 ]
 
+[[package]]
+name = "arrow-flight"
+version = "58.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "28abfe8bf9f124e5fc83b334af4fa58f8d0323ad25312ccb2d1da50178415704"
+dependencies = [
+ "arrow-arith",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-cast",
+ "arrow-data",
+ "arrow-ipc",
+ "arrow-ord",
+ "arrow-row",
+ "arrow-schema",
+ "arrow-select",
+ "arrow-string",
+ "base64",
+ "bytes",
+ "futures",
+ "once_cell",
+ "paste",
+ "prost",
+ "prost-types",
+ "tonic",
+ "tonic-prost",
+]
+
 [[package]]
 name = "arrow-ipc"
 version = "58.3.0"
@@ -520,6 +557,12 @@ dependencies = [
  "slab",
 ]
 
+[[package]]
+name = "async-ffi"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f4de21c0feef7e5a556e51af767c953f0501f7f300ba785cc99c47bdc8081a50"
+
 [[package]]
 name = "async-global-executor"
 version = "2.4.1"
@@ -1015,6 +1058,58 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "axum"
+version = "0.8.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "31b698c5f9a010f6573133b09e0de5408834d0c82f8d7475a89fc1867a71cd90"
+dependencies = [
+ "axum-core",
+ "bytes",
+ "form_urlencoded",
+ "futures-util",
+ "http 1.4.2",
+ "http-body 1.0.1",
+ "http-body-util",
+ "hyper",
+ "hyper-util",
+ "itoa",
+ "matchit",
+ "memchr",
+ "mime",
+ "percent-encoding",
+ "pin-project-lite",
+ "serde_core",
+ "serde_json",
+ "serde_path_to_error",
+ "serde_urlencoded",
+ "sync_wrapper",
+ "tokio",
+ "tower",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "axum-core"
+version = "0.5.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08c78f31d7b1291f7ee735c1c6780ccde7785daae9a9206026862dab7d8792d1"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "http 1.4.2",
+ "http-body 1.0.1",
+ "http-body-util",
+ "mime",
+ "pin-project-lite",
+ "sync_wrapper",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
 [[package]]
 name = "backon"
 version = "1.6.0"
@@ -1041,6 +1136,123 @@ dependencies = [
  "windows-link",
 ]
 
+[[package]]
+name = "ballista"
+version = "53.0.0"
+source = "git+https://github.com/apache/datafusion-ballista?rev=6472c7f21ad1a824b123037b2f18669bd1538bca#6472c7f21ad1a824b123037b2f18669bd1538bca"
+dependencies = [
+ "async-trait",
+ "ballista-core",
+ "ballista-executor",
+ "ballista-scheduler",
+ "datafusion",
+ "log",
+ "tokio",
+ "url",
+]
+
+[[package]]
+name = "ballista-core"
+version = "53.0.0"
+source = "git+https://github.com/apache/datafusion-ballista?rev=6472c7f21ad1a824b123037b2f18669bd1538bca#6472c7f21ad1a824b123037b2f18669bd1538bca"
+dependencies = [
+ "arrow-flight",
+ "async-trait",
+ "aws-config",
+ "aws-credential-types",
+ "chrono",
+ "clap",
+ "datafusion",
+ "datafusion-proto",
+ "datafusion-proto-common",
+ "futures",
+ "itertools 0.15.0",
+ "log",
+ "md-5 0.11.0",
+ "object_store",
+ "parking_lot",
+ "prost",
+ "prost-types",
+ "rand 0.10.1",
+ "rustc_version",
+ "serde",
+ "tokio",
+ "tokio-stream",
+ "tonic",
+ "tonic-build",
+ "tonic-prost",
+ "tonic-prost-build",
+ "url",
+ "uuid",
+]
+
+[[package]]
+name = "ballista-executor"
+version = "53.0.0"
+source = "git+https://github.com/apache/datafusion-ballista?rev=6472c7f21ad1a824b123037b2f18669bd1538bca#6472c7f21ad1a824b123037b2f18669bd1538bca"
+dependencies = [
+ "arrow",
+ "arrow-flight",
+ "async-trait",
+ "ballista-core",
+ "bytesize",
+ "clap",
+ "dashmap",
+ "datafusion",
+ "datafusion-proto",
+ "futures",
+ "libc",
+ "log",
+ "memory-stats",
+ "mimalloc",
+ "parking_lot",
+ "serde",
+ "sysinfo",
+ "tempfile",
+ "tokio",
+ "tokio-stream",
+ "tokio-util",
+ "tonic",
+ "tracing",
+ "tracing-appender",
+ "tracing-subscriber",
+ "uuid",
+]
+
+[[package]]
+name = "ballista-scheduler"
+version = "53.0.0"
+source = "git+https://github.com/apache/datafusion-ballista?rev=6472c7f21ad1a824b123037b2f18669bd1538bca#6472c7f21ad1a824b123037b2f18669bd1538bca"
+dependencies = [
+ "arrow-flight",
+ "async-trait",
+ "axum",
+ "ballista-core",
+ "clap",
+ "dashmap",
+ "datafusion",
+ "datafusion-proto",
+ "futures",
+ "http 1.4.2",
+ "insta",
+ "itertools 0.15.0",
+ "log",
+ "object_store",
+ "parking_lot",
+ "prost",
+ "prost-types",
+ "rand 0.10.1",
+ "serde",
+ "tokio",
+ "tokio-stream",
+ "tonic",
+ "tower-http 0.7.0",
+ "tracing",
+ "tracing-appender",
+ "tracing-subscriber",
+ "uuid",
+]
+
 [[package]]
 name = "base64"
 version = "0.22.1"
@@ -1299,6 +1511,12 @@ dependencies = [
  "either",
 ]
 
+[[package]]
+name = "bytesize"
+version = "2.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49e78e506b9d7633710dab98996f22f95f3d0f488e8f1aa162830556ed9fc14d"
+
 [[package]]
 name = "bzip2"
 version = "0.6.1"
@@ -1442,7 +1660,7 @@ checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
 dependencies = [
  "glob",
  "libc",
- "libloading",
+ "libloading 0.8.9",
 ]
 
 [[package]]
@@ -1556,6 +1774,17 @@ dependencies = [
  "crossbeam-utils",
 ]
 
+[[package]]
+name = "console"
+version = "0.16.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4fe5f465a4f6fee88fad41b85d990f84c835335e85b5d9e6e63e0d06d28cba7c"
+dependencies = [
+ "encode_unicode",
+ "libc",
+ "windows-sys 0.61.2",
+]
+
 [[package]]
 name = "const-oid"
 version = "0.9.6"
@@ -1891,6 +2120,7 @@ dependencies = [
  "arrow",
  "arrow-schema",
  "async-trait",
+ "bzip2",
  "chrono",
  "datafusion-catalog",
  "datafusion-catalog-listing",
@@ -1917,9 +2147,11 @@ dependencies = [
  "datafusion-physical-plan",
  "datafusion-session",
  "datafusion-sql",
+ "flate2",
  "futures",
  "indexmap 2.14.0",
  "itertools 0.14.0",
+ "liblzma",
  "log",
  "object_store",
  "parking_lot",
@@ -1929,6 +2161,7 @@ dependencies = [
  "tokio",
  "url",
  "uuid",
+ "zstd",
 ]
 
 [[package]]
@@ -1983,11 +2216,16 @@ dependencies = [
 name = "datafusion-comet"
 version = "1.0.0"
 dependencies = [
+ "anyhow",
  "arrow",
  "assertables",
  "async-trait",
  "aws-config",
  "aws-credential-types",
+ "ballista",
+ "ballista-core",
+ "ballista-executor",
+ "ballista-scheduler",
  "criterion",
  "datafusion",
  "datafusion-comet-common",
@@ -1997,8 +2235,10 @@ dependencies = [
  "datafusion-comet-shuffle",
  "datafusion-comet-spark-expr",
  "datafusion-datasource",
+ "datafusion-ffi",
  "datafusion-functions-nested",
  "datafusion-physical-expr-adapter",
+ "datafusion-proto",
  "datafusion-spark",
  "futures",
  "hdfs-sys",
@@ -2173,6 +2413,7 @@ dependencies = [
  "log",
  "object_store",
  "parquet",
+ "recursive",
  "sqlparser",
  "tokio",
  "uuid",
@@ -2374,6 +2615,7 @@ dependencies = [
  "datafusion-physical-expr-common",
  "indexmap 2.14.0",
  "itertools 0.14.0",
+ "recursive",
  "serde_json",
  "sqlparser",
 ]
@@ -2390,6 +2632,39 @@ dependencies = [
  "itertools 0.14.0",
 ]
 
+[[package]]
+name = "datafusion-ffi"
+version = "54.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5660e8fa79fd51e29ce46f3026b67317ef738ebd633e106beb1a1907a406152"
+dependencies = [
+ "arrow",
+ "arrow-schema",
+ "async-ffi",
+ "async-trait",
+ "chrono",
+ "datafusion-catalog",
+ "datafusion-common",
+ "datafusion-datasource",
+ "datafusion-execution",
+ "datafusion-expr",
+ "datafusion-functions-aggregate-common",
+ "datafusion-physical-expr",
+ "datafusion-physical-expr-common",
+ "datafusion-physical-optimizer",
+ "datafusion-physical-plan",
+ "datafusion-proto",
+ "datafusion-proto-common",
+ "datafusion-session",
+ "futures",
+ "libloading 0.9.0",
+ "log",
+ "prost",
+ "semver",
+ "stabby",
+ "tokio",
+]
+
 [[package]]
 name = "datafusion-functions"
 version = "54.0.0"
@@ -2549,6 +2824,7 @@ dependencies = [
  "indexmap 2.14.0",
  "itertools 0.14.0",
  "log",
+ "recursive",
  "regex",
  "regex-syntax",
 ]
@@ -2571,6 +2847,7 @@ dependencies = [
  "itertools 0.14.0",
  "parking_lot",
  "petgraph",
+ "recursive",
  "tokio",
 ]
 
@@ -2622,6 +2899,7 @@ dependencies = [
  "datafusion-physical-plan",
  "datafusion-pruning",
  "itertools 0.14.0",
+ "recursive",
 ]
 
 [[package]]
@@ -2657,6 +2935,44 @@ dependencies = [
  "tokio",
 ]
 
+[[package]]
+name = "datafusion-proto"
+version = "54.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9dd15a1ba5d3af93808241065c6c44dbca8296a189845e8a587c45c07bf0ffae"
+dependencies = [
+ "arrow",
+ "chrono",
+ "datafusion-catalog",
+ "datafusion-catalog-listing",
+ "datafusion-common",
+ "datafusion-datasource",
+ "datafusion-datasource-arrow",
+ "datafusion-datasource-csv",
+ "datafusion-datasource-json",
+ "datafusion-datasource-parquet",
+ "datafusion-execution",
+ "datafusion-expr",
+ "datafusion-functions-table",
+ "datafusion-physical-expr",
+ "datafusion-physical-expr-common",
+ "datafusion-physical-plan",
+ "datafusion-proto-common",
+ "object_store",
+ "prost",
+]
+
+[[package]]
+name = "datafusion-proto-common"
+version = "54.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90042982cf9462eb06a0b81f92efa4188dae871e7ea3ab8dc61aa9c9349b2530"
+dependencies = [
+ "arrow",
+ "datafusion-common",
+ "prost",
+]
+
 [[package]]
 name = "datafusion-pruning"
 version = "54.0.0"
@@ -2731,6 +3047,7 @@ dependencies = [
  "datafusion-functions-nested",
  "indexmap 2.14.0",
  "log",
+ "recursive",
  "regex",
  "sqlparser",
 ]
@@ -2923,6 +3240,12 @@ version = "1.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e"
 
+[[package]]
+name = "encode_unicode"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
+
 [[package]]
 name = "equator"
 version = "0.4.2"
@@ -3494,6 +3817,12 @@ version = "1.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"
 
+[[package]]
+name = "httpdate"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
+
 [[package]]
 name = "humantime"
 version = "2.3.0"
@@ -3523,6 +3852,7 @@ dependencies = [
  "http 1.4.2",
  "http-body 1.0.1",
  "httparse",
+ "httpdate",
  "itoa",
  "pin-project-lite",
  "smallvec",
@@ -3546,6 +3876,19 @@ dependencies = [
  "tower-service",
 ]
 
+[[package]]
+name = "hyper-timeout"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0"
+dependencies = [
+ "hyper",
+ "hyper-util",
+ "pin-project-lite",
+ "tokio",
+ "tower-service",
+]
+
 [[package]]
 name = "hyper-util"
 version = "0.1.20"
@@ -3828,6 +4171,18 @@ dependencies = [
  "generic-array",
 ]
 
+[[package]]
+name = "insta"
+version = "1.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "86f0f8fee8c926415c58d6ae43a08523a26faccb2323f5e6b644fe7dd4ef6b82"
+dependencies = [
+ "console",
+ "once_cell",
+ "similar",
+ "tempfile",
+]
+
 [[package]]
 name = "integer-encoding"
 version = "3.0.4"
@@ -3979,7 +4334,7 @@ dependencies = [
  "java-locator",
  "jni-macros",
  "jni-sys 0.4.1",
- "libloading",
+ "libloading 0.8.9",
  "log",
  "simd_cesu8",
  "thiserror 2.0.18",
@@ -4153,8 +4508,18 @@ dependencies = [
 ]
 
 [[package]]
-name = "liblzma"
-version = "0.4.6"
+name = "libloading"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "754ca22de805bb5744484a5b151a9e1a8e837d5dc232c2d7d8c2e3492edc8b60"
+dependencies = [
+ "cfg-if",
+ "windows-link",
+]
+
+[[package]]
+name = "liblzma"
+version = "0.4.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6033b77c21d1f56deeae8014eb9fbe7bdf1765185a6c508b5ca82eeaed7f899"
 dependencies = [
@@ -4286,6 +4651,21 @@ dependencies = [
  "twox-hash",
 ]
 
+[[package]]
+name = "matchers"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9"
+dependencies = [
+ "regex-automata",
+]
+
+[[package]]
+name = "matchit"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3"
+
 [[package]]
 name = "md-5"
 version = "0.10.6"
@@ -4330,6 +4710,16 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "memory-stats"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c73f5c649995a115e1a0220b35e4df0a1294500477f97a91d0660fb5abeb574a"
+dependencies = [
+ "libc",
+ "windows-sys 0.52.0",
+]
+
 [[package]]
 name = "mimalloc"
 version = "0.1.52"
@@ -4339,6 +4729,12 @@ dependencies = [
  "libmimalloc-sys",
 ]
 
+[[package]]
+name = "mime"
+version = "0.3.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
+
 [[package]]
 name = "minimal-lexical"
 version = "0.2.1"
@@ -4425,6 +4821,24 @@ dependencies = [
  "minimal-lexical",
 ]
 
+[[package]]
+name = "ntapi"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3b335231dfd352ffb0f8017f3b6027a4917f7df785ea2143d8af2adc66980ae"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "nu-ansi-term"
+version = "0.50.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5"
+dependencies = [
+ "windows-sys 0.59.0",
+]
+
 [[package]]
 name = "num"
 version = "0.4.3"
@@ -4532,6 +4946,25 @@ dependencies = [
  "libm",
 ]
 
+[[package]]
+name = "objc2-core-foundation"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536"
+dependencies = [
+ "bitflags 2.13.0",
+]
+
+[[package]]
+name = "objc2-io-kit"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33fafba39597d6dc1fb709123dfa8289d39406734be322956a69f0931c73bb15"
+dependencies = [
+ "libc",
+ "objc2-core-foundation",
+]
+
 [[package]]
 name = "object"
 version = "0.37.3"
@@ -5276,6 +5709,15 @@ dependencies = [
  "syn 2.0.118",
 ]
 
+[[package]]
+name = "proc-macro-crate"
+version = "3.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e67ba7e9b2b56446f1d419b1d807906278ffa1a658a8a5d8a39dcb1f5a78614f"
+dependencies = [
+ "toml_edit",
+]
+
 [[package]]
 name = "proc-macro-error-attr2"
 version = "2.0.0"
@@ -5355,6 +5797,8 @@ dependencies = [
  "prettyplease",
  "prost",
  "prost-types",
+ "pulldown-cmark",
+ "pulldown-cmark-to-cmark",
  "regex",
  "syn 2.0.118",
  "tempfile",
@@ -5382,6 +5826,36 @@ dependencies = [
  "prost",
 ]
 
+[[package]]
+name = "psm"
+version = "0.1.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "645dbe486e346d9b5de3ef16ede18c26e6c70ad97418f4874b8b1889d6e761ea"
+dependencies = [
+ "ar_archive_writer",
+ "cc",
+]
+
+[[package]]
+name = "pulldown-cmark"
+version = "0.13.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e9f068eba8e7071c5f9511831b44f32c740d5adf574e990f946ddb53db2f314e"
+dependencies = [
+ "bitflags 2.13.0",
+ "memchr",
+ "unicase",
+]
+
+[[package]]
+name = "pulldown-cmark-to-cmark"
+version = "22.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50793def1b900256624a709439404384204a5dc3a6ec580281bfaac35e882e90"
+dependencies = [
+ "pulldown-cmark",
+]
+
 [[package]]
 name = "quad-rand"
 version = "0.2.3"
@@ -5589,6 +6063,26 @@ dependencies = [
  "crossbeam-utils",
 ]
 
+[[package]]
+name = "recursive"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e"
+dependencies = [
+ "recursive-proc-macro-impl",
+ "stacker",
+]
+
+[[package]]
+name = "recursive-proc-macro-impl"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b"
+dependencies = [
+ "quote",
+ "syn 2.0.118",
+]
+
 [[package]]
 name = "redox_syscall"
 version = "0.5.18"
@@ -5800,7 +6294,7 @@ dependencies = [
  "tokio-rustls",
  "tokio-util",
  "tower",
- "tower-http",
+ "tower-http 0.6.11",
  "tower-service",
  "url",
  "wasm-bindgen",
@@ -5838,7 +6332,7 @@ dependencies = [
  "tokio-rustls",
  "tokio-util",
  "tower",
- "tower-http",
+ "tower-http 0.6.11",
  "tower-service",
  "url",
  "wasm-bindgen",
@@ -6228,6 +6722,17 @@ dependencies = [
  "zmij",
 ]
 
+[[package]]
+name = "serde_path_to_error"
+version = "0.1.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "10a9ff822e371bb5403e391ecd83e182e0e77ba7f6fe0160b795797109d1b457"
+dependencies = [
+ "itoa",
+ "serde",
+ "serde_core",
+]
+
 [[package]]
 name = "serde_repr"
 version = "0.1.20"
@@ -6340,6 +6845,21 @@ dependencies = [
  "digest 0.11.3",
 ]
 
+[[package]]
+name = "sha2-const-stable"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f179d4e11094a893b82fff208f74d448a7512f99f5a0acbd5c679b705f83ed9"
+
+[[package]]
+name = "sharded-slab"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6"
+dependencies = [
+ "lazy_static",
+]
+
 [[package]]
 name = "shlex"
 version = "1.3.0"
@@ -6394,6 +6914,12 @@ version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e"
 
+[[package]]
+name = "similar"
+version = "2.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbbb5d9659141646ae647b42fe094daf6c6192d1620870b449d9557f748b2daa"
+
 [[package]]
 name = "siphasher"
 version = "1.0.3"
@@ -6460,6 +6986,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "13c6d1b651dc4edf07eead2a0c6c78016ce971bc2c10da5266861b13f25e7cec"
 dependencies = [
  "log",
+ "recursive",
  "sqlparser_derive",
 ]
 
@@ -6474,12 +7001,59 @@ dependencies = [
  "syn 2.0.118",
 ]
 
+[[package]]
+name = "stabby"
+version = "72.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a7b834ec7ced12095fea1e4b07dcb7e8cf2b59b18afa3eac52494d835965a5ec"
+dependencies = [
+ "rustversion",
+ "stabby-abi",
+]
+
+[[package]]
+name = "stabby-abi"
+version = "72.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff1a4f477858a5bdf927c9fab7f579899de9b13e39f8b3b3b300c89fbab632f4"
+dependencies = [
+ "rustc_version",
+ "rustversion",
+ "sha2-const-stable",
+ "stabby-macros",
+]
+
+[[package]]
+name = "stabby-macros"
+version = "72.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b31c4b2434980b67ad83f300a58088ba14d59454dcd79ba3d87419bbd924d31e"
+dependencies = [
+ "proc-macro-crate",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.118",
+]
+
 [[package]]
 name = "stable_deref_trait"
 version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
 
+[[package]]
+name = "stacker"
+version = "0.1.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "640c8cdd92b6b12f5bcb1803ca3bbf5ab96e5e6b6b96b9ab77dabe9e880b3190"
+dependencies = [
+ "cc",
+ "cfg-if",
+ "libc",
+ "psm",
+ "windows-sys 0.60.2",
+]
+
 [[package]]
 name = "str_stack"
 version = "0.1.1"
@@ -6542,6 +7116,12 @@ dependencies = [
  "symbolic-common",
 ]
 
+[[package]]
+name = "symlink"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a7973cce6668464ea31f176d85b13c7ab3bba2cb3b77a2ed26abd7801688010a"
+
 [[package]]
 name = "syn"
 version = "1.0.109"
@@ -6584,6 +7164,20 @@ dependencies = [
  "syn 2.0.118",
 ]
 
+[[package]]
+name = "sysinfo"
+version = "0.38.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92ab6a2f8bfe508deb3c6406578252e491d299cbbf3bc0529ecc3313aee4a52f"
+dependencies = [
+ "libc",
+ "memchr",
+ "ntapi",
+ "objc2-core-foundation",
+ "objc2-io-kit",
+ "windows",
+]
+
 [[package]]
 name = "tagptr"
 version = "0.2.0"
@@ -6653,6 +7247,15 @@ dependencies = [
  "windows-sys 0.61.2",
 ]
 
+[[package]]
+name = "thread_local"
+version = "1.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185"
+dependencies = [
+ "cfg-if",
+]
+
 [[package]]
 name = "thrift"
 version = "0.17.0"
@@ -6832,6 +7435,104 @@ dependencies = [
  "tokio",
 ]
 
+[[package]]
+name = "toml_datetime"
+version = "1.1.1+spec-1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3165f65f62e28e0115a00b2ebdd37eb6f3b641855f9d636d3cd4103767159ad7"
+dependencies = [
+ "serde_core",
+]
+
+[[package]]
+name = "toml_edit"
+version = "0.25.12+spec-1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2153edc6955a6c354fad8f5efd38b6a8769bdccf9fe50f8e1329f81b0baa5d7"
+dependencies = [
+ "indexmap 2.14.0",
+ "toml_datetime",
+ "toml_parser",
+ "winnow",
+]
+
+[[package]]
+name = "toml_parser"
+version = "1.1.2+spec-1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2abe9b86193656635d2411dc43050282ca48aa31c2451210f4202550afb7526"
+dependencies = [
+ "winnow",
+]
+
+[[package]]
+name = "tonic"
+version = "0.14.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac2a5518c70fa84342385732db33fb3f44bc4cc748936eb5833d2df34d6445ef"
+dependencies = [
+ "async-trait",
+ "axum",
+ "base64",
+ "bytes",
+ "h2",
+ "http 1.4.2",
+ "http-body 1.0.1",
+ "http-body-util",
+ "hyper",
+ "hyper-timeout",
+ "hyper-util",
+ "percent-encoding",
+ "pin-project",
+ "socket2",
+ "sync_wrapper",
+ "tokio",
+ "tokio-stream",
+ "tower",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tonic-build"
+version = "0.14.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c68f61875ac5293cf72e6c8cf0158086428c82c37229e98c840878f1706b0322"
+dependencies = [
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.118",
+]
+
+[[package]]
+name = "tonic-prost"
+version = "0.14.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50849f68853be452acf590cde0b146665b8d507b3b8af17261df47e02c209ea0"
+dependencies = [
+ "bytes",
+ "prost",
+ "tonic",
+]
+
+[[package]]
+name = "tonic-prost-build"
+version = "0.14.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "654e5643eff75d7f8c99197ce1440ed19a3474eada74c12bbac488b2cafdae27"
+dependencies = [
+ "prettyplease",
+ "proc-macro2",
+ "prost-build",
+ "prost-types",
+ "quote",
+ "syn 2.0.118",
+ "tempfile",
+ "tonic-build",
+]
+
 [[package]]
 name = "tower"
 version = "0.5.3"
@@ -6840,11 +7541,15 @@ checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4"
 dependencies = [
  "futures-core",
  "futures-util",
+ "indexmap 2.14.0",
  "pin-project-lite",
+ "slab",
  "sync_wrapper",
  "tokio",
+ "tokio-util",
  "tower-layer",
  "tower-service",
+ "tracing",
 ]
 
 [[package]]
@@ -6865,6 +7570,21 @@ dependencies = [
  "url",
 ]
 
+[[package]]
+name = "tower-http"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b11f75e912b0c2be01b63d8cf8057b8c3f97cf34abb3d431a3a4c8675498e233"
+dependencies = [
+ "bitflags 2.13.0",
+ "bytes",
+ "http 1.4.2",
+ "percent-encoding",
+ "pin-project-lite",
+ "tower-layer",
+ "tower-service",
+]
+
 [[package]]
 name = "tower-layer"
 version = "0.3.3"
@@ -6883,11 +7603,25 @@ version = "0.1.44"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100"
 dependencies = [
+ "log",
  "pin-project-lite",
  "tracing-attributes",
  "tracing-core",
 ]
 
+[[package]]
+name = "tracing-appender"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "050686193eb999b4bb3bc2acfa891a13da00f79734704c4b8b4ef1a10b368a3c"
+dependencies = [
+ "crossbeam-channel",
+ "symlink",
+ "thiserror 2.0.18",
+ "time",
+ "tracing-subscriber",
+]
+
 [[package]]
 name = "tracing-attributes"
 version = "0.1.31"
@@ -6906,6 +7640,36 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a"
 dependencies = [
  "once_cell",
+ "valuable",
+]
+
+[[package]]
+name = "tracing-log"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3"
+dependencies = [
+ "log",
+ "once_cell",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-subscriber"
+version = "0.3.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb7f578e5945fb242538965c2d0b04418d38ec25c79d160cd279bf0731c8d319"
+dependencies = [
+ "matchers",
+ "nu-ansi-term",
+ "once_cell",
+ "regex-automata",
+ "sharded-slab",
+ "smallvec",
+ "thread_local",
+ "tracing",
+ "tracing-core",
+ "tracing-log",
 ]
 
 [[package]]
@@ -6988,6 +7752,12 @@ dependencies = [
  "syn 2.0.118",
 ]
 
+[[package]]
+name = "unicase"
+version = "2.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142"
+
 [[package]]
 name = "unicode-ident"
 version = "1.0.24"
@@ -7085,6 +7855,12 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "valuable"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
+
 [[package]]
 name = "value-bag"
 version = "1.12.0"
@@ -7290,6 +8066,27 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
 
+[[package]]
+name = "windows"
+version = "0.62.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "527fadee13e0c05939a6a05d5bd6eec6cd2e3dbd648b9f8e447c6518133d8580"
+dependencies = [
+ "windows-collections",
+ "windows-core",
+ "windows-future",
+ "windows-numerics",
+]
+
+[[package]]
+name = "windows-collections"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "23b2d95af1a8a14a3c7367e1ed4fc9c20e0a26e79551b1454d72583c97cc6610"
+dependencies = [
+ "windows-core",
+]
+
 [[package]]
 name = "windows-core"
 version = "0.62.2"
@@ -7303,6 +8100,17 @@ dependencies = [
  "windows-strings",
 ]
 
+[[package]]
+name = "windows-future"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e1d6f90251fe18a279739e78025bd6ddc52a7e22f921070ccdc67dde84c605cb"
+dependencies = [
+ "windows-core",
+ "windows-link",
+ "windows-threading",
+]
+
 [[package]]
 name = "windows-implement"
 version = "0.60.2"
@@ -7331,6 +8139,16 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
 
+[[package]]
+name = "windows-numerics"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e2e40844ac143cdb44aead537bbf727de9b044e107a0f1220392177d15b0f26"
+dependencies = [
+ "windows-core",
+ "windows-link",
+]
+
 [[package]]
 name = "windows-result"
 version = "0.4.1"
@@ -7442,6 +8260,15 @@ dependencies = [
  "windows_x86_64_msvc 0.53.1",
 ]
 
+[[package]]
+name = "windows-threading"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3949bd5b99cafdf1c7ca86b43ca564028dfe27d66958f2470940f73d86d75b37"
+dependencies = [
+ "windows-link",
+]
+
 [[package]]
 name = "windows_aarch64_gnullvm"
 version = "0.42.2"
@@ -7580,6 +8407,15 @@ version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650"
 
+[[package]]
+name = "winnow"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0592e1c9d151f854e6fd382574c3a0855250e1d9b2f99d9281c6e6391af352f1"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "wit-bindgen"
 version = "0.57.1"
diff --git a/native/Cargo.toml b/native/Cargo.toml
index 3e797eb968..f159350197 100644
--- a/native/Cargo.toml
+++ b/native/Cargo.toml
@@ -61,6 +61,13 @@ aws-credential-types = "1.2.13"
 iceberg = { git = "https://github.com/apache/iceberg-rust", rev = "80a30d3" }
 iceberg-storage-opendal = { git = "https://github.com/apache/iceberg-rust", rev = "80a30d3", features = ["opendal-memory", "opendal-fs", "opendal-s3", "opendal-gcs", "opendal-oss", "opendal-azdls"] }
 reqsign-core = "3"
+# Ballista pinned to an apache/datafusion-ballista main rev that includes
+# execute_physical_plan + the PhysicalPlan(bytes) submission variant the
+# distributed offload needs (PR #1924, merged).
+ballista = { git = "https://github.com/apache/datafusion-ballista", rev = "6472c7f21ad1a824b123037b2f18669bd1538bca", package = "ballista" }
+ballista-core = { git = "https://github.com/apache/datafusion-ballista", rev = "6472c7f21ad1a824b123037b2f18669bd1538bca", package = "ballista-core" }
+ballista-scheduler = { git = "https://github.com/apache/datafusion-ballista", rev = "6472c7f21ad1a824b123037b2f18669bd1538bca", package = "ballista-scheduler" }
+ballista-executor = { git = "https://github.com/apache/datafusion-ballista", rev = "6472c7f21ad1a824b123037b2f18669bd1538bca", package = "ballista-executor" }
 
 [profile.release]
 debug = true
diff --git a/native/core/Cargo.toml b/native/core/Cargo.toml
index e657879d33..252934849d 100644
--- a/native/core/Cargo.toml
+++ b/native/core/Cargo.toml
@@ -53,6 +53,13 @@ tempfile = "3.26.0"
 itertools = "0.15.0"
 paste = "1.0.14"
 datafusion = { workspace = true, features = ["parquet_encryption", "sql"] }
+# Only used by the optional `ballista` offload module (`execution::ffi` /
+# `execution::fragment`); activated by the `ballista` feature so the default
+# `libcomet` build links no `datafusion-ffi`.
+datafusion-ffi = { version = "54.0.0", optional = true }
+# Only used by the optional `ballista` offload module (extension codecs +
+# physical-plan (de)serialization); activated by the `ballista` feature.
+datafusion-proto = { version = "54.0.0", optional = true }
 datafusion-physical-expr-adapter = { workspace = true }
 datafusion-datasource = { workspace = true }
 datafusion-spark = { workspace = true }
@@ -78,6 +85,13 @@ reqsign-core = { workspace = true }
 serde_json = "1.0"
 uuid = "1.23.3"
 
+# Ballista offload deps — optional, activated by the `ballista` feature so the
+# default `libcomet` build stays Ballista-free (no ballista/tonic/second core).
+ballista = { workspace = true, optional = true }
+ballista-core = { workspace = true, optional = true }
+ballista-scheduler = { workspace = true, optional = true }
+ballista-executor = { workspace = true, optional = true }
+
 [target.'cfg(target_os = "linux")'.dependencies]
 procfs = "0.18.0"
 
@@ -92,6 +106,10 @@ lazy_static = "1.4"
 assertables = "10"
 hex = "0.4.3"
 datafusion-functions-nested = { version = "54.0.0" }
+# `#[tokio::test]` macro + `anyhow` are used by the `ballista`-feature-gated
+# integration tests under `tests/ballista_*.rs`.
+tokio = { version = "1", features = ["rt-multi-thread", "macros"] }
+anyhow = "1"
 
 [features]
 backtrace = ["datafusion/backtrace"]
@@ -99,6 +117,18 @@ default = ["hdfs-opendal"]
 hdfs = ["datafusion-comet-objectstore-hdfs"]
 hdfs-opendal = ["opendal", "object_store_opendal", "hdfs-sys"]
 jemalloc = ["tikv-jemallocator", "tikv-jemalloc-ctl"]
+# Folds the in-process Ballista offload (`execution::ballista`, incl. the
+# `Java_org_apache_comet_ballista_NativeBallista_*` JNI entries) into `libcomet`.
+# Default-OFF so the standard build pulls no ballista/tonic and links only one
+# copy of Comet core.
+ballista = [
+    "dep:ballista",
+    "dep:ballista-core",
+    "dep:ballista-scheduler",
+    "dep:ballista-executor",
+    "dep:datafusion-proto",
+    "dep:datafusion-ffi",
+]
 
 # exclude optional packages from cargo machete verifications
 [package.metadata.cargo-machete]
@@ -109,6 +139,21 @@ name = "comet"
 # "rlib" is for benchmarking with criterion.
 crate-type = ["cdylib", "rlib"]
 
+# Comet-flavored Ballista scheduler/executor binaries. They register Comet's
+# extension codecs (which the stock Ballista CLIs hardcode to None) so a
+# distributed Comet plan survives (de)serialization across a real external
+# cluster. Only built with `--features ballista`, so the default build stays
+# Ballista-free.
+[[bin]]
+name = "comet-scheduler"
+path = "src/bin/comet-scheduler.rs"
+required-features = ["ballista"]
+
+[[bin]]
+name = "comet-executor"
+path = "src/bin/comet-executor.rs"
+required-features = ["ballista"]
+
 [[bench]]
 name = "array_element_append"
 harness = false
diff --git a/native/core/src/bin/comet-executor.rs b/native/core/src/bin/comet-executor.rs
new file mode 100644
index 0000000000..b3c91a77ba
--- /dev/null
+++ b/native/core/src/bin/comet-executor.rs
@@ -0,0 +1,100 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! A Comet-flavored Ballista **executor** process.
+//!
+//! Identical to the stock `ballista-executor` binary except that it registers
+//! Comet's extension codecs ([`CometLogicalCodec`] / [`CometPhysicalCodec`]) on
+//! the [`ExecutorProcessConfig`], so a `CometFragmentExec` shipped from the
+//! scheduler is reconstructed here (via Comet's planner over the proto) and run.
+//!
+//! This is the process that actually **executes** Comet fragments. It links
+//! `libcomet` (as an rlib) but is a plain Rust process with **no running JVM** —
+//! only `libjvm` is on the loader path. Comet fragments whose leaf is a
+//! self-contained `NativeScan` read Parquet directly and never touch `JAVA_VM`,
+//! so they run here JVM-less; this binary is the first place that is proven in a
+//! *separate* process rather than an in-process test.
+//!
+//! Only built with `--features ballista` (see `required-features` in
+//! `core/Cargo.toml`).
+//!
+//! Configuration (all optional, env-driven so a harness can place it):
+//! - `COMET_BALLISTA_EXECUTOR_BIND_HOST`  (default `127.0.0.1`)
+//! - `COMET_BALLISTA_EXECUTOR_PORT`       (flight port, default `50051`)
+//! - `COMET_BALLISTA_EXECUTOR_GRPC_PORT`  (default `50052`)
+//! - `COMET_BALLISTA_SCHEDULER_HOST`      (default `localhost`)
+//! - `COMET_BALLISTA_SCHEDULER_PORT`      (default `50050`)
+//! - `COMET_BALLISTA_EXECUTOR_CONCURRENT_TASKS` (default: available parallelism)
+
+use std::sync::Arc;
+
+use ballista_executor::executor_process::{start_executor_process, ExecutorProcessConfig};
+
+use comet::execution::ballista::{CometLogicalCodec, CometPhysicalCodec};
+
+fn env_u16(key: &str, default: u16) -> u16 {
+    std::env::var(key)
+        .ok()
+        .and_then(|s| s.parse().ok())
+        .unwrap_or(default)
+}
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let bind_host = std::env::var("COMET_BALLISTA_EXECUTOR_BIND_HOST")
+        .unwrap_or_else(|_| "127.0.0.1".to_string());
+    let port = env_u16("COMET_BALLISTA_EXECUTOR_PORT", 50051);
+    let grpc_port = env_u16("COMET_BALLISTA_EXECUTOR_GRPC_PORT", 50052);
+    let scheduler_host =
+        std::env::var("COMET_BALLISTA_SCHEDULER_HOST").unwrap_or_else(|_| "localhost".to_string());
+    let scheduler_port = env_u16("COMET_BALLISTA_SCHEDULER_PORT", 50050);
+    let concurrent_tasks = std::env::var("COMET_BALLISTA_EXECUTOR_CONCURRENT_TASKS")
+        .ok()
+        .and_then(|s| s.parse().ok())
+        .unwrap_or_else(|| {
+            std::thread::available_parallelism()
+                .map(|n| n.get())
+                .unwrap_or(1)
+        });
+
+    // Manual runtime so the default (feature-less) build needs no tokio `macros`.
+    let runtime = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()?;
+
+    runtime.block_on(async move {
+        let config = ExecutorProcessConfig {
+            bind_host,
+            port,
+            grpc_port,
+            scheduler_host,
+            scheduler_port,
+            concurrent_tasks,
+            // The seam: Comet codecs so the executor can rebuild Comet fragments.
+            override_logical_codec: Some(Arc::new(CometLogicalCodec::default())),
+            override_physical_codec: Some(Arc::new(CometPhysicalCodec::default())),
+            ..Default::default()
+        };
+
+        eprintln!(
+            "[comet-executor] flight :{port} grpc :{grpc_port} -> scheduler {}:{}",
+            config.scheduler_host, config.scheduler_port
+        );
+
+        start_executor_process(Arc::new(config)).await?;
+        Ok::<(), Box<dyn std::error::Error>>(())
+    })
+}
diff --git a/native/core/src/bin/comet-scheduler.rs b/native/core/src/bin/comet-scheduler.rs
new file mode 100644
index 0000000000..89b8ee0eb8
--- /dev/null
+++ b/native/core/src/bin/comet-scheduler.rs
@@ -0,0 +1,74 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! A Comet-flavored Ballista **scheduler** process.
+//!
+//! Identical to the stock `ballista-scheduler` binary except that it registers
+//! Comet's extension codecs ([`CometLogicalCodec`] / [`CometPhysicalCodec`]) on
+//! the [`SchedulerConfig`], so a submitted plan containing Comet nodes
+//! (`CometFragmentExec` / `CometScanExec`) survives (de)serialization on the
+//! scheduler. The stock CLI hardcodes those overrides to `None`, which is why
+//! this bespoke binary exists.
+//!
+//! Only built with `--features ballista` (see `required-features` in
+//! `core/Cargo.toml`). Runs a real, externally reachable gRPC scheduler that a
+//! separate `comet-executor` process connects to.
+//!
+//! Configuration (all optional, env-driven so a harness can place it):
+//! - `COMET_BALLISTA_SCHEDULER_BIND_HOST` (default `127.0.0.1`)
+//! - `COMET_BALLISTA_SCHEDULER_BIND_PORT` (default `50050`)
+
+use std::net::SocketAddr;
+use std::sync::Arc;
+
+use ballista_scheduler::cluster::BallistaCluster;
+use ballista_scheduler::config::SchedulerConfig;
+use ballista_scheduler::scheduler_process::start_server;
+
+use comet::execution::ballista::{CometLogicalCodec, CometPhysicalCodec};
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let bind_host = std::env::var("COMET_BALLISTA_SCHEDULER_BIND_HOST")
+        .unwrap_or_else(|_| "127.0.0.1".to_string());
+    let bind_port: u16 = std::env::var("COMET_BALLISTA_SCHEDULER_BIND_PORT")
+        .ok()
+        .and_then(|s| s.parse().ok())
+        .unwrap_or(50050);
+
+    // Manual runtime so the default (feature-less) build needs no tokio `macros`.
+    let runtime = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()?;
+
+    runtime.block_on(async move {
+        let config = SchedulerConfig {
+            bind_host: bind_host.clone(),
+            bind_port,
+            // The seam: Comet codecs so the scheduler can decode Comet plan nodes.
+            override_logical_codec: Some(Arc::new(CometLogicalCodec::default())),
+            override_physical_codec: Some(Arc::new(CometPhysicalCodec::default())),
+            ..Default::default()
+        };
+
+        let addr: SocketAddr = format!("{bind_host}:{bind_port}").parse()?;
+        eprintln!("[comet-scheduler] starting on {addr}");
+
+        let cluster = BallistaCluster::new_from_config(&config).await?;
+        start_server(cluster, addr, Arc::new(config)).await?;
+        Ok::<(), Box<dyn std::error::Error>>(())
+    })
+}
diff --git a/native/core/src/execution/ballista/codec.rs b/native/core/src/execution/ballista/codec.rs
new file mode 100644
index 0000000000..322c5a45f1
--- /dev/null
+++ b/native/core/src/execution/ballista/codec.rs
@@ -0,0 +1,142 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use datafusion::arrow::datatypes::SchemaRef;
+use datafusion::catalog::TableProvider;
+use datafusion::common::Result;
+use datafusion::execution::TaskContext;
+use datafusion::logical_expr::Extension;
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion::sql::TableReference;
+use datafusion_proto::logical_plan::LogicalExtensionCodec;
+use datafusion_proto::physical_plan::PhysicalExtensionCodec;
+
+use ballista_core::serde::{BallistaLogicalExtensionCodec, BallistaPhysicalExtensionCodec};
+
+use super::fragment::CometFragmentExec;
+use super::scan::CometScanExec;
+use super::table_provider::CometTableProvider;
+
+/// Marks a payload as a Comet node so the codec can tell it apart from a
+/// Ballista/DataFusion node it should delegate.
+///
+/// Prefix-sniffing this is safe because Ballista/DataFusion codec payloads
+/// are protobuf tag streams that never begin with these bytes — the
+/// embedded NUL in particular makes a collision effectively impossible.
+pub const COMET_MAGIC: &[u8] = b"CMET1\0";
+
+/// Marks a payload as a [`CometFragmentExec`] (a Comet fragment fed by
+/// DataFusion children), distinct from [`COMET_MAGIC`] for the childless
+/// [`CometScanExec`]. Same collision-safety argument as `COMET_MAGIC`.
+pub const COMET_FRAGMENT_MAGIC: &[u8] = b"CMETF\0";
+
+/// Serializes `CometScanExec` as its Comet proto bytes (tagged with `COMET_MAGIC`)
+/// and reconstructs it on decode by re-running Comet's planner via FFI. All other
+/// nodes (including Ballista's own shuffle operators) delegate to Ballista's codec.
+#[derive(Debug, Default)]
+pub struct CometPhysicalCodec {
+    inner: BallistaPhysicalExtensionCodec,
+}
+
+impl PhysicalExtensionCodec for CometPhysicalCodec {
+    fn try_decode(
+        &self,
+        buf: &[u8],
+        inputs: &[Arc<dyn ExecutionPlan>],
+        ctx: &TaskContext,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        if let Some(rest) = buf.strip_prefix(COMET_FRAGMENT_MAGIC) {
+            // `inputs` are the already-decoded DataFusion children that feed the
+            // fragment's `Scan` input leaves.
+            return Ok(Arc::new(CometFragmentExec::try_new(
+                rest.to_vec(),
+                inputs.to_vec(),
+            )?));
+        }
+        if let Some(rest) = buf.strip_prefix(COMET_MAGIC) {
+            return Ok(Arc::new(CometScanExec::try_new(rest.to_vec())?));
+        }
+        self.inner.try_decode(buf, inputs, ctx)
+    }
+
+    fn try_encode(&self, node: Arc<dyn ExecutionPlan>, buf: &mut Vec<u8>) -> Result<()> {
+        if let Some(fragment) = node.downcast_ref::<CometFragmentExec>() {
+            buf.extend_from_slice(COMET_FRAGMENT_MAGIC);
+            buf.extend_from_slice(fragment.proto());
+            return Ok(());
+        }
+        if let Some(scan) = node.downcast_ref::<CometScanExec>() {
+            buf.extend_from_slice(COMET_MAGIC);
+            buf.extend_from_slice(scan.proto());
+            return Ok(());
+        }
+        self.inner.try_encode(node, buf)
+    }
+}
+
+/// Serializes `CometTableProvider` (as its Comet proto bytes, tagged with
+/// `COMET_MAGIC`) so a query's logical plan can be shipped client -> scheduler
+/// and reconstructed there. Everything else delegates to Ballista's codec.
+#[derive(Debug, Default)]
+pub struct CometLogicalCodec {
+    inner: BallistaLogicalExtensionCodec,
+}
+
+impl LogicalExtensionCodec for CometLogicalCodec {
+    fn try_decode(
+        &self,
+        buf: &[u8],
+        inputs: &[datafusion::logical_expr::LogicalPlan],
+        ctx: &TaskContext,
+    ) -> Result<Extension> {
+        self.inner.try_decode(buf, inputs, ctx)
+    }
+
+    fn try_encode(&self, node: &Extension, buf: &mut Vec<u8>) -> Result<()> {
+        self.inner.try_encode(node, buf)
+    }
+
+    fn try_decode_table_provider(
+        &self,
+        buf: &[u8],
+        table_ref: &TableReference,
+        schema: SchemaRef,
+        ctx: &TaskContext,
+    ) -> Result<Arc<dyn TableProvider>> {
+        if let Some(rest) = buf.strip_prefix(COMET_MAGIC) {
+            return Ok(Arc::new(CometTableProvider::new(rest.to_vec(), schema)));
+        }
+        self.inner
+            .try_decode_table_provider(buf, table_ref, schema, ctx)
+    }
+
+    fn try_encode_table_provider(
+        &self,
+        table_ref: &TableReference,
+        node: Arc<dyn TableProvider>,
+        buf: &mut Vec<u8>,
+    ) -> Result<()> {
+        if let Some(provider) = node.downcast_ref::<CometTableProvider>() {
+            buf.extend_from_slice(COMET_MAGIC);
+            buf.extend_from_slice(provider.proto());
+            return Ok(());
+        }
+        self.inner.try_encode_table_provider(table_ref, node, buf)
+    }
+}
diff --git a/native/core/src/execution/ballista/ffi_jni.rs b/native/core/src/execution/ballista/ffi_jni.rs
new file mode 100644
index 0000000000..52ec64f848
--- /dev/null
+++ b/native/core/src/execution/ballista/ffi_jni.rs
@@ -0,0 +1,460 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Driver-side "offload to Ballista" submission entry.
+//!
+//! The JVM hands us a serialized Comet `Operator` proto; we run it on an
+//! **in-process standalone Ballista** engine (no Spark executors) and hand the
+//! resulting Arrow batches back to the JVM over the Arrow C Data Interface —
+//! the same FFI mechanism Comet already uses in `jni_api::prepare_output`
+//! (`ArrayData` → caller-allocated `FFI_ArrowArray`/`FFI_ArrowSchema`).
+
+use std::sync::Arc;
+
+use ballista::prelude::{SessionConfigExt, SessionContextExt};
+use datafusion::arrow::array::RecordBatch;
+use datafusion::arrow::compute::concat_batches;
+use datafusion::arrow::datatypes::SchemaRef;
+use datafusion::arrow::ffi::{FFI_ArrowArray, FFI_ArrowSchema};
+use datafusion::execution::SessionStateBuilder;
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion::prelude::{SessionConfig, SessionContext};
+
+use datafusion_comet_proto::spark_operator::{CometBallistaOffloadPlan, Operator};
+use prost::Message;
+
+use super::scan::CometScanExec;
+use super::{CometFragmentExec, CometLogicalCodec, CometPhysicalCodec, CometTableProvider};
+
+/// Run a Comet `Operator` proto on an in-process standalone Ballista engine and
+/// return the collected Arrow batches plus the result schema.
+///
+/// This reuses the "proto → standalone Ballista → RecordBatches" recipe
+/// validated in `tests/distributed.rs`, running `SELECT * FROM t` (no shuffle)
+/// over a table provider that carries the whole Comet plan proto — so any
+/// operators above the scan (filter/project/aggregate) run natively too.
+///
+/// The result schema is derived from the **built** Comet plan's `schema()`
+/// (not from the scan proto's `required_schema`), so plans with operators above
+/// the scan report their true output schema rather than the raw scan schema.
+pub fn execute_comet_proto(proto: &[u8]) -> Result<(SchemaRef, Vec<RecordBatch>), String> {
+    // Validate the proto decodes before spinning up the engine.
+    Operator::decode(proto).map_err(|e| format!("failed to decode Operator proto: {e}"))?;
+
+    let runtime = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()
+        .map_err(|e| format!("failed to build tokio runtime: {e}"))?;
+
+    runtime.block_on(async move {
+        // Build the whole Comet plan once (inside the Tokio runtime, which
+        // `CometScanExec::try_new` requires) so we can read its true output
+        // schema — the result schema comes from the built plan, not the
+        // NativeScan proto's `required_schema`.
+        let built: Arc<dyn ExecutionPlan> = Arc::new(
+            CometScanExec::try_new(proto.to_vec())
+                .map_err(|e| format!("failed to build Comet plan: {e}"))?,
+        );
+        let schema = built.schema();
+
+        // In-process standalone Ballista cluster (scheduler + executor) with the
+        // Comet codecs registered so the Comet leaf survives serialization.
+        let config = SessionConfig::new_with_ballista()
+            .with_target_partitions(1)
+            .with_ballista_standalone_parallelism(1)
+            .with_ballista_physical_extension_codec(Arc::new(CometPhysicalCodec::default()))
+            .with_ballista_logical_extension_codec(Arc::new(CometLogicalCodec::default()));
+        let state = SessionStateBuilder::new()
+            .with_config(config)
+            .with_default_features()
+            .build();
+        let ctx = SessionContext::standalone_with_state(state)
+            .await
+            .map_err(|e| format!("failed to start standalone Ballista: {e}"))?;
+
+        ctx.register_table(
+            "comet_t",
+            Arc::new(CometTableProvider::new(proto.to_vec(), Arc::clone(&schema))),
+        )
+        .map_err(|e| format!("failed to register Comet table: {e}"))?;
+
+        let df = ctx
+            .sql("SELECT * FROM comet_t")
+            .await
+            .map_err(|e| format!("failed to plan query: {e}"))?;
+        let batches = df
+            .collect()
+            .await
+            .map_err(|e| format!("failed to execute query: {e}"))?;
+        Ok((schema, batches))
+    })
+}
+
+// ---------------------------------------------------------------------------
+// R3: general DAG offload (`CometBallistaOffloadPlan`)
+// ---------------------------------------------------------------------------
+
+use std::time::Duration;
+
+use ballista_core::config::BallistaConfig;
+use ballista_core::execution_plans::execute_physical_plan;
+use ballista_core::serde::protobuf::scheduler_grpc_client::SchedulerGrpcClient;
+use datafusion::execution::SessionState;
+use datafusion::physical_expr::expressions::Column;
+use datafusion::physical_expr::PhysicalExpr;
+use datafusion::physical_plan::repartition::RepartitionExec;
+use datafusion::physical_plan::Partitioning;
+use datafusion_proto::protobuf::PhysicalPlanNode;
+use futures::TryStreamExt;
+
+/// Count the `Scan` (#100) input leaves in a serialized Comet `Operator` block —
+/// the same leaves `build_native_fragment` (`native/core/src/execution/fragment.rs`)
+/// expects one child stream per, in DFS order. Used as a build-time guard so a
+/// mismatched `OffloadFragment.inputs` count fails fast in `build_offload_plan`
+/// rather than lazily inside `CometFragmentExec::execute`.
+fn comet_offload_scan_leaf_count(block_proto: &[u8]) -> Result<usize, String> {
+    use datafusion_comet_proto::spark_operator::{operator::OpStruct, Operator};
+    fn count(op: &Operator) -> usize {
+        if matches!(op.op_struct, Some(OpStruct::Scan(_))) {
+            return 1;
+        }
+        op.children.iter().map(count).sum()
+    }
+    let op = Operator::decode(block_proto).map_err(|e| format!("decode block: {e}"))?;
+    Ok(count(&op))
+}
+
+/// Fold a serialized `CometBallistaOffloadPlan` into a Ballista physical plan: a DAG
+/// of `CometFragmentExec` nodes whose inputs are `RepartitionExec(Hash)` over the
+/// producer fragments. Fragments are processed in topological order; the last is the
+/// root. Ballista's planner then splits at each hash repartition into a stage.
+pub fn build_offload_plan(plan_bytes: &[u8]) -> Result<Arc<dyn ExecutionPlan>, String> {
+    let plan = CometBallistaOffloadPlan::decode(plan_bytes)
+        .map_err(|e| format!("failed to decode CometBallistaOffloadPlan: {e}"))?;
+    if plan.fragments.is_empty() {
+        return Err("CometBallistaOffloadPlan has no fragments".to_string());
+    }
+    let n = plan.num_partitions.max(1) as usize;
+
+    let mut built: Vec<Arc<dyn ExecutionPlan>> = Vec::with_capacity(plan.fragments.len());
+    for (idx, frag) in plan.fragments.iter().enumerate() {
+        // Build-time guard: the block's actual `Scan`(#100) leaf count must match
+        // the descriptor's declared input count, or `CometFragmentExec::execute`
+        // would fail lazily (or silently under-drive leaves) later.
+        let leaf_count = comet_offload_scan_leaf_count(&frag.block_proto)
+            .map_err(|e| format!("fragment {idx}: {e}"))?;
+        if leaf_count != frag.inputs.len() {
+            return Err(format!(
+                "fragment {idx}: block has {leaf_count} Scan input leaves but the descriptor \
+                 declares {} inputs",
+                frag.inputs.len()
+            ));
+        }
+
+        // Build each input edge as a hash repartition over an already-built producer.
+        let mut children: Vec<Arc<dyn ExecutionPlan>> = Vec::with_capacity(frag.inputs.len());
+        for input in &frag.inputs {
+            let producer_idx = input.producer as usize;
+            if producer_idx >= idx {
+                return Err(format!(
+                    "fragment {idx} references producer {producer_idx} that is not earlier in \
+                     topological order"
+                ));
+            }
+            let producer = Arc::clone(&built[producer_idx]);
+            let producer_schema = producer.schema();
+            let hash_exprs: Vec<Arc<dyn PhysicalExpr>> = input
+                .hash_key_ordinals
+                .iter()
+                .map(|&ord| {
+                    let ord = ord as usize;
+                    if ord >= producer_schema.fields().len() {
+                        return Err(format!(
+                            "fragment {idx} input hash key ordinal {ord} out of range for \
+                             producer {producer_idx} with {} columns",
+                            producer_schema.fields().len()
+                        ));
+                    }
+                    Ok(
+                        Arc::new(Column::new(producer_schema.field(ord).name(), ord))
+                            as Arc<dyn PhysicalExpr>,
+                    )
+                })
+                .collect::<Result<_, String>>()?;
+            let repart = RepartitionExec::try_new(producer, Partitioning::Hash(hash_exprs, n))
+                .map_err(|e| {
+                    format!("fragment {idx}: failed to build hash RepartitionExec: {e}")
+                })?;
+            children.push(Arc::new(repart));
+        }
+        let fragment = CometFragmentExec::try_new(frag.block_proto.clone(), children)
+            .map_err(|e| format!("fragment {idx}: failed to build CometFragmentExec: {e}"))?;
+        built.push(Arc::new(fragment));
+    }
+    Ok(built.pop().expect("fragments non-empty"))
+}
+
+/// Build and submit a general `CometBallistaOffloadPlan` DAG to a Ballista
+/// cluster, returning the collected Arrow result batches plus the result schema.
+///
+/// The plan is an arbitrary DAG of `CometFragmentExec` nodes (folded by
+/// [`build_offload_plan`]), not a fixed two-stage shape. The shuffle width `n`
+/// is read directly from the descriptor's `num_partitions` field — the
+/// authoritative parallelism for every hash repartition `build_offload_plan`
+/// builds — so `build_offload_plan` itself keeps its single-return signature.
+///
+/// An empty `scheduler_url` starts an in-process standalone cluster; a
+/// non-empty one submits to that external scheduler instead.
+pub fn execute_offload_plan(
+    plan_bytes: &[u8],
+    scheduler_url: &str,
+) -> Result<(SchemaRef, Vec<RecordBatch>), String> {
+    let runtime = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()
+        .map_err(|e| format!("failed to build tokio runtime: {e}"))?;
+
+    runtime.block_on(async move {
+        // The descriptor carries the authoritative shuffle width; decode it for `n`.
+        let n = CometBallistaOffloadPlan::decode(plan_bytes)
+            .map_err(|e| format!("failed to decode CometBallistaOffloadPlan: {e}"))?
+            .num_partitions
+            .max(1) as usize;
+
+        // Build the plan inside the runtime: the fragments' NativeScan leaves
+        // build via Comet's planner, which requires an active Tokio runtime.
+        let plan = build_offload_plan(plan_bytes)?;
+        let config = SessionConfig::new_with_ballista()
+            .with_target_partitions(n)
+            .with_ballista_standalone_parallelism(n)
+            .with_ballista_physical_extension_codec(Arc::new(CometPhysicalCodec::default()))
+            .with_ballista_logical_extension_codec(Arc::new(CometLogicalCodec::default()));
+        let state = SessionStateBuilder::new()
+            .with_config(config)
+            .with_default_features()
+            .build();
+        let schema = plan.schema();
+
+        // Empty URL => in-process standalone; non-empty => external cluster.
+        let scheduler_url = if scheduler_url.is_empty() {
+            log::debug!("[comet-ballista R3] submitting to in-process standalone cluster");
+            start_standalone_from_state(&state).await?
+        } else {
+            log::debug!("[comet-ballista R3] submitting to external cluster at {scheduler_url}");
+            scheduler_url.to_string()
+        };
+
+        let session_config = state.config().clone();
+        let codec = CometPhysicalCodec::default();
+        let session_id = state.session_id().to_string();
+
+        let stream = execute_physical_plan::<PhysicalPlanNode>(
+            scheduler_url,
+            &BallistaConfig::default(),
+            plan,
+            &codec,
+            session_id,
+            session_config,
+        )
+        .await
+        .map_err(|e| format!("failed to submit offload plan: {e}"))?;
+
+        let batches = stream
+            .try_collect::<Vec<_>>()
+            .await
+            .map_err(|e| format!("failed to collect distributed results: {e}"))?;
+
+        Ok((schema, batches))
+    })
+}
+
+/// Run the general DAG offload plan and export the (single, concatenated)
+/// result batch into the JVM-allocated FFI structs. Returns the row count.
+///
+/// # Safety
+/// See [`export_batch_to_addresses`].
+pub unsafe fn submit_and_export_offload(
+    plan_bytes: &[u8],
+    scheduler_url: &str,
+    array_addrs: &[i64],
+    schema_addrs: &[i64],
+) -> Result<i64, String> {
+    let (schema, batches) = execute_offload_plan(plan_bytes, scheduler_url)?;
+    // The final fragment's partitions are concatenated into one batch so the
+    // JVM imports exactly one set of column structs (same contract as R1/R2).
+    let batch = concat_batches(&schema, &batches)
+        .map_err(|e| format!("failed to concatenate result batches: {e}"))?;
+    export_batch_to_addresses(&batch, array_addrs, schema_addrs)?;
+    Ok(batch.num_rows() as i64)
+}
+
+/// Start an in-process standalone Ballista cluster (scheduler + executor) from
+/// `state`, so the Comet extension codecs registered on the state's config reach
+/// both sides. Mirrors `ballista::extension`'s private `setup_standalone`, but
+/// returns the scheduler URL for the direct physical-plan submission path.
+async fn start_standalone_from_state(state: &SessionState) -> Result<String, String> {
+    let addr = ballista_scheduler::standalone::new_standalone_scheduler_from_state(state)
+        .await
+        .map_err(|e| format!("failed to start standalone scheduler: {e}"))?;
+    let scheduler_url = format!("http://localhost:{}", addr.port());
+
+    let mut retries = 50;
+    let scheduler = loop {
+        match SchedulerGrpcClient::connect(scheduler_url.clone()).await {
+            Ok(s) => break s,
+            Err(e) if retries > 0 => {
+                retries -= 1;
+                tokio::time::sleep(Duration::from_millis(100)).await;
+                let _ = e;
+            }
+            Err(e) => return Err(format!("could not connect to standalone scheduler: {e}")),
+        }
+    };
+
+    let concurrent_tasks = state.config().ballista_standalone_parallelism();
+    ballista_executor::new_standalone_executor_from_state(scheduler, concurrent_tasks, state)
+        .await
+        .map_err(|e| format!("failed to start standalone executor: {e}"))?;
+
+    Ok(scheduler_url)
+}
+
+/// Export one Arrow batch into caller-allocated `FFI_ArrowArray` /
+/// `FFI_ArrowSchema` structs, one per column, whose addresses were allocated by
+/// the JVM (Arrow Java `ArrowArray.allocateNew` / `ArrowSchema.allocateNew`).
+///
+/// This mirrors `jni_api::prepare_output`: the JVM owns the C Data structs and
+/// imports them with its `ArrowImporter` after this call returns.
+///
+/// # Safety
+/// `array_addrs[i]` / `schema_addrs[i]` must be valid, writable pointers to
+/// uninitialized `FFI_ArrowArray` / `FFI_ArrowSchema` for each column.
+unsafe fn export_batch_to_addresses(
+    batch: &RecordBatch,
+    array_addrs: &[i64],
+    schema_addrs: &[i64],
+) -> Result<(), String> {
+    let num_cols = batch.num_columns();
+    if array_addrs.len() != num_cols || schema_addrs.len() != num_cols {
+        return Err(format!(
+            "column count mismatch: batch has {num_cols}, got {} array / {} schema addresses",
+            array_addrs.len(),
+            schema_addrs.len()
+        ));
+    }
+    // Export every column first; only once *all* succeed do we write into the
+    // JVM-owned structs. Exporting can fail mid-loop (e.g. an unsupported data
+    // type); writing incrementally would then leave already-written structs that
+    // the JVM never imports (and thus never releases) — a leak. Staging into a
+    // local Vec makes the write phase below infallible, so it is all-or-nothing.
+    let mut exported = Vec::with_capacity(num_cols);
+    for i in 0..num_cols {
+        let data = batch.column(i).to_data();
+        let schema = FFI_ArrowSchema::try_from(data.data_type())
+            .map_err(|e| format!("failed to export schema for column {i}: {e}"))?;
+        let array = FFI_ArrowArray::new(&data);
+        exported.push((array, schema));
+    }
+    // The JVM allocated these structs; write the exported values into them. This
+    // phase cannot fail, so no partial write is possible.
+    for (i, (array, schema)) in exported.into_iter().enumerate() {
+        std::ptr::write(array_addrs[i] as *mut FFI_ArrowArray, array);
+        std::ptr::write(schema_addrs[i] as *mut FFI_ArrowSchema, schema);
+    }
+    Ok(())
+}
+
+/// Run the proto and export the (single) result batch into the JVM-allocated
+/// FFI structs. Returns the row count, or `Err` with a message.
+///
+/// # Safety
+/// See [`export_batch_to_addresses`].
+pub unsafe fn submit_and_export(
+    proto: &[u8],
+    array_addrs: &[i64],
+    schema_addrs: &[i64],
+) -> Result<i64, String> {
+    let (schema, batches) = execute_comet_proto(proto)?;
+    // Concatenate to one batch so the JVM imports exactly one set of column
+    // structs.
+    let batch = concat_batches(&schema, &batches)
+        .map_err(|e| format!("failed to concatenate result batches: {e}"))?;
+    export_batch_to_addresses(&batch, array_addrs, schema_addrs)?;
+    Ok(batch.num_rows() as i64)
+}
+
+// ---------------------------------------------------------------------------
+// JNI entry point
+// ---------------------------------------------------------------------------
+
+mod jni_entry {
+    use super::submit_and_export_offload;
+    use crate::errors::{try_unwrap_or_throw, CometError};
+    use jni::objects::{JByteArray, JClass, JLongArray, JString, ReleaseMode};
+    use jni::sys::jlong;
+    use jni::EnvUnowned;
+
+    /// JVM entry: a no-op whose only purpose is symbol resolution. It is compiled
+    /// only into a `--features ballista` `libcomet`, so the JVM side can detect
+    /// whether the offload is present by resolving this symbol (see
+    /// `NativeBallista.isAvailable`); a feature-less library lacks it and yields an
+    /// `UnsatisfiedLinkError`.
+    ///
+    /// # Safety
+    /// Called from the JVM via JNI.
+    #[no_mangle]
+    pub unsafe extern "system" fn Java_org_apache_comet_ballista_NativeBallista_probeAvailable(
+        _e: EnvUnowned,
+        _class: JClass,
+    ) {
+    }
+
+    /// JVM entry: run a general DAG offload (R3), a `CometBallistaOffloadPlan`
+    /// describing an arbitrary DAG of `CometFragmentExec` nodes joined by hash
+    /// shuffles (folded by `build_offload_plan`). Submits it to a Ballista
+    /// cluster — in-process standalone if `schedulerUrl` is empty, or the named
+    /// external scheduler otherwise — and exports the concatenated result batch
+    /// into the JVM-allocated Arrow C Data structs, returning the number of rows.
+    ///
+    /// # Safety
+    /// Called from the JVM via JNI; the address arrays must reference valid
+    /// caller-allocated `FFI_ArrowArray`/`FFI_ArrowSchema` structs (one per
+    /// output column of the plan's final fragment).
+    #[no_mangle]
+    pub unsafe extern "system" fn Java_org_apache_comet_ballista_NativeBallista_executeOffloadPlan(
+        e: EnvUnowned,
+        _class: JClass,
+        plan: JByteArray,
+        array_addrs: JLongArray,
+        schema_addrs: JLongArray,
+        scheduler_url: JString,
+    ) -> jlong {
+        try_unwrap_or_throw(&e, |env| {
+            let plan_bytes = env.convert_byte_array(plan)?;
+            let scheduler_url: String = scheduler_url.try_to_string(env)?;
+
+            let arrays = unsafe { array_addrs.get_elements(env, ReleaseMode::NoCopyBack)? };
+            let schemas = unsafe { schema_addrs.get_elements(env, ReleaseMode::NoCopyBack)? };
+
+            let num_rows = unsafe {
+                submit_and_export_offload(&plan_bytes, &scheduler_url, &arrays, &schemas)
+            }
+            .map_err(CometError::Internal)?;
+            Ok(num_rows as jlong)
+        })
+    }
+}
diff --git a/native/core/src/execution/ballista/fragment.rs b/native/core/src/execution/ballista/fragment.rs
new file mode 100644
index 0000000000..e624737ddd
--- /dev/null
+++ b/native/core/src/execution/ballista/fragment.rs
@@ -0,0 +1,137 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::fmt;
+use std::sync::Arc;
+
+use datafusion::common::{DataFusionError, Result};
+use datafusion::execution::TaskContext;
+use datafusion::physical_plan::{
+    DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties,
+    SendableRecordBatchStream,
+};
+
+use crate::execution::fragment::{build_native_fragment, native_fragment_plan_properties};
+
+/// A DataFusion node that runs a Comet plan fragment (carried as `Operator`
+/// proto bytes) whose input-leaf `Scan` operators are fed by this node's
+/// DataFusion `children`.
+///
+/// In a Ballista stage those children are shuffle readers; a childless fragment
+/// (whose leaf is a self-contained `NativeScan`) behaves like [`super::scan::CometScanExec`],
+/// but reached through the native (non-FFI) path since the executor and Comet
+/// share a DataFusion build.
+///
+/// Serializable through [`super::codec::CometPhysicalCodec`] by its proto bytes;
+/// the children round-trip via datafusion-proto and are handed back on decode.
+#[derive(Debug)]
+pub struct CometFragmentExec {
+    proto: Vec<u8>,
+    children: Vec<Arc<dyn ExecutionPlan>>,
+    props: Arc<PlanProperties>,
+}
+
+impl CometFragmentExec {
+    /// Build from Comet proto bytes and the fragment's DataFusion children. The
+    /// schema/ordering are derived by building the fragment plan once (without
+    /// executing it or requiring the child streams).
+    ///
+    /// A Comet fragment is internally single-partition, but as a DataFusion node
+    /// it is a *per-partition* transform: [`execute`](Self::execute) runs the
+    /// fragment once for each output partition, feeding that partition's child
+    /// streams into the fragment's `Scan` leaves. So when the fragment has
+    /// children (e.g. a Ballista shuffle reader with `N` partitions), its output
+    /// partition count must match the children's — otherwise consumers (and the
+    /// distributed planner / result fetch) would only ever drive partition 0 and
+    /// silently drop the other `N-1` partitions' rows. A childless fragment
+    /// (self-contained `NativeScan` leaf) keeps the built plan's own partitioning.
+    pub fn try_new(proto: Vec<u8>, children: Vec<Arc<dyn ExecutionPlan>>) -> Result<Self> {
+        let base = native_fragment_plan_properties(&proto).map_err(DataFusionError::Execution)?;
+        let props = match children.first() {
+            Some(child) => {
+                let n = child.properties().partitioning.partition_count();
+                Arc::new(PlanProperties::new(
+                    base.eq_properties.clone(),
+                    Partitioning::UnknownPartitioning(n),
+                    base.emission_type,
+                    base.boundedness,
+                ))
+            }
+            None => base,
+        };
+        Ok(Self {
+            proto,
+            children,
+            props,
+        })
+    }
+
+    pub fn proto(&self) -> &[u8] {
+        &self.proto
+    }
+}
+
+impl DisplayAs for CometFragmentExec {
+    fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(
+            f,
+            "CometFragmentExec(proto={} bytes, children={})",
+            self.proto.len(),
+            self.children.len()
+        )
+    }
+}
+
+impl ExecutionPlan for CometFragmentExec {
+    fn name(&self) -> &str {
+        "CometFragmentExec"
+    }
+
+    fn properties(&self) -> &Arc<PlanProperties> {
+        &self.props
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        self.children.iter().collect()
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(Arc::new(CometFragmentExec::try_new(
+            self.proto.clone(),
+            children,
+        )?))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        // Materialize one input stream per child for the requested output
+        // partition; these feed the fragment's `Scan` input leaves in order.
+        let inputs = self
+            .children
+            .iter()
+            .map(|child| child.execute(partition, Arc::clone(&context)))
+            .collect::<Result<Vec<_>>>()?;
+
+        build_native_fragment(&self.proto, context, inputs).map_err(DataFusionError::Execution)
+    }
+}
diff --git a/native/core/src/execution/ballista/mod.rs b/native/core/src/execution/ballista/mod.rs
new file mode 100644
index 0000000000..3d60e475c0
--- /dev/null
+++ b/native/core/src/execution/ballista/mod.rs
@@ -0,0 +1,51 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Runs Apache DataFusion Comet native plans as leaves inside Apache
+//! DataFusion Ballista.
+//!
+//! This module is compiled into the single `libcomet` cdylib only when the
+//! default-off `ballista` Cargo feature is enabled, so the offload entry lives
+//! in the same library (and shares the same Comet core state, e.g. `JAVA_VM`)
+//! as the rest of Comet — there is no separate `comet-ballista` cdylib.
+//!
+//! - [`scan::CometScanExec`]: a serializable DataFusion leaf that carries the
+//!   Comet proto bytes (the "recipe") and builds the FFI plan at execute()
+//!   time. This is what Ballista ships to executors and reconstructs there.
+//! - [`codec::CometPhysicalCodec`] / [`codec::CometLogicalCodec`]: extension
+//!   codecs that (de)serialize Comet nodes as their proto bytes (tagged with
+//!   [`codec::COMET_MAGIC`]) and delegate everything else to Ballista's own
+//!   codecs — the seam that lets Ballista distribute Comet work without
+//!   linking Comet's translation code.
+//! - [`table_provider::CometTableProvider`]: a `TableProvider` that produces a
+//!   `CometScanExec`, so a Comet scan can participate in a DataFusion logical
+//!   plan.
+
+pub mod codec;
+pub mod ffi_jni;
+pub mod fragment;
+pub mod scan;
+pub mod table_provider;
+
+pub use codec::{CometLogicalCodec, CometPhysicalCodec, COMET_FRAGMENT_MAGIC, COMET_MAGIC};
+pub use ffi_jni::{
+    build_offload_plan, execute_comet_proto, execute_offload_plan, submit_and_export,
+    submit_and_export_offload,
+};
+pub use fragment::CometFragmentExec;
+pub use scan::CometScanExec;
+pub use table_provider::CometTableProvider;
diff --git a/native/core/src/execution/ballista/scan.rs b/native/core/src/execution/ballista/scan.rs
new file mode 100644
index 0000000000..8c3db67530
--- /dev/null
+++ b/native/core/src/execution/ballista/scan.rs
@@ -0,0 +1,103 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::fmt;
+use std::sync::Arc;
+
+use datafusion::common::{DataFusionError, Result};
+use datafusion::execution::TaskContext;
+use datafusion::physical_plan::{
+    DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, SendableRecordBatchStream,
+};
+use datafusion_ffi::execution_plan::ForeignExecutionPlan;
+use tokio::runtime::Handle;
+
+use crate::execution::ffi::comet_ffi_plan_from_proto;
+
+/// A DataFusion leaf that carries a Comet plan protobuf and executes it via the
+/// `datafusion-ffi` boundary. Serializable through `CometPhysicalCodec` by its
+/// proto bytes, so Ballista can ship it to executors.
+#[derive(Debug)]
+pub struct CometScanExec {
+    proto: Vec<u8>,
+    inner: Arc<dyn ExecutionPlan>,
+    props: Arc<PlanProperties>,
+}
+
+impl CometScanExec {
+    /// Build from Comet proto bytes: run Comet's planner via FFI to get the plan,
+    /// wrap it as a `ForeignExecutionPlan` (forcing the real FFI vtable path).
+    ///
+    /// The Tokio runtime handle is captured here (via `Handle::try_current()`),
+    /// so `try_new` (and `try_decode`, which calls it) must run inside a Tokio
+    /// runtime — true for Ballista's executor, which drives all task execution
+    /// on a Tokio runtime.
+    pub fn try_new(proto: Vec<u8>) -> Result<Self> {
+        let ffi = comet_ffi_plan_from_proto(&proto, Handle::try_current().ok())
+            .map_err(DataFusionError::Execution)?;
+        let inner: Arc<dyn ExecutionPlan> = Arc::new(
+            ForeignExecutionPlan::try_from(ffi)
+                .map_err(|e| DataFusionError::Execution(format!("ForeignExecutionPlan: {e}")))?,
+        );
+        let props = Arc::clone(inner.properties());
+        Ok(Self {
+            proto,
+            inner,
+            props,
+        })
+    }
+
+    pub fn proto(&self) -> &[u8] {
+        &self.proto
+    }
+}
+
+impl DisplayAs for CometScanExec {
+    fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "CometScanExec(proto={} bytes)", self.proto.len())
+    }
+}
+
+impl ExecutionPlan for CometScanExec {
+    fn name(&self) -> &str {
+        "CometScanExec"
+    }
+
+    fn properties(&self) -> &Arc<PlanProperties> {
+        &self.props
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        _children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        debug_assert!(_children.is_empty(), "CometScanExec is a leaf");
+        Ok(self)
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        self.inner.execute(partition, context)
+    }
+}
diff --git a/native/core/src/execution/ballista/table_provider.rs b/native/core/src/execution/ballista/table_provider.rs
new file mode 100644
index 0000000000..4009af3b02
--- /dev/null
+++ b/native/core/src/execution/ballista/table_provider.rs
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use datafusion::arrow::datatypes::SchemaRef;
+use datafusion::catalog::{Session, TableProvider};
+use datafusion::common::Result;
+use datafusion::logical_expr::{Expr, TableType};
+use datafusion::physical_expr::expressions::Column;
+use datafusion::physical_expr::PhysicalExpr;
+use datafusion::physical_plan::projection::ProjectionExec;
+use datafusion::physical_plan::ExecutionPlan;
+
+use super::scan::CometScanExec;
+
+/// A DataFusion `TableProvider` that produces a `CometScanExec`. Carries the
+/// Comet proto so the table can be reconstructed on the scheduler side via the
+/// logical codec below.
+#[derive(Debug)]
+pub struct CometTableProvider {
+    proto: Vec<u8>,
+    schema: SchemaRef,
+}
+
+impl CometTableProvider {
+    pub fn new(proto: Vec<u8>, schema: SchemaRef) -> Self {
+        Self { proto, schema }
+    }
+    pub fn proto(&self) -> &[u8] {
+        &self.proto
+    }
+}
+
+#[async_trait::async_trait]
+impl TableProvider for CometTableProvider {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+    fn table_type(&self) -> TableType {
+        TableType::Base
+    }
+    // `_filters` and `_limit` are intentionally not pushed down into the Comet
+    // scan; DataFusion re-applies them on top of the returned plan.
+    async fn scan(
+        &self,
+        _state: &dyn Session,
+        projection: Option<&Vec<usize>>,
+        _filters: &[Expr],
+        _limit: Option<usize>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let scan: Arc<dyn ExecutionPlan> = Arc::new(CometScanExec::try_new(self.proto.clone())?);
+        match projection {
+            Some(indices) => {
+                let exprs: Vec<(Arc<dyn PhysicalExpr>, String)> = indices
+                    .iter()
+                    .map(|&i| {
+                        let f = self.schema.field(i);
+                        (
+                            Arc::new(Column::new(f.name(), i)) as Arc<dyn PhysicalExpr>,
+                            f.name().to_string(),
+                        )
+                    })
+                    .collect();
+                Ok(Arc::new(ProjectionExec::try_new(exprs, scan)?))
+            }
+            None => Ok(scan),
+        }
+    }
+}
diff --git a/native/core/src/execution/ffi.rs b/native/core/src/execution/ffi.rs
new file mode 100644
index 0000000000..f54d342b85
--- /dev/null
+++ b/native/core/src/execution/ffi.rs
@@ -0,0 +1,170 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Exposes a Comet native plan across a `datafusion-ffi` boundary so that a
+//! process compiled against a different DataFusion version (e.g. a Ballista
+//! executor) can execute it without linking Comet's Rust crates.
+//!
+//! The input is a serialized Comet `Operator` plan whose leaves are
+//! `NativeScan` (native Parquet), so no JVM-fed inputs are required.
+
+use std::sync::Arc;
+
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion::prelude::SessionContext;
+use datafusion_comet_proto::spark_operator::Operator;
+use datafusion_ffi::execution_plan::FFI_ExecutionPlan;
+use prost::Message;
+use tokio::runtime::Handle;
+
+use super::planner::PhysicalPlanner;
+
+/// Decode a Comet `Operator` protobuf, build the native DataFusion plan with
+/// Comet's own planner, and wrap the root as an `FFI_ExecutionPlan`.
+///
+/// `runtime` is the Tokio runtime handle the foreign consumer should use to
+/// drive async execution across the boundary.
+pub fn comet_ffi_plan_from_proto(
+    proto_bytes: &[u8],
+    runtime: Option<Handle>,
+) -> Result<FFI_ExecutionPlan, String> {
+    let op = Operator::decode(proto_bytes)
+        .map_err(|e| format!("failed to decode Comet Operator proto: {e}"))?;
+
+    // A fresh `SessionContext` means object-store configuration comes only
+    // from the proto's `object_store_options`, not from any ambient session.
+    // That's sufficient for local `file://` scans; remote object stores
+    // (S3, GCS, etc.) will need this revisited to plumb their config through.
+    let session_ctx = Arc::new(SessionContext::new());
+    let planner = PhysicalPlanner::new(session_ctx, 0);
+
+    // NativeScan leaves read Parquet directly, so no JVM input sources are needed.
+    let mut inputs = Vec::new();
+    let (_scans, _shuffle_scans, spark_plan) = planner
+        .create_plan(&op, &mut inputs, 1)
+        .map_err(|e| format!("failed to build native plan: {e}"))?;
+
+    let plan: Arc<dyn ExecutionPlan> = Arc::clone(&spark_plan.native_plan);
+    Ok(FFI_ExecutionPlan::new(plan, runtime))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use datafusion::arrow::array::{Int32Array, RecordBatch};
+    use datafusion::arrow::datatypes::{DataType as ArrowDataType, Field, Schema};
+    use datafusion::parquet::arrow::ArrowWriter;
+    use datafusion_comet_proto::spark_expression::{data_type::DataTypeId, DataType};
+    use datafusion_comet_proto::spark_operator::{
+        operator::OpStruct, NativeScan, NativeScanCommon, SparkFilePartition, SparkPartitionedFile,
+        SparkStructField,
+    };
+    use datafusion_ffi::execution_plan::ForeignExecutionPlan;
+    use futures::StreamExt;
+
+    /// Write a tiny Parquet file with a single int32 column `a` = [1..=5].
+    fn write_test_parquet(path: &std::path::Path) {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "a",
+            ArrowDataType::Int32,
+            true,
+        )]));
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]))],
+        )
+        .unwrap();
+        let file = std::fs::File::create(path).unwrap();
+        let mut writer = ArrowWriter::try_new(file, schema, None).unwrap();
+        writer.write(&batch).unwrap();
+        writer.close().unwrap();
+    }
+
+    /// Build a Comet `Operator` proto: a single `NativeScan` over `parquet_path`.
+    fn build_native_scan_proto(parquet_path: &std::path::Path) -> Vec<u8> {
+        let int32 = DataType {
+            type_id: DataTypeId::Int32 as i32,
+            type_info: None,
+        };
+        let field_a = SparkStructField {
+            name: "a".to_string(),
+            data_type: Some(int32),
+            nullable: true,
+            metadata: Default::default(),
+        };
+        let common = NativeScanCommon {
+            required_schema: vec![field_a.clone()],
+            data_schema: vec![field_a],
+            projection_vector: vec![0],
+            session_timezone: "UTC".to_string(),
+            source: "comet-ffi-test".to_string(),
+            ..Default::default()
+        };
+        let file_size = std::fs::metadata(parquet_path).unwrap().len() as i64;
+        let partitioned_file = SparkPartitionedFile {
+            file_path: format!("file://{}", parquet_path.display()),
+            start: 0,
+            length: file_size,
+            file_size,
+            partition_values: vec![],
+        };
+        let native_scan = NativeScan {
+            common: Some(common),
+            file_partition: Some(SparkFilePartition {
+                partitioned_file: vec![partitioned_file],
+            }),
+        };
+        let op = Operator {
+            children: vec![],
+            plan_id: 0,
+            op_struct: Some(OpStruct::NativeScan(native_scan)),
+        };
+        op.encode_to_vec()
+    }
+
+    #[tokio::test]
+    async fn ffi_export_executes_native_scan() {
+        let dir = tempfile::tempdir().unwrap();
+        let parquet_path = dir.path().join("ffi_export_test.parquet");
+        write_test_parquet(&parquet_path);
+
+        let proto = build_native_scan_proto(&parquet_path);
+
+        let ffi_plan = comet_ffi_plan_from_proto(&proto, Handle::try_current().ok())
+            .expect("failed to build FFI plan from proto");
+
+        // Wrap via `ForeignExecutionPlan` to force the real FFI vtable path,
+        // rather than datafusion-ffi's same-library short-circuit.
+        let plan: Arc<dyn ExecutionPlan> = Arc::new(
+            ForeignExecutionPlan::try_from(ffi_plan)
+                .expect("failed to wrap FFI plan as ForeignExecutionPlan"),
+        );
+
+        let session_ctx = SessionContext::new();
+        let mut stream = plan
+            .execute(0, session_ctx.task_ctx())
+            .expect("failed to execute plan");
+
+        let mut total = 0usize;
+        while let Some(batch) = stream.next().await {
+            let batch = batch.expect("failed to read batch");
+            total += batch.num_rows();
+        }
+
+        assert_eq!(total, 5);
+    }
+}
diff --git a/native/core/src/execution/fragment.rs b/native/core/src/execution/fragment.rs
new file mode 100644
index 0000000000..76378bea94
--- /dev/null
+++ b/native/core/src/execution/fragment.rs
@@ -0,0 +1,194 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Builds and drives a Comet plan fragment whose input-leaf `Scan` operators are
+//! fed by native DataFusion [`SendableRecordBatchStream`]s (e.g. a Ballista
+//! shuffle reader), rather than by JVM-exported Arrow streams.
+//!
+//! Unlike [`super::ffi`], this path stays entirely in-process: there is no
+//! `datafusion-ffi` boundary, because the consumer (a Ballista executor) and
+//! Comet resolve to the same DataFusion build, so `ExecutionPlan`s and
+//! `SendableRecordBatchStream`s are shared directly.
+//!
+//! `PhysicalPlanner::create_plan` builds a `Scan` (op #100) leaf with no input
+//! source and returns a handle to it. Its executable clone shares this handle's
+//! `batch` slot (an `Arc`), so [`super::operators::ScanExec::set_native_input`]
+//! injects the child stream into the handle and [`NativeFragmentStream`] drives
+//! `get_next_batch` on it — mirroring the JVM busy-poll in `jni_api` — to make
+//! child batches flow through the fragment.
+
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use arrow::array::RecordBatch;
+use arrow::datatypes::SchemaRef;
+use datafusion::common::{DataFusionError, Result as DataFusionResult};
+use datafusion::execution::{SendableRecordBatchStream, TaskContext};
+use datafusion::physical_plan::{ExecutionPlan, PlanProperties, RecordBatchStream};
+use datafusion::prelude::SessionContext;
+use datafusion_comet_proto::spark_operator::Operator;
+use futures::{Stream, StreamExt};
+use prost::Message;
+
+use super::operators::ScanExec;
+use super::planner::PhysicalPlanner;
+
+/// A non-`TEST_EXEC_CONTEXT_ID` execution-context id for natively-fed scans. No
+/// JVM context is involved on this path (the input is a native stream, not an
+/// `ArrowArrayStream`), so the concrete value is immaterial as long as it is not
+/// the test sentinel, which would make `pull_next` short-circuit to EOF.
+const NATIVE_FRAGMENT_EXEC_ID: i64 = 0;
+
+/// Decode the Comet `Operator` proto and build the DataFusion plan with Comet's
+/// planner, returning the input-leaf `Scan` handles (in encounter order) and the
+/// fragment root. The default (test) `exec_context_id` is used so the `Scan` op
+/// builds without consuming a JVM input; native inputs are injected afterwards
+/// via [`ScanExec::set_native_input`].
+fn plan_from_proto(proto_bytes: &[u8]) -> Result<(Vec<ScanExec>, Arc<dyn ExecutionPlan>), String> {
+    let op = Operator::decode(proto_bytes)
+        .map_err(|e| format!("failed to decode Operator proto: {e}"))?;
+
+    // A fresh `SessionContext` means configuration comes only from the proto,
+    // not from any ambient session (see `super::ffi`).
+    let session_ctx = Arc::new(SessionContext::new());
+    let planner = PhysicalPlanner::new(session_ctx, 0);
+
+    let mut jvm_inputs = Vec::new();
+    let (scans, _shuffle_scans, spark_plan) = planner
+        .create_plan(&op, &mut jvm_inputs, 1)
+        .map_err(|e| format!("failed to build native plan: {e}"))?;
+
+    Ok((scans, Arc::clone(&spark_plan.native_plan)))
+}
+
+/// The `PlanProperties` (schema, partitioning, ordering) of the fragment root,
+/// used to establish a `CometFragmentExec`'s schema/properties at construction
+/// time without executing it or requiring the child streams.
+pub fn native_fragment_plan_properties(proto_bytes: &[u8]) -> Result<Arc<PlanProperties>, String> {
+    let (_scans, root) = plan_from_proto(proto_bytes)?;
+    Ok(Arc::clone(root.properties()))
+}
+
+/// Build the Comet fragment described by `proto_bytes`, feeding its input-leaf
+/// `Scan` operators from `inputs` (one stream per leaf, in encounter order), and
+/// return the fragment root's output stream. Executing that stream drives the
+/// child streams through the fragment.
+pub fn build_native_fragment(
+    proto_bytes: &[u8],
+    task_ctx: Arc<TaskContext>,
+    inputs: Vec<SendableRecordBatchStream>,
+) -> Result<SendableRecordBatchStream, String> {
+    let (mut scans, root) = plan_from_proto(proto_bytes)?;
+
+    if scans.len() != inputs.len() {
+        return Err(format!(
+            "Comet fragment has {} Scan input leaves but {} child streams were provided",
+            scans.len(),
+            inputs.len()
+        ));
+    }
+
+    // Inject each child stream into the matching `Scan` handle. The handle shares
+    // its `batch` slot with the executable leaf, so pulling here delivers batches
+    // to the plan node.
+    for (scan, input) in scans.iter_mut().zip(inputs) {
+        scan.set_native_input(NATIVE_FRAGMENT_EXEC_ID, input);
+    }
+
+    // The Comet fragment is internally single-partition; execute its root at
+    // partition 0. The child streams were already obtained for the desired output
+    // partition by the caller.
+    let root_stream = root
+        .execute(0, task_ctx)
+        .map_err(|e| format!("failed to execute Comet fragment root: {e}"))?;
+    let schema = root_stream.schema();
+
+    Ok(Box::pin(NativeFragmentStream {
+        root: root_stream,
+        scans,
+        schema,
+    }))
+}
+
+/// Streams the fragment root while pumping its `Scan` leaves. When the root
+/// yields `Pending` because a leaf's `batch` slot is empty, that leaf handle is
+/// asked to pull its next batch and the root is re-polled — the same interleaving
+/// `jni_api` performs for JVM-fed scans, but with native child streams.
+///
+/// Crucially, the root is only re-polled after new input is actually fed into a
+/// leaf. If the root returns `Pending` while every leaf already has a batch
+/// pending consumption (or there are no leaves at all — e.g. a childless
+/// `NativeScan` fragment reading Parquet directly), the root is genuinely pending
+/// on its own async work and has registered a waker on `cx`; we return
+/// `Poll::Pending` and let that waker reschedule us, rather than hot-spinning the
+/// worker thread on every async-I/O `Pending`.
+struct NativeFragmentStream {
+    root: SendableRecordBatchStream,
+    scans: Vec<ScanExec>,
+    schema: SchemaRef,
+}
+
+impl Stream for NativeFragmentStream {
+    type Item = DataFusionResult<RecordBatch>;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.get_mut();
+        loop {
+            match this.root.poll_next_unpin(cx) {
+                Poll::Ready(item) => return Poll::Ready(item),
+                Poll::Pending => {
+                    // Feed only leaves whose `batch` slot is empty; `get_next_batch`
+                    // blocks until it delivers a batch (or EOF), so a fed leaf then
+                    // holds input and re-polling the root makes progress. Track
+                    // whether we fed anything this iteration.
+                    let mut fed_new_input = false;
+                    for scan in this.scans.iter_mut() {
+                        // Peek the slot without holding the lock into `get_next_batch`
+                        // (which takes it again). A slot that is already `Some` needs
+                        // no feeding; a contended `try_lock` is treated as "not empty"
+                        // (nothing to do this round).
+                        let needs_input = scan
+                            .batch
+                            .try_lock()
+                            .map(|slot| slot.is_none())
+                            .unwrap_or(false);
+                        if needs_input {
+                            if let Err(e) = scan.get_next_batch() {
+                                return Poll::Ready(Some(Err(DataFusionError::Execution(
+                                    format!("Comet fragment scan input error: {e}"),
+                                ))));
+                            }
+                            fed_new_input = true;
+                        }
+                    }
+                    // Nothing new to feed: the root is pending on its own async work
+                    // and its waker (registered on `cx` above) will reschedule us.
+                    if !fed_new_input {
+                        return Poll::Pending;
+                    }
+                }
+            }
+        }
+    }
+}
+
+impl RecordBatchStream for NativeFragmentStream {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+}
diff --git a/native/core/src/execution/mod.rs b/native/core/src/execution/mod.rs
index ec247f72b7..17d5fa9f40 100644
--- a/native/core/src/execution/mod.rs
+++ b/native/core/src/execution/mod.rs
@@ -16,8 +16,17 @@
 // under the License.
 
 //! PoC of vectorization execution through JNI to Rust.
+#[cfg(feature = "ballista")]
+pub mod ballista;
 pub mod columnar_to_row;
 pub mod expressions;
+// `ffi` and `fragment` are used only by the feature-gated `ballista` offload
+// module (and their own `#[cfg(test)]` code). Gating them keeps the default
+// `libcomet` build from compiling the offload path or linking `datafusion-ffi`.
+#[cfg(feature = "ballista")]
+pub mod ffi;
+#[cfg(feature = "ballista")]
+pub mod fragment;
 pub mod jni_api;
 pub(crate) mod merge_as_partial;
 pub(crate) mod metrics;
diff --git a/native/core/src/execution/operators/scan.rs b/native/core/src/execution/operators/scan.rs
index 409d064284..a002345a70 100644
--- a/native/core/src/execution/operators/scan.rs
+++ b/native/core/src/execution/operators/scan.rs
@@ -20,7 +20,9 @@ use crate::{errors::CometError, execution::planner::TEST_EXEC_CONTEXT_ID};
 use arrow::array::{ArrayRef, RecordBatch, RecordBatchOptions};
 use arrow::compute::{cast_with_options, CastOptions};
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use arrow::error::ArrowError;
 use datafusion::common::{arrow_datafusion_err, DataFusionError, Result as DataFusionResult};
+use datafusion::execution::SendableRecordBatchStream;
 use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType};
 use datafusion::physical_plan::metrics::{
     BaselineMetrics, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet, Time,
@@ -30,25 +32,88 @@ use datafusion::{
     physical_expr::*,
     physical_plan::{ExecutionPlan, *},
 };
-use futures::Stream;
+use futures::{Stream, StreamExt};
 use itertools::Itertools;
 use std::{
+    fmt::Debug,
     pin::Pin,
     sync::{Arc, Mutex},
     task::{Context, Poll},
 };
 
-/// `ScanExec` reads batches of data from Spark over the Arrow C Stream Interface. The
-/// `input_source` is moved out of the JVM-exported `ArrowArrayStream` at plan-construction time;
+/// Abstraction over the source that feeds batches into a [`ScanExec`]. `ScanExec` was originally
+/// hard-wired to a JVM-exported `ArrowArrayStream` (via [`AlignedArrowStreamReader`]); this trait
+/// lets it be driven by that reader OR by a purely native producer (e.g. a DataFusion
+/// `SendableRecordBatchStream`), with no JVM involved in the latter case. Mirrors
+/// `Iterator<Item = Result<RecordBatch, ArrowError>>`, which `AlignedArrowStreamReader` already
+/// implements, but is spelled out as `next_batch` so it stays object-safe (`Iterator` itself
+/// isn't dyn-compatible because of its many default/adapter methods).
+pub trait InputBatchStream: Send + Debug {
+    /// Pull the next batch. `Ok(None)` signals end of stream.
+    fn next_batch(&mut self) -> Result<Option<RecordBatch>, ArrowError>;
+}
+
+impl InputBatchStream for AlignedArrowStreamReader {
+    fn next_batch(&mut self) -> Result<Option<RecordBatch>, ArrowError> {
+        self.next().transpose()
+    }
+}
+
+/// Feeds a `ScanExec` from a native DataFusion [`SendableRecordBatchStream`] instead of a
+/// JVM-exported `ArrowArrayStream`. This is what lets a `ScanExec` sit at the bottom of a plan
+/// fed purely natively, e.g. by a future Ballista shuffle-reader stream.
+///
+/// `SendableRecordBatchStream` is async, but `ScanExec` pulls its input synchronously (via
+/// `reader.next()`/`next_batch()` from `poll_next`, itself invoked off the batch producer thread
+/// rather than awaited). To bridge that without restructuring `ScanExec`, each call blocks the
+/// current thread on the stream's next item via `futures::executor::block_on`. This is safe here
+/// because `block_on` merely parks the calling thread on a `Future`/`Waker` pair — unlike
+/// `Runtime::block_on`, it does not require (and will not panic inside) an existing Tokio
+/// runtime, so it composes with Comet's own executor threads. It does mean the calling thread is
+/// unavailable for other work while a batch is pending, which is fine for the in-memory /
+/// channel-backed producers this abstraction targets (e.g. a shuffle reader), but would be a poor
+/// fit for a producer that itself does blocking I/O on the same thread.
+pub struct NativeBatchStream {
+    stream: SendableRecordBatchStream,
+}
+
+impl NativeBatchStream {
+    pub fn new(stream: SendableRecordBatchStream) -> Self {
+        Self { stream }
+    }
+}
+
+impl Debug for NativeBatchStream {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("NativeBatchStream").finish_non_exhaustive()
+    }
+}
+
+impl InputBatchStream for NativeBatchStream {
+    fn next_batch(&mut self) -> Result<Option<RecordBatch>, ArrowError> {
+        match futures::executor::block_on(self.stream.next()) {
+            None => Ok(None),
+            Some(Ok(batch)) => Ok(Some(batch)),
+            Some(Err(e)) => Err(ArrowError::ExternalError(Box::new(e))),
+        }
+    }
+}
+
+/// `ScanExec` reads batches of data from an upstream [`InputBatchStream`]. The common case is
+/// Spark, over the Arrow C Stream Interface: the `input_source` is moved out of the
+/// JVM-exported `ArrowArrayStream` at plan-construction time via `AlignedArrowStreamReader`;
 /// dropping the reader (when this exec drops) fires the stream's release callback, which closes
-/// the JVM-side `ArrowReader` and its `VectorSchemaRoot`.
+/// the JVM-side `ArrowReader` and its `VectorSchemaRoot`. `ScanExec` can equally be fed by a
+/// native `SendableRecordBatchStream` (see [`ScanExec::new_native`] / [`NativeBatchStream`]),
+/// with no JVM involved.
 #[derive(Debug, Clone)]
 pub struct ScanExec {
     /// JVM execution-context id used to look up the `JNIEnv` for callbacks.
     pub exec_context_id: i64,
-    /// The C Stream Interface reader. `None` only in unit tests that seed input via
-    /// `set_input_batch`.
-    pub input_source: Option<Arc<Mutex<AlignedArrowStreamReader>>>,
+    /// The batch source: the C Stream Interface reader for the JVM path, or a native
+    /// [`NativeBatchStream`] wrapping a `SendableRecordBatchStream`. `None` only in unit tests
+    /// that seed input via `set_input_batch`.
+    pub input_source: Option<Arc<Mutex<dyn InputBatchStream>>>,
     pub input_source_description: String,
     pub data_types: Vec<DataType>,
     pub schema: SchemaRef,
@@ -63,7 +128,7 @@ pub struct ScanExec {
 impl ScanExec {
     pub fn new(
         exec_context_id: i64,
-        input_source: Option<Arc<Mutex<AlignedArrowStreamReader>>>,
+        input_source: Option<Arc<Mutex<dyn InputBatchStream>>>,
         input_source_description: &str,
         data_types: Vec<DataType>,
     ) -> Result<Self, CometError> {
@@ -95,6 +160,42 @@ impl ScanExec {
         })
     }
 
+    /// Convenience constructor for a `ScanExec` fed by a native `SendableRecordBatchStream`
+    /// (no JVM involved), e.g. a Ballista shuffle-reader stream. Wraps `stream` in a
+    /// [`NativeBatchStream`] and delegates to [`ScanExec::new`].
+    pub fn new_native(
+        exec_context_id: i64,
+        stream: SendableRecordBatchStream,
+        input_source_description: &str,
+        data_types: Vec<DataType>,
+    ) -> Result<Self, CometError> {
+        let input_source: Arc<Mutex<dyn InputBatchStream>> =
+            Arc::new(Mutex::new(NativeBatchStream::new(stream)));
+        Self::new(
+            exec_context_id,
+            Some(input_source),
+            input_source_description,
+            data_types,
+        )
+    }
+
+    /// Inject a native [`SendableRecordBatchStream`] into an already-constructed
+    /// `ScanExec` handle (e.g. one returned by `PhysicalPlanner::create_plan` for a
+    /// `Scan` leaf, which is built with `input_source = None`). This is what lets a
+    /// [`crate::execution::fragment`] feed the fragment's `Scan` leaves from its
+    /// DataFusion children after the plan has been built.
+    ///
+    /// A non-`TEST_EXEC_CONTEXT_ID` `exec_context_id` MUST be supplied so that
+    /// `pull_next` actually pulls from the stream instead of short-circuiting to
+    /// EOF (that short-circuit is reserved for unit tests that seed batches via
+    /// `set_input_batch`). Only the handle that `get_next_batch` is driven on needs
+    /// this — the executable leaf shares this handle's `batch` slot (an `Arc`), so
+    /// batches pulled here become visible to the plan node without touching it.
+    pub fn set_native_input(&mut self, exec_context_id: i64, stream: SendableRecordBatchStream) {
+        self.exec_context_id = exec_context_id;
+        self.input_source = Some(Arc::new(Mutex::new(NativeBatchStream::new(stream))));
+    }
+
     /// Unpack all dictionary types because some DataFusion operators
     /// and expressions do not support dictionary types
     fn unpack_dictionary_type(dt: &DataType) -> DataType {
@@ -133,25 +234,24 @@ impl ScanExec {
     /// columns are unpacked because Comet's downstream operators do not handle them.
     fn pull_next(
         exec_context_id: i64,
-        reader: &Arc<Mutex<AlignedArrowStreamReader>>,
+        reader: &Arc<Mutex<dyn InputBatchStream>>,
     ) -> Result<InputBatch, CometError> {
         if exec_context_id == TEST_EXEC_CONTEXT_ID {
             // Unit test path; input batches are seeded directly.
             return Ok(InputBatch::EOF);
         }
 
-        // The `Mutex` is for interior mutability (`next` needs `&mut`, but the exec holds the
-        // reader behind an `Arc`); access is already serialized by the `self.batch` lock held in
-        // `get_next_batch`, so a contended `try_lock` here would signal a caller bug, not races.
+        // The `Mutex` is for interior mutability (`next_batch` needs `&mut`, but the exec holds
+        // the reader behind an `Arc`); access is already serialized by the `self.batch` lock held
+        // in `get_next_batch`, so a contended `try_lock` here would signal a caller bug, not
+        // races.
         let mut reader = reader
             .try_lock()
-            .map_err(|_| CometError::Internal("AlignedArrowStreamReader contended".to_string()))?;
+            .map_err(|_| CometError::Internal("input batch stream contended".to_string()))?;
 
-        let next = reader.next();
-        match next {
+        match reader.next_batch()? {
             None => Ok(InputBatch::EOF),
-            Some(Err(e)) => Err(CometError::from(e)),
-            Some(Ok(record_batch)) => {
+            Some(record_batch) => {
                 let num_rows = record_batch.num_rows();
                 let columns = record_batch.columns();
                 let mut inputs: Vec<ArrayRef> = Vec::with_capacity(columns.len());
@@ -341,6 +441,80 @@ impl RecordBatchStream for ScanStream<'_> {
     }
 }
 
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::Int32Array;
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion::execution::TaskContext;
+    use datafusion::physical_plan::memory::MemoryStream;
+
+    /// A `ScanExec` fed by a native `SendableRecordBatchStream` (no JVM involved) must pass
+    /// batches through unchanged, row-for-row and value-for-value. This is the enabling case for
+    /// a future non-JVM producer (e.g. a Ballista shuffle-reader stream) driving a Comet
+    /// fragment's `ScanExec` leaf.
+    #[test]
+    fn scan_exec_reads_native_record_batch_stream() {
+        let schema: SchemaRef = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, true)]));
+        let batch1 = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef],
+        )
+        .unwrap();
+        let batch2 = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int32Array::from(vec![4, 5])) as ArrayRef],
+        )
+        .unwrap();
+
+        let mem_stream = MemoryStream::try_new(
+            vec![batch1.clone(), batch2.clone()],
+            Arc::clone(&schema),
+            None,
+        )
+        .unwrap();
+        let native_stream: SendableRecordBatchStream = Box::pin(mem_stream);
+
+        // Any id other than `TEST_EXEC_CONTEXT_ID` so `pull_next` actually pulls from the native
+        // stream instead of short-circuiting to EOF (that short-circuit is what lets *other*
+        // unit tests seed batches directly via `set_input_batch`).
+        let exec_context_id = TEST_EXEC_CONTEXT_ID + 1;
+        let mut scan = ScanExec::new_native(
+            exec_context_id,
+            native_stream,
+            "native-test-stream",
+            vec![DataType::Int32],
+        )
+        .unwrap();
+
+        let task_ctx = Arc::new(TaskContext::default());
+        let mut output_stream = scan.execute(0, task_ctx).unwrap();
+
+        let mut collected = Vec::new();
+        loop {
+            scan.get_next_batch().unwrap();
+            match futures::executor::block_on(output_stream.next()) {
+                Some(Ok(batch)) => collected.push(batch),
+                Some(Err(e)) => panic!("unexpected error polling ScanExec: {e}"),
+                None => break,
+            }
+        }
+
+        assert_eq!(
+            collected.len(),
+            2,
+            "expected both native batches to pass through"
+        );
+        // `ScanExec` rebuilds the schema from `data_types` with placeholder field names (see
+        // `schema_from_data_types`), so compare columns/row counts rather than full batch/schema
+        // equality.
+        assert_eq!(collected[0].columns(), batch1.columns());
+        assert_eq!(collected[1].columns(), batch2.columns());
+        let total_rows: usize = collected.iter().map(|b| b.num_rows()).sum();
+        assert_eq!(total_rows, 5);
+    }
+}
+
 #[derive(Clone, Debug)]
 pub enum InputBatch {
     /// The end of input batches.
diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
index 25162332fd..dfadbeace5 100644
--- a/native/core/src/execution/planner.rs
+++ b/native/core/src/execution/planner.rs
@@ -22,8 +22,8 @@ pub mod macros;
 pub mod operator_registry;
 
 use crate::execution::operators::init_csv_datasource_exec;
-use crate::execution::operators::AlignedArrowStreamReader;
 use crate::execution::operators::IcebergScanExec;
+use crate::execution::operators::{AlignedArrowStreamReader, InputBatchStream};
 use crate::execution::{
     expressions::list_positions::ListPositionsExpr,
     expressions::subquery::Subquery,
@@ -1533,8 +1533,13 @@ impl PhysicalPlanner {
 
                 // Consumes the first input source for the scan. The Java side passes an
                 // `org.apache.arrow.c.ArrowArrayStream` whose `memoryAddress` points at the C
-                // struct; native takes ownership via `AlignedArrowStreamReader::from_raw`.
-                let input_source = if self.exec_context_id == TEST_EXEC_CONTEXT_ID
+                // struct; native takes ownership via `AlignedArrowStreamReader::from_raw`. Wrapped
+                // as `dyn InputBatchStream` so `ScanExec` can equally be driven by a native
+                // `SendableRecordBatchStream` (see `ScanExec::new_native`); this JVM path's
+                // behavior is otherwise unchanged.
+                let input_source: Option<Arc<std::sync::Mutex<dyn InputBatchStream>>> = if self
+                    .exec_context_id
+                    == TEST_EXEC_CONTEXT_ID
                     && inputs.is_empty()
                 {
                     // For unit test, we will set input batch to scan directly by `set_input_batch`.
diff --git a/native/core/tests/ballista_codec_roundtrip.rs b/native/core/tests/ballista_codec_roundtrip.rs
new file mode 100644
index 0000000000..223e8904d0
--- /dev/null
+++ b/native/core/tests/ballista_codec_roundtrip.rs
@@ -0,0 +1,153 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// Proves a Comet-FFI leaf survives Ballista's physical-plan serialization.
+//
+// Ballista ships each stage's physical plan to executors as protobuf via a
+// PhysicalExtensionCodec. Here we take a plan `CoalesceBatchesExec(CometScanExec)`,
+// serialize the WHOLE tree with datafusion-proto + our CometPhysicalCodec exactly
+// as Ballista would, then deserialize it in a fresh context (simulating the
+// executor) and execute it. The Comet leaf travels as proto bytes and is rebuilt
+// on the far side by re-running Comet's planner over FFI.
+
+#![cfg(feature = "ballista")]
+
+use std::sync::Arc;
+
+use datafusion::arrow::array::{Int32Array, RecordBatch};
+use datafusion::arrow::datatypes::{DataType as ArrowDataType, Field, Schema};
+use datafusion::parquet::arrow::ArrowWriter;
+// `CoalesceBatchesExec` is deprecated upstream in favor of arrow-rs's
+// `BatchCoalescer`, but it's still a real, functional standard DataFusion
+// operator, which is exactly what this test needs on top of the Comet leaf.
+#[allow(deprecated)]
+use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec;
+use datafusion::physical_plan::{displayable, ExecutionPlan};
+use datafusion::prelude::SessionContext;
+use datafusion_proto::physical_plan::AsExecutionPlan;
+use datafusion_proto::protobuf::PhysicalPlanNode;
+use futures::StreamExt;
+use prost::Message;
+
+use comet::execution::ballista::{CometPhysicalCodec, CometScanExec};
+use datafusion_comet_proto::spark_expression::{data_type::DataTypeId, DataType};
+use datafusion_comet_proto::spark_operator::{
+    operator::OpStruct, NativeScan, NativeScanCommon, Operator, SparkFilePartition,
+    SparkPartitionedFile, SparkStructField,
+};
+
+/// Write a tiny Parquet file with a single int32 column `a` = [1..=5].
+fn write_test_parquet(path: &std::path::Path) -> anyhow::Result<()> {
+    let schema = Arc::new(Schema::new(vec![Field::new(
+        "a",
+        ArrowDataType::Int32,
+        true,
+    )]));
+    let batch = RecordBatch::try_new(
+        Arc::clone(&schema),
+        vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]))],
+    )?;
+    let file = std::fs::File::create(path)?;
+    let mut writer = ArrowWriter::try_new(file, schema, None)?;
+    writer.write(&batch)?;
+    writer.close()?;
+    Ok(())
+}
+
+/// Build a Comet `Operator` proto: a single `NativeScan` over `parquet_path`.
+fn build_native_scan_proto(parquet_path: &std::path::Path) -> anyhow::Result<Vec<u8>> {
+    let int32 = DataType {
+        type_id: DataTypeId::Int32 as i32,
+        type_info: None,
+    };
+    let field_a = SparkStructField {
+        name: "a".to_string(),
+        data_type: Some(int32),
+        nullable: true,
+        metadata: Default::default(),
+    };
+    let common = NativeScanCommon {
+        required_schema: vec![field_a.clone()],
+        data_schema: vec![field_a],
+        projection_vector: vec![0],
+        session_timezone: "UTC".to_string(),
+        source: "comet-ffi-ballista-test".to_string(),
+        ..Default::default()
+    };
+    let file_size = std::fs::metadata(parquet_path)?.len() as i64;
+    let partitioned_file = SparkPartitionedFile {
+        file_path: format!("file://{}", parquet_path.display()),
+        start: 0,
+        length: file_size,
+        file_size,
+        partition_values: vec![],
+    };
+    let native_scan = NativeScan {
+        common: Some(common),
+        file_partition: Some(SparkFilePartition {
+            partitioned_file: vec![partitioned_file],
+        }),
+    };
+    let op = Operator {
+        children: vec![],
+        plan_id: 0,
+        op_struct: Some(OpStruct::NativeScan(native_scan)),
+    };
+    Ok(op.encode_to_vec())
+}
+
+#[allow(deprecated)]
+#[tokio::test(flavor = "multi_thread")]
+async fn comet_leaf_survives_ballista_codec() -> anyhow::Result<()> {
+    let parquet_path = std::env::temp_dir().join("comet_ffi_ballista_codec_roundtrip.parquet");
+    write_test_parquet(&parquet_path)?;
+    let proto = build_native_scan_proto(&parquet_path)?;
+
+    // Build a plan with a standard DataFusion operator on top of the Comet leaf.
+    let comet_scan: Arc<dyn ExecutionPlan> = Arc::new(CometScanExec::try_new(proto)?);
+    let plan: Arc<dyn ExecutionPlan> = Arc::new(CoalesceBatchesExec::new(comet_scan, 8192));
+    println!(
+        "original plan:\n{}",
+        displayable(plan.as_ref()).indent(false)
+    );
+
+    // --- Encode (scheduler side) ---
+    let codec = CometPhysicalCodec::default();
+    let node = PhysicalPlanNode::try_from_physical_plan(Arc::clone(&plan), &codec)?;
+    let bytes = node.encode_to_vec();
+    println!("serialized physical plan: {} bytes", bytes.len());
+
+    // --- Ship bytes, decode in a fresh context (executor side) ---
+    let ctx = SessionContext::new();
+    let task_ctx = ctx.task_ctx();
+    let node2 = PhysicalPlanNode::decode(&bytes[..])?;
+    let plan2 = node2.try_into_physical_plan(task_ctx.as_ref(), &codec)?;
+    println!(
+        "reconstructed plan (executor side):\n{}",
+        displayable(plan2.as_ref()).indent(false)
+    );
+
+    // --- Execute the reconstructed plan ---
+    let mut stream = plan2.execute(0, task_ctx)?;
+    let mut total_rows = 0usize;
+    while let Some(batch) = stream.next().await {
+        let batch = batch?;
+        total_rows += batch.num_rows();
+    }
+    println!("\nTOTAL ROWS AFTER CODEC ROUND-TRIP: {total_rows}");
+    assert_eq!(total_rows, 5, "expected 5 rows");
+    Ok(())
+}
diff --git a/native/core/tests/ballista_distributed.rs b/native/core/tests/ballista_distributed.rs
new file mode 100644
index 0000000000..0230987f48
--- /dev/null
+++ b/native/core/tests/ballista_distributed.rs
@@ -0,0 +1,149 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// Distributes a Comet FFI scan across a real (in-process) Ballista cluster.
+//
+// A `CometTableProvider` exposes the Comet `NativeScan` as a SQL table. The
+// query `GROUP BY a` forces a hash repartition, which Ballista turns into a
+// shuffle boundary -> two stages. Stage 1 (the Comet scan + partial aggregate)
+// is serialized and shipped to the executor via our codecs; the Comet leaf is
+// rebuilt there by re-running Comet's planner over FFI. This proves Ballista
+// distributes Comet work end to end.
+//
+// Starts an in-process Ballista scheduler + executors, so it is heavier and
+// slower than a unit test. Run explicitly:
+//   cargo test -p datafusion-comet --features ballista --test ballista_distributed -- --ignored
+
+#![cfg(feature = "ballista")]
+
+use std::sync::Arc;
+
+use ballista::prelude::{SessionConfigExt, SessionContextExt};
+use datafusion::arrow::array::{Int32Array, RecordBatch};
+use datafusion::arrow::datatypes::{DataType as ArrowDataType, Field, Schema};
+use datafusion::arrow::util::pretty::pretty_format_batches;
+use datafusion::execution::SessionStateBuilder;
+use datafusion::parquet::arrow::ArrowWriter;
+use datafusion::prelude::{SessionConfig, SessionContext};
+
+use comet::execution::ballista::{CometLogicalCodec, CometPhysicalCodec, CometTableProvider};
+use datafusion_comet_proto::spark_expression::{data_type::DataTypeId, DataType};
+use datafusion_comet_proto::spark_operator::{
+    operator::OpStruct, NativeScan, NativeScanCommon, Operator, SparkFilePartition,
+    SparkPartitionedFile, SparkStructField,
+};
+
+/// Write a tiny Parquet file with a single int32 column `a` = [1..=5].
+fn write_test_parquet(path: &std::path::Path) -> anyhow::Result<()> {
+    let schema = Arc::new(Schema::new(vec![Field::new(
+        "a",
+        ArrowDataType::Int32,
+        true,
+    )]));
+    let batch = RecordBatch::try_new(
+        Arc::clone(&schema),
+        vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]))],
+    )?;
+    let file = std::fs::File::create(path)?;
+    let mut writer = ArrowWriter::try_new(file, schema, None)?;
+    writer.write(&batch)?;
+    writer.close()?;
+    Ok(())
+}
+
+/// Build a Comet `Operator` proto: a single `NativeScan` over `parquet_path`.
+fn build_native_scan_proto(parquet_path: &std::path::Path) -> anyhow::Result<Vec<u8>> {
+    use prost::Message;
+    let int32 = DataType {
+        type_id: DataTypeId::Int32 as i32,
+        type_info: None,
+    };
+    let field_a = SparkStructField {
+        name: "a".to_string(),
+        data_type: Some(int32),
+        nullable: true,
+        metadata: Default::default(),
+    };
+    let common = NativeScanCommon {
+        required_schema: vec![field_a.clone()],
+        data_schema: vec![field_a],
+        projection_vector: vec![0],
+        session_timezone: "UTC".to_string(),
+        source: "comet-ffi-ballista-test".to_string(),
+        ..Default::default()
+    };
+    let file_size = std::fs::metadata(parquet_path)?.len() as i64;
+    let partitioned_file = SparkPartitionedFile {
+        file_path: format!("file://{}", parquet_path.display()),
+        start: 0,
+        length: file_size,
+        file_size,
+        partition_values: vec![],
+    };
+    let native_scan = NativeScan {
+        common: Some(common),
+        file_partition: Some(SparkFilePartition {
+            partitioned_file: vec![partitioned_file],
+        }),
+    };
+    let op = Operator {
+        children: vec![],
+        plan_id: 0,
+        op_struct: Some(OpStruct::NativeScan(native_scan)),
+    };
+    Ok(op.encode_to_vec())
+}
+
+#[ignore = "starts an in-process Ballista cluster; run explicitly"]
+#[tokio::test]
+async fn comet_scan_distributed_with_shuffle() -> anyhow::Result<()> {
+    let parquet = std::env::temp_dir().join("comet_ffi_ballista_distributed.parquet");
+    write_test_parquet(&parquet)?;
+    let proto = build_native_scan_proto(&parquet)?;
+    let schema = Arc::new(Schema::new(vec![Field::new(
+        "a",
+        ArrowDataType::Int32,
+        true,
+    )]));
+
+    // In-process Ballista cluster with our Comet codecs registered on both the
+    // scheduler and executor sides (they flow via SessionConfig).
+    let config = SessionConfig::new_with_ballista()
+        .with_target_partitions(4)
+        .with_ballista_standalone_parallelism(2)
+        .with_ballista_physical_extension_codec(Arc::new(CometPhysicalCodec::default()))
+        .with_ballista_logical_extension_codec(Arc::new(CometLogicalCodec::default()));
+    let state = SessionStateBuilder::new()
+        .with_config(config)
+        .with_default_features()
+        .build();
+    let ctx = SessionContext::standalone_with_state(state).await?;
+
+    ctx.register_table("comet_t", Arc::new(CometTableProvider::new(proto, schema)))?;
+
+    let sql = "SELECT a, count(*) AS c FROM comet_t GROUP BY a ORDER BY a";
+    println!("distributed query: {sql}\n");
+    let df = ctx.sql(sql).await?;
+    println!("logical plan:\n{}\n", df.logical_plan().display_indent());
+
+    let results = df.collect().await?;
+    println!("{}", pretty_format_batches(&results)?);
+    let groups: usize = results.iter().map(|b| b.num_rows()).sum();
+    println!("\nGROUP BY produced {groups} groups");
+    assert_eq!(groups, 5, "expected 5 groups (a = 1..=5)");
+    println!("PASS: Comet FFI scan distributed by Ballista (with shuffle) — correct results");
+    Ok(())
+}
diff --git a/native/core/tests/ballista_external_cluster.rs b/native/core/tests/ballista_external_cluster.rs
new file mode 100644
index 0000000000..b3d58b6539
--- /dev/null
+++ b/native/core/tests/ballista_external_cluster.rs
@@ -0,0 +1,344 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// Distributes a Comet plan across a REAL external Ballista cluster: a separate
+// `comet-scheduler` process and a separate `comet-executor`
+// process, each spawned as a child of this test. This is unlike
+// `ballista_distributed.rs`, which runs an *in-process* standalone cluster
+// (scheduler + executor threads inside the test).
+//
+// The point it proves:
+//   1. The two Comet-flavored binaries (which inject Comet's extension codecs,
+//      unlike the stock Ballista CLIs) start, and the executor registers.
+//   2. A two-fragment Comet DAG offload plan (`CometFragment(NativeScan) ->
+//      hash-shuffle -> CometFragment(Filter over a Scan)`) submitted to the
+//      external scheduler is split into two stages, shipped to the *separate*
+//      executor process, reconstructed there via the codecs, and executed —
+//      returning correct results across the process boundary.
+//   3. Crucially, the executor is a plain Rust process with NO running JVM (only
+//      `libjvm` on the loader path). The Comet fragments must execute there
+//      without a "JAVA_VM not initialized" panic. A childless `NativeScan`
+//      fragment reads Parquet directly and never touches `JAVA_VM`, so this
+//      should hold — this test is the first proof of it in a *separate* process.
+//
+// Spawns child processes and binds ports, so it is `#[ignore]`. Run explicitly:
+//   export DYLD_LIBRARY_PATH="$JAVA_HOME/lib/server:$DYLD_LIBRARY_PATH"
+//   cargo test -p datafusion-comet --features ballista \
+//       --test ballista_external_cluster -- --ignored --nocapture
+
+#![cfg(feature = "ballista")]
+
+use std::net::{SocketAddr, TcpStream};
+use std::path::PathBuf;
+use std::process::{Child, Command};
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
+use datafusion::arrow::array::{Int32Array, RecordBatch};
+use datafusion::arrow::datatypes::{DataType as ArrowDataType, Field, Schema, SchemaRef};
+use datafusion::parquet::arrow::ArrowWriter;
+use prost::Message;
+
+use comet::execution::ballista::execute_offload_plan;
+use datafusion_comet_proto::spark_expression::{
+    data_type::DataTypeId, expr::ExprStruct, literal, BinaryExpr, BoundReference, DataType, Expr,
+    Literal,
+};
+use datafusion_comet_proto::spark_operator::{
+    operator::OpStruct, CometBallistaOffloadPlan, Filter, NativeScan, NativeScanCommon,
+    OffloadFragment, OffloadInput, Operator, Scan, SparkFilePartition, SparkPartitionedFile,
+    SparkStructField,
+};
+
+// Non-default ports so this test does not collide with a real cluster on the
+// usual 50050/50051/50052.
+const SCHEDULER_PORT: u16 = 51050;
+const EXECUTOR_FLIGHT_PORT: u16 = 51051;
+const EXECUTOR_GRPC_PORT: u16 = 51052;
+
+fn int32_type() -> DataType {
+    DataType {
+        type_id: DataTypeId::Int32 as i32,
+        type_info: None,
+    }
+}
+
+fn int32_schema() -> SchemaRef {
+    Arc::new(Schema::new(vec![Field::new(
+        "a",
+        ArrowDataType::Int32,
+        true,
+    )]))
+}
+
+/// Write a tiny Parquet file with a single int32 column `a` = [1..=5].
+fn write_test_parquet(path: &std::path::Path) -> anyhow::Result<()> {
+    let schema = int32_schema();
+    let batch = RecordBatch::try_new(
+        Arc::clone(&schema),
+        vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]))],
+    )?;
+    let file = std::fs::File::create(path)?;
+    let mut writer = ArrowWriter::try_new(file, schema, None)?;
+    writer.write(&batch)?;
+    writer.close()?;
+    Ok(())
+}
+
+/// block1: a childless `NativeScan` fragment over `parquet_path` (int32 `a`).
+/// This is the JVM-less leaf — it reads Parquet directly, no `JAVA_VM`.
+fn build_native_scan_proto(parquet_path: &std::path::Path) -> anyhow::Result<Vec<u8>> {
+    let field_a = SparkStructField {
+        name: "a".to_string(),
+        data_type: Some(int32_type()),
+        nullable: true,
+        metadata: Default::default(),
+    };
+    let common = NativeScanCommon {
+        required_schema: vec![field_a.clone()],
+        data_schema: vec![field_a],
+        projection_vector: vec![0],
+        session_timezone: "UTC".to_string(),
+        source: "comet-external-cluster-native-scan".to_string(),
+        ..Default::default()
+    };
+    let file_size = std::fs::metadata(parquet_path)?.len() as i64;
+    let partitioned_file = SparkPartitionedFile {
+        file_path: format!("file://{}", parquet_path.display()),
+        start: 0,
+        length: file_size,
+        file_size,
+        partition_values: vec![],
+    };
+    let native_scan = NativeScan {
+        common: Some(common),
+        file_partition: Some(SparkFilePartition {
+            partitioned_file: vec![partitioned_file],
+        }),
+    };
+    Ok(Operator {
+        children: vec![],
+        plan_id: 0,
+        op_struct: Some(OpStruct::NativeScan(native_scan)),
+    }
+    .encode_to_vec())
+}
+
+/// block2: `Filter(col0 > 2)` over a `Scan` (#100) input leaf, fed by the shuffle
+/// reader. Keeps a > 2, i.e. rows {3, 4, 5}.
+fn build_filter_over_scan_proto() -> Vec<u8> {
+    let scan = Scan {
+        fields: vec![int32_type()],
+        source: "comet-external-cluster-shuffle-scan".to_string(),
+    };
+    let scan_op = Operator {
+        children: vec![],
+        plan_id: 1,
+        op_struct: Some(OpStruct::Scan(scan)),
+    };
+
+    let col0 = Expr {
+        expr_struct: Some(ExprStruct::Bound(BoundReference {
+            index: 0,
+            datatype: Some(int32_type()),
+        })),
+        ..Default::default()
+    };
+    let lit2 = Expr {
+        expr_struct: Some(ExprStruct::Literal(Literal {
+            value: Some(literal::Value::IntVal(2)),
+            datatype: Some(int32_type()),
+            is_null: false,
+        })),
+        ..Default::default()
+    };
+    let predicate = Expr {
+        expr_struct: Some(ExprStruct::Gt(Box::new(BinaryExpr {
+            left: Some(Box::new(col0)),
+            right: Some(Box::new(lit2)),
+        }))),
+        ..Default::default()
+    };
+    Operator {
+        children: vec![scan_op],
+        plan_id: 2,
+        op_struct: Some(OpStruct::Filter(Filter {
+            predicate: Some(predicate),
+        })),
+    }
+    .encode_to_vec()
+}
+
+/// Kills the spawned child processes on drop, so a panicking assertion still
+/// tears the external cluster down.
+struct ClusterGuard {
+    children: Vec<(&'static str, Child)>,
+}
+
+impl Drop for ClusterGuard {
+    fn drop(&mut self) {
+        for (name, child) in self.children.iter_mut() {
+            let _ = child.kill();
+            let _ = child.wait();
+            eprintln!("[harness] stopped {name}");
+        }
+    }
+}
+
+/// The `libjvm` directory (`$JAVA_HOME/lib/server`) so the spawned binaries can
+/// load `libjvm` (present, not a running JVM). Inherited env usually already has
+/// it, but we set it explicitly to be robust to macOS DYLD stripping.
+fn dyld_path() -> Option<String> {
+    let java_home = std::env::var("JAVA_HOME").ok()?;
+    let lib = PathBuf::from(&java_home).join("lib").join("server");
+    let existing = std::env::var("DYLD_LIBRARY_PATH").unwrap_or_default();
+    Some(if existing.is_empty() {
+        lib.display().to_string()
+    } else {
+        format!("{}:{}", lib.display(), existing)
+    })
+}
+
+/// Poll a TCP port until it accepts a connection (the process is listening) or
+/// the deadline passes.
+fn wait_for_port(port: u16, what: &str, timeout: Duration) -> anyhow::Result<()> {
+    let addr: SocketAddr = format!("127.0.0.1:{port}").parse()?;
+    let deadline = Instant::now() + timeout;
+    while Instant::now() < deadline {
+        if TcpStream::connect_timeout(&addr, Duration::from_millis(200)).is_ok() {
+            eprintln!("[harness] {what} is listening on {port}");
+            return Ok(());
+        }
+        std::thread::sleep(Duration::from_millis(150));
+    }
+    anyhow::bail!("timed out waiting for {what} on port {port}")
+}
+
+#[ignore = "spawns external scheduler + executor processes and binds ports; run explicitly"]
+#[test]
+fn comet_plan_on_external_cluster() -> anyhow::Result<()> {
+    let scheduler_bin = env!("CARGO_BIN_EXE_comet-scheduler");
+    let executor_bin = env!("CARGO_BIN_EXE_comet-executor");
+    let dyld = dyld_path();
+
+    // --- 1. Spawn the external scheduler process ---
+    let mut scheduler_cmd = Command::new(scheduler_bin);
+    scheduler_cmd
+        .env("COMET_BALLISTA_SCHEDULER_BIND_HOST", "127.0.0.1")
+        .env(
+            "COMET_BALLISTA_SCHEDULER_BIND_PORT",
+            SCHEDULER_PORT.to_string(),
+        );
+    if let Some(ref d) = dyld {
+        scheduler_cmd.env("DYLD_LIBRARY_PATH", d);
+    }
+    let scheduler = scheduler_cmd.spawn()?;
+    let mut guard = ClusterGuard {
+        children: vec![("comet-scheduler", scheduler)],
+    };
+    wait_for_port(SCHEDULER_PORT, "scheduler", Duration::from_secs(30))?;
+
+    // --- 2. Spawn the external executor process (separate, JVM-less) ---
+    let mut executor_cmd = Command::new(executor_bin);
+    executor_cmd
+        .env("COMET_BALLISTA_EXECUTOR_BIND_HOST", "127.0.0.1")
+        .env(
+            "COMET_BALLISTA_EXECUTOR_PORT",
+            EXECUTOR_FLIGHT_PORT.to_string(),
+        )
+        .env(
+            "COMET_BALLISTA_EXECUTOR_GRPC_PORT",
+            EXECUTOR_GRPC_PORT.to_string(),
+        )
+        .env("COMET_BALLISTA_SCHEDULER_HOST", "127.0.0.1")
+        .env("COMET_BALLISTA_SCHEDULER_PORT", SCHEDULER_PORT.to_string())
+        .env("COMET_BALLISTA_EXECUTOR_CONCURRENT_TASKS", "4");
+    if let Some(ref d) = dyld {
+        executor_cmd.env("DYLD_LIBRARY_PATH", d);
+    }
+    let executor = executor_cmd.spawn()?;
+    guard.children.push(("comet-executor", executor));
+    wait_for_port(
+        EXECUTOR_FLIGHT_PORT,
+        "executor flight",
+        Duration::from_secs(30),
+    )?;
+    wait_for_port(EXECUTOR_GRPC_PORT, "executor grpc", Duration::from_secs(30))?;
+    // Grace for the executor to complete registration with the scheduler.
+    std::thread::sleep(Duration::from_secs(3));
+
+    // --- 3. Build the two-fragment Comet DAG offload plan ---
+    let parquet = std::env::temp_dir().join("comet_external_cluster.parquet");
+    write_test_parquet(&parquet)?;
+    let block1 = build_native_scan_proto(&parquet)?; // NativeScan a=[1..5]
+    let block2 = build_filter_over_scan_proto(); // Filter(a > 2)
+    let plan = CometBallistaOffloadPlan {
+        num_partitions: 4,
+        fragments: vec![
+            OffloadFragment {
+                block_proto: block1,
+                inputs: vec![],
+            },
+            OffloadFragment {
+                block_proto: block2,
+                inputs: vec![OffloadInput {
+                    producer: 0,
+                    hash_key_ordinals: vec![0],
+                }],
+            },
+        ],
+    };
+
+    // --- 4. Submit to the EXTERNAL scheduler (non-empty URL => remote path) ---
+    let scheduler_url = format!("http://127.0.0.1:{SCHEDULER_PORT}");
+    eprintln!("[harness] submitting DAG offload plan to {scheduler_url}");
+    let (schema, batches) = execute_offload_plan(&plan.encode_to_vec(), &scheduler_url)
+        .map_err(|e| anyhow::anyhow!("external submission failed: {e}"))?;
+
+    // --- 5. Verify correctness across the process boundary ---
+    let mut values: Vec<i32> = Vec::new();
+    for batch in &batches {
+        let col = batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .expect("int32 column");
+        values.extend(col.values().iter().copied());
+    }
+    values.sort_unstable();
+    eprintln!(
+        "[harness] external cluster returned {} rows: {:?} (schema: {:?})",
+        values.len(),
+        values,
+        schema.fields().iter().map(|f| f.name()).collect::<Vec<_>>()
+    );
+
+    assert_eq!(
+        values,
+        vec![3, 4, 5],
+        "distributed Comet plan on the external cluster must return {{3,4,5}} (a > 2)"
+    );
+
+    eprintln!(
+        "PASS: a distributed Comet plan ran on a SEPARATE scheduler+executor process pair \
+         (JVM-less executor) and returned correct results"
+    );
+
+    // guard drops here, tearing down both child processes.
+    drop(guard);
+    let _ = schema;
+    Ok(())
+}
diff --git a/native/core/tests/ballista_ffi_roundtrip.rs b/native/core/tests/ballista_ffi_roundtrip.rs
new file mode 100644
index 0000000000..da773f9b09
--- /dev/null
+++ b/native/core/tests/ballista_ffi_roundtrip.rs
@@ -0,0 +1,150 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// Proves the driver-side offload result boundary: a Comet `Operator` proto is
+// run on in-process standalone Ballista and its result is exported over the
+// Arrow C Data Interface into *caller-allocated* FFI structs, exactly as the
+// JVM boundary does (Arrow Java allocates the structs, native writes into them,
+// Arrow Java imports them). Here the "caller" is this Rust test standing in for
+// the JVM: it allocates the FFI structs, calls `submit_and_export`, then
+// re-imports via `from_ffi` and asserts 5 rows come back.
+//
+//   cargo test -p datafusion-comet --features ballista --test ballista_ffi_roundtrip -- --ignored --nocapture
+
+#![cfg(feature = "ballista")]
+
+use std::sync::Arc;
+
+use datafusion::arrow::array::{make_array, Int32Array, RecordBatch};
+use datafusion::arrow::datatypes::{DataType as ArrowDataType, Field, Schema};
+use datafusion::arrow::ffi::{from_ffi, FFI_ArrowArray, FFI_ArrowSchema};
+use datafusion::parquet::arrow::ArrowWriter;
+
+use comet::execution::ballista::submit_and_export;
+use datafusion_comet_proto::spark_expression::{data_type::DataTypeId, DataType};
+use datafusion_comet_proto::spark_operator::{
+    operator::OpStruct, NativeScan, NativeScanCommon, Operator, SparkFilePartition,
+    SparkPartitionedFile, SparkStructField,
+};
+
+fn write_test_parquet(path: &std::path::Path) -> anyhow::Result<()> {
+    let schema = Arc::new(Schema::new(vec![Field::new(
+        "a",
+        ArrowDataType::Int32,
+        true,
+    )]));
+    let batch = RecordBatch::try_new(
+        Arc::clone(&schema),
+        vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]))],
+    )?;
+    let file = std::fs::File::create(path)?;
+    let mut writer = ArrowWriter::try_new(file, schema, None)?;
+    writer.write(&batch)?;
+    writer.close()?;
+    Ok(())
+}
+
+fn build_native_scan_proto(parquet_path: &std::path::Path) -> anyhow::Result<Vec<u8>> {
+    use prost::Message;
+    let int32 = DataType {
+        type_id: DataTypeId::Int32 as i32,
+        type_info: None,
+    };
+    let field_a = SparkStructField {
+        name: "a".to_string(),
+        data_type: Some(int32),
+        nullable: true,
+        metadata: Default::default(),
+    };
+    let common = NativeScanCommon {
+        required_schema: vec![field_a.clone()],
+        data_schema: vec![field_a],
+        projection_vector: vec![0],
+        session_timezone: "UTC".to_string(),
+        source: "comet-ffi-ballista-jvm-spike".to_string(),
+        ..Default::default()
+    };
+    let file_size = std::fs::metadata(parquet_path)?.len() as i64;
+    let partitioned_file = SparkPartitionedFile {
+        file_path: format!("file://{}", parquet_path.display()),
+        start: 0,
+        length: file_size,
+        file_size,
+        partition_values: vec![],
+    };
+    let native_scan = NativeScan {
+        common: Some(common),
+        file_partition: Some(SparkFilePartition {
+            partitioned_file: vec![partitioned_file],
+        }),
+    };
+    let op = Operator {
+        children: vec![],
+        plan_id: 0,
+        op_struct: Some(OpStruct::NativeScan(native_scan)),
+    };
+    Ok(op.encode_to_vec())
+}
+
+#[ignore = "starts an in-process Ballista cluster; run explicitly"]
+#[test]
+fn offload_proto_and_import_over_c_data_interface() -> anyhow::Result<()> {
+    let parquet = std::env::temp_dir().join("comet_ffi_ballista_jvm_spike.parquet");
+    write_test_parquet(&parquet)?;
+    let proto = build_native_scan_proto(&parquet)?;
+
+    // Stand in for the JVM: allocate one (array, schema) FFI struct per column.
+    // Arrow Java's ArrowArray.allocateNew / ArrowSchema.allocateNew produce the
+    // exact same C Data structs; we hand their addresses to native code.
+    const NUM_COLS: usize = 1;
+    let mut arrays: Vec<FFI_ArrowArray> = (0..NUM_COLS).map(|_| FFI_ArrowArray::empty()).collect();
+    let mut schemas: Vec<FFI_ArrowSchema> =
+        (0..NUM_COLS).map(|_| FFI_ArrowSchema::empty()).collect();
+    let array_addrs: Vec<i64> = arrays
+        .iter_mut()
+        .map(|a| a as *mut FFI_ArrowArray as i64)
+        .collect();
+    let schema_addrs: Vec<i64> = schemas
+        .iter_mut()
+        .map(|s| s as *mut FFI_ArrowSchema as i64)
+        .collect();
+
+    // JVM → native → in-process Ballista → export into the caller structs.
+    let num_rows = unsafe { submit_and_export(&proto, &array_addrs, &schema_addrs) }
+        .map_err(anyhow::Error::msg)?;
+    assert_eq!(num_rows, 5, "expected 5 rows back from Ballista");
+
+    // JVM side: import the exported structs (mirrors Arrow Java ArrowImporter).
+    for i in 0..NUM_COLS {
+        let array = std::mem::replace(&mut arrays[i], FFI_ArrowArray::empty());
+        let schema = std::mem::replace(&mut schemas[i], FFI_ArrowSchema::empty());
+        let data = unsafe { from_ffi(array, &schema) }?;
+        let imported = make_array(data);
+        assert_eq!(imported.len(), 5, "imported column {i} should have 5 rows");
+        let ints = imported
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .expect("column a should be Int32");
+        let values: Vec<i32> = ints.values().to_vec();
+        assert_eq!(values, vec![1, 2, 3, 4, 5]);
+    }
+
+    println!(
+        "PASS: proto -> standalone Ballista -> {num_rows} rows exported and re-imported over the \
+         Arrow C Data Interface (the JVM boundary mechanism)"
+    );
+    Ok(())
+}
diff --git a/native/core/tests/ballista_fragment_child_input.rs b/native/core/tests/ballista_fragment_child_input.rs
new file mode 100644
index 0000000000..187575b125
--- /dev/null
+++ b/native/core/tests/ballista_fragment_child_input.rs
@@ -0,0 +1,325 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// Proves `CometFragmentExec` runs a Comet plan fragment whose input-leaf `Scan`
+// is fed by the node's DataFusion child stream (the R2 shuffle-reader shape,
+// stood in for here by an in-memory child and a `CometScanExec` child), and
+// that such a fragment survives Ballista's physical-plan (de)serialization.
+
+#![cfg(feature = "ballista")]
+
+use std::sync::Arc;
+
+use datafusion::arrow::array::{Int32Array, RecordBatch};
+use datafusion::arrow::datatypes::{DataType as ArrowDataType, Field, Schema, SchemaRef};
+use datafusion::common::Result;
+use datafusion::execution::{SendableRecordBatchStream, TaskContext};
+use datafusion::parquet::arrow::ArrowWriter;
+use datafusion::physical_expr::EquivalenceProperties;
+use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType};
+use datafusion::physical_plan::memory::MemoryStream;
+use datafusion::physical_plan::{
+    displayable, DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties,
+};
+use datafusion::prelude::SessionContext;
+use datafusion_proto::physical_plan::AsExecutionPlan;
+use datafusion_proto::protobuf::PhysicalPlanNode;
+use futures::StreamExt;
+use prost::Message;
+
+use comet::execution::ballista::{CometFragmentExec, CometPhysicalCodec, CometScanExec};
+use datafusion_comet_proto::spark_expression::{
+    data_type::DataTypeId, expr::ExprStruct, literal, BinaryExpr, BoundReference, DataType, Expr,
+    Literal,
+};
+use datafusion_comet_proto::spark_operator::{
+    operator::OpStruct, Filter, NativeScan, NativeScanCommon, Operator, Scan, SparkFilePartition,
+    SparkPartitionedFile, SparkStructField,
+};
+
+/// A minimal in-memory DataFusion leaf yielding a fixed set of batches, standing
+/// in for a shuffle reader (or any upstream DataFusion child) that feeds a
+/// `CometFragmentExec`'s `Scan` input leaf.
+#[derive(Debug)]
+struct InMemoryChildExec {
+    batches: Vec<RecordBatch>,
+    schema: SchemaRef,
+    props: Arc<PlanProperties>,
+}
+
+impl InMemoryChildExec {
+    fn new(batches: Vec<RecordBatch>, schema: SchemaRef) -> Self {
+        let props = Arc::new(PlanProperties::new(
+            EquivalenceProperties::new(Arc::clone(&schema)),
+            Partitioning::UnknownPartitioning(1),
+            EmissionType::Incremental,
+            Boundedness::Bounded,
+        ));
+        Self {
+            batches,
+            schema,
+            props,
+        }
+    }
+}
+
+impl DisplayAs for InMemoryChildExec {
+    fn fmt_as(&self, _t: DisplayFormatType, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "InMemoryChildExec")
+    }
+}
+
+impl ExecutionPlan for InMemoryChildExec {
+    fn name(&self) -> &str {
+        "InMemoryChildExec"
+    }
+    fn properties(&self) -> &Arc<PlanProperties> {
+        &self.props
+    }
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![]
+    }
+    fn with_new_children(
+        self: Arc<Self>,
+        _children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(self)
+    }
+    fn execute(
+        &self,
+        _partition: usize,
+        _context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        Ok(Box::pin(MemoryStream::try_new(
+            self.batches.clone(),
+            Arc::clone(&self.schema),
+            None,
+        )?))
+    }
+}
+
+fn int32_type() -> DataType {
+    DataType {
+        type_id: DataTypeId::Int32 as i32,
+        type_info: None,
+    }
+}
+
+/// Build a Comet `Operator` proto: `Filter(gt(col0, 2))` over a `Scan` leaf with
+/// one Int32 column. The `Scan` (op #100) is the input leaf fed by a child
+/// stream; the `Filter` proves an operator is applied on top of the child rows.
+fn build_filter_over_scan_proto() -> Vec<u8> {
+    let scan = Scan {
+        fields: vec![int32_type()],
+        source: "fragment-child-test".to_string(),
+    };
+    let scan_op = Operator {
+        children: vec![],
+        plan_id: 1,
+        op_struct: Some(OpStruct::Scan(scan)),
+    };
+
+    let col0 = Expr {
+        expr_struct: Some(ExprStruct::Bound(BoundReference {
+            index: 0,
+            datatype: Some(int32_type()),
+        })),
+        ..Default::default()
+    };
+    let lit2 = Expr {
+        expr_struct: Some(ExprStruct::Literal(Literal {
+            value: Some(literal::Value::IntVal(2)),
+            datatype: Some(int32_type()),
+            is_null: false,
+        })),
+        ..Default::default()
+    };
+    let predicate = Expr {
+        expr_struct: Some(ExprStruct::Gt(Box::new(BinaryExpr {
+            left: Some(Box::new(col0)),
+            right: Some(Box::new(lit2)),
+        }))),
+        ..Default::default()
+    };
+    let filter_op = Operator {
+        children: vec![scan_op],
+        plan_id: 2,
+        op_struct: Some(OpStruct::Filter(Filter {
+            predicate: Some(predicate),
+        })),
+    };
+    filter_op.encode_to_vec()
+}
+
+fn int32_schema() -> SchemaRef {
+    Arc::new(Schema::new(vec![Field::new(
+        "a",
+        ArrowDataType::Int32,
+        true,
+    )]))
+}
+
+/// A `CometFragmentExec` whose `Scan` leaf is fed by an in-memory DataFusion
+/// child must pass the child rows through the fragment and apply the fragment's
+/// `Filter` (col0 > 2) to them.
+#[tokio::test(flavor = "multi_thread")]
+async fn fragment_scan_leaf_fed_by_child() -> anyhow::Result<()> {
+    let schema = int32_schema();
+    let batch1 = RecordBatch::try_new(
+        Arc::clone(&schema),
+        vec![Arc::new(Int32Array::from(vec![1, 2, 3])) as _],
+    )?;
+    let batch2 = RecordBatch::try_new(
+        Arc::clone(&schema),
+        vec![Arc::new(Int32Array::from(vec![4, 5])) as _],
+    )?;
+    let child: Arc<dyn ExecutionPlan> =
+        Arc::new(InMemoryChildExec::new(vec![batch1, batch2], schema));
+
+    let proto = build_filter_over_scan_proto();
+    let fragment: Arc<dyn ExecutionPlan> =
+        Arc::new(CometFragmentExec::try_new(proto, vec![child])?);
+
+    let ctx = SessionContext::new();
+    let mut stream = fragment.execute(0, ctx.task_ctx())?;
+    let mut values: Vec<i32> = Vec::new();
+    while let Some(batch) = stream.next().await {
+        let batch = batch?;
+        let col = batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .expect("int32 column");
+        values.extend(col.values().iter().copied());
+    }
+
+    // Child produced 1..=5; the fragment's Filter keeps col0 > 2.
+    assert_eq!(
+        values,
+        vec![3, 4, 5],
+        "child rows must flow through and be filtered"
+    );
+    Ok(())
+}
+
+/// Write a tiny Parquet file with a single int32 column `a` = [1..=5].
+fn write_test_parquet(path: &std::path::Path) -> anyhow::Result<()> {
+    let schema = int32_schema();
+    let batch = RecordBatch::try_new(
+        Arc::clone(&schema),
+        vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]))],
+    )?;
+    let file = std::fs::File::create(path)?;
+    let mut writer = ArrowWriter::try_new(file, schema, None)?;
+    writer.write(&batch)?;
+    writer.close()?;
+    Ok(())
+}
+
+/// Build a Comet `Operator` proto: a single `NativeScan` over `parquet_path`.
+fn build_native_scan_proto(parquet_path: &std::path::Path) -> anyhow::Result<Vec<u8>> {
+    let field_a = SparkStructField {
+        name: "a".to_string(),
+        data_type: Some(int32_type()),
+        nullable: true,
+        metadata: Default::default(),
+    };
+    let common = NativeScanCommon {
+        required_schema: vec![field_a.clone()],
+        data_schema: vec![field_a],
+        projection_vector: vec![0],
+        session_timezone: "UTC".to_string(),
+        source: "comet-fragment-child-native-scan".to_string(),
+        ..Default::default()
+    };
+    let file_size = std::fs::metadata(parquet_path)?.len() as i64;
+    let partitioned_file = SparkPartitionedFile {
+        file_path: format!("file://{}", parquet_path.display()),
+        start: 0,
+        length: file_size,
+        file_size,
+        partition_values: vec![],
+    };
+    let native_scan = NativeScan {
+        common: Some(common),
+        file_partition: Some(SparkFilePartition {
+            partitioned_file: vec![partitioned_file],
+        }),
+    };
+    Ok(Operator {
+        children: vec![],
+        plan_id: 0,
+        op_struct: Some(OpStruct::NativeScan(native_scan)),
+    }
+    .encode_to_vec())
+}
+
+/// A `CometFragmentExec` (with a `CometScanExec` child, so the whole tree is
+/// serializable) must survive Ballista's physical-plan codec round-trip and
+/// produce the same filtered result on the far side.
+#[tokio::test(flavor = "multi_thread")]
+async fn fragment_codec_roundtrip() -> anyhow::Result<()> {
+    let parquet_path = std::env::temp_dir().join("comet_fragment_child_codec_roundtrip.parquet");
+    write_test_parquet(&parquet_path)?;
+
+    // Child = CometScanExec over the parquet (round-trips via COMET_MAGIC);
+    // parent fragment = Filter(col0 > 2) over a Scan input leaf.
+    let child: Arc<dyn ExecutionPlan> = Arc::new(CometScanExec::try_new(build_native_scan_proto(
+        &parquet_path,
+    )?)?);
+    let fragment_proto = build_filter_over_scan_proto();
+    let plan: Arc<dyn ExecutionPlan> =
+        Arc::new(CometFragmentExec::try_new(fragment_proto, vec![child])?);
+    println!(
+        "original plan:\n{}",
+        displayable(plan.as_ref()).indent(false)
+    );
+
+    // --- Encode (scheduler side) ---
+    let codec = CometPhysicalCodec::default();
+    let node = PhysicalPlanNode::try_from_physical_plan(Arc::clone(&plan), &codec)?;
+    let bytes = node.encode_to_vec();
+
+    // --- Ship bytes, decode in a fresh context (executor side) ---
+    let ctx = SessionContext::new();
+    let task_ctx = ctx.task_ctx();
+    let node2 = PhysicalPlanNode::decode(&bytes[..])?;
+    let plan2 = node2.try_into_physical_plan(task_ctx.as_ref(), &codec)?;
+    println!(
+        "reconstructed plan (executor side):\n{}",
+        displayable(plan2.as_ref()).indent(false)
+    );
+
+    // --- Execute the reconstructed plan ---
+    let mut stream = plan2.execute(0, task_ctx)?;
+    let mut values: Vec<i32> = Vec::new();
+    while let Some(batch) = stream.next().await {
+        let batch = batch?;
+        let col = batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .expect("int32 column");
+        values.extend(col.values().iter().copied());
+    }
+
+    assert_eq!(
+        values,
+        vec![3, 4, 5],
+        "fragment result must be identical after codec round-trip"
+    );
+    Ok(())
+}
diff --git a/native/core/tests/ballista_offload_dag.rs b/native/core/tests/ballista_offload_dag.rs
new file mode 100644
index 0000000000..1630b7cbf7
--- /dev/null
+++ b/native/core/tests/ballista_offload_dag.rs
@@ -0,0 +1,152 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// Builder tests for the general DAG offload plan (`build_offload_plan`). No
+// cluster is started; these only assert the shape of the built plan (and that
+// the build-time leaf-count guard fires), mirroring `ballista_fragment_child_input.rs`.
+
+#![cfg(feature = "ballista")]
+
+use comet::execution::ballista::{build_offload_plan, execute_offload_plan};
+use datafusion::arrow::util::pretty::pretty_format_batches;
+use datafusion::physical_plan::displayable;
+use datafusion_comet_proto::spark_operator::{
+    CometBallistaOffloadPlan, OffloadFragment, OffloadInput,
+};
+use prost::Message;
+
+mod common;
+use common::{build_native_scan_proto, build_scan_leaf_block_proto, write_test_parquet};
+
+/// A two-fragment DAG: fragment 0 is a `NativeScan` producer (no inputs) reading
+/// Parquet column `a`; fragment 1 is a consumer whose block is a `Scan`(#100) leaf
+/// fed by a hash `RepartitionExec` over fragment 0's output column `a` (ordinal 0).
+/// `build_offload_plan` must fold this into
+/// `CometFragmentExec(consumer, [RepartitionExec::Hash([a@0], 4)(CometFragmentExec(producer, []))])`.
+#[test]
+fn two_stage_aggregate_builds_hash_repartition_dag() {
+    let parquet = std::env::temp_dir().join("comet_ffi_ballista_offload_dag.parquet");
+    write_test_parquet(&parquet).expect("write test parquet");
+    let producer = build_native_scan_proto(&parquet).expect("build NativeScan producer block");
+    let consumer = build_scan_leaf_block_proto();
+
+    let plan = CometBallistaOffloadPlan {
+        num_partitions: 4,
+        fragments: vec![
+            OffloadFragment {
+                block_proto: producer,
+                inputs: vec![],
+            },
+            OffloadFragment {
+                block_proto: consumer,
+                inputs: vec![OffloadInput {
+                    producer: 0,
+                    hash_key_ordinals: vec![0],
+                }],
+            },
+        ],
+    };
+
+    let built = build_offload_plan(&plan.encode_to_vec()).expect("build_offload_plan");
+    let rendered = format!("{}", displayable(built.as_ref()).indent(false));
+    assert!(rendered.contains("CometFragmentExec"), "got:\n{rendered}");
+    assert!(
+        rendered.contains("RepartitionExec: partitioning=Hash([a@0], 4)"),
+        "got:\n{rendered}"
+    );
+}
+
+/// A fragment's block must declare exactly as many `OffloadInput`s as it has
+/// `Scan`(#100) leaves. Here fragment 1's block is a `NativeScan` (0 leaves), but
+/// the descriptor declares 1 input — `build_offload_plan` must fail fast at BUILD
+/// time (not lazily inside `CometFragmentExec::execute`).
+#[test]
+fn leaf_count_mismatch_fails_fast() {
+    let parquet = std::env::temp_dir().join("comet_ffi_ballista_offload_dag_mismatch.parquet");
+    write_test_parquet(&parquet).expect("write test parquet");
+    let producer = build_native_scan_proto(&parquet).expect("build NativeScan producer block");
+    // A second NativeScan block: 0 `Scan` leaves, but we wire it up as a consumer
+    // with 1 declared input.
+    let mismatched = build_native_scan_proto(&parquet).expect("build NativeScan block");
+
+    let plan = CometBallistaOffloadPlan {
+        num_partitions: 2,
+        fragments: vec![
+            OffloadFragment {
+                block_proto: producer,
+                inputs: vec![],
+            },
+            OffloadFragment {
+                block_proto: mismatched,
+                inputs: vec![OffloadInput {
+                    producer: 0,
+                    hash_key_ordinals: vec![0],
+                }],
+            },
+        ],
+    };
+
+    let err =
+        build_offload_plan(&plan.encode_to_vec()).expect_err("must fail fast on leaf mismatch");
+    assert!(
+        err.contains("Scan input leaves"),
+        "expected leaf-count mismatch error, got: {err}"
+    );
+}
+
+/// Real submission smoke test for `execute_offload_plan`: a single-fragment
+/// descriptor (one `NativeScan` block, no inputs, no shuffle edges) run on an
+/// in-process standalone Ballista cluster. This proves the
+/// descriptor -> `build_offload_plan` -> `execute_physical_plan` submission path
+/// works end to end, without hand-building a partial+final aggregate pair (see
+/// the comment below, which covers the multi-fragment hash-shuffle case and is
+/// deferred to the Scala E2E in Task 8).
+///
+/// A plain `#[test]` (not `#[tokio::test]`): `execute_offload_plan` builds and
+/// drives its own Tokio runtime internally (it is called synchronously from
+/// JNI, with no ambient runtime), so calling it from a thread that is already
+/// driving one (e.g. inside `#[tokio::test]`) panics with "Cannot start a
+/// runtime from within a runtime".
+#[ignore = "starts an in-process Ballista cluster; run explicitly"]
+#[test]
+fn single_fragment_offload_plan_executes() {
+    let parquet = std::env::temp_dir().join("comet_ffi_ballista_offload_dag_smoke.parquet");
+    write_test_parquet(&parquet).expect("write test parquet");
+    let producer = build_native_scan_proto(&parquet).expect("build NativeScan producer block");
+
+    let plan = CometBallistaOffloadPlan {
+        num_partitions: 2,
+        fragments: vec![OffloadFragment {
+            block_proto: producer,
+            inputs: vec![],
+        }],
+    };
+
+    let (_schema, batches) =
+        execute_offload_plan(&plan.encode_to_vec(), "").expect("execute_offload_plan");
+    println!("{}", pretty_format_batches(&batches).unwrap());
+    let rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+    assert_eq!(rows, 5, "expected all 5 scanned rows (a = 1..=5)");
+}
+
+// Deferred to the Scala E2E in Task 8 (per the task-3 brief): hand-building a
+// partial+final aggregate `CometBallistaOffloadPlan` (two `NativeScan`/`Scan`
+// blocks with the right agg-state schema on each side of a hash shuffle) is
+// intricate proto plumbing that the Scala path exercises for free via the real
+// planner. `single_fragment_offload_plan_executes` above already proves the
+// `execute_offload_plan` submission path (session setup, in-process standalone
+// cluster, `execute_physical_plan` codecs) works end to end; multi-fragment DAG
+// *shape* is covered by `two_stage_aggregate_builds_hash_repartition_dag` above.
diff --git a/native/core/tests/common/mod.rs b/native/core/tests/common/mod.rs
new file mode 100644
index 0000000000..9c4a2c90dd
--- /dev/null
+++ b/native/core/tests/common/mod.rs
@@ -0,0 +1,122 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// Shared helpers for Ballista offload tests: write a tiny Parquet file and build
+// Comet `Operator` proto blocks (`NativeScan` leaf, `Scan` leaf) used to assemble
+// offload descriptors without standing up a real cluster.
+//
+// `write_test_parquet` / `build_native_scan_proto` are copied verbatim from
+// `ballista_distributed.rs` so multiple test binaries can share them (each
+// `tests/*.rs` file is its own crate, so this lives in `tests/common/mod.rs` and
+// is pulled in via `mod common;`).
+
+#![allow(dead_code)]
+
+use std::sync::Arc;
+
+use datafusion::arrow::array::{Int32Array, RecordBatch};
+use datafusion::arrow::datatypes::{DataType as ArrowDataType, Field, Schema};
+use datafusion::parquet::arrow::ArrowWriter;
+use prost::Message;
+
+use datafusion_comet_proto::spark_expression::{data_type::DataTypeId, DataType};
+use datafusion_comet_proto::spark_operator::{
+    operator::OpStruct, NativeScan, NativeScanCommon, Operator, Scan, SparkFilePartition,
+    SparkPartitionedFile, SparkStructField,
+};
+
+/// Write a tiny Parquet file with a single int32 column `a` = [1..=5].
+pub fn write_test_parquet(path: &std::path::Path) -> anyhow::Result<()> {
+    let schema = Arc::new(Schema::new(vec![Field::new(
+        "a",
+        ArrowDataType::Int32,
+        true,
+    )]));
+    let batch = RecordBatch::try_new(
+        Arc::clone(&schema),
+        vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]))],
+    )?;
+    let file = std::fs::File::create(path)?;
+    let mut writer = ArrowWriter::try_new(file, schema, None)?;
+    writer.write(&batch)?;
+    writer.close()?;
+    Ok(())
+}
+
+/// Build a Comet `Operator` proto: a single `NativeScan` over `parquet_path`.
+pub fn build_native_scan_proto(parquet_path: &std::path::Path) -> anyhow::Result<Vec<u8>> {
+    let int32 = DataType {
+        type_id: DataTypeId::Int32 as i32,
+        type_info: None,
+    };
+    let field_a = SparkStructField {
+        name: "a".to_string(),
+        data_type: Some(int32),
+        nullable: true,
+        metadata: Default::default(),
+    };
+    let common = NativeScanCommon {
+        required_schema: vec![field_a.clone()],
+        data_schema: vec![field_a],
+        projection_vector: vec![0],
+        session_timezone: "UTC".to_string(),
+        source: "comet-ffi-ballista-test".to_string(),
+        ..Default::default()
+    };
+    let file_size = std::fs::metadata(parquet_path)?.len() as i64;
+    let partitioned_file = SparkPartitionedFile {
+        file_path: format!("file://{}", parquet_path.display()),
+        start: 0,
+        length: file_size,
+        file_size,
+        partition_values: vec![],
+    };
+    let native_scan = NativeScan {
+        common: Some(common),
+        file_partition: Some(SparkFilePartition {
+            partitioned_file: vec![partitioned_file],
+        }),
+    };
+    let op = Operator {
+        children: vec![],
+        plan_id: 0,
+        op_struct: Some(OpStruct::NativeScan(native_scan)),
+    };
+    Ok(op.encode_to_vec())
+}
+
+/// Build a Comet `Operator` proto: a single `Scan` (#100) leaf over a one-column
+/// Int32 schema named `a`. This is the shape a DAG **consumer** fragment's block
+/// must have — a childless `Scan` leaf that `build_offload_plan`'s hash
+/// `RepartitionExec` child feeds — as opposed to a `NativeScan` block (which reads
+/// Parquet directly and has zero `Scan` leaves, i.e. is only valid as a producer
+/// with no inputs).
+pub fn build_scan_leaf_block_proto() -> Vec<u8> {
+    let int32 = DataType {
+        type_id: DataTypeId::Int32 as i32,
+        type_info: None,
+    };
+    let scan = Scan {
+        fields: vec![int32],
+        source: "comet-offload-dag-test".to_string(),
+    };
+    let op = Operator {
+        children: vec![],
+        plan_id: 0,
+        op_struct: Some(OpStruct::Scan(scan)),
+    };
+    op.encode_to_vec()
+}
diff --git a/native/proto/src/proto/operator.proto b/native/proto/src/proto/operator.proto
index 2fcfe7f25b..a428d07bb4 100644
--- a/native/proto/src/proto/operator.proto
+++ b/native/proto/src/proto/operator.proto
@@ -58,6 +58,31 @@ message Operator {
   }
 }
 
+// A distributed offload plan: a DAG of Comet native fragments connected by hash
+// exchanges. `fragments` is topologically ordered; the last entry is the root
+// (the final stage whose output is returned to the driver).
+message CometBallistaOffloadPlan {
+  repeated OffloadFragment fragments = 1;
+  // Shuffle width applied to every hash exchange (one consistent partition count).
+  uint32 num_partitions = 2;
+}
+
+message OffloadFragment {
+  // The serialized Comet `Operator` plan for this fragment (with file partitions
+  // already injected for NativeScan leaves).
+  bytes block_proto = 1;
+  // Inputs in the same DFS order the fragment's `Scan` (#100) leaves appear
+  // (left subtree before right). Empty for a leaf fragment (a NativeScan block).
+  repeated OffloadInput inputs = 2;
+}
+
+message OffloadInput {
+  // Index into `CometBallistaOffloadPlan.fragments` of the producing fragment.
+  uint32 producer = 1;
+  // Hash-partition key ordinals into the PRODUCER fragment's output schema.
+  repeated uint32 hash_key_ordinals = 2;
+}
+
 message SparkPartitionedFile {
   string file_path = 1;
   int64 start = 2;
diff --git a/spark/src/main/scala/org/apache/comet/CometConf.scala b/spark/src/main/scala/org/apache/comet/CometConf.scala
index 8e47151358..4bfef40f97 100644
--- a/spark/src/main/scala/org/apache/comet/CometConf.scala
+++ b/spark/src/main/scala/org/apache/comet/CometConf.scala
@@ -280,6 +280,35 @@ object CometConf extends ShimCometConf {
   val COMET_EXEC_LOCAL_TABLE_SCAN_ENABLED: ConfigEntry[Boolean] =
     createExecEnabledConfig("localTableScan", defaultValue = false)
 
+  val COMET_EXEC_BALLISTA_ENABLED: ConfigEntry[Boolean] =
+    conf(s"$COMET_EXEC_CONFIG_PREFIX.ballista.enabled")
+      .category(CATEGORY_EXEC)
+      .doc("EXPERIMENTAL: When enabled, a `collect()` on a single-stage Comet-accelerated query " +
+        "is offloaded from the Spark driver to an in-process Apache DataFusion Ballista engine. " +
+        "The already-serialized whole-query Comet plan is submitted to Ballista and the result " +
+        "rows are returned directly on the driver, with no Spark executor tasks launched. Only " +
+        "single-stage plans (no exchange) are supported. R1 targets single-stage queries without " +
+        "dynamic partition pruning or correlated scalar subqueries: resolving those inputs " +
+        "(via `waitForSubqueries()`/`updateResult()` before the plan is handed to Ballista) can " +
+        "still transitively launch Spark executor tasks even with this flag enabled. Requires " +
+        "Adaptive Query Execution to be OFF: with AQE on, the collect root is an " +
+        "`AdaptiveSparkPlanExec` rather than the Comet columnar-to-row node that carries the " +
+        "offload override, so this flag silently has no effect.")
+      .booleanConf
+      .createWithDefault(false)
+
+  val COMET_EXEC_BALLISTA_SCHEDULER_URL: ConfigEntry[String] =
+    conf(s"$COMET_EXEC_CONFIG_PREFIX.ballista.scheduler.url")
+      .category(CATEGORY_EXEC)
+      .doc("EXPERIMENTAL: When the Comet Ballista offload is enabled, the URL of an external " +
+        "Ballista scheduler (e.g. `http://host:50050`) to submit the distributed plan to. When " +
+        "empty (the default), the plan is submitted to an in-process standalone Ballista cluster " +
+        "on the driver instead. The external scheduler and its executors must be the " +
+        "Comet-flavored `comet-scheduler` / `comet-executor` binaries so the " +
+        "shipped Comet plan nodes can be decoded there.")
+      .stringConf
+      .createWithDefault("")
+
   val COMET_NATIVE_COLUMNAR_TO_ROW_ENABLED: ConfigEntry[Boolean] =
     conf(s"$COMET_EXEC_CONFIG_PREFIX.columnarToRow.native.enabled")
       .category(CATEGORY_EXEC)
diff --git a/spark/src/main/scala/org/apache/comet/ballista/BallistaOffloadPlanner.scala b/spark/src/main/scala/org/apache/comet/ballista/BallistaOffloadPlanner.scala
new file mode 100644
index 0000000000..0a9ecd03c3
--- /dev/null
+++ b/spark/src/main/scala/org/apache/comet/ballista/BallistaOffloadPlanner.scala
@@ -0,0 +1,255 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet.ballista
+
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
+import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning
+import org.apache.spark.sql.comet.{CometBroadcastExchangeExec, CometBroadcastHashJoinExec, CometExec, CometHashJoinExec, CometNativeExec, CometSortMergeJoinExec}
+import org.apache.spark.sql.comet.execution.shuffle.CometShuffleExchangeExec
+import org.apache.spark.sql.execution.SparkPlan
+
+import com.google.protobuf.ByteString
+
+import org.apache.comet.CometConf
+import org.apache.comet.serde.OperatorOuterClass.{OffloadFragment, OffloadInput}
+import org.apache.comet.serde.OperatorOuterClass.CometBallistaOffloadPlan
+
+/**
+ * Driver-side DAG walker that decomposes a Comet physical plan into a fragment/hash-exchange DAG
+ * and serializes it as a `CometBallistaOffloadPlan` protobuf for submission to Ballista via
+ * `NativeBallista.executeOffloadPlan`.
+ *
+ * Currently supports:
+ *   - a single native block (no Comet exchange): one fragment, no inputs.
+ *   - an N-block LINEAR chain of native blocks connected by [[CometShuffleExchangeExec]] hash
+ *     exchanges (the R2 two-stage GROUP BY shape generalized to N stages).
+ *   - a co-partitioned join block (shuffle-hash or sort-merge) fed by exactly two Comet hash
+ *     exchanges, discovered via a generic DFS over the block that stops descent at each exchange.
+ *     DFS pre-order naturally visits a binary join's left input before its right input, matching
+ *     the join proto's `[left_leaf, right_leaf]` leaf order; no join-specific handling is
+ *     required.
+ *
+ * Other multi-input DAG shapes (e.g. a native block fed by more than two upstream fragments, or
+ * broadcast joins) are a future increment; the walker rejects anything it doesn't recognize with
+ * an [[UnsupportedOperationException]] rather than guessing.
+ */
+object BallistaOffloadPlanner {
+
+  /** A native block plus the exchanges that directly feed it (its DAG inputs). */
+  private case class BlockNode(block: CometNativeExec, inputs: Seq[CometShuffleExchangeExec])
+
+  /**
+   * Decompose `root` into a topologically-ordered DAG of native blocks + hash exchanges and
+   * serialize it as a CometBallistaOffloadPlan. Producers precede consumers; the last fragment is
+   * the root. Throws [[UnsupportedOperationException]] for shapes not yet supported.
+   */
+  def buildOffloadPlan(root: SparkPlan, numPartitions: Int): Array[Byte] = {
+    // Assign a fragment index to every native block, discovered in producer-first order.
+    val ordered = mutable.ArrayBuffer.empty[BlockNode]
+    val indexOf = mutable.LinkedHashMap.empty[CometNativeExec, Int]
+
+    // `p` itself is usually the native block, but the very top of the collect root may be a
+    // thin wrapper with no serialized plan of its own (e.g. `CometNativeColumnarToRowExec`, the
+    // columnar-to-row conversion node that carries the `executeCollect` override which calls in
+    // here). `collectFirst` is a pre-order search, so it finds the nearest enclosing boundary
+    // (the outermost `CometNativeExec` with a serialized plan) without ever having to look past
+    // it into that block's own internals.
+    def blockOf(p: SparkPlan): CometNativeExec =
+      p.collectFirst { case n: CometNativeExec if n.serializedPlanOpt.isDefined => n }
+        .getOrElse(
+          throw new UnsupportedOperationException(
+            "Comet Ballista offload: expected a serialized native block reachable from " +
+              s"${p.nodeName}:\n$root"))
+
+    // The direct native-block inputs of `p`'s subtree are the Comet hash exchanges reachable
+    // without crossing a deeper native block, discovered by a plain DFS that stops descent at
+    // each exchange. Used both at block level (to find a block's own inputs) and rooted at a
+    // single join's left/right child (to validate how a join's two sides split those inputs).
+    def directExchanges(p: SparkPlan): Seq[CometShuffleExchangeExec] = {
+      val found = mutable.ArrayBuffer.empty[CometShuffleExchangeExec]
+      def walk(p: SparkPlan): Unit = p match {
+        case e: CometShuffleExchangeExec => found += e // do NOT descend past an exchange
+        case _: CometBroadcastExchangeExec =>
+          // A broadcast build side is not a Comet hash exchange and must never be silently
+          // walked through and ignored -- reject rather than mis-classify the block.
+          throw new UnsupportedOperationException(
+            "Comet Ballista offload: broadcast joins are not supported (found a " +
+              s"CometBroadcastExchangeExec build side); a future increment:\n$root")
+        case other => other.children.foreach(walk)
+      }
+      walk(p)
+      found.toSeq
+    }
+
+    // The binary Comet join nodes directly inside `p`'s subtree, discovered the same way (DFS,
+    // stop descent at exchanges) but continuing past a join into its own children so a fused
+    // MULTI-join block (two joins with no exchange between them) is still detected.
+    def directJoins(p: SparkPlan): Seq[SparkPlan] = {
+      val found = mutable.ArrayBuffer.empty[SparkPlan]
+      def walk(p: SparkPlan): Unit = p match {
+        case _: CometShuffleExchangeExec => // do NOT descend past an exchange
+        case _: CometBroadcastHashJoinExec =>
+          throw new UnsupportedOperationException(
+            "Comet Ballista offload: broadcast joins (CometBroadcastHashJoinExec) are not " +
+              s"supported; a future increment:\n$root")
+        case j @ (_: CometHashJoinExec | _: CometSortMergeJoinExec) =>
+          found += j
+          j.children.foreach(walk)
+        case other => other.children.foreach(walk)
+      }
+      walk(p)
+      found.toSeq
+    }
+
+    // Resolve a block's ordered DAG inputs (producer exchanges, in the proto's
+    // `[left_leaf, right_leaf]` order) from its direct exchanges and joins. A block may be a
+    // leaf (0 exchanges), a linear chain (1 exchange, no join -- e.g. partial->final
+    // aggregate), or a single co-partitioned join whose two sides each contribute exactly one
+    // of exactly two exchanges. Any other shape -- a fused multi-join block, a join with a
+    // broadcast (non-hash-exchange) side, or exchanges not cleanly split by a single join --
+    // throws rather than silently mis-pairing exchanges from different joins.
+    def resolveInputs(block: CometNativeExec): Seq[CometShuffleExchangeExec] = {
+      // A broadcast join is a supported-looking CometNativeExec (it IS a CometExec, so offload
+      // is attempted for it) but its build side is a CometBroadcastExchangeExec, not a Comet
+      // hash exchange -- reject explicitly here rather than letting it fall through and be
+      // mis-classified as a leaf/linear block by directExchanges/directJoins below.
+      if (block.isInstanceOf[CometBroadcastHashJoinExec]) {
+        throw new UnsupportedOperationException(
+          "Comet Ballista offload: broadcast joins (CometBroadcastHashJoinExec) are not " +
+            s"supported; a future increment:\n$root")
+      }
+      val exchanges = directExchanges(block)
+      val joins = directJoins(block)
+      (exchanges.size, joins.size) match {
+        case (0, _) => Seq.empty
+        case (1, 0) => exchanges
+        case (2, 1) =>
+          val join = joins.head
+          val (leftPlan, rightPlan) = join match {
+            case j: CometHashJoinExec => (j.left, j.right)
+            case j: CometSortMergeJoinExec => (j.left, j.right)
+          }
+          val leftEx = directExchanges(leftPlan)
+          val rightEx = directExchanges(rightPlan)
+          if (leftEx.size == 1 && rightEx.size == 1 &&
+            (leftEx.toSet ++ rightEx.toSet) == exchanges.toSet) {
+            Seq(leftEx.head, rightEx.head)
+          } else {
+            throw new UnsupportedOperationException(
+              "Comet Ballista offload: join block's two Comet exchanges are not cleanly split " +
+                s"one-each across the join's left (${leftEx.size} exchange(s)) and right " +
+                s"(${rightEx.size} exchange(s)) sides -- a broadcast join side is a future " +
+                s"increment:\n$root")
+          }
+        case (n, m) =>
+          throw new UnsupportedOperationException(
+            s"Comet Ballista offload: block resolves to $n Comet exchange(s) and $m binary " +
+              "join(s); only a leaf block (0 exchanges), a linear chain (1 exchange, no " +
+              "join), or a single co-partitioned join (exactly 2 exchanges split one-each " +
+              "across exactly one join's inputs) are supported -- a fused multi-join block " +
+              s"is a future increment:\n$root")
+      }
+    }
+
+    def register(block: CometNativeExec): Int = indexOf.getOrElseUpdate(
+      block, {
+        val inputs = resolveInputs(block)
+        // Recurse producers first so their indices are smaller (topological order).
+        inputs.foreach(ex => register(blockOf(ex.child)))
+        val idx = ordered.size
+        ordered += BlockNode(block, inputs)
+        indexOf.put(block, idx)
+        idx
+      })
+
+    register(blockOf(root))
+
+    // A multi-fragment plan feeds downstream fragments from the Ballista hash shuffle, which
+    // requires each consuming fragment's shuffle-input leaf to serialize as a plain `Scan`
+    // (#100), not a native `ShuffleScan` (#116) that expects to read Comet shuffle blocks
+    // directly. That requires direct read disabled (mirrors the old two-block R2 check).
+    if (ordered.size > 1 && CometConf.COMET_SHUFFLE_DIRECT_READ_ENABLED.get()) {
+      throw new UnsupportedOperationException(
+        "Comet Ballista multi-fragment offload requires " +
+          s"${CometConf.COMET_SHUFFLE_DIRECT_READ_ENABLED.key}=false so each downstream " +
+          "fragment reads a plain Scan leaf (fed by the Ballista shuffle) rather than a " +
+          s"native ShuffleScan:\n$root")
+    }
+
+    val planBuilder = CometBallistaOffloadPlan.newBuilder().setNumPartitions(numPartitions)
+    ordered.foreach { node =>
+      // Co-partition check: a two-input (join) block's inputs must be hash-partitioned to the
+      // same width. Spark's EnsureRequirements guarantees this; assert to fail fast otherwise.
+      if (node.inputs.size == 2) {
+        val ns = node.inputs.map(_.outputPartitioning).collect { case HashPartitioning(_, n) =>
+          n
+        }
+        require(
+          ns.distinct.size == 1,
+          s"Comet Ballista offload: join inputs are not co-partitioned to the same width " +
+            s"($ns):\n$root")
+      }
+      val fragBuilder = OffloadFragment.newBuilder()
+      // Inject file partitions into NativeScan leaves (reuse the existing helper).
+      fragBuilder.setBlockProto(
+        ByteString.copyFrom(CometExec.injectScanFilesFor(root, node.block)))
+      node.inputs.foreach { ex =>
+        val producer = blockOf(ex.child)
+        val producerIdx = indexOf(producer)
+        val keyOrdinals = hashKeyOrdinals(ex, producer.output, root)
+        val inputBuilder = OffloadInput.newBuilder().setProducer(producerIdx)
+        keyOrdinals.foreach(o => inputBuilder.addHashKeyOrdinals(o))
+        fragBuilder.addInputs(inputBuilder)
+      }
+      planBuilder.addFragments(fragBuilder)
+    }
+    planBuilder.build().toByteArray
+  }
+
+  /** Map an exchange's HashPartitioning key expressions to ordinals in the producer's output. */
+  private def hashKeyOrdinals(
+      ex: CometShuffleExchangeExec,
+      producerOutput: Seq[Attribute],
+      root: SparkPlan): Seq[Int] = ex.outputPartitioning match {
+    case HashPartitioning(expressions, _) =>
+      expressions.map { e =>
+        val attr = e match {
+          case a: AttributeReference => a
+          case other =>
+            throw new UnsupportedOperationException(
+              s"Comet Ballista offload: hash key is not a simple column ($other); not " +
+                s"supported:\n$root")
+        }
+        val ord = producerOutput.indexWhere(_.exprId == attr.exprId)
+        if (ord < 0) {
+          throw new UnsupportedOperationException(
+            s"Comet Ballista offload: hash key $attr not found in producer output " +
+              s"${producerOutput.map(_.name)}:\n$root")
+        }
+        ord
+      }
+    case other =>
+      throw new UnsupportedOperationException(
+        s"Comet Ballista offload: only HashPartitioning exchanges are supported; found $other " +
+          s"(range/single-partition sort is a future increment):\n$root")
+  }
+}
diff --git a/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala b/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala
new file mode 100644
index 0000000000..73341afd7b
--- /dev/null
+++ b/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet.ballista
+
+import org.apache.comet.NativeBase
+
+/**
+ * JNI binding to the native driver-side Ballista submission entry.
+ *
+ * The offload code (the `execution::ballista` module and its
+ * `Java_org_apache_comet_ballista_NativeBallista_*` JNI entries) is compiled into the single
+ * `libcomet` cdylib when Comet's native crate is built with the default-off `ballista` Cargo
+ * feature (`cd native && cargo build --features ballista`). There is no separate
+ * `libdatafusion_comet_ballista` library anymore: folding the offload into core means it shares
+ * Comet core's single `JAVA_VM` static, so a Comet-on-executor query and an in-process offload
+ * can coexist in one JVM without the "JAVA_VM not initialized" panic.
+ *
+ * EXPERIMENTAL: used by [[org.apache.spark.sql.comet.CometExec.executeCollectViaBallista]] to
+ * offload a Comet query - a DAG of native fragments joined by hash exchanges, folded from the
+ * plan by `executeOffloadPlan` - to a Ballista engine (in-process standalone or an external
+ * cluster) on the Spark driver.
+ */
+class NativeBallista {
+
+  // Ensure the native library is loaded before any native method is invoked.
+  NativeBallista.ensureLoaded()
+
+  /**
+   * No-op native entry used only to detect whether `libcomet` was built with the `ballista`
+   * feature: resolving this symbol succeeds only in a `--features ballista` build. See
+   * [[NativeBallista.isAvailable]].
+   */
+  @native def probeAvailable(): Unit
+
+  /**
+   * Run a serialized [[org.apache.comet.serde.OperatorOuterClass.CometBallistaOffloadPlan]] (a
+   * DAG of Comet native fragments + hash exchanges) on Ballista and export the single
+   * (concatenated) result batch into the caller-allocated Arrow C Data structs.
+   *
+   * @param plan
+   *   serialized CometBallistaOffloadPlan
+   * @param arrayAddrs
+   *   one ArrowArray struct address per output column
+   * @param schemaAddrs
+   *   one ArrowSchema struct address per output column
+   * @param schedulerUrl
+   *   external Ballista scheduler URL; "" = in-process standalone
+   * @return
+   *   number of rows exported
+   */
+  @native def executeOffloadPlan(
+      plan: Array[Byte],
+      arrayAddrs: Array[Long],
+      schemaAddrs: Array[Long],
+      schedulerUrl: String): Long
+}
+
+object NativeBallista {
+
+  @volatile private var probed = false
+  @volatile private var available = false
+  @volatile private var loadError: Option[Throwable] = None
+
+  /**
+   * Ensure the single `libcomet` cdylib is loaded. [[NativeBase]] already loads it for every
+   * Comet native method; because the offload JNI entries are now compiled into `libcomet` (behind
+   * the `ballista` feature), loading libcomet also binds the `NativeBallista_*` entries. There is
+   * no separate library to `System.load`.
+   */
+  private def ensureCometLoaded(): Unit = synchronized {
+    if (loadError.isDefined) return
+    try {
+      NativeBase.isLoaded()
+    } catch {
+      case t: Throwable => loadError = Some(t)
+    }
+  }
+
+  /** Load libcomet, throwing if it cannot be loaded. */
+  def ensureLoaded(): Unit = {
+    ensureCometLoaded()
+    loadError.foreach { t =>
+      throw new IllegalStateException(s"failed to load native comet library: ${t.getMessage}", t)
+    }
+  }
+
+  /**
+   * True if the offload native entries are present - i.e. `libcomet` loaded AND was built with
+   * the `ballista` Cargo feature. Detected once by resolving a `NativeBallista_*` JNI symbol; a
+   * feature-less `libcomet` has no such symbol and yields `false`, so the offload suites
+   * `assume`-skip instead of hard-failing with an `UnsatisfiedLinkError`.
+   */
+  def isAvailable: Boolean = synchronized {
+    if (probed) return available
+    probed = true
+    ensureCometLoaded()
+    if (loadError.isDefined) {
+      available = false
+    } else {
+      available =
+        try {
+          // Resolve a NativeBallista JNI entry; only a `--features ballista` libcomet has it.
+          new NativeBallista().probeAvailable()
+          true
+        } catch {
+          case t: Throwable =>
+            loadError = Some(t)
+            false
+        }
+    }
+    available
+  }
+
+  /** The load/availability failure, if any (probes on first call). */
+  def loadFailure: Option[Throwable] = {
+    if (!probed) isAvailable
+    loadError
+  }
+}
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometColumnarToRowExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometColumnarToRowExec.scala
index 2fe870ed06..48a9b9aec4 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/CometColumnarToRowExec.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometColumnarToRowExec.scala
@@ -45,6 +45,8 @@ import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector}
 import org.apache.spark.util.{SparkFatalException, Utils}
 import org.apache.spark.util.io.ChunkedByteBuffer
 
+import org.apache.comet.CometConf
+
 /**
  * Copied from Spark `ColumnarToRowExec`. Comet needs the fix for SPARK-50235 but cannot wait for
  * the fix to be released in Spark versions. We copy the implementation here to apply the fix.
@@ -62,6 +64,17 @@ case class CometColumnarToRowExec(child: SparkPlan)
 
   override def outputOrdering: Seq[SortOrder] = child.outputOrdering
 
+  override def executeCollect(): Array[InternalRow] = {
+    if (CometConf.COMET_EXEC_BALLISTA_ENABLED.get()) {
+      // EXPERIMENTAL (R1): offload the whole-query native plan to an in-process Ballista engine on
+      // the driver instead of launching a Spark job. This ColumnarToRow node is the collect root,
+      // so the CometNativeExec boundary carrying the serialized plan is in its subtree.
+      CometExec.executeCollectViaBallista(this)
+    } else {
+      super.executeCollect()
+    }
+  }
+
   // `ColumnarToRowExec` processes the input RDD directly, which is kind of a leaf node in the
   // codegen stage and needs to do the limit check.
   protected override def canCheckLimitNotReached: Boolean = true
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometNativeColumnarToRowExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometNativeColumnarToRowExec.scala
index 6fa220b728..0d39a5517f 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/CometNativeColumnarToRowExec.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometNativeColumnarToRowExec.scala
@@ -69,6 +69,17 @@ case class CometNativeColumnarToRowExec(child: SparkPlan)
 
   override def outputOrdering: Seq[SortOrder] = child.outputOrdering
 
+  override def executeCollect(): Array[InternalRow] = {
+    if (CometConf.COMET_EXEC_BALLISTA_ENABLED.get()) {
+      // EXPERIMENTAL (R1): offload the whole-query native plan to an in-process Ballista engine on
+      // the driver instead of launching a Spark job. This ColumnarToRow node is the collect root,
+      // so the CometNativeExec boundary carrying the serialized plan is in its subtree.
+      CometExec.executeCollectViaBallista(this)
+    } else {
+      super.executeCollect()
+    }
+  }
+
   override lazy val metrics: Map[String, SQLMetric] = Map(
     "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
     "numInputBatches" -> SQLMetrics.createMetric(sparkContext, "number of input batches"),
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
index e4d6b53770..f269675c76 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
@@ -53,6 +53,7 @@ import com.google.protobuf.CodedOutputStream
 
 import org.apache.comet.{CometConf, CometExecIterator, CometRuntimeException, ConfigEntry}
 import org.apache.comet.CometSparkSessionExtensions.{isCometShuffleEnabled, withFallbackReason}
+import org.apache.comet.ballista.{BallistaOffloadPlanner, NativeBallista}
 import org.apache.comet.parquet.CometParquetUtils
 import org.apache.comet.rules.CometExecRule
 import org.apache.comet.serde.{CometOperatorSerde, Compatible, Incompatible, OperatorOuterClass, SupportLevel, Unsupported}
@@ -60,6 +61,7 @@ import org.apache.comet.serde.OperatorOuterClass.{AggregateMode => CometAggregat
 import org.apache.comet.serde.QueryPlanSerde
 import org.apache.comet.serde.QueryPlanSerde.{aggExprToProto, exprToProto, isStringCollationType, supportedSortType}
 import org.apache.comet.serde.operator.CometSink
+import org.apache.comet.vector.NativeUtil
 
 /**
  * Trait for injecting per-partition planning data into operator nodes.
@@ -271,8 +273,16 @@ abstract class CometExec extends CometPlan {
   override def doExecute(): RDD[InternalRow] =
     ColumnarToRowExec(this).doExecute()
 
-  override def executeCollect(): Array[InternalRow] =
-    ColumnarToRowExec(this).executeCollect()
+  override def executeCollect(): Array[InternalRow] = {
+    if (CometConf.COMET_EXEC_BALLISTA_ENABLED.get()) {
+      // EXPERIMENTAL (R1): offload the whole-query native plan to an in-process
+      // Ballista engine on the driver and return the rows directly, launching no
+      // Spark executor tasks. See CometExec.executeCollectViaBallista.
+      CometExec.executeCollectViaBallista(this)
+    } else {
+      ColumnarToRowExec(this).executeCollect()
+    }
+  }
 
   override def outputOrdering: Seq[SortOrder] = originalPlan.outputOrdering
 
@@ -392,6 +402,116 @@ object CometExec {
       encryptedFilePaths)
   }
 
+  /**
+   * EXPERIMENTAL: offload a Comet query to an in-process Apache DataFusion Ballista engine on the
+   * Spark driver and return the collected rows, launching NO Spark executor tasks.
+   *
+   * Enabled by `spark.comet.exec.ballista.enabled`. The query is decomposed into a DAG of native
+   * fragments connected by hash exchanges by [[BallistaOffloadPlanner]] (currently: a single
+   * native block; an N-block linear chain of hash exchanges; or a single co-partitioned join fed
+   * by exactly two hash exchanges -- fused multi-join blocks and broadcast join sides are a
+   * future increment), serialized as a `CometBallistaOffloadPlan`, and submitted to Ballista via
+   * the general native `executeOffloadPlan` entry point.
+   *
+   * Anything not yet supported by the walker throws [[UnsupportedOperationException]].
+   */
+  def executeCollectViaBallista(root: SparkPlan): Array[InternalRow] = {
+    val numPartitions = root.conf.numShufflePartitions
+    val planBytes = BallistaOffloadPlanner.buildOffloadPlan(root, numPartitions)
+    val schedulerUrl = CometConf.COMET_EXEC_BALLISTA_SCHEDULER_URL.get()
+    val numCols = root.output.length
+    val nativeUtil = new NativeUtil()
+    try {
+      val nativeBallista = new NativeBallista
+      nativeUtil.getNextBatch(
+        numCols,
+        (arrayAddrs, schemaAddrs) =>
+          nativeBallista
+            .executeOffloadPlan(planBytes, arrayAddrs, schemaAddrs, schedulerUrl)) match {
+        case Some(batch) =>
+          try {
+            batch.rowIterator().asScala.map(_.copy()).toArray
+          } finally {
+            batch.close()
+          }
+        case None =>
+          Array.empty[InternalRow]
+      }
+    } finally {
+      nativeUtil.close()
+    }
+  }
+
+  /**
+   * Inject file partitions into a native block's serialized plan. The serialized template carries
+   * each `NativeScan`'s `common` metadata but NOT its file list (Comet normally injects files
+   * per-partition at task launch, see `NativeScanPlanDataInjector`). Since the offload runs a
+   * block as one native leaf, merge all partitions' files into each scan so Ballista reads the
+   * complete table. Blocks with no `NativeScan` (e.g. an R2 final-aggregate reading a shuffle)
+   * are returned unchanged.
+   */
+  private def injectScanFiles(root: SparkPlan, boundary: CometNativeExec): Array[Byte] = {
+    val planBytes = boundary.serializedPlanOpt.plan.getOrElse(
+      throw new UnsupportedOperationException(
+        s"Comet Ballista offload: the native plan block carries no serialized plan:\n$root"))
+    // Only `CometNativeScanExec` leaves have their files injected below; an Iceberg native scan
+    // carries its splits differently (see `CometIcebergNativeScanExec.serializedPartitionData`)
+    // and would be shipped to Ballista with no files to read, silently returning zero rows. Reject.
+    val icebergScans = boundary.collect { case s: CometIcebergNativeScanExec => s }
+    if (icebergScans.nonEmpty) {
+      throw new UnsupportedOperationException(
+        "Comet Ballista offload does not support Iceberg native scans " +
+          s"(${icebergScans.size} CometIcebergNativeScanExec leaves found); only " +
+          s"CometNativeScanExec leaves can be offloaded:\n$root")
+    }
+    val nativeScans = boundary.collect { case s: CometNativeScanExec => s }
+    if (nativeScans.isEmpty) {
+      planBytes
+    } else {
+      val commonByKey = nativeScans.map { scan =>
+        scan.ensureSubqueriesResolved()
+        scan.sourceKey -> scan.commonData
+      }.toMap
+      val partitionByKey = nativeScans.map { scan =>
+        scan.sourceKey -> mergeFilePartitions(scan.perPartitionData)
+      }.toMap
+      val template = Operator.parseFrom(planBytes)
+      val injected = PlanDataInjector.injectPlanData(template, commonByKey, partitionByKey)
+      PlanDataInjector.serializeOperator(injected)
+    }
+  }
+
+  /**
+   * Thin public wrapper exposing [[injectScanFiles]] to `org.apache.comet.ballista
+   * .BallistaOffloadPlanner`, which needs to inject file partitions into every native block's
+   * `NativeScan` leaves (not just a single block/boundary) when walking the offload DAG. Public
+   * (not `private[comet]`) because the planner lives in a different package tree
+   * (`org.apache.comet.ballista`, not `org.apache.spark.sql.comet`) that a `comet`-qualified
+   * private would not reach.
+   */
+  def injectScanFilesFor(root: SparkPlan, boundary: CometNativeExec): Array[Byte] =
+    injectScanFiles(root, boundary)
+
+  /**
+   * Merge the per-partition file lists of a native scan into a single `NativeScan` carrying every
+   * partition's files, serialized as the `partitionBytes` expected by
+   * [[NativeScanPlanDataInjector]] (a `NativeScan` whose `file_partition` holds all
+   * `partitioned_file`s). Used by the Ballista offload so the whole table is read by one native
+   * scan leaf.
+   */
+  private def mergeFilePartitions(perPartitionData: Array[Array[Byte]]): Array[Byte] = {
+    val filePartition = OperatorOuterClass.SparkFilePartition.newBuilder()
+    perPartitionData.foreach { bytes =>
+      val scan = OperatorOuterClass.NativeScan.parseFrom(bytes)
+      filePartition.addAllPartitionedFile(scan.getFilePartition.getPartitionedFileList)
+    }
+    OperatorOuterClass.NativeScan
+      .newBuilder()
+      .setFilePartition(filePartition)
+      .build()
+      .toByteArray
+  }
+
   /**
    * Executes this Comet operator and serialized output ColumnarBatch into bytes.
    */
diff --git a/spark/src/test/spark-4.x/org/apache/comet/ballista/BallistaOffloadPlannerSuite.scala b/spark/src/test/spark-4.x/org/apache/comet/ballista/BallistaOffloadPlannerSuite.scala
new file mode 100644
index 0000000000..46d86c4997
--- /dev/null
+++ b/spark/src/test/spark-4.x/org/apache/comet/ballista/BallistaOffloadPlannerSuite.scala
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet.ballista
+
+import org.apache.spark.sql.CometTestBase
+import org.apache.spark.sql.internal.SQLConf
+
+import org.apache.comet.CometConf
+import org.apache.comet.serde.OperatorOuterClass.CometBallistaOffloadPlan
+
+/**
+ * Unit tests for [[BallistaOffloadPlanner]]: drives real Comet plans via SQL and asserts the
+ * `CometBallistaOffloadPlan` descriptor the walker emits, without requiring the native `ballista`
+ * feature to be built (no execution, only plan decomposition + serialization).
+ */
+class BallistaOffloadPlannerSuite extends CometTestBase {
+
+  test("two-stage GROUP BY builds a 2-fragment linear descriptor with a hash edge on the key") {
+    withParquetTable((0 until 100).map(i => (i % 5, i)), "t") {
+      withSQLConf(
+        SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false",
+        SQLConf.SHUFFLE_PARTITIONS.key -> "4",
+        CometConf.COMET_SHUFFLE_DIRECT_READ_ENABLED.key -> "false",
+        CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") {
+        val plan = sql("SELECT _1, count(*) FROM t GROUP BY _1").queryExecution.executedPlan
+        val bytes = BallistaOffloadPlanner.buildOffloadPlan(plan, numPartitions = 4)
+        val desc = CometBallistaOffloadPlan.parseFrom(bytes)
+        assert(desc.getFragmentsCount == 2)
+        assert(desc.getNumPartitions == 4)
+        // fragment 1 (root) has one input from fragment 0, hashed on ordinal 0 (the group key)
+        val rootInputs = desc.getFragments(1).getInputsList
+        assert(rootInputs.size == 1)
+        assert(rootInputs.get(0).getProducer == 0)
+        assert(rootInputs.get(0).getHashKeyOrdinalsList.contains(0))
+      }
+    }
+  }
+
+  test("single native block builds a 1-fragment descriptor with no inputs") {
+    withParquetTable((0 until 10).map(i => (i, i)), "t") {
+      withSQLConf(
+        SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false",
+        CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") {
+        val plan = sql("SELECT _1 + 1 FROM t").queryExecution.executedPlan
+        val desc = CometBallistaOffloadPlan.parseFrom(
+          BallistaOffloadPlanner.buildOffloadPlan(plan, numPartitions = 4))
+        assert(desc.getFragmentsCount == 1)
+        assert(desc.getFragments(0).getInputsCount == 0)
+      }
+    }
+  }
+
+  test("shuffle-hash join builds a join fragment with two hash inputs on the join keys") {
+    withParquetTable((0 until 50).map(i => (i, i * 10)), "l") {
+      withParquetTable((0 until 50).map(i => (i, i * 100)), "r") {
+        withSQLConf(
+          SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false",
+          SQLConf.SHUFFLE_PARTITIONS.key -> "4",
+          CometConf.COMET_SHUFFLE_DIRECT_READ_ENABLED.key -> "false",
+          SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1",
+          CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") {
+          val plan =
+            sql("SELECT l._2, r._2 FROM l JOIN r ON l._1 = r._1").queryExecution.executedPlan
+          val desc = CometBallistaOffloadPlan.parseFrom(
+            BallistaOffloadPlanner.buildOffloadPlan(plan, numPartitions = 4))
+          // root fragment = the join; two inputs (left, right) each on one key ordinal
+          val join = desc.getFragments(desc.getFragmentsCount - 1)
+          assert(join.getInputsCount == 2, s"expected 2 join inputs, got:\n$desc")
+          assert(join.getInputs(0).getHashKeyOrdinalsList.size == 1)
+          assert(join.getInputs(1).getHashKeyOrdinalsList.size == 1)
+        }
+      }
+    }
+  }
+
+  test("range/ORDER BY exchange is rejected with a clear message") {
+    withParquetTable((0 until 20).map(i => (i % 3, i)), "t") {
+      withSQLConf(
+        SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false",
+        CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") {
+        val plan =
+          sql("SELECT _1, count(*) c FROM t GROUP BY _1 ORDER BY _1").queryExecution.executedPlan
+        val e = intercept[UnsupportedOperationException] {
+          BallistaOffloadPlanner.buildOffloadPlan(plan, numPartitions = 4)
+        }
+        assert(e.getMessage.contains("HashPartitioning") || e.getMessage.contains("range"))
+      }
+    }
+  }
+
+  test("broadcast join is rejected with a clear message") {
+    // Do NOT disable auto-broadcast (no AUTO_BROADCASTJOIN_THRESHOLD=-1 override): `r` is tiny
+    // so Spark plans a broadcast join, giving a `CometBroadcastHashJoinExec` fed by a
+    // `CometBroadcastExchangeExec` build side -- the out-of-scope shape the walker must reject.
+    withParquetTable((0 until 50).map(i => (i, i * 10)), "l") {
+      withParquetTable((0 until 5).map(i => (i, i * 100)), "r") {
+        withSQLConf(
+          SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false",
+          SQLConf.SHUFFLE_PARTITIONS.key -> "4",
+          CometConf.COMET_SHUFFLE_DIRECT_READ_ENABLED.key -> "false",
+          CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") {
+          val plan =
+            sql("SELECT l._2, r._2 FROM l JOIN r ON l._1 = r._1").queryExecution.executedPlan
+          val e = intercept[UnsupportedOperationException] {
+            BallistaOffloadPlanner.buildOffloadPlan(plan, numPartitions = 4)
+          }
+          assert(
+            e.getMessage.toLowerCase.contains("broadcast"),
+            s"expected a message mentioning broadcast, got: ${e.getMessage}")
+        }
+      }
+    }
+  }
+}
diff --git a/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaDistributedSuite.scala b/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaDistributedSuite.scala
new file mode 100644
index 0000000000..ef6baf519e
--- /dev/null
+++ b/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaDistributedSuite.scala
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet.ballista
+
+import java.util.concurrent.atomic.AtomicInteger
+
+import org.apache.spark.CometListenerBusUtils
+import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskStart}
+import org.apache.spark.sql.CometTestBase
+import org.apache.spark.sql.comet.CometNativeExec
+import org.apache.spark.sql.comet.execution.shuffle.CometShuffleExchangeExec
+import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
+import org.apache.spark.sql.internal.SQLConf
+
+import org.apache.comet.CometConf
+
+/**
+ * The R2 milestone: a Spark `GROUP BY` runs DISTRIBUTED on an in-process Apache DataFusion
+ * Ballista engine on the Spark driver, across a hash shuffle, with Comet native fragments on both
+ * sides (partial aggregate -> Ballista hash shuffle -> final aggregate). The collected rows must
+ * be identical to the flag-off (Spark/Comet-on-executors) baseline, launching ZERO Spark executor
+ * tasks.
+ *
+ * Plan shape offloaded (with `spark.comet.exec.shuffle.directRead.enabled=false` so the final
+ * aggregate's input leaf serializes as a plain `Scan` fed by the Ballista shuffle):
+ * {{{
+ *   CometNativeExec[block2]  (final HashAggregate over a Scan leaf)
+ *     CometShuffleExchangeExec (HashPartitioning(k), N)
+ *       CometNativeExec[block1]  (partial HashAggregate over a NativeScan)
+ *         CometNativeScanExec
+ * }}}
+ *
+ * No `ORDER BY` in the offloaded query: a global sort would add a range-partition exchange (a
+ * third stage), which is out of the 2-block scope. The test sorts the collected rows itself.
+ */
+class CometBallistaDistributedSuite extends CometTestBase with AdaptiveSparkPlanHelper {
+
+  /**
+   * Runs `f`, counting Spark executor task starts during it. Drains the listener bus before
+   * attaching and after running so asynchronous task-start events are flushed. (Same apparatus as
+   * `CometBallistaOffloadSuite`.)
+   */
+  private def countTaskStarts(f: => Unit): Int = {
+    val taskStarts = new AtomicInteger(0)
+    val listener = new SparkListener {
+      override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = {
+        taskStarts.incrementAndGet()
+      }
+    }
+    CometListenerBusUtils.waitUntilEmpty(spark.sparkContext)
+    spark.sparkContext.addSparkListener(listener)
+    try {
+      f
+      CometListenerBusUtils.waitUntilEmpty(spark.sparkContext)
+    } finally {
+      spark.sparkContext.removeSparkListener(listener)
+    }
+    taskStarts.get()
+  }
+
+  test("two-stage GROUP BY is offloaded to distributed Ballista with no Spark executor tasks") {
+    assume(
+      NativeBallista.isAvailable,
+      s"native ballista library not available: ${NativeBallista.loadFailure.map(_.getMessage)}")
+
+    withTempPath { dir =>
+      import testImplicits._
+
+      // A few `k` values with distinct row counts (k=1 x3, k=2 x2, k=3 x4), spread across several
+      // input files so a GROUP BY needs a shuffle to aggregate across partitions.
+      Seq((1, 10), (1, 11), (1, 12), (2, 20), (2, 21), (3, 30), (3, 31), (3, 32), (3, 33))
+        .toDF("k", "v")
+        .repartition(4)
+        .write
+        .parquet(dir.getCanonicalPath)
+
+      // AQE off so the collect root is the Comet columnar-to-row node carrying our executeCollect
+      // override (not an AdaptiveSparkPlanExec wrapper), and the shuffle boundary is deterministic.
+      // Small shuffle-partition count keeps the in-process distributed run fast.
+      // directRead off so block2's input leaf serializes as a plain Scan (#100) fed by the Ballista
+      // shuffle, not a native ShuffleScan (#116).
+      withSQLConf(
+        SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false",
+        SQLConf.SHUFFLE_PARTITIONS.key -> "4",
+        CometConf.COMET_SHUFFLE_DIRECT_READ_ENABLED.key -> "false") {
+        spark.read.parquet(dir.getCanonicalPath).createOrReplaceTempView("t")
+        val query = "SELECT k, count(*) AS c FROM t GROUP BY k"
+
+        // Confirm the plan is the offloadable R2 shape BEFORE running it: exactly one Comet hash
+        // exchange and exactly two serialized CometNativeExec blocks (partial + final aggregate).
+        val executed = withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") {
+          spark.sql(query).queryExecution.executedPlan
+        }
+        val exchanges = executed.collect { case e: CometShuffleExchangeExec => e }
+        assert(
+          exchanges.size == 1,
+          s"expected exactly one Comet hash exchange (two stages), found ${exchanges.size}:\n" +
+            s"$executed")
+        val nativeBlocks = executed.collect {
+          case n: CometNativeExec if n.serializedPlanOpt.isDefined => n
+        }
+        assert(
+          nativeBlocks.size == 2,
+          s"expected exactly two serialized CometNativeExec blocks, found ${nativeBlocks.size}:\n" +
+            s"$executed")
+
+        // Baseline: normal Comet execution (offload off) through the same listener apparatus. This
+        // positive control proves the listener observes executor task starts, so the `== 0`
+        // assertion for the offloaded run is meaningful.
+        var baseline: Seq[Seq[Any]] = null
+        val baselineTaskStarts = countTaskStarts {
+          baseline = withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") {
+            spark.sql(query).collect().map(_.toSeq.toIndexedSeq).toIndexedSeq
+          }
+        }
+        assert(
+          baselineTaskStarts > 0,
+          "expected the flag-off baseline collect to launch at least one Spark executor task " +
+            s"(sanity check for the listener apparatus); got $baselineTaskStarts")
+
+        // Ballista offload: run the same query with the flag on, counting executor task starts.
+        var offloaded: Seq[Seq[Any]] = null
+        val offloadedTaskStarts = countTaskStarts {
+          offloaded = withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "true") {
+            spark.sql(query).collect().map(_.toSeq.toIndexedSeq).toIndexedSeq
+          }
+        }
+
+        def sortKey(r: Seq[Any]): String = r.map(v => s"$v").mkString(",")
+        val baselineSorted = baseline.sortBy(sortKey)
+        val offloadedSorted = offloaded.sortBy(sortKey)
+
+        // The distributed aggregate must compose across the shuffle: partial counts written to
+        // Ballista's IPC shuffle, read back, and merged by the final aggregate into correct totals.
+        assert(
+          offloadedSorted == baselineSorted,
+          "offloaded (distributed) rows do not match baseline\n" +
+            s"  baseline:  $baselineSorted\n  offloaded: $offloadedSorted")
+        assert(
+          baselineSorted == Seq(Seq(1, 3L), Seq(2, 2L), Seq(3, 4L)),
+          s"unexpected group counts: $baselineSorted")
+
+        // Crucially, NO Spark executor tasks ran for the offloaded (driver-side, distributed)
+        // collect.
+        assert(
+          offloadedTaskStarts == 0,
+          s"expected 0 Spark executor tasks for the Ballista-offloaded distributed collect, " +
+            s"but $offloadedTaskStarts started")
+      }
+    }
+  }
+}
diff --git a/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaExternalClusterQ1Suite.scala b/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaExternalClusterQ1Suite.scala
new file mode 100644
index 0000000000..38952c1341
--- /dev/null
+++ b/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaExternalClusterQ1Suite.scala
@@ -0,0 +1,450 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet.ballista
+
+import java.io.File
+import java.math.{BigDecimal => JBigDecimal}
+import java.net.{InetSocketAddress, Socket}
+import java.nio.file.Files
+import java.sql.Date
+import java.util.concurrent.atomic.AtomicInteger
+
+import scala.io.Source
+
+import org.apache.spark.CometListenerBusUtils
+import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskStart}
+import org.apache.spark.sql.{CometTestBase, Row}
+import org.apache.spark.sql.comet.CometNativeExec
+import org.apache.spark.sql.comet.execution.shuffle.CometShuffleExchangeExec
+import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types._
+
+import org.apache.comet.CometConf
+
+/**
+ * The R1b/R2 external-cluster milestone from the FULL Spark-driver side: a live Spark driver
+ * offloads TPC-H Q1's aggregate to a GENUINELY external, separately-spawned Ballista cluster
+ * (`comet-scheduler` + `comet-executor` child processes) and gets results identical to Spark,
+ * with ZERO Spark-executor tasks.
+ *
+ * How this differs from [[CometBallistaQ1Suite]] / [[CometBallistaDistributedSuite]]: those run
+ * the same two-block Q1 aggregate against an IN-PROCESS standalone Ballista engine on the driver
+ * (`scheduler.url` empty). Here `spark.comet.exec.ballista.scheduler.url` points at a real
+ * external scheduler process, so the plan is shipped over gRPC/Flight to a SEPARATE, JVM-less
+ * executor process, reconstructed there via the injected Comet codecs, and run. This is the first
+ * time the full Q1 aggregate fragment (partial-agg NativeScan leaf -> hash shuffle -> final-agg
+ * over a Scan leaf) runs on a separate executor PROCESS rather than in the test's own process.
+ *
+ * The `comet-scheduler` / `comet-executor` binaries and the `libcomet` this JVM loads must all be
+ * built with `--features ballista` (`make core-ballista`), so the offload + URL path and the
+ * Comet-flavored codecs exist. When the loaded `libcomet` lacks the feature, the suite
+ * `assume`-skips (same guard as the other offload suites). When the feature binaries are missing
+ * on disk, the suite fails with a build hint rather than silently passing.
+ *
+ * Spawns child processes and binds ports, so it mirrors the Rust harness
+ * (`native/core/tests/ballista_external_cluster.rs`): libjvm on the loader path, wait for ports,
+ * a short registration grace, and a teardown that kills the children even on failure.
+ */
+class CometBallistaExternalClusterQ1Suite extends CometTestBase with AdaptiveSparkPlanHelper {
+
+  // Non-default ports so this suite does not collide with a real cluster on the usual
+  // 50050/50051/50052, nor with the Rust harness on 51050-51052.
+  private val schedulerPort = 51150
+  private val executorFlightPort = 51151
+  private val executorGrpcPort = 51152
+
+  private var scheduler: Process = _
+  private var executor: Process = _
+  private var logDir: File = _
+  private var schedulerLog: File = _
+  private var executorLog: File = _
+
+  /**
+   * TPC-H `lineitem`, restricted to the columns Q1 touches, with the correct Spark types
+   * (`decimal(12,2)` and a real `date`). Same fixture as [[CometBallistaQ1Suite]].
+   */
+  private val lineitemSchema: StructType = StructType(
+    Seq(
+      StructField("l_quantity", DecimalType(12, 2), nullable = false),
+      StructField("l_extendedprice", DecimalType(12, 2), nullable = false),
+      StructField("l_discount", DecimalType(12, 2), nullable = false),
+      StructField("l_tax", DecimalType(12, 2), nullable = false),
+      StructField("l_returnflag", StringType, nullable = false),
+      StructField("l_linestatus", StringType, nullable = false),
+      StructField("l_shipdate", DateType, nullable = false)))
+
+  private def dec(v: String): JBigDecimal = new JBigDecimal(v).setScale(2)
+
+  /** Same synthetic `lineitem` rows as [[CometBallistaQ1Suite]] (three surviving Q1 groups). */
+  private def lineitemRows: Seq[Row] = Seq(
+    Row(
+      dec("17.00"),
+      dec("21168.23"),
+      dec("0.04"),
+      dec("0.02"),
+      "A",
+      "F",
+      Date.valueOf("1998-08-01")),
+    Row(
+      dec("36.00"),
+      dec("45983.16"),
+      dec("0.09"),
+      dec("0.06"),
+      "A",
+      "F",
+      Date.valueOf("1998-07-15")),
+    Row(
+      dec("8.00"),
+      dec("13309.60"),
+      dec("0.10"),
+      dec("0.02"),
+      "A",
+      "F",
+      Date.valueOf("1998-09-01")),
+    Row(
+      dec("28.00"),
+      dec("28955.64"),
+      dec("0.05"),
+      dec("0.08"),
+      "N",
+      "O",
+      Date.valueOf("1998-06-10")),
+    Row(
+      dec("24.00"),
+      dec("32000.00"),
+      dec("0.00"),
+      dec("0.00"),
+      "N",
+      "O",
+      Date.valueOf("1998-08-20")),
+    Row(
+      dec("2.00"),
+      dec("2600.00"),
+      dec("0.06"),
+      dec("0.03"),
+      "N",
+      "O",
+      Date.valueOf("1998-09-02")),
+    Row(
+      dec("32.00"),
+      dec("41000.50"),
+      dec("0.07"),
+      dec("0.05"),
+      "R",
+      "F",
+      Date.valueOf("1998-05-05")),
+    Row(
+      dec("45.00"),
+      dec("60000.00"),
+      dec("0.02"),
+      dec("0.01"),
+      "R",
+      "F",
+      Date.valueOf("1998-08-31")),
+    // rows PAST the Q1 cutoff -- must be filtered out
+    Row(
+      dec("50.00"),
+      dec("70000.00"),
+      dec("0.03"),
+      dec("0.04"),
+      "N",
+      "F",
+      Date.valueOf("1998-09-03")),
+    Row(
+      dec("99.00"),
+      dec("99999.99"),
+      dec("0.05"),
+      dec("0.05"),
+      "N",
+      "F",
+      Date.valueOf("1998-12-01")))
+
+  /**
+   * TPC-H Q1's full aggregate (NO `ORDER BY`): `sum`x4, `avg`x3, `count`, grouped by the two keys
+   * `(l_returnflag, l_linestatus)`. Same query as [[CometBallistaQ1Suite]]'s R2 test.
+   */
+  private val q1FullAggregate =
+    """
+      |SELECT l_returnflag, l_linestatus,
+      |  sum(l_quantity) AS sum_qty,
+      |  sum(l_extendedprice) AS sum_base_price,
+      |  sum(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
+      |  sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
+      |  avg(l_quantity) AS avg_qty,
+      |  avg(l_extendedprice) AS avg_price,
+      |  avg(l_discount) AS avg_disc,
+      |  count(*) AS count_order
+      |FROM lineitem
+      |WHERE l_shipdate <= date '1998-12-01' - interval '90' day
+      |GROUP BY l_returnflag, l_linestatus
+      |""".stripMargin
+
+  /** Runs `f`, counting Spark executor task starts during it (drains the bus around it). */
+  private def countTaskStarts(f: => Unit): Int = {
+    val taskStarts = new AtomicInteger(0)
+    val listener = new SparkListener {
+      override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = {
+        taskStarts.incrementAndGet()
+      }
+    }
+    CometListenerBusUtils.waitUntilEmpty(spark.sparkContext)
+    spark.sparkContext.addSparkListener(listener)
+    try {
+      f
+      CometListenerBusUtils.waitUntilEmpty(spark.sparkContext)
+    } finally {
+      spark.sparkContext.removeSparkListener(listener)
+    }
+    taskStarts.get()
+  }
+
+  /**
+   * `$JAVA_HOME/lib/server` prepended to any inherited `DYLD_LIBRARY_PATH` (libjvm, not a JVM).
+   */
+  private def dyldPath(): Option[String] = {
+    Option(System.getenv("JAVA_HOME")).map { javaHome =>
+      val lib = new File(new File(javaHome, "lib"), "server").getAbsolutePath
+      val existing = Option(System.getenv("DYLD_LIBRARY_PATH")).getOrElse("")
+      if (existing.isEmpty) lib else s"$lib:$existing"
+    }
+  }
+
+  /**
+   * Locate the directory holding the feature-built `comet-scheduler` / `comet-executor` binaries.
+   * Honors `COMET_BALLISTA_BIN_DIR`, else tries the usual debug/release target dirs relative to
+   * the module (surefire's `user.dir` is the `spark/` module) and the repo root.
+   */
+  private def findBinDir(): Option[File] = {
+    val candidates =
+      Option(System.getenv("COMET_BALLISTA_BIN_DIR")).map(new File(_)).toSeq ++ Seq(
+        "../native/target/debug",
+        "../native/target/release",
+        "native/target/debug",
+        "native/target/release").map(p => new File(System.getProperty("user.dir"), p))
+    candidates.find { d =>
+      new File(d, "comet-scheduler").canExecute && new File(d, "comet-executor").canExecute
+    }
+  }
+
+  /** Poll a TCP port until it accepts a connection or the deadline passes. */
+  private def waitForPort(port: Int, what: String, timeoutMillis: Long): Unit = {
+    val deadline = System.currentTimeMillis() + timeoutMillis
+    var connected = false
+    while (!connected && System.currentTimeMillis() < deadline) {
+      val socket = new Socket()
+      try {
+        socket.connect(new InetSocketAddress("127.0.0.1", port), 200)
+        connected = true
+      } catch {
+        case _: Throwable => Thread.sleep(150)
+      } finally {
+        try socket.close()
+        catch { case _: Throwable => }
+      }
+    }
+    if (!connected) {
+      // Surface the child's log to make a startup failure diagnosable.
+      throw new IllegalStateException(
+        s"timed out waiting for $what on port $port\n${tailLog(what)}")
+    }
+    // scalastyle:off println
+    println(s"[external-cluster] $what is listening on $port")
+    // scalastyle:on println
+  }
+
+  private def tailLog(what: String): String = {
+    val f = if (what.startsWith("scheduler")) schedulerLog else executorLog
+    if (f != null && f.exists()) {
+      val src = Source.fromFile(f)
+      try s"--- $what log tail ---\n${src.getLines().toSeq.takeRight(40).mkString("\n")}"
+      finally src.close()
+    } else ""
+  }
+
+  /** Spawn the external `comet-scheduler` + `comet-executor` child processes. */
+  private def startCluster(binDir: File): Unit = {
+    logDir = Files.createTempDirectory("comet-external-cluster-").toFile
+    schedulerLog = new File(logDir, "scheduler.log")
+    executorLog = new File(logDir, "executor.log")
+    val dyld = dyldPath()
+
+    // --- scheduler ---
+    val schedulerPb = new ProcessBuilder(new File(binDir, "comet-scheduler").getAbsolutePath)
+    schedulerPb.redirectOutput(schedulerLog).redirectErrorStream(true)
+    val schedEnv = schedulerPb.environment()
+    schedEnv.put("COMET_BALLISTA_SCHEDULER_BIND_HOST", "127.0.0.1")
+    schedEnv.put("COMET_BALLISTA_SCHEDULER_BIND_PORT", schedulerPort.toString)
+    schedEnv.put("RUST_LOG", "info")
+    dyld.foreach(schedEnv.put("DYLD_LIBRARY_PATH", _))
+    scheduler = schedulerPb.start()
+    waitForPort(schedulerPort, "scheduler", 30000)
+
+    // --- executor (separate, JVM-less) ---
+    val executorPb = new ProcessBuilder(new File(binDir, "comet-executor").getAbsolutePath)
+    executorPb.redirectOutput(executorLog).redirectErrorStream(true)
+    val exEnv = executorPb.environment()
+    exEnv.put("COMET_BALLISTA_EXECUTOR_BIND_HOST", "127.0.0.1")
+    exEnv.put("COMET_BALLISTA_EXECUTOR_PORT", executorFlightPort.toString)
+    exEnv.put("COMET_BALLISTA_EXECUTOR_GRPC_PORT", executorGrpcPort.toString)
+    exEnv.put("COMET_BALLISTA_SCHEDULER_HOST", "127.0.0.1")
+    exEnv.put("COMET_BALLISTA_SCHEDULER_PORT", schedulerPort.toString)
+    exEnv.put("COMET_BALLISTA_EXECUTOR_CONCURRENT_TASKS", "4")
+    exEnv.put("RUST_LOG", "info")
+    dyld.foreach(exEnv.put("DYLD_LIBRARY_PATH", _))
+    executor = executorPb.start()
+    waitForPort(executorFlightPort, "executor flight", 30000)
+    waitForPort(executorGrpcPort, "executor grpc", 30000)
+    // Grace for the executor to finish registering with the scheduler.
+    Thread.sleep(3000)
+  }
+
+  private def stopCluster(): Unit = {
+    Seq(("comet-executor", executor), ("comet-scheduler", scheduler)).foreach {
+      case (name, proc) =>
+        if (proc != null) {
+          proc.destroyForcibly()
+          proc.waitFor()
+          // scalastyle:off println
+          println(s"[external-cluster] stopped $name")
+          // scalastyle:on println
+        }
+    }
+  }
+
+  override def afterAll(): Unit = {
+    try stopCluster()
+    finally super.afterAll()
+  }
+
+  test(
+    "TPC-H Q1 full aggregate offloads to a LIVE external comet-scheduler/comet-executor cluster " +
+      "with identical results and no executor tasks") {
+    assume(
+      NativeBallista.isAvailable,
+      s"native ballista library not available (build with `make core-ballista`): " +
+        s"${NativeBallista.loadFailure.map(_.getMessage)}")
+
+    val binDir = findBinDir().getOrElse {
+      fail(
+        "could not find feature-built comet-scheduler/comet-executor binaries; " +
+          "build them with `make core-ballista` (they require --features ballista), or set " +
+          "COMET_BALLISTA_BIN_DIR")
+    }
+    // scalastyle:off println
+    println(s"[external-cluster] using binaries from ${binDir.getAbsolutePath}")
+    // scalastyle:on println
+
+    startCluster(binDir)
+
+    withTempPath { dir =>
+      // Spread rows across several input files so same-group rows land in different partitions —
+      // the hash shuffle must combine partial-aggregate states across partitions.
+      spark
+        .createDataFrame(spark.sparkContext.parallelize(lineitemRows), lineitemSchema)
+        .repartition(3)
+        .write
+        .parquet(dir.getCanonicalPath)
+
+      // AQE off (collect root carries the executeCollect override); direct-read off so block2's
+      // input leaf serializes as a plain Scan fed by the Ballista shuffle; small shuffle-partition
+      // count keeps the run fast. scheduler.url points at the LIVE external scheduler.
+      withSQLConf(
+        SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false",
+        SQLConf.SHUFFLE_PARTITIONS.key -> "4",
+        CometConf.COMET_SHUFFLE_DIRECT_READ_ENABLED.key -> "false",
+        CometConf.COMET_EXEC_BALLISTA_SCHEDULER_URL.key -> s"http://127.0.0.1:$schedulerPort") {
+        spark.read.parquet(dir.getCanonicalPath).createOrReplaceTempView("lineitem")
+
+        // Confirm the offloadable R2 shape BEFORE running: exactly one Comet hash exchange (two
+        // stages) and exactly two serialized CometNativeExec blocks (partial + final aggregate).
+        val executed = withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") {
+          spark.sql(q1FullAggregate).queryExecution.executedPlan
+        }
+        val exchanges = executed.collect { case e: CometShuffleExchangeExec => e }
+        assert(
+          exchanges.size == 1,
+          s"expected exactly one Comet hash exchange (two stages), found ${exchanges.size}:\n" +
+            s"$executed")
+        val nativeBlocks = executed.collect {
+          case n: CometNativeExec if n.serializedPlanOpt.isDefined => n
+        }
+        assert(
+          nativeBlocks.size == 2,
+          s"expected exactly two serialized CometNativeExec blocks, found ${nativeBlocks.size}:\n" +
+            s"$executed")
+
+        // Baseline oracle: Q1 via the Comet-on-executor native path (offload off). Positive control
+        // for the listener (must launch executor tasks) and the row-for-row reference.
+        var baseline: Seq[Seq[Any]] = null
+        val baselineTaskStarts = countTaskStarts {
+          baseline = withSQLConf(
+            CometConf.COMET_ENABLED.key -> "true",
+            CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") {
+            spark.sql(q1FullAggregate).collect().map(_.toSeq.toIndexedSeq).toIndexedSeq
+          }
+        }
+        assert(
+          baselineTaskStarts > 0,
+          "expected the Spark baseline collect to launch at least one Spark executor task " +
+            s"(sanity check for the listener apparatus); got $baselineTaskStarts")
+
+        // Ballista offload to the LIVE external cluster: same query, flag on, URL set.
+        var offloaded: Seq[Seq[Any]] = null
+        val offloadedTaskStarts = countTaskStarts {
+          offloaded = withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "true") {
+            spark.sql(q1FullAggregate).collect().map(_.toSeq.toIndexedSeq).toIndexedSeq
+          }
+        }
+
+        // Sort both sides by (returnflag, linestatus) on the driver (Q1's ORDER BY is not
+        // offloaded). Compare full rows using the exact values Spark produced — decimals keep their
+        // computed scale, so a wrong decimal scale from avg/sum composition fails the assertion.
+        def sortKey(r: Seq[Any]): (String, String) = (s"${r.head}", s"${r(1)}")
+        val baselineSorted = baseline.sortBy(sortKey)
+        val offloadedSorted = offloaded.sortBy(sortKey)
+        assert(
+          offloadedSorted == baselineSorted,
+          "external-cluster-offloaded Q1 aggregate rows do not match Spark's own Q1\n" +
+            s"  spark:     $baselineSorted\n  offloaded: $offloadedSorted\n" +
+            s"${tailLog("scheduler")}\n${tailLog("executor")}")
+
+        // Sanity: three surviving groups after the Q1 date filter.
+        assert(
+          baselineSorted.map(r => (s"${r.head}", s"${r(1)}")) ==
+            Seq(("A", "F"), ("N", "O"), ("R", "F")),
+          s"unexpected Q1 groups: ${baselineSorted.map(r => (r.head, r(1)))}")
+
+        // Crucially, NO Spark executor tasks ran for the offloaded collect — the external Ballista
+        // cluster served it.
+        assert(
+          offloadedTaskStarts == 0,
+          s"expected 0 Spark executor tasks for the external-cluster-offloaded collect, " +
+            s"but $offloadedTaskStarts started")
+
+        // scalastyle:off println
+        println(s"[external-cluster] PASS: live Spark driver ran full Q1 on the external cluster")
+        println(tailLog("scheduler"))
+        println(tailLog("executor"))
+        // scalastyle:on println
+      }
+    }
+  }
+}
diff --git a/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaJoinSuite.scala b/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaJoinSuite.scala
new file mode 100644
index 0000000000..e923620ae5
--- /dev/null
+++ b/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaJoinSuite.scala
@@ -0,0 +1,215 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet.ballista
+
+import java.util.concurrent.atomic.AtomicInteger
+
+import org.apache.spark.CometListenerBusUtils
+import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskStart}
+import org.apache.spark.sql.CometTestBase
+import org.apache.spark.sql.comet.{CometHashJoinExec, CometNativeExec, CometSortMergeJoinExec}
+import org.apache.spark.sql.comet.execution.shuffle.CometShuffleExchangeExec
+import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
+import org.apache.spark.sql.internal.SQLConf
+
+import org.apache.comet.CometConf
+
+/**
+ * Task 8 (proof milestone): a distributed shuffle-hash/sort-merge JOIN and an N-stage aggregate
+ * (>2 native blocks / 2 hash exchanges) both run DISTRIBUTED on the in-process Ballista engine,
+ * producing results identical to the flag-off (Spark/Comet-on-executors) baseline while launching
+ * ZERO Spark executor tasks.
+ *
+ * The join fragment shape (see [[BallistaOffloadPlanner]]): a single co-partitioned join block
+ * fed by exactly two Comet hash exchanges, one per side.
+ * `spark.sql.autoBroadcastJoinThreshold=-1` forces a shuffle-hash / sort-merge join instead of a
+ * broadcast join, since a broadcast join side is not yet a supported offload shape (the walker
+ * rejects it).
+ */
+class CometBallistaJoinSuite extends CometTestBase with AdaptiveSparkPlanHelper {
+
+  /**
+   * Runs `f`, counting Spark executor task starts during it. Drains the listener bus before
+   * attaching and after running so asynchronous task-start events are flushed. (Same apparatus as
+   * `CometBallistaDistributedSuite` / `CometBallistaOffloadSuite`.)
+   */
+  private def countTaskStarts(f: => Unit): Int = {
+    val taskStarts = new AtomicInteger(0)
+    val listener = new SparkListener {
+      override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = {
+        taskStarts.incrementAndGet()
+      }
+    }
+    CometListenerBusUtils.waitUntilEmpty(spark.sparkContext)
+    spark.sparkContext.addSparkListener(listener)
+    try {
+      f
+      CometListenerBusUtils.waitUntilEmpty(spark.sparkContext)
+    } finally {
+      spark.sparkContext.removeSparkListener(listener)
+    }
+    taskStarts.get()
+  }
+
+  /**
+   * Runs `query` with the Ballista offload flag set to `ballista`, counting Spark executor task
+   * starts. AQE is off, shuffle partitions are pinned small, shuffle direct-read is off (so a
+   * downstream fragment's input leaf serializes as a plain `Scan` fed by the Ballista shuffle),
+   * and the broadcast-join threshold is disabled so joins plan as shuffle-hash / sort-merge
+   * rather than broadcast.
+   */
+  private def runWith(ballista: Boolean, query: String): (Seq[Seq[Any]], Int) = {
+    var rows: Seq[Seq[Any]] = null
+    val taskStarts = countTaskStarts {
+      rows = withSQLConf(
+        SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false",
+        SQLConf.SHUFFLE_PARTITIONS.key -> "4",
+        CometConf.COMET_SHUFFLE_DIRECT_READ_ENABLED.key -> "false",
+        SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1",
+        CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> ballista.toString) {
+        spark.sql(query).collect().map(_.toSeq.toIndexedSeq).toIndexedSeq
+      }
+    }
+    (rows, taskStarts)
+  }
+
+  private def sortKey(r: Seq[Any]): String = r.map(v => s"$v").mkString(",")
+
+  test("distributed shuffle-hash join offloads with zero Spark tasks") {
+    assume(
+      NativeBallista.isAvailable,
+      s"native ballista library not available: ${NativeBallista.loadFailure.map(_.getMessage)}")
+
+    withParquetTable((0 until 200).map(i => (i, i * 10)), "l") {
+      withParquetTable((0 until 200).map(i => (i % 50, i * 100)), "r") {
+        val query = "SELECT l._1, l._2, r._2 FROM l JOIN r ON l._1 = r._1"
+
+        // Pre-flight: confirm the plan is the co-partitioned join shape (two Comet hash
+        // exchanges feeding a shuffle-hash/sort-merge join) BEFORE running it.
+        val executed = withSQLConf(
+          SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false",
+          SQLConf.SHUFFLE_PARTITIONS.key -> "4",
+          CometConf.COMET_SHUFFLE_DIRECT_READ_ENABLED.key -> "false",
+          SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1",
+          CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") {
+          spark.sql(query).queryExecution.executedPlan
+        }
+        val exchanges = executed.collect { case e: CometShuffleExchangeExec => e }
+        assert(
+          exchanges.size == 2,
+          s"expected exactly two Comet hash exchanges (one per join side), found " +
+            s"${exchanges.size}:\n$executed")
+        val joins = executed.collect {
+          case j: CometHashJoinExec => j
+          case j: CometSortMergeJoinExec => j
+        }
+        assert(
+          joins.size == 1,
+          s"expected exactly one shuffle-hash/sort-merge join, found ${joins.size}:\n$executed")
+
+        // Baseline: normal Comet execution (offload off), a positive control proving the
+        // listener observes executor task starts.
+        val (baseline, baselineTaskStarts) = runWith(ballista = false, query)
+        assert(
+          baselineTaskStarts > 0,
+          "expected the flag-off baseline collect to launch at least one Spark executor task " +
+            s"(sanity check for the listener apparatus); got $baselineTaskStarts")
+
+        // Ballista offload: same query, flag on.
+        val (offloaded, offloadedTaskStarts) = runWith(ballista = true, query)
+
+        val baselineSorted = baseline.sortBy(sortKey)
+        val offloadedSorted = offloaded.sortBy(sortKey)
+        assert(
+          offloadedSorted == baselineSorted,
+          "offloaded (distributed) rows do not match baseline\n" +
+            s"  baseline:  $baselineSorted\n  offloaded: $offloadedSorted")
+
+        assert(
+          offloadedTaskStarts == 0,
+          "expected 0 Spark executor tasks for the Ballista-offloaded distributed join, " +
+            s"but $offloadedTaskStarts started")
+      }
+    }
+  }
+
+  test("three-stage aggregate (two hash exchanges) offloads with zero Spark tasks") {
+    assume(
+      NativeBallista.isAvailable,
+      s"native ballista library not available: ${NativeBallista.loadFailure.map(_.getMessage)}")
+
+    // _1 ranges over 200 distinct values (0..199), each appearing twice, so the inner
+    // `GROUP BY _1` needs a real shuffle to de-duplicate across partitions. k = _1 % 10 then
+    // buckets those 200 distinct values into 10 groups of 20, and the outer `GROUP BY k` needs a
+    // second shuffle to merge partial counts. Two shuffles => three native blocks (partial
+    // dedupe -> exchange -> final dedupe + partial count -> exchange -> final count).
+    withParquetTable((0 until 400).map(i => (i % 200, i)), "t") {
+      val query = "SELECT k, count(*) AS c FROM (SELECT _1 % 10 AS k FROM t GROUP BY _1) " +
+        "GROUP BY k"
+
+      // Pre-flight: confirm the plan is the >=3-block / 2-exchange shape BEFORE running it.
+      val executed = withSQLConf(
+        SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false",
+        SQLConf.SHUFFLE_PARTITIONS.key -> "4",
+        CometConf.COMET_SHUFFLE_DIRECT_READ_ENABLED.key -> "false",
+        CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") {
+        spark.sql(query).queryExecution.executedPlan
+      }
+      val exchanges = executed.collect { case e: CometShuffleExchangeExec => e }
+      assert(
+        exchanges.size == 2,
+        s"expected exactly two Comet hash exchanges (three stages), found ${exchanges.size}:\n" +
+          s"$executed")
+      val nativeBlocks = executed.collect {
+        case n: CometNativeExec if n.serializedPlanOpt.isDefined => n
+      }
+      assert(
+        nativeBlocks.size == 3,
+        s"expected exactly three serialized CometNativeExec blocks, found " +
+          s"${nativeBlocks.size}:\n$executed")
+
+      // Baseline: normal Comet execution (offload off), a positive control proving the listener
+      // observes executor task starts.
+      val (baseline, baselineTaskStarts) = runWith(ballista = false, query)
+      assert(
+        baselineTaskStarts > 0,
+        "expected the flag-off baseline collect to launch at least one Spark executor task " +
+          s"(sanity check for the listener apparatus); got $baselineTaskStarts")
+      assert(
+        baseline.sortBy(sortKey) == (0 until 10).map(k => Seq(k, 20L)),
+        s"unexpected group counts: ${baseline.sortBy(sortKey)}")
+
+      // Ballista offload: same query, flag on.
+      val (offloaded, offloadedTaskStarts) = runWith(ballista = true, query)
+
+      val baselineSorted = baseline.sortBy(sortKey)
+      val offloadedSorted = offloaded.sortBy(sortKey)
+      assert(
+        offloadedSorted == baselineSorted,
+        "offloaded (distributed) rows do not match baseline\n" +
+          s"  baseline:  $baselineSorted\n  offloaded: $offloadedSorted")
+
+      assert(
+        offloadedTaskStarts == 0,
+        "expected 0 Spark executor tasks for the Ballista-offloaded distributed aggregate, " +
+          s"but $offloadedTaskStarts started")
+    }
+  }
+}
diff --git a/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaOffloadSuite.scala b/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaOffloadSuite.scala
new file mode 100644
index 0000000000..5ce5eca1dc
--- /dev/null
+++ b/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaOffloadSuite.scala
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet.ballista
+
+import java.util.concurrent.atomic.AtomicInteger
+
+import org.apache.spark.CometListenerBusUtils
+import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskStart}
+import org.apache.spark.sql.CometTestBase
+import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
+import org.apache.spark.sql.internal.SQLConf
+
+import org.apache.comet.CometConf
+
+/**
+ * Proves the driver-side "offload to Ballista" collect path (R1): when
+ * `spark.comet.exec.ballista.enabled=true`, a `collect()` on a single-stage Comet query runs on
+ * the Spark driver via an in-process Ballista engine and returns the same rows as the normal
+ * path, launching NO Spark executor tasks.
+ */
+class CometBallistaOffloadSuite extends CometTestBase with AdaptiveSparkPlanHelper {
+
+  /**
+   * Runs `f`, counting Spark executor task starts that occur during it. Drains the listener bus
+   * before attaching (so events from prior setup don't leak in) and after running `f` (so
+   * asynchronously-dispatched task-start events are flushed before we read the counter).
+   */
+  private def countTaskStarts(f: => Unit): Int = {
+    val taskStarts = new AtomicInteger(0)
+    val listener = new SparkListener {
+      override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = {
+        taskStarts.incrementAndGet()
+      }
+    }
+    CometListenerBusUtils.waitUntilEmpty(spark.sparkContext)
+    spark.sparkContext.addSparkListener(listener)
+    try {
+      f
+      CometListenerBusUtils.waitUntilEmpty(spark.sparkContext)
+    } finally {
+      spark.sparkContext.removeSparkListener(listener)
+    }
+    taskStarts.get()
+  }
+
+  test("single-stage collect is offloaded to Ballista with no Spark executor tasks") {
+    assume(
+      NativeBallista.isAvailable,
+      s"native ballista library not available: ${NativeBallista.loadFailure.map(_.getMessage)}")
+
+    withTempPath { dir =>
+      import testImplicits._
+
+      // A single Parquet file (coalesce(1)) with two int columns, so the offloaded plan is a
+      // clean single-stage scan with no exchange.
+      Seq((1, 10), (2, 20), (3, 30), (4, 40), (5, 50))
+        .toDF("a", "b")
+        .coalesce(1)
+        .write
+        .parquet(dir.getCanonicalPath)
+
+      // Disable AQE so the collect root is the Comet columnar-to-row node (which carries our
+      // executeCollect override) rather than an AdaptiveSparkPlanExec wrapper.
+      withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") {
+        spark.read.parquet(dir.getCanonicalPath).createOrReplaceTempView("t")
+        val query = "SELECT a, b FROM t WHERE a > 2"
+
+        // Baseline: normal Comet execution (offload off), run through the same
+        // listener/waitUntilEmpty apparatus used for the offloaded case below. This is a
+        // positive control: it proves the listener actually observes executor task starts (i.e.
+        // it isn't a broken apparatus that would report 0 regardless), so the `== 0` assertion
+        // for the offloaded collect is meaningful.
+        var baseline: Set[Seq[Any]] = null
+        val baselineTaskStarts = countTaskStarts {
+          baseline = withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") {
+            spark.sql(query).collect().map(_.toSeq).toSet
+          }
+        }
+        assert(
+          baseline == Set(Seq(3, 30), Seq(4, 40), Seq(5, 50)),
+          s"unexpected baseline: $baseline")
+        assert(
+          baselineTaskStarts > 0,
+          "expected the flag-off baseline collect to launch at least one Spark executor task " +
+            "(sanity check that the listener/waitUntilEmpty apparatus catches task starts); " +
+            s"got $baselineTaskStarts")
+
+        // Ballista offload: count executor task starts around the collect.
+        var offloaded: Set[Seq[Any]] = null
+        val offloadedTaskStarts = countTaskStarts {
+          offloaded = withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "true") {
+            spark.sql(query).collect().map(_.toSeq).toSet
+          }
+        }
+
+        // Same rows via the offloaded path...
+        assert(offloaded == baseline, s"offloaded rows $offloaded != baseline $baseline")
+        // ...and crucially, NO Spark executor tasks ran for the offloaded collect.
+        assert(
+          offloadedTaskStarts == 0,
+          s"expected 0 Spark executor tasks for the Ballista-offloaded collect, " +
+            s"but $offloadedTaskStarts started")
+      }
+    }
+  }
+
+  test("two-stage GROUP BY with shuffle direct read on throws under Ballista offload") {
+    assume(
+      NativeBallista.isAvailable,
+      s"native ballista library not available: ${NativeBallista.loadFailure.map(_.getMessage)}")
+
+    withTempPath { dir =>
+      import testImplicits._
+
+      // Several partition files with repeated keys, so a `GROUP BY` requires a shuffle
+      // (exchange) to aggregate across partitions -> two CometNativeExec boundaries. This is the
+      // R2 two-stage shape, but with shuffle direct read left ON (the default) the final block's
+      // input leaf serializes as a native ShuffleScan, which the Ballista shuffle-fed fragment
+      // cannot consume, so the offload is rejected with a clear error.
+      Seq((1, 10), (1, 20), (2, 30), (2, 40), (3, 50), (3, 60), (4, 70), (4, 80))
+        .toDF("k", "v")
+        .repartition(4)
+        .write
+        .parquet(dir.getCanonicalPath)
+
+      // Disable AQE so the shuffle boundary/plan shape is deterministic (no runtime coalescing
+      // of the exchange back down to a single stage). Leave directRead at its default (true).
+      withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") {
+        spark.read.parquet(dir.getCanonicalPath).createOrReplaceTempView("t2")
+        val query = "SELECT k, count(*) FROM t2 GROUP BY k"
+
+        withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "true") {
+          val df = spark.sql(query)
+          // Sanity check: the plan does contain an exchange, i.e. the two-stage path is actually
+          // exercised and not vacuously satisfied.
+          val hasExchange = df.queryExecution.executedPlan.collect {
+            case e: org.apache.spark.sql.execution.exchange.Exchange => e
+          }.nonEmpty
+          assert(
+            hasExchange,
+            s"expected an exchange in the plan:\n${df.queryExecution.executedPlan}")
+
+          val ex = intercept[UnsupportedOperationException] {
+            df.collect()
+          }
+          assert(
+            ex.getMessage.contains("directRead"),
+            s"unexpected exception message: ${ex.getMessage}")
+        }
+      }
+    }
+  }
+}
diff --git a/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaQ1Suite.scala b/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaQ1Suite.scala
new file mode 100644
index 0000000000..1b30f42889
--- /dev/null
+++ b/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaQ1Suite.scala
@@ -0,0 +1,442 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet.ballista
+
+import java.math.{BigDecimal => JBigDecimal}
+import java.sql.Date
+import java.util.concurrent.atomic.AtomicInteger
+
+import org.apache.spark.CometListenerBusUtils
+import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskStart}
+import org.apache.spark.sql.{CometTestBase, Row}
+import org.apache.spark.sql.comet.CometNativeExec
+import org.apache.spark.sql.comet.execution.shuffle.CometShuffleExchangeExec
+import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
+import org.apache.spark.sql.execution.exchange.Exchange
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types._
+
+import org.apache.comet.CometConf
+
+/**
+ * The milestone demonstration for the R1 driver-side Ballista offload, using TPC-H Q1's
+ * `lineitem` data and per-row semantics: run the query with
+ * `spark.comet.exec.ballista.enabled=true` and prove the collected rows are identical to the
+ * flag-off (Spark/Comet-on-executors) baseline, while launching ZERO Spark executor tasks.
+ *
+ * Scope note — why this offloads the pre-aggregation slice of Q1, not the full aggregate:
+ *
+ * The R1 offload path only accepts a plan with exactly ONE serialized `CometNativeExec` block (a
+ * single native leaf reading Parquet directly). Full Q1's `GROUP BY` cannot be squeezed into one
+ * such block under this machinery:
+ *   - Plain read: Spark plans partial-agg -> `CometExchange` -> final-agg, i.e. a shuffle
+ *     boundary \=> two serialized blocks (the guard rejects it as multi-stage). The Parquet scan
+ *     reports `UnknownPartitioning`, which never satisfies the aggregate's
+ *     `ClusteredDistribution`, so the exchange is unavoidable regardless of how many
+ *     files/partitions the input has.
+ *   - `.coalesce(1)`: this removes the exchange (the single-partition child satisfies the
+ *     distribution) but inserts a `CometCoalesce` *sink*, which is itself a native-block boundary
+ *     \=> still two serialized blocks, still rejected. So no arrangement of full Q1 is a single
+ *     exchange-free `CometNativeExec` with a Parquet leaf in R1; the multi-block/aggregate case
+ *     is R2 and explicitly out of scope here (see task brief).
+ *
+ * We therefore offload the largest single-block subset of Q1: its scan + `WHERE` date filter +
+ * the exact decimal arithmetic projections that feed the aggregate (`disc_price`, `charge`). This
+ * exercises the parts that matter for offload correctness — Parquet native scan, date filtering
+ * against the Q1 cutoff, and Q1's decimal multiplications — as ONE exchange-free native block.
+ * The test asserts the plan really is single-block before offloading, and compares full result
+ * rows flag-on vs flag-off using the exact decimal types Spark produces.
+ *
+ * Coexistence (single-core global state): both tests in this suite run in one JVM. Since the
+ * offload was folded into `libcomet` behind the `ballista` Cargo feature, there is now exactly
+ * ONE copy of Comet core — and one `JAVA_VM` static — shared by both the Comet-on-executor path
+ * (`Native.executePlan` on a `tokio-rt-worker`) and the in-process offload. The old dual-library
+ * hazard (a second, uninitialized `JAVA_VM` in `libdatafusion_comet_ballista` causing a `JAVA_VM
+ * not initialized` panic once an offload had run) is therefore gone. The second test's reference
+ * oracle deliberately runs with Comet ENABLED (the executor native path) AFTER the first test's
+ * offload has already run in this JVM: it is the coexistence acceptance check — a
+ * Comet-on-executor native query and an in-process offload sharing one JVM without panicking.
+ */
+class CometBallistaQ1Suite extends CometTestBase with AdaptiveSparkPlanHelper {
+
+  /**
+   * The single-block, pre-aggregation slice of TPC-H Q1: the scan, the Q1 `WHERE` date filter,
+   * and Q1's per-row decimal projections (`l_extendedprice * (1 - l_discount)` and `... * (1 +
+   * l_tax)`) — everything up to, but not including, the `GROUP BY` (which would force a shuffle
+   * boundary, see the class doc). No `ORDER BY` (a global sort would also need a range-partition
+   * exchange); the test sorts the collected rows itself.
+   */
+  private val q1 =
+    """
+      |SELECT l_returnflag, l_linestatus,
+      |  l_quantity,
+      |  l_extendedprice,
+      |  l_extendedprice * (1 - l_discount) AS disc_price,
+      |  l_extendedprice * (1 - l_discount) * (1 + l_tax) AS charge
+      |FROM lineitem
+      |WHERE l_shipdate <= date '1998-12-01' - interval '90' day
+      |""".stripMargin
+
+  /**
+   * TPC-H `lineitem`, restricted to the columns Q1 touches, with the correct Spark types.
+   * Decimals use the classic TPC-H `decimal(12,2)`; `l_shipdate` is a real `date`.
+   */
+  private val lineitemSchema: StructType = StructType(
+    Seq(
+      StructField("l_quantity", DecimalType(12, 2), nullable = false),
+      StructField("l_extendedprice", DecimalType(12, 2), nullable = false),
+      StructField("l_discount", DecimalType(12, 2), nullable = false),
+      StructField("l_tax", DecimalType(12, 2), nullable = false),
+      StructField("l_returnflag", StringType, nullable = false),
+      StructField("l_linestatus", StringType, nullable = false),
+      StructField("l_shipdate", DateType, nullable = false)))
+
+  private def dec(v: String): JBigDecimal = new JBigDecimal(v).setScale(2)
+
+  /**
+   * A small synthetic `lineitem`: a handful of rows spanning three `(returnflag, linestatus)`
+   * groups, with shipdates straddling the Q1 cutoff (`1998-12-01 - 90 days = 1998-09-02`) so the
+   * `WHERE` filter actually removes rows (the two past-cutoff rows). A range of discount/tax
+   * values gives the decimal projections non-trivial products.
+   */
+  private def lineitemRows: Seq[Row] = Seq(
+    // group (A, F) -- all kept
+    Row(
+      dec("17.00"),
+      dec("21168.23"),
+      dec("0.04"),
+      dec("0.02"),
+      "A",
+      "F",
+      Date.valueOf("1998-08-01")),
+    Row(
+      dec("36.00"),
+      dec("45983.16"),
+      dec("0.09"),
+      dec("0.06"),
+      "A",
+      "F",
+      Date.valueOf("1998-07-15")),
+    Row(
+      dec("8.00"),
+      dec("13309.60"),
+      dec("0.10"),
+      dec("0.02"),
+      "A",
+      "F",
+      Date.valueOf("1998-09-01")),
+    // group (N, O) -- all kept
+    Row(
+      dec("28.00"),
+      dec("28955.64"),
+      dec("0.05"),
+      dec("0.08"),
+      "N",
+      "O",
+      Date.valueOf("1998-06-10")),
+    Row(
+      dec("24.00"),
+      dec("32000.00"),
+      dec("0.00"),
+      dec("0.00"),
+      "N",
+      "O",
+      Date.valueOf("1998-08-20")),
+    Row(
+      dec("2.00"),
+      dec("2600.00"),
+      dec("0.06"),
+      dec("0.03"),
+      "N",
+      "O",
+      Date.valueOf("1998-09-02")),
+    // group (R, F) -- all kept
+    Row(
+      dec("32.00"),
+      dec("41000.50"),
+      dec("0.07"),
+      dec("0.05"),
+      "R",
+      "F",
+      Date.valueOf("1998-05-05")),
+    Row(
+      dec("45.00"),
+      dec("60000.00"),
+      dec("0.02"),
+      dec("0.01"),
+      "R",
+      "F",
+      Date.valueOf("1998-08-31")),
+    // rows PAST the cutoff -- must be filtered out (would form a (N, F) group if kept)
+    Row(
+      dec("50.00"),
+      dec("70000.00"),
+      dec("0.03"),
+      dec("0.04"),
+      "N",
+      "F",
+      Date.valueOf("1998-09-03")),
+    Row(
+      dec("99.00"),
+      dec("99999.99"),
+      dec("0.05"),
+      dec("0.05"),
+      "N",
+      "F",
+      Date.valueOf("1998-12-01")))
+
+  /**
+   * Runs `f`, counting Spark executor task starts during it. Drains the listener bus before
+   * attaching and after running so asynchronous task-start events are flushed. (Same apparatus as
+   * `CometBallistaOffloadSuite`.)
+   */
+  private def countTaskStarts(f: => Unit): Int = {
+    val taskStarts = new AtomicInteger(0)
+    val listener = new SparkListener {
+      override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = {
+        taskStarts.incrementAndGet()
+      }
+    }
+    CometListenerBusUtils.waitUntilEmpty(spark.sparkContext)
+    spark.sparkContext.addSparkListener(listener)
+    try {
+      f
+      CometListenerBusUtils.waitUntilEmpty(spark.sparkContext)
+    } finally {
+      spark.sparkContext.removeSparkListener(listener)
+    }
+    taskStarts.get()
+  }
+
+  test(
+    "TPC-H Q1 (pre-aggregation slice) offloads to Ballista single-block with identical results " +
+      "and no executor tasks") {
+    assume(
+      NativeBallista.isAvailable,
+      s"native ballista library not available: ${NativeBallista.loadFailure.map(_.getMessage)}")
+
+    withTempPath { dir =>
+      // Single Parquet file (coalesce(1)) so the offloaded plan reads one native scan leaf.
+      spark
+        .createDataFrame(spark.sparkContext.parallelize(lineitemRows), lineitemSchema)
+        .coalesce(1)
+        .write
+        .parquet(dir.getCanonicalPath)
+
+      // AQE off so the physical plan is stable (no AdaptiveSparkPlanExec collect root wrapping the
+      // Comet columnar-to-row node that carries our executeCollect offload hook).
+      withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") {
+        spark.read.parquet(dir.getCanonicalPath).createOrReplaceTempView("lineitem")
+
+        // Confirm the plan is offloadable BEFORE running it: no exchange, and exactly one
+        // CometNativeExec block carrying a serialized plan (the R1 single-block requirement).
+        val executed = withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") {
+          spark.sql(q1).queryExecution.executedPlan
+        }
+        val exchanges = executed.collect { case e: Exchange => e }
+        assert(
+          exchanges.isEmpty,
+          s"expected no exchange (single-stage) in the plan, found ${exchanges.size}:\n$executed")
+        val nativeBlocks = executed.collect {
+          case n: CometNativeExec if n.serializedPlanOpt.isDefined => n
+        }
+        assert(
+          nativeBlocks.size == 1,
+          s"expected exactly one serialized CometNativeExec block, found ${nativeBlocks.size}:\n" +
+            s"$executed")
+
+        // Baseline: normal Comet execution (offload off), run through the same listener apparatus.
+        // This is a positive control proving the listener actually observes executor task starts,
+        // so the `== 0` assertion for the offloaded run is meaningful.
+        var baseline: Seq[Seq[Any]] = null
+        val baselineTaskStarts = countTaskStarts {
+          baseline = withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") {
+            spark.sql(q1).collect().map(_.toSeq.toIndexedSeq).toIndexedSeq
+          }
+        }
+        assert(
+          baselineTaskStarts > 0,
+          "expected the flag-off baseline collect to launch at least one Spark executor task " +
+            s"(sanity check for the listener apparatus); got $baselineTaskStarts")
+
+        // Ballista offload: run the same query with the flag on, counting executor task starts.
+        var offloaded: Seq[Seq[Any]] = null
+        val offloadedTaskStarts = countTaskStarts {
+          offloaded = withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "true") {
+            spark.sql(q1).collect().map(_.toSeq.toIndexedSeq).toIndexedSeq
+          }
+        }
+
+        // Compare full rows (each sorted into a stable total order by its string form) using the
+        // exact values/types Spark produced -- decimals stay decimals with their computed scale.
+        def sortKey(r: Seq[Any]): String = r.map(v => s"$v").mkString("")
+        val baselineSorted = baseline.sortBy(sortKey)
+        val offloadedSorted = offloaded.sortBy(sortKey)
+        assert(
+          offloadedSorted == baselineSorted,
+          "offloaded rows do not match baseline\n" +
+            s"  baseline:  $baselineSorted\n  offloaded: $offloadedSorted")
+
+        // The 8 rows on/before the Q1 cutoff are kept; the two past-cutoff rows are filtered out.
+        assert(
+          baselineSorted.size == 8,
+          s"expected 8 rows after the Q1 date filter, got ${baselineSorted.size}: $baselineSorted")
+
+        // Crucially, NO Spark executor tasks ran for the offloaded collect.
+        assert(
+          offloadedTaskStarts == 0,
+          s"expected 0 Spark executor tasks for the Ballista-offloaded collect, " +
+            s"but $offloadedTaskStarts started")
+      }
+    }
+  }
+
+  /**
+   * TPC-H Q1's full aggregate (NO `ORDER BY`): `sum`/`avg`/`count` over decimals, grouped by the
+   * two keys `(l_returnflag, l_linestatus)`. This is the R2 milestone — it distributes the
+   * aggregate across a Ballista hash shuffle as the two-block shape (Comet partial-agg ->
+   * `CometShuffleExchangeExec` -> Comet final-agg) and asserts the collected rows are identical
+   * to Spark's own Q1, launching ZERO Spark executor tasks.
+   *
+   * It exercises the composition risks the single-block R1 slice and the `count(*)` R2 test left
+   * unverified: `avg`'s partial state (sum + count) and decimal partial sums round-tripping
+   * through Ballista's Arrow IPC shuffle and composing in the Comet final aggregate, across two
+   * group keys.
+   *
+   * No `ORDER BY` in the offloaded query (a global sort would add a third, range-partition stage,
+   * out of the 2-block scope); both sides are sorted on the driver by `(returnflag, linestatus)`
+   * before comparison.
+   */
+  private val q1FullAggregate =
+    """
+      |SELECT l_returnflag, l_linestatus,
+      |  sum(l_quantity) AS sum_qty,
+      |  sum(l_extendedprice) AS sum_base_price,
+      |  sum(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
+      |  sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
+      |  avg(l_quantity) AS avg_qty,
+      |  avg(l_extendedprice) AS avg_price,
+      |  avg(l_discount) AS avg_disc,
+      |  count(*) AS count_order
+      |FROM lineitem
+      |WHERE l_shipdate <= date '1998-12-01' - interval '90' day
+      |GROUP BY l_returnflag, l_linestatus
+      |""".stripMargin
+
+  test(
+    "TPC-H Q1 full aggregate (sum/avg/count over decimals, two group keys) distributes across a " +
+      "Ballista shuffle with identical results and no executor tasks") {
+    assume(
+      NativeBallista.isAvailable,
+      s"native ballista library not available: ${NativeBallista.loadFailure.map(_.getMessage)}")
+
+    withTempPath { dir =>
+      // Spread the rows across several input files so rows of the same (returnflag, linestatus)
+      // group land in different partitions — the hash shuffle must then actually combine partial
+      // aggregate states across partitions for the final totals to be correct.
+      spark
+        .createDataFrame(spark.sparkContext.parallelize(lineitemRows), lineitemSchema)
+        .repartition(3)
+        .write
+        .parquet(dir.getCanonicalPath)
+
+      // AQE off so the collect root carries our executeCollect override; direct-read off so
+      // block2's input leaf serializes as a plain Scan (#100) fed by the Ballista shuffle; small
+      // shuffle-partition count keeps the in-process distributed run fast.
+      withSQLConf(
+        SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false",
+        SQLConf.SHUFFLE_PARTITIONS.key -> "4",
+        CometConf.COMET_SHUFFLE_DIRECT_READ_ENABLED.key -> "false") {
+        spark.read.parquet(dir.getCanonicalPath).createOrReplaceTempView("lineitem")
+
+        // Confirm the offloadable R2 shape BEFORE running: exactly one Comet hash exchange (two
+        // stages) and exactly two serialized CometNativeExec blocks (partial + final aggregate).
+        val executed = withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") {
+          spark.sql(q1FullAggregate).queryExecution.executedPlan
+        }
+        val exchanges = executed.collect { case e: CometShuffleExchangeExec => e }
+        assert(
+          exchanges.size == 1,
+          s"expected exactly one Comet hash exchange (two stages), found ${exchanges.size}:\n" +
+            s"$executed")
+        val nativeBlocks = executed.collect {
+          case n: CometNativeExec if n.serializedPlanOpt.isDefined => n
+        }
+        assert(
+          nativeBlocks.size == 2,
+          s"expected exactly two serialized CometNativeExec blocks, found ${nativeBlocks.size}:\n" +
+            s"$executed")
+
+        // Baseline oracle: Q1's answer computed via the Comet-on-executor native path
+        // (COMET_ENABLED=true, offload off). This is ALSO the coexistence acceptance check: the
+        // first test in this suite already ran an in-process Ballista offload in this same JVM, so
+        // this Comet-native collect drives `Native.executePlan` (tokio `with_env`) AFTER an offload
+        // — the exact scenario that used to panic with `JAVA_VM not initialized` under the old
+        // dual-library layout. With the offload folded into the single `libcomet` (one shared
+        // `JAVA_VM`), it must now run cleanly and still match Spark's Q1. It also runs through the
+        // same listener apparatus as a positive control proving the listener observes executor task
+        // starts, so the `== 0` assertion for the offloaded run is meaningful.
+        var baseline: Seq[Seq[Any]] = null
+        val baselineTaskStarts = countTaskStarts {
+          baseline = withSQLConf(
+            CometConf.COMET_ENABLED.key -> "true",
+            CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") {
+            spark.sql(q1FullAggregate).collect().map(_.toSeq.toIndexedSeq).toIndexedSeq
+          }
+        }
+        assert(
+          baselineTaskStarts > 0,
+          "expected the Spark baseline collect to launch at least one Spark executor task " +
+            s"(sanity check for the listener apparatus); got $baselineTaskStarts")
+
+        // Ballista offload: run the same query with the flag on, counting executor task starts.
+        var offloaded: Seq[Seq[Any]] = null
+        val offloadedTaskStarts = countTaskStarts {
+          offloaded = withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "true") {
+            spark.sql(q1FullAggregate).collect().map(_.toSeq.toIndexedSeq).toIndexedSeq
+          }
+        }
+
+        // Sort both sides by (returnflag, linestatus) on the driver (Q1's trailing ORDER BY is not
+        // offloaded). Compare full rows using the exact values Spark produced — decimals keep their
+        // computed scale, so a wrong decimal scale from avg/sum composition fails the assertion.
+        def sortKey(r: Seq[Any]): (String, String) = (s"${r.head}", s"${r(1)}")
+        val baselineSorted = baseline.sortBy(sortKey)
+        val offloadedSorted = offloaded.sortBy(sortKey)
+        assert(
+          offloadedSorted == baselineSorted,
+          "offloaded (distributed) Q1 aggregate rows do not match Spark's own Q1\n" +
+            s"  spark:     $baselineSorted\n  offloaded: $offloadedSorted")
+
+        // Sanity: the synthetic lineitem forms three surviving groups after the Q1 date filter.
+        assert(
+          baselineSorted.map(r => (s"${r.head}", s"${r(1)}")) ==
+            Seq(("A", "F"), ("N", "O"), ("R", "F")),
+          s"unexpected Q1 groups: ${baselineSorted.map(r => (r.head, r(1)))}")
+
+        // Crucially, NO Spark executor tasks ran for the offloaded (distributed) collect.
+        assert(
+          offloadedTaskStarts == 0,
+          s"expected 0 Spark executor tasks for the Ballista-offloaded distributed collect, " +
+            s"but $offloadedTaskStarts started")
+      }
+    }
+  }
+}