From 04abf3f69057dd6305bb89728e1e20584bb8232f Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Jul 2026 13:56:15 -0600 Subject: [PATCH 01/42] feat(native): execute native plans across a datafusion-ffi boundary Add an FFI export that decodes a Comet `Operator` protobuf, builds the native plan with the existing `PhysicalPlanner`, and wraps the root as an `FFI_ExecutionPlan`. A process compiled against a different DataFusion version (for example a Ballista executor) can then execute Comet's native operators without linking Comet's Rust crates. The plan must use `NativeScan` leaves so no JVM-fed input sources are required. Relates to #4796 --- native/Cargo.lock | 180 ++++++++++++++++++++++++++++++- native/core/Cargo.toml | 1 + native/core/src/execution/ffi.rs | 59 ++++++++++ native/core/src/execution/mod.rs | 1 + 4 files changed, 239 insertions(+), 2 deletions(-) create mode 100644 native/core/src/execution/ffi.rs diff --git a/native/Cargo.lock b/native/Cargo.lock index adb764fbfb..3c92bd71d3 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -520,6 +520,12 @@ dependencies = [ "slab", ] +[[package]] +name = "async-ffi" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4de21c0feef7e5a556e51af767c953f0501f7f300ba785cc99c47bdc8081a50" + [[package]] name = "async-global-executor" version = "2.4.1" @@ -1442,7 +1448,7 @@ checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" dependencies = [ "glob", "libc", - "libloading", + "libloading 0.8.9", ] [[package]] @@ -1997,6 +2003,7 @@ dependencies = [ "datafusion-comet-shuffle", "datafusion-comet-spark-expr", "datafusion-datasource", + "datafusion-ffi", "datafusion-functions-nested", "datafusion-physical-expr-adapter", "datafusion-spark", @@ -2390,6 +2397,39 @@ dependencies = [ "itertools 0.14.0", ] +[[package]] +name = "datafusion-ffi" +version = "54.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5660e8fa79fd51e29ce46f3026b67317ef738ebd633e106beb1a1907a406152" +dependencies = [ + "arrow", + "arrow-schema", + "async-ffi", + "async-trait", + "chrono", + "datafusion-catalog", + "datafusion-common", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-optimizer", + "datafusion-physical-plan", + "datafusion-proto", + "datafusion-proto-common", + "datafusion-session", + "futures", + "libloading 0.9.0", + "log", + "prost", + "semver", + "stabby", + "tokio", +] + [[package]] name = "datafusion-functions" version = "54.0.0" @@ -2657,6 +2697,44 @@ dependencies = [ "tokio", ] +[[package]] +name = "datafusion-proto" +version = "54.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd15a1ba5d3af93808241065c6c44dbca8296a189845e8a587c45c07bf0ffae" +dependencies = [ + "arrow", + "chrono", + "datafusion-catalog", + "datafusion-catalog-listing", + "datafusion-common", + "datafusion-datasource", + "datafusion-datasource-arrow", + "datafusion-datasource-csv", + "datafusion-datasource-json", + "datafusion-datasource-parquet", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-table", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-proto-common", + "object_store", + "prost", +] + +[[package]] +name = "datafusion-proto-common" +version = "54.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90042982cf9462eb06a0b81f92efa4188dae871e7ea3ab8dc61aa9c9349b2530" +dependencies = [ + "arrow", + "datafusion-common", + "prost", +] + [[package]] name = "datafusion-pruning" version = "54.0.0" @@ -3979,7 +4057,7 @@ dependencies = [ "java-locator", "jni-macros", "jni-sys 0.4.1", - "libloading", + "libloading 0.8.9", "log", "simd_cesu8", "thiserror 2.0.18", @@ -4152,6 +4230,16 @@ dependencies = [ "windows-link", ] +[[package]] +name = "libloading" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "754ca22de805bb5744484a5b151a9e1a8e837d5dc232c2d7d8c2e3492edc8b60" +dependencies = [ + "cfg-if", + "windows-link", +] + [[package]] name = "liblzma" version = "0.4.6" @@ -5276,6 +5364,15 @@ dependencies = [ "syn 2.0.118", ] +[[package]] +name = "proc-macro-crate" +version = "3.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e67ba7e9b2b56446f1d419b1d807906278ffa1a658a8a5d8a39dcb1f5a78614f" +dependencies = [ + "toml_edit", +] + [[package]] name = "proc-macro-error-attr2" version = "2.0.0" @@ -6340,6 +6437,12 @@ dependencies = [ "digest 0.11.3", ] +[[package]] +name = "sha2-const-stable" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f179d4e11094a893b82fff208f74d448a7512f99f5a0acbd5c679b705f83ed9" + [[package]] name = "shlex" version = "1.3.0" @@ -6474,6 +6577,40 @@ dependencies = [ "syn 2.0.118", ] +[[package]] +name = "stabby" +version = "72.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7b834ec7ced12095fea1e4b07dcb7e8cf2b59b18afa3eac52494d835965a5ec" +dependencies = [ + "rustversion", + "stabby-abi", +] + +[[package]] +name = "stabby-abi" +version = "72.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff1a4f477858a5bdf927c9fab7f579899de9b13e39f8b3b3b300c89fbab632f4" +dependencies = [ + "rustc_version", + "rustversion", + "sha2-const-stable", + "stabby-macros", +] + +[[package]] +name = "stabby-macros" +version = "72.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b31c4b2434980b67ad83f300a58088ba14d59454dcd79ba3d87419bbd924d31e" +dependencies = [ + "proc-macro-crate", + "proc-macro2", + "quote", + "syn 2.0.118", +] + [[package]] name = "stable_deref_trait" version = "1.2.1" @@ -6832,6 +6969,36 @@ dependencies = [ "tokio", ] +[[package]] +name = "toml_datetime" +version = "1.1.1+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3165f65f62e28e0115a00b2ebdd37eb6f3b641855f9d636d3cd4103767159ad7" +dependencies = [ + "serde_core", +] + +[[package]] +name = "toml_edit" +version = "0.25.12+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2153edc6955a6c354fad8f5efd38b6a8769bdccf9fe50f8e1329f81b0baa5d7" +dependencies = [ + "indexmap 2.14.0", + "toml_datetime", + "toml_parser", + "winnow", +] + +[[package]] +name = "toml_parser" +version = "1.1.2+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2abe9b86193656635d2411dc43050282ca48aa31c2451210f4202550afb7526" +dependencies = [ + "winnow", +] + [[package]] name = "tower" version = "0.5.3" @@ -7580,6 +7747,15 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" +[[package]] +name = "winnow" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0592e1c9d151f854e6fd382574c3a0855250e1d9b2f99d9281c6e6391af352f1" +dependencies = [ + "memchr", +] + [[package]] name = "wit-bindgen" version = "0.57.1" diff --git a/native/core/Cargo.toml b/native/core/Cargo.toml index e657879d33..f9876037ae 100644 --- a/native/core/Cargo.toml +++ b/native/core/Cargo.toml @@ -53,6 +53,7 @@ tempfile = "3.26.0" itertools = "0.15.0" paste = "1.0.14" datafusion = { workspace = true, features = ["parquet_encryption", "sql"] } +datafusion-ffi = "54.0.0" datafusion-physical-expr-adapter = { workspace = true } datafusion-datasource = { workspace = true } datafusion-spark = { workspace = true } diff --git a/native/core/src/execution/ffi.rs b/native/core/src/execution/ffi.rs new file mode 100644 index 0000000000..f71cf369e6 --- /dev/null +++ b/native/core/src/execution/ffi.rs @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Exposes a Comet native plan across a `datafusion-ffi` boundary so that a +//! process compiled against a different DataFusion version (e.g. a Ballista +//! executor) can execute it without linking Comet's Rust crates. +//! +//! The input is a serialized Comet `Operator` plan whose leaves are +//! `NativeScan` (native Parquet), so no JVM-fed inputs are required. + +use std::sync::Arc; + +use datafusion::physical_plan::ExecutionPlan; +use datafusion::prelude::SessionContext; +use datafusion_comet_proto::spark_operator::Operator; +use datafusion_ffi::execution_plan::FFI_ExecutionPlan; +use prost::Message; +use tokio::runtime::Handle; + +use super::planner::PhysicalPlanner; + +/// Decode a Comet `Operator` protobuf, build the native DataFusion plan with +/// Comet's own planner, and wrap the root as an `FFI_ExecutionPlan`. +/// +/// `runtime` is the Tokio runtime handle the foreign consumer should use to +/// drive async execution across the boundary. +pub fn comet_ffi_plan_from_proto( + proto_bytes: &[u8], + runtime: Option, +) -> Result { + let op = Operator::decode(proto_bytes) + .map_err(|e| format!("failed to decode Comet Operator proto: {e}"))?; + + let session_ctx = Arc::new(SessionContext::new()); + let planner = PhysicalPlanner::new(session_ctx, 0); + + // NativeScan leaves read Parquet directly, so no JVM input sources are needed. + let mut inputs = Vec::new(); + let (_scans, _shuffle_scans, spark_plan) = planner + .create_plan(&op, &mut inputs, 1) + .map_err(|e| format!("failed to build native plan: {e}"))?; + + let plan: Arc = Arc::clone(&spark_plan.native_plan); + Ok(FFI_ExecutionPlan::new(plan, runtime)) +} diff --git a/native/core/src/execution/mod.rs b/native/core/src/execution/mod.rs index ec247f72b7..3e85715920 100644 --- a/native/core/src/execution/mod.rs +++ b/native/core/src/execution/mod.rs @@ -18,6 +18,7 @@ //! PoC of vectorization execution through JNI to Rust. pub mod columnar_to_row; pub mod expressions; +pub mod ffi; pub mod jni_api; pub(crate) mod merge_as_partial; pub(crate) mod metrics; From e6f056cfbc5067c8ac1fb2fb42e35b415c744118 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Jul 2026 14:18:16 -0600 Subject: [PATCH 02/42] test(native): cover the datafusion-ffi plan export end to end Relates to #4796. --- native/core/src/execution/ffi.rs | 107 +++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) diff --git a/native/core/src/execution/ffi.rs b/native/core/src/execution/ffi.rs index f71cf369e6..b8487261dc 100644 --- a/native/core/src/execution/ffi.rs +++ b/native/core/src/execution/ffi.rs @@ -57,3 +57,110 @@ pub fn comet_ffi_plan_from_proto( let plan: Arc = Arc::clone(&spark_plan.native_plan); Ok(FFI_ExecutionPlan::new(plan, runtime)) } + +#[cfg(test)] +mod tests { + use super::*; + use datafusion::arrow::array::{Int32Array, RecordBatch}; + use datafusion::arrow::datatypes::{DataType as ArrowDataType, Field, Schema}; + use datafusion::parquet::arrow::ArrowWriter; + use datafusion_comet_proto::spark_expression::{data_type::DataTypeId, DataType}; + use datafusion_comet_proto::spark_operator::{ + operator::OpStruct, NativeScan, NativeScanCommon, SparkFilePartition, SparkPartitionedFile, + SparkStructField, + }; + use datafusion_ffi::execution_plan::ForeignExecutionPlan; + use futures::StreamExt; + + /// Write a tiny Parquet file with a single int32 column `a` = [1..=5]. + fn write_test_parquet(path: &std::path::Path) { + let schema = Arc::new(Schema::new(vec![Field::new( + "a", + ArrowDataType::Int32, + true, + )])); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]))], + ) + .unwrap(); + let file = std::fs::File::create(path).unwrap(); + let mut writer = ArrowWriter::try_new(file, schema, None).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + } + + /// Build a Comet `Operator` proto: a single `NativeScan` over `parquet_path`. + fn build_native_scan_proto(parquet_path: &std::path::Path) -> Vec { + let int32 = DataType { + type_id: DataTypeId::Int32 as i32, + type_info: None, + }; + let field_a = SparkStructField { + name: "a".to_string(), + data_type: Some(int32), + nullable: true, + metadata: Default::default(), + }; + let common = NativeScanCommon { + required_schema: vec![field_a.clone()], + data_schema: vec![field_a], + projection_vector: vec![0], + session_timezone: "UTC".to_string(), + source: "comet-ffi-test".to_string(), + ..Default::default() + }; + let file_size = std::fs::metadata(parquet_path).unwrap().len() as i64; + let partitioned_file = SparkPartitionedFile { + file_path: format!("file://{}", parquet_path.display()), + start: 0, + length: file_size, + file_size, + partition_values: vec![], + }; + let native_scan = NativeScan { + common: Some(common), + file_partition: Some(SparkFilePartition { + partitioned_file: vec![partitioned_file], + }), + }; + let op = Operator { + children: vec![], + plan_id: 0, + op_struct: Some(OpStruct::NativeScan(native_scan)), + }; + op.encode_to_vec() + } + + #[tokio::test] + async fn ffi_export_executes_native_scan() { + let dir = tempfile::tempdir().unwrap(); + let parquet_path = dir.path().join("ffi_export_test.parquet"); + write_test_parquet(&parquet_path); + + let proto = build_native_scan_proto(&parquet_path); + + let ffi_plan = comet_ffi_plan_from_proto(&proto, Handle::try_current().ok()) + .expect("failed to build FFI plan from proto"); + + // Wrap via `ForeignExecutionPlan` to force the real FFI vtable path, + // rather than datafusion-ffi's same-library short-circuit. + let plan: Arc = Arc::new( + ForeignExecutionPlan::try_from(ffi_plan) + .expect("failed to wrap FFI plan as ForeignExecutionPlan"), + ); + + let session_ctx = SessionContext::new(); + let mut stream = plan + .execute(0, session_ctx.task_ctx()) + .expect("failed to execute plan"); + + let mut total = 0usize; + while let Some(batch) = stream.next().await { + let batch = batch.expect("failed to read batch"); + total += batch.num_rows(); + } + + assert_eq!(total, 5); + } +} From a1db30a6b042c4ede9c1afb4614fa7371600d680 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Jul 2026 14:25:15 -0600 Subject: [PATCH 03/42] feat(ballista): add datafusion-comet-ballista crate Add a new native workspace member that ports the comet-ffi-consumer PoC's CometScanExec, CometPhysicalCodec/CometLogicalCodec, and CometTableProvider so Comet native scans can run as leaves inside Ballista distributed plans. Wire the Ballista dependency as a git dep pinned to DF54 rev a8b3c79c, and depend on Comet's core crate via a workspace path dep instead of the PoC's path deps into sibling checkouts. --- native/Cargo.lock | 572 +++++++++++++++++++++++++- native/Cargo.toml | 4 +- native/ballista/Cargo.toml | 48 +++ native/ballista/src/codec.rs | 135 ++++++ native/ballista/src/lib.rs | 39 ++ native/ballista/src/scan.rs | 97 +++++ native/ballista/src/table_provider.rs | 63 +++ 7 files changed, 955 insertions(+), 3 deletions(-) create mode 100644 native/ballista/Cargo.toml create mode 100644 native/ballista/src/codec.rs create mode 100644 native/ballista/src/lib.rs create mode 100644 native/ballista/src/scan.rs create mode 100644 native/ballista/src/table_provider.rs diff --git a/native/Cargo.lock b/native/Cargo.lock index 3c92bd71d3..ecaee0db5b 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -203,6 +203,15 @@ dependencies = [ "zstd", ] +[[package]] +name = "ar_archive_writer" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4087686b4b0a3427190bae57a1d9a478dbb2d40c5dc1bd6e2b6d797913bdd348" +dependencies = [ + "object", +] + [[package]] name = "arc-swap" version = "1.9.1" @@ -346,6 +355,34 @@ dependencies = [ "num-traits", ] +[[package]] +name = "arrow-flight" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28abfe8bf9f124e5fc83b334af4fa58f8d0323ad25312ccb2d1da50178415704" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ipc", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", + "base64", + "bytes", + "futures", + "once_cell", + "paste", + "prost", + "prost-types", + "tonic", + "tonic-prost", +] + [[package]] name = "arrow-ipc" version = "58.3.0" @@ -1021,6 +1058,58 @@ dependencies = [ "tracing", ] +[[package]] +name = "axum" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31b698c5f9a010f6573133b09e0de5408834d0c82f8d7475a89fc1867a71cd90" +dependencies = [ + "axum-core", + "bytes", + "form_urlencoded", + "futures-util", + "http 1.4.2", + "http-body 1.0.1", + "http-body-util", + "hyper", + "hyper-util", + "itoa", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "serde_core", + "serde_json", + "serde_path_to_error", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "axum-core" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08c78f31d7b1291f7ee735c1c6780ccde7785daae9a9206026862dab7d8792d1" +dependencies = [ + "bytes", + "futures-core", + "http 1.4.2", + "http-body 1.0.1", + "http-body-util", + "mime", + "pin-project-lite", + "sync_wrapper", + "tower-layer", + "tower-service", + "tracing", +] + [[package]] name = "backon" version = "1.6.0" @@ -1047,6 +1136,110 @@ dependencies = [ "windows-link", ] +[[package]] +name = "ballista" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion-ballista?rev=a8b3c79c#a8b3c79c80f7f1c0aa862ed1a76dee7a1ac67265" +dependencies = [ + "async-trait", + "ballista-core", + "ballista-executor", + "ballista-scheduler", + "datafusion", + "log", + "tokio", + "url", +] + +[[package]] +name = "ballista-core" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion-ballista?rev=a8b3c79c#a8b3c79c80f7f1c0aa862ed1a76dee7a1ac67265" +dependencies = [ + "arrow-flight", + "async-trait", + "chrono", + "datafusion", + "datafusion-proto", + "datafusion-proto-common", + "futures", + "itertools 0.15.0", + "log", + "md-5 0.11.0", + "parking_lot", + "prost", + "prost-types", + "rand 0.10.1", + "rustc_version", + "serde", + "tokio", + "tokio-stream", + "tonic", + "tonic-build", + "tonic-prost", + "tonic-prost-build", + "url", + "uuid", +] + +[[package]] +name = "ballista-executor" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion-ballista?rev=a8b3c79c#a8b3c79c80f7f1c0aa862ed1a76dee7a1ac67265" +dependencies = [ + "arrow", + "arrow-flight", + "async-trait", + "ballista-core", + "bytesize", + "dashmap", + "datafusion", + "datafusion-proto", + "futures", + "libc", + "log", + "memory-stats", + "parking_lot", + "serde", + "sysinfo", + "tempfile", + "tokio", + "tokio-stream", + "tokio-util", + "tonic", + "uuid", +] + +[[package]] +name = "ballista-scheduler" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion-ballista?rev=a8b3c79c#a8b3c79c80f7f1c0aa862ed1a76dee7a1ac67265" +dependencies = [ + "arrow-flight", + "async-trait", + "axum", + "ballista-core", + "dashmap", + "datafusion", + "datafusion-proto", + "futures", + "http 1.4.2", + "insta", + "itertools 0.15.0", + "log", + "object_store", + "parking_lot", + "prost", + "prost-types", + "rand 0.10.1", + "serde", + "tokio", + "tokio-stream", + "tonic", + "tower-http 0.7.0", + "uuid", +] + [[package]] name = "base64" version = "0.22.1" @@ -1305,6 +1498,12 @@ dependencies = [ "either", ] +[[package]] +name = "bytesize" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49e78e506b9d7633710dab98996f22f95f3d0f488e8f1aa162830556ed9fc14d" + [[package]] name = "bzip2" version = "0.6.1" @@ -1562,6 +1761,17 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "console" +version = "0.16.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fe5f465a4f6fee88fad41b85d990f84c835335e85b5d9e6e63e0d06d28cba7c" +dependencies = [ + "encode_unicode", + "libc", + "windows-sys 0.61.2", +] + [[package]] name = "const-oid" version = "0.9.6" @@ -1897,6 +2107,7 @@ dependencies = [ "arrow", "arrow-schema", "async-trait", + "bzip2", "chrono", "datafusion-catalog", "datafusion-catalog-listing", @@ -1923,9 +2134,11 @@ dependencies = [ "datafusion-physical-plan", "datafusion-session", "datafusion-sql", + "flate2", "futures", "indexmap 2.14.0", "itertools 0.14.0", + "liblzma", "log", "object_store", "parking_lot", @@ -1935,6 +2148,7 @@ dependencies = [ "tokio", "url", "uuid", + "zstd", ] [[package]] @@ -2042,6 +2256,24 @@ dependencies = [ "uuid", ] +[[package]] +name = "datafusion-comet-ballista" +version = "1.0.0" +dependencies = [ + "anyhow", + "async-trait", + "ballista", + "ballista-core", + "datafusion", + "datafusion-comet", + "datafusion-comet-proto", + "datafusion-ffi", + "datafusion-proto", + "futures", + "prost", + "tokio", +] + [[package]] name = "datafusion-comet-common" version = "1.0.0" @@ -2180,6 +2412,7 @@ dependencies = [ "log", "object_store", "parquet", + "recursive", "sqlparser", "tokio", "uuid", @@ -2381,6 +2614,7 @@ dependencies = [ "datafusion-physical-expr-common", "indexmap 2.14.0", "itertools 0.14.0", + "recursive", "serde_json", "sqlparser", ] @@ -2589,6 +2823,7 @@ dependencies = [ "indexmap 2.14.0", "itertools 0.14.0", "log", + "recursive", "regex", "regex-syntax", ] @@ -2611,6 +2846,7 @@ dependencies = [ "itertools 0.14.0", "parking_lot", "petgraph", + "recursive", "tokio", ] @@ -2662,6 +2898,7 @@ dependencies = [ "datafusion-physical-plan", "datafusion-pruning", "itertools 0.14.0", + "recursive", ] [[package]] @@ -2809,6 +3046,7 @@ dependencies = [ "datafusion-functions-nested", "indexmap 2.14.0", "log", + "recursive", "regex", "sqlparser", ] @@ -3001,6 +3239,12 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e" +[[package]] +name = "encode_unicode" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" + [[package]] name = "equator" version = "0.4.2" @@ -3572,6 +3816,12 @@ version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + [[package]] name = "humantime" version = "2.3.0" @@ -3601,6 +3851,7 @@ dependencies = [ "http 1.4.2", "http-body 1.0.1", "httparse", + "httpdate", "itoa", "pin-project-lite", "smallvec", @@ -3624,6 +3875,19 @@ dependencies = [ "tower-service", ] +[[package]] +name = "hyper-timeout" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" +dependencies = [ + "hyper", + "hyper-util", + "pin-project-lite", + "tokio", + "tower-service", +] + [[package]] name = "hyper-util" version = "0.1.20" @@ -3906,6 +4170,18 @@ dependencies = [ "generic-array", ] +[[package]] +name = "insta" +version = "1.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86f0f8fee8c926415c58d6ae43a08523a26faccb2323f5e6b644fe7dd4ef6b82" +dependencies = [ + "console", + "once_cell", + "similar", + "tempfile", +] + [[package]] name = "integer-encoding" version = "3.0.4" @@ -4374,6 +4650,12 @@ dependencies = [ "twox-hash", ] +[[package]] +name = "matchit" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" + [[package]] name = "md-5" version = "0.10.6" @@ -4418,6 +4700,16 @@ dependencies = [ "libc", ] +[[package]] +name = "memory-stats" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c73f5c649995a115e1a0220b35e4df0a1294500477f97a91d0660fb5abeb574a" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + [[package]] name = "mimalloc" version = "0.1.52" @@ -4427,6 +4719,12 @@ dependencies = [ "libmimalloc-sys", ] +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + [[package]] name = "minimal-lexical" version = "0.2.1" @@ -4513,6 +4811,15 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "ntapi" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3b335231dfd352ffb0f8017f3b6027a4917f7df785ea2143d8af2adc66980ae" +dependencies = [ + "winapi", +] + [[package]] name = "num" version = "0.4.3" @@ -4620,6 +4927,25 @@ dependencies = [ "libm", ] +[[package]] +name = "objc2-core-foundation" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536" +dependencies = [ + "bitflags 2.13.0", +] + +[[package]] +name = "objc2-io-kit" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33fafba39597d6dc1fb709123dfa8289d39406734be322956a69f0931c73bb15" +dependencies = [ + "libc", + "objc2-core-foundation", +] + [[package]] name = "object" version = "0.37.3" @@ -5452,6 +5778,8 @@ dependencies = [ "prettyplease", "prost", "prost-types", + "pulldown-cmark", + "pulldown-cmark-to-cmark", "regex", "syn 2.0.118", "tempfile", @@ -5479,6 +5807,36 @@ dependencies = [ "prost", ] +[[package]] +name = "psm" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "645dbe486e346d9b5de3ef16ede18c26e6c70ad97418f4874b8b1889d6e761ea" +dependencies = [ + "ar_archive_writer", + "cc", +] + +[[package]] +name = "pulldown-cmark" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9f068eba8e7071c5f9511831b44f32c740d5adf574e990f946ddb53db2f314e" +dependencies = [ + "bitflags 2.13.0", + "memchr", + "unicase", +] + +[[package]] +name = "pulldown-cmark-to-cmark" +version = "22.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50793def1b900256624a709439404384204a5dc3a6ec580281bfaac35e882e90" +dependencies = [ + "pulldown-cmark", +] + [[package]] name = "quad-rand" version = "0.2.3" @@ -5686,6 +6044,26 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "recursive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" +dependencies = [ + "recursive-proc-macro-impl", + "stacker", +] + +[[package]] +name = "recursive-proc-macro-impl" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" +dependencies = [ + "quote", + "syn 2.0.118", +] + [[package]] name = "redox_syscall" version = "0.5.18" @@ -5897,7 +6275,7 @@ dependencies = [ "tokio-rustls", "tokio-util", "tower", - "tower-http", + "tower-http 0.6.11", "tower-service", "url", "wasm-bindgen", @@ -5935,7 +6313,7 @@ dependencies = [ "tokio-rustls", "tokio-util", "tower", - "tower-http", + "tower-http 0.6.11", "tower-service", "url", "wasm-bindgen", @@ -6325,6 +6703,17 @@ dependencies = [ "zmij", ] +[[package]] +name = "serde_path_to_error" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10a9ff822e371bb5403e391ecd83e182e0e77ba7f6fe0160b795797109d1b457" +dependencies = [ + "itoa", + "serde", + "serde_core", +] + [[package]] name = "serde_repr" version = "0.1.20" @@ -6497,6 +6886,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" +[[package]] +name = "similar" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbbb5d9659141646ae647b42fe094daf6c6192d1620870b449d9557f748b2daa" + [[package]] name = "siphasher" version = "1.0.3" @@ -6563,6 +6958,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c6d1b651dc4edf07eead2a0c6c78016ce971bc2c10da5266861b13f25e7cec" dependencies = [ "log", + "recursive", "sqlparser_derive", ] @@ -6617,6 +7013,19 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" +[[package]] +name = "stacker" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "640c8cdd92b6b12f5bcb1803ca3bbf5ab96e5e6b6b96b9ab77dabe9e880b3190" +dependencies = [ + "cc", + "cfg-if", + "libc", + "psm", + "windows-sys 0.60.2", +] + [[package]] name = "str_stack" version = "0.1.1" @@ -6721,6 +7130,20 @@ dependencies = [ "syn 2.0.118", ] +[[package]] +name = "sysinfo" +version = "0.38.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92ab6a2f8bfe508deb3c6406578252e491d299cbbf3bc0529ecc3313aee4a52f" +dependencies = [ + "libc", + "memchr", + "ntapi", + "objc2-core-foundation", + "objc2-io-kit", + "windows", +] + [[package]] name = "tagptr" version = "0.2.0" @@ -6999,6 +7422,74 @@ dependencies = [ "winnow", ] +[[package]] +name = "tonic" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac2a5518c70fa84342385732db33fb3f44bc4cc748936eb5833d2df34d6445ef" +dependencies = [ + "async-trait", + "axum", + "base64", + "bytes", + "h2", + "http 1.4.2", + "http-body 1.0.1", + "http-body-util", + "hyper", + "hyper-timeout", + "hyper-util", + "percent-encoding", + "pin-project", + "socket2", + "sync_wrapper", + "tokio", + "tokio-stream", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tonic-build" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c68f61875ac5293cf72e6c8cf0158086428c82c37229e98c840878f1706b0322" +dependencies = [ + "prettyplease", + "proc-macro2", + "quote", + "syn 2.0.118", +] + +[[package]] +name = "tonic-prost" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50849f68853be452acf590cde0b146665b8d507b3b8af17261df47e02c209ea0" +dependencies = [ + "bytes", + "prost", + "tonic", +] + +[[package]] +name = "tonic-prost-build" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "654e5643eff75d7f8c99197ce1440ed19a3474eada74c12bbac488b2cafdae27" +dependencies = [ + "prettyplease", + "proc-macro2", + "prost-build", + "prost-types", + "quote", + "syn 2.0.118", + "tempfile", + "tonic-build", +] + [[package]] name = "tower" version = "0.5.3" @@ -7007,11 +7498,15 @@ checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" dependencies = [ "futures-core", "futures-util", + "indexmap 2.14.0", "pin-project-lite", + "slab", "sync_wrapper", "tokio", + "tokio-util", "tower-layer", "tower-service", + "tracing", ] [[package]] @@ -7032,6 +7527,21 @@ dependencies = [ "url", ] +[[package]] +name = "tower-http" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b11f75e912b0c2be01b63d8cf8057b8c3f97cf34abb3d431a3a4c8675498e233" +dependencies = [ + "bitflags 2.13.0", + "bytes", + "http 1.4.2", + "percent-encoding", + "pin-project-lite", + "tower-layer", + "tower-service", +] + [[package]] name = "tower-layer" version = "0.3.3" @@ -7050,6 +7560,7 @@ version = "0.1.44" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" dependencies = [ + "log", "pin-project-lite", "tracing-attributes", "tracing-core", @@ -7155,6 +7666,12 @@ dependencies = [ "syn 2.0.118", ] +[[package]] +name = "unicase" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142" + [[package]] name = "unicode-ident" version = "1.0.24" @@ -7457,6 +7974,27 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "527fadee13e0c05939a6a05d5bd6eec6cd2e3dbd648b9f8e447c6518133d8580" +dependencies = [ + "windows-collections", + "windows-core", + "windows-future", + "windows-numerics", +] + +[[package]] +name = "windows-collections" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23b2d95af1a8a14a3c7367e1ed4fc9c20e0a26e79551b1454d72583c97cc6610" +dependencies = [ + "windows-core", +] + [[package]] name = "windows-core" version = "0.62.2" @@ -7470,6 +8008,17 @@ dependencies = [ "windows-strings", ] +[[package]] +name = "windows-future" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1d6f90251fe18a279739e78025bd6ddc52a7e22f921070ccdc67dde84c605cb" +dependencies = [ + "windows-core", + "windows-link", + "windows-threading", +] + [[package]] name = "windows-implement" version = "0.60.2" @@ -7498,6 +8047,16 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-numerics" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e2e40844ac143cdb44aead537bbf727de9b044e107a0f1220392177d15b0f26" +dependencies = [ + "windows-core", + "windows-link", +] + [[package]] name = "windows-result" version = "0.4.1" @@ -7609,6 +8168,15 @@ dependencies = [ "windows_x86_64_msvc 0.53.1", ] +[[package]] +name = "windows-threading" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3949bd5b99cafdf1c7ca86b43ca564028dfe27d66958f2470940f73d86d75b37" +dependencies = [ + "windows-link", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.42.2" diff --git a/native/Cargo.toml b/native/Cargo.toml index 3e797eb968..07c2936c0f 100644 --- a/native/Cargo.toml +++ b/native/Cargo.toml @@ -17,7 +17,7 @@ [workspace] default-members = ["core", "spark-expr", "common", "proto", "jni-bridge", "shuffle"] -members = ["core", "spark-expr", "common", "proto", "jni-bridge", "shuffle", "hdfs", "fs-hdfs"] +members = ["core", "spark-expr", "common", "proto", "jni-bridge", "shuffle", "hdfs", "fs-hdfs", "ballista"] resolver = "2" [workspace.package] @@ -61,6 +61,8 @@ aws-credential-types = "1.2.13" iceberg = { git = "https://github.com/apache/iceberg-rust", rev = "80a30d3" } iceberg-storage-opendal = { git = "https://github.com/apache/iceberg-rust", rev = "80a30d3", features = ["opendal-memory", "opendal-fs", "opendal-s3", "opendal-gcs", "opendal-oss", "opendal-azdls"] } reqsign-core = "3" +ballista = { git = "https://github.com/apache/datafusion-ballista", rev = "a8b3c79c", package = "ballista" } +ballista-core = { git = "https://github.com/apache/datafusion-ballista", rev = "a8b3c79c", package = "ballista-core" } [profile.release] debug = true diff --git a/native/ballista/Cargo.toml b/native/ballista/Cargo.toml new file mode 100644 index 0000000000..12b4b90721 --- /dev/null +++ b/native/ballista/Cargo.toml @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "datafusion-comet-ballista" +version = { workspace = true } +homepage = "https://datafusion.apache.org/comet" +repository = "https://github.com/apache/datafusion-comet" +authors = ["Apache DataFusion "] +description = "Runs Apache DataFusion Comet native plans as leaves in Apache DataFusion Ballista" +readme = "README.md" +license = "Apache-2.0" +edition = "2021" + +# this crate is a proof-of-concept and does not contain public Rust APIs so we do not publish this crate +publish = false + +[dependencies] +datafusion-comet = { path = "../core" } +datafusion-comet-proto = { workspace = true } +datafusion = { workspace = true, features = ["parquet"] } +datafusion-ffi = "54.0.0" +datafusion-proto = "54.0.0" +async-trait = { workspace = true } +tokio = { version = "1", features = ["rt-multi-thread"] } +prost = "0.14.3" +futures = { workspace = true } +anyhow = "1" + +ballista = { workspace = true } +ballista-core = { workspace = true } + +[dev-dependencies] +tokio = { version = "1", features = ["rt-multi-thread", "macros"] } diff --git a/native/ballista/src/codec.rs b/native/ballista/src/codec.rs new file mode 100644 index 0000000000..6be590a7d2 --- /dev/null +++ b/native/ballista/src/codec.rs @@ -0,0 +1,135 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use datafusion::arrow::datatypes::SchemaRef; +use datafusion::catalog::TableProvider; +use datafusion::common::Result; +use datafusion::execution::TaskContext; +use datafusion::logical_expr::Extension; +use datafusion::physical_plan::ExecutionPlan; +use datafusion::sql::TableReference; +use datafusion_proto::logical_plan::LogicalExtensionCodec; +use datafusion_proto::physical_plan::PhysicalExtensionCodec; + +use ballista_core::serde::{BallistaLogicalExtensionCodec, BallistaPhysicalExtensionCodec}; + +use crate::scan::CometScanExec; +use crate::table_provider::CometTableProvider; + +/// Marks a payload as a Comet node so the codec can tell it apart from a +/// Ballista/DataFusion node it should delegate. +pub const COMET_MAGIC: &[u8] = b"CMET1\0"; + +/// Serializes `CometScanExec` as its Comet proto bytes (tagged with `COMET_MAGIC`) +/// and reconstructs it on decode by re-running Comet's planner via FFI. All other +/// nodes (including Ballista's own shuffle operators) delegate to Ballista's codec. +#[derive(Debug)] +pub struct CometPhysicalCodec { + inner: BallistaPhysicalExtensionCodec, +} + +impl Default for CometPhysicalCodec { + fn default() -> Self { + Self { + inner: BallistaPhysicalExtensionCodec::default(), + } + } +} + +impl PhysicalExtensionCodec for CometPhysicalCodec { + fn try_decode( + &self, + buf: &[u8], + inputs: &[Arc], + ctx: &TaskContext, + ) -> Result> { + if let Some(rest) = buf.strip_prefix(COMET_MAGIC) { + return Ok(Arc::new(CometScanExec::try_new(rest.to_vec())?)); + } + self.inner.try_decode(buf, inputs, ctx) + } + + fn try_encode(&self, node: Arc, buf: &mut Vec) -> Result<()> { + if let Some(scan) = node.downcast_ref::() { + buf.extend_from_slice(COMET_MAGIC); + buf.extend_from_slice(scan.proto()); + return Ok(()); + } + self.inner.try_encode(node, buf) + } +} + +/// Serializes `CometTableProvider` (as its Comet proto bytes, tagged with +/// `COMET_MAGIC`) so a query's logical plan can be shipped client -> scheduler +/// and reconstructed there. Everything else delegates to Ballista's codec. +#[derive(Debug)] +pub struct CometLogicalCodec { + inner: BallistaLogicalExtensionCodec, +} + +impl Default for CometLogicalCodec { + fn default() -> Self { + Self { + inner: BallistaLogicalExtensionCodec::default(), + } + } +} + +impl LogicalExtensionCodec for CometLogicalCodec { + fn try_decode( + &self, + buf: &[u8], + inputs: &[datafusion::logical_expr::LogicalPlan], + ctx: &TaskContext, + ) -> Result { + self.inner.try_decode(buf, inputs, ctx) + } + + fn try_encode(&self, node: &Extension, buf: &mut Vec) -> Result<()> { + self.inner.try_encode(node, buf) + } + + fn try_decode_table_provider( + &self, + buf: &[u8], + table_ref: &TableReference, + schema: SchemaRef, + ctx: &TaskContext, + ) -> Result> { + if let Some(rest) = buf.strip_prefix(COMET_MAGIC) { + return Ok(Arc::new(CometTableProvider::new(rest.to_vec(), schema))); + } + self.inner + .try_decode_table_provider(buf, table_ref, schema, ctx) + } + + fn try_encode_table_provider( + &self, + table_ref: &TableReference, + node: Arc, + buf: &mut Vec, + ) -> Result<()> { + if let Some(provider) = node.downcast_ref::() { + buf.extend_from_slice(COMET_MAGIC); + buf.extend_from_slice(provider.proto()); + return Ok(()); + } + self.inner.try_encode_table_provider(table_ref, node, buf) + } +} diff --git a/native/ballista/src/lib.rs b/native/ballista/src/lib.rs new file mode 100644 index 0000000000..f8a7085154 --- /dev/null +++ b/native/ballista/src/lib.rs @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Runs Apache DataFusion Comet native plans as leaves inside Apache +//! DataFusion Ballista. +//! +//! - [`scan::CometScanExec`]: a serializable DataFusion leaf that carries the +//! Comet proto bytes (the "recipe") and builds the FFI plan at execute() +//! time. This is what Ballista ships to executors and reconstructs there. +//! - [`codec::CometPhysicalCodec`] / [`codec::CometLogicalCodec`]: extension +//! codecs that (de)serialize Comet nodes as their proto bytes (tagged with +//! [`codec::COMET_MAGIC`]) and delegate everything else to Ballista's own +//! codecs — the seam that lets Ballista distribute Comet work without +//! linking Comet's translation code. +//! - [`table_provider::CometTableProvider`]: a `TableProvider` that produces a +//! `CometScanExec`, so a Comet scan can participate in a DataFusion logical +//! plan. + +pub mod codec; +pub mod scan; +pub mod table_provider; + +pub use codec::{CometLogicalCodec, CometPhysicalCodec, COMET_MAGIC}; +pub use scan::CometScanExec; +pub use table_provider::CometTableProvider; diff --git a/native/ballista/src/scan.rs b/native/ballista/src/scan.rs new file mode 100644 index 0000000000..1b530964fd --- /dev/null +++ b/native/ballista/src/scan.rs @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::fmt; +use std::sync::Arc; + +use datafusion::common::{DataFusionError, Result}; +use datafusion::execution::TaskContext; +use datafusion::physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, SendableRecordBatchStream, +}; +use datafusion_ffi::execution_plan::ForeignExecutionPlan; +use tokio::runtime::Handle; + +use comet::execution::ffi::comet_ffi_plan_from_proto; + +/// A DataFusion leaf that carries a Comet plan protobuf and executes it via the +/// `datafusion-ffi` boundary. Serializable through `CometPhysicalCodec` by its +/// proto bytes, so Ballista can ship it to executors. +#[derive(Debug)] +pub struct CometScanExec { + proto: Vec, + inner: Arc, + props: Arc, +} + +impl CometScanExec { + /// Build from Comet proto bytes: run Comet's planner via FFI to get the plan, + /// wrap it as a `ForeignExecutionPlan` (forcing the real FFI vtable path). + pub fn try_new(proto: Vec) -> Result { + let ffi = comet_ffi_plan_from_proto(&proto, Handle::try_current().ok()) + .map_err(DataFusionError::Execution)?; + let inner: Arc = Arc::new( + ForeignExecutionPlan::try_from(ffi) + .map_err(|e| DataFusionError::Execution(format!("ForeignExecutionPlan: {e}")))?, + ); + let props = Arc::clone(inner.properties()); + Ok(Self { + proto, + inner, + props, + }) + } + + pub fn proto(&self) -> &[u8] { + &self.proto + } +} + +impl DisplayAs for CometScanExec { + fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "CometScanExec(proto={} bytes)", self.proto.len()) + } +} + +impl ExecutionPlan for CometScanExec { + fn name(&self) -> &str { + "CometScanExec" + } + + fn properties(&self) -> &Arc { + &self.props + } + + fn children(&self) -> Vec<&Arc> { + vec![] + } + + fn with_new_children( + self: Arc, + _children: Vec>, + ) -> Result> { + Ok(self) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> Result { + self.inner.execute(partition, context) + } +} diff --git a/native/ballista/src/table_provider.rs b/native/ballista/src/table_provider.rs new file mode 100644 index 0000000000..daa53a6d45 --- /dev/null +++ b/native/ballista/src/table_provider.rs @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use datafusion::arrow::datatypes::SchemaRef; +use datafusion::catalog::{Session, TableProvider}; +use datafusion::common::Result; +use datafusion::logical_expr::{Expr, TableType}; +use datafusion::physical_plan::ExecutionPlan; + +use crate::scan::CometScanExec; + +/// A DataFusion `TableProvider` that produces a `CometScanExec`. Carries the +/// Comet proto so the table can be reconstructed on the scheduler side via the +/// logical codec below. +#[derive(Debug)] +pub struct CometTableProvider { + proto: Vec, + schema: SchemaRef, +} + +impl CometTableProvider { + pub fn new(proto: Vec, schema: SchemaRef) -> Self { + Self { proto, schema } + } + pub fn proto(&self) -> &[u8] { + &self.proto + } +} + +#[async_trait::async_trait] +impl TableProvider for CometTableProvider { + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } + fn table_type(&self) -> TableType { + TableType::Base + } + async fn scan( + &self, + _state: &dyn Session, + _projection: Option<&Vec>, + _filters: &[Expr], + _limit: Option, + ) -> Result> { + Ok(Arc::new(CometScanExec::try_new(self.proto.clone())?)) + } +} From d56bb328fa9a1a0fa10d412367b5ae22c3a44a55 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Jul 2026 14:25:15 -0600 Subject: [PATCH 04/42] test(ballista): codec round-trip for the Comet FFI leaf Adapt the c2a PoC binary into an integration test: build a CoalesceBatchesExec(CometScanExec) plan, encode it with CometPhysicalCodec exactly as Ballista's scheduler would, decode it in a fresh SessionContext to simulate an executor, and execute it. Asserts the Comet leaf survives Ballista's physical-plan serialization and still produces the expected 5 rows. Relates to #4796 --- native/ballista/tests/codec_roundtrip.rs | 130 +++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 native/ballista/tests/codec_roundtrip.rs diff --git a/native/ballista/tests/codec_roundtrip.rs b/native/ballista/tests/codec_roundtrip.rs new file mode 100644 index 0000000000..1e2245ad3a --- /dev/null +++ b/native/ballista/tests/codec_roundtrip.rs @@ -0,0 +1,130 @@ +// Proves a Comet-FFI leaf survives Ballista's physical-plan serialization. +// +// Ballista ships each stage's physical plan to executors as protobuf via a +// PhysicalExtensionCodec. Here we take a plan `CoalesceBatchesExec(CometScanExec)`, +// serialize the WHOLE tree with datafusion-proto + our CometPhysicalCodec exactly +// as Ballista would, then deserialize it in a fresh context (simulating the +// executor) and execute it. The Comet leaf travels as proto bytes and is rebuilt +// on the far side by re-running Comet's planner over FFI. + +use std::sync::Arc; + +use datafusion::arrow::array::{Int32Array, RecordBatch}; +use datafusion::arrow::datatypes::{DataType as ArrowDataType, Field, Schema}; +use datafusion::parquet::arrow::ArrowWriter; +use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec; +use datafusion::physical_plan::{displayable, ExecutionPlan}; +use datafusion::prelude::SessionContext; +use datafusion_proto::physical_plan::AsExecutionPlan; +use datafusion_proto::protobuf::PhysicalPlanNode; +use futures::StreamExt; +use prost::Message; + +use datafusion_comet_ballista::{CometPhysicalCodec, CometScanExec}; +use datafusion_comet_proto::spark_expression::{data_type::DataTypeId, DataType}; +use datafusion_comet_proto::spark_operator::{ + operator::OpStruct, NativeScan, NativeScanCommon, Operator, SparkFilePartition, + SparkPartitionedFile, SparkStructField, +}; + +/// Write a tiny Parquet file with a single int32 column `a` = [1..=5]. +fn write_test_parquet(path: &std::path::Path) -> anyhow::Result<()> { + let schema = Arc::new(Schema::new(vec![Field::new( + "a", + ArrowDataType::Int32, + true, + )])); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]))], + )?; + let file = std::fs::File::create(path)?; + let mut writer = ArrowWriter::try_new(file, schema, None)?; + writer.write(&batch)?; + writer.close()?; + Ok(()) +} + +/// Build a Comet `Operator` proto: a single `NativeScan` over `parquet_path`. +fn build_native_scan_proto(parquet_path: &std::path::Path) -> anyhow::Result> { + let int32 = DataType { + type_id: DataTypeId::Int32 as i32, + type_info: None, + }; + let field_a = SparkStructField { + name: "a".to_string(), + data_type: Some(int32), + nullable: true, + metadata: Default::default(), + }; + let common = NativeScanCommon { + required_schema: vec![field_a.clone()], + data_schema: vec![field_a], + projection_vector: vec![0], + session_timezone: "UTC".to_string(), + source: "comet-ffi-ballista-test".to_string(), + ..Default::default() + }; + let file_size = std::fs::metadata(parquet_path)?.len() as i64; + let partitioned_file = SparkPartitionedFile { + file_path: format!("file://{}", parquet_path.display()), + start: 0, + length: file_size, + file_size, + partition_values: vec![], + }; + let native_scan = NativeScan { + common: Some(common), + file_partition: Some(SparkFilePartition { + partitioned_file: vec![partitioned_file], + }), + }; + let op = Operator { + children: vec![], + plan_id: 0, + op_struct: Some(OpStruct::NativeScan(native_scan)), + }; + Ok(op.encode_to_vec()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn comet_leaf_survives_ballista_codec() -> anyhow::Result<()> { + let parquet_path = std::env::temp_dir().join("comet_ffi_ballista_codec_roundtrip.parquet"); + write_test_parquet(&parquet_path)?; + let proto = build_native_scan_proto(&parquet_path)?; + + // Build a plan with a standard DataFusion operator on top of the Comet leaf. + let comet_scan: Arc = Arc::new(CometScanExec::try_new(proto)?); + let plan: Arc = Arc::new(CoalesceBatchesExec::new(comet_scan, 8192)); + println!( + "original plan:\n{}", + displayable(plan.as_ref()).indent(false) + ); + + // --- Encode (scheduler side) --- + let codec = CometPhysicalCodec::default(); + let node = PhysicalPlanNode::try_from_physical_plan(Arc::clone(&plan), &codec)?; + let bytes = node.encode_to_vec(); + println!("serialized physical plan: {} bytes", bytes.len()); + + // --- Ship bytes, decode in a fresh context (executor side) --- + let ctx = SessionContext::new(); + let task_ctx = ctx.task_ctx(); + let node2 = PhysicalPlanNode::decode(&bytes[..])?; + let plan2 = node2.try_into_physical_plan(task_ctx.as_ref(), &codec)?; + println!( + "reconstructed plan (executor side):\n{}", + displayable(plan2.as_ref()).indent(false) + ); + + // --- Execute the reconstructed plan --- + let mut stream = plan2.execute(0, task_ctx)?; + let mut total_rows = 0usize; + while let Some(batch) = stream.next().await { + let batch = batch?; + total_rows += batch.num_rows(); + } + println!("\nTOTAL ROWS AFTER CODEC ROUND-TRIP: {total_rows}"); + assert_eq!(total_rows, 5, "expected 5 rows"); + Ok(()) +} From ccd024f241d216022b9c72d649ab452e4b0e9428 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Jul 2026 14:31:51 -0600 Subject: [PATCH 05/42] test(ballista): end-to-end distributed shuffle over the Comet scan Relates to #4796 --- native/ballista/tests/distributed.rs | 131 +++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 native/ballista/tests/distributed.rs diff --git a/native/ballista/tests/distributed.rs b/native/ballista/tests/distributed.rs new file mode 100644 index 0000000000..1523f0c9ad --- /dev/null +++ b/native/ballista/tests/distributed.rs @@ -0,0 +1,131 @@ +// Distributes a Comet FFI scan across a real (in-process) Ballista cluster. +// +// A `CometTableProvider` exposes the Comet `NativeScan` as a SQL table. The +// query `GROUP BY a` forces a hash repartition, which Ballista turns into a +// shuffle boundary -> two stages. Stage 1 (the Comet scan + partial aggregate) +// is serialized and shipped to the executor via our codecs; the Comet leaf is +// rebuilt there by re-running Comet's planner over FFI. This proves Ballista +// distributes Comet work end to end. +// +// Starts an in-process Ballista scheduler + executors, so it is heavier and +// slower than a unit test. Run explicitly: +// cargo test -p datafusion-comet-ballista --test distributed -- --ignored + +use std::sync::Arc; + +use ballista::prelude::{SessionConfigExt, SessionContextExt}; +use datafusion::arrow::array::{Int32Array, RecordBatch}; +use datafusion::arrow::datatypes::{DataType as ArrowDataType, Field, Schema}; +use datafusion::arrow::util::pretty::pretty_format_batches; +use datafusion::execution::SessionStateBuilder; +use datafusion::parquet::arrow::ArrowWriter; +use datafusion::prelude::{SessionConfig, SessionContext}; + +use datafusion_comet_ballista::{CometLogicalCodec, CometPhysicalCodec, CometTableProvider}; +use datafusion_comet_proto::spark_expression::{data_type::DataTypeId, DataType}; +use datafusion_comet_proto::spark_operator::{ + operator::OpStruct, NativeScan, NativeScanCommon, Operator, SparkFilePartition, + SparkPartitionedFile, SparkStructField, +}; + +/// Write a tiny Parquet file with a single int32 column `a` = [1..=5]. +fn write_test_parquet(path: &std::path::Path) -> anyhow::Result<()> { + let schema = Arc::new(Schema::new(vec![Field::new( + "a", + ArrowDataType::Int32, + true, + )])); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]))], + )?; + let file = std::fs::File::create(path)?; + let mut writer = ArrowWriter::try_new(file, schema, None)?; + writer.write(&batch)?; + writer.close()?; + Ok(()) +} + +/// Build a Comet `Operator` proto: a single `NativeScan` over `parquet_path`. +fn build_native_scan_proto(parquet_path: &std::path::Path) -> anyhow::Result> { + use prost::Message; + let int32 = DataType { + type_id: DataTypeId::Int32 as i32, + type_info: None, + }; + let field_a = SparkStructField { + name: "a".to_string(), + data_type: Some(int32), + nullable: true, + metadata: Default::default(), + }; + let common = NativeScanCommon { + required_schema: vec![field_a.clone()], + data_schema: vec![field_a], + projection_vector: vec![0], + session_timezone: "UTC".to_string(), + source: "comet-ffi-ballista-test".to_string(), + ..Default::default() + }; + let file_size = std::fs::metadata(parquet_path)?.len() as i64; + let partitioned_file = SparkPartitionedFile { + file_path: format!("file://{}", parquet_path.display()), + start: 0, + length: file_size, + file_size, + partition_values: vec![], + }; + let native_scan = NativeScan { + common: Some(common), + file_partition: Some(SparkFilePartition { + partitioned_file: vec![partitioned_file], + }), + }; + let op = Operator { + children: vec![], + plan_id: 0, + op_struct: Some(OpStruct::NativeScan(native_scan)), + }; + Ok(op.encode_to_vec()) +} + +#[ignore = "starts an in-process Ballista cluster; run explicitly"] +#[tokio::test] +async fn comet_scan_distributed_with_shuffle() -> anyhow::Result<()> { + let parquet = std::env::temp_dir().join("comet_ffi_ballista_distributed.parquet"); + write_test_parquet(&parquet)?; + let proto = build_native_scan_proto(&parquet)?; + let schema = Arc::new(Schema::new(vec![Field::new( + "a", + ArrowDataType::Int32, + true, + )])); + + // In-process Ballista cluster with our Comet codecs registered on both the + // scheduler and executor sides (they flow via SessionConfig). + let config = SessionConfig::new_with_ballista() + .with_target_partitions(4) + .with_ballista_standalone_parallelism(2) + .with_ballista_physical_extension_codec(Arc::new(CometPhysicalCodec::default())) + .with_ballista_logical_extension_codec(Arc::new(CometLogicalCodec::default())); + let state = SessionStateBuilder::new() + .with_config(config) + .with_default_features() + .build(); + let ctx = SessionContext::standalone_with_state(state).await?; + + ctx.register_table("comet_t", Arc::new(CometTableProvider::new(proto, schema)))?; + + let sql = "SELECT a, count(*) AS c FROM comet_t GROUP BY a ORDER BY a"; + println!("distributed query: {sql}\n"); + let df = ctx.sql(sql).await?; + println!("logical plan:\n{}\n", df.logical_plan().display_indent()); + + let results = df.collect().await?; + println!("{}", pretty_format_batches(&results)?); + let groups: usize = results.iter().map(|b| b.num_rows()).sum(); + println!("\nGROUP BY produced {groups} groups"); + assert_eq!(groups, 5, "expected 5 groups (a = 1..=5)"); + println!("PASS: Comet FFI scan distributed by Ballista (with shuffle) — correct results"); + Ok(()) +} From a22255ae60092d42ad63f1c40b90d0cd1d271821 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Jul 2026 14:46:06 -0600 Subject: [PATCH 06/42] =?UTF-8?q?fix(ballista):=20address=20review=20?= =?UTF-8?q?=E2=80=94=20honor=20scan=20projection,=20derive=20Default,=20ti?= =?UTF-8?q?dy=20deps/docs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Relates to #4796 --- native/ballista/Cargo.toml | 4 ++-- native/ballista/README.md | 15 ++++++++++++++ native/ballista/src/codec.rs | 24 ++++++----------------- native/ballista/src/scan.rs | 6 ++++++ native/ballista/src/table_provider.rs | 25 ++++++++++++++++++++++-- native/ballista/tests/codec_roundtrip.rs | 5 +++++ native/core/src/execution/ffi.rs | 4 ++++ 7 files changed, 61 insertions(+), 22 deletions(-) create mode 100644 native/ballista/README.md diff --git a/native/ballista/Cargo.toml b/native/ballista/Cargo.toml index 12b4b90721..32a2aa4fad 100644 --- a/native/ballista/Cargo.toml +++ b/native/ballista/Cargo.toml @@ -26,7 +26,7 @@ readme = "README.md" license = "Apache-2.0" edition = "2021" -# this crate is a proof-of-concept and does not contain public Rust APIs so we do not publish this crate +# this crate does not contain public Rust APIs so we do not publish it publish = false [dependencies] @@ -39,10 +39,10 @@ async-trait = { workspace = true } tokio = { version = "1", features = ["rt-multi-thread"] } prost = "0.14.3" futures = { workspace = true } -anyhow = "1" ballista = { workspace = true } ballista-core = { workspace = true } [dev-dependencies] tokio = { version = "1", features = ["rt-multi-thread", "macros"] } +anyhow = "1" diff --git a/native/ballista/README.md b/native/ballista/README.md new file mode 100644 index 0000000000..f2fbd59db3 --- /dev/null +++ b/native/ballista/README.md @@ -0,0 +1,15 @@ +# datafusion-comet-ballista + +Lets Apache DataFusion Ballista execute Apache DataFusion Comet native plans +that are handed across a `datafusion-ffi` boundary, so a Ballista executor can +run Comet's native scans without linking Comet's Rust crates directly. + +- [`scan::CometScanExec`] — a serializable DataFusion leaf that carries a + Comet plan's proto bytes and rebuilds the FFI plan at `execute()` time. This + is what Ballista ships to executors and reconstructs there. +- [`codec::CometPhysicalCodec`] / [`codec::CometLogicalCodec`] — extension + codecs that (de)serialize Comet nodes as their proto bytes and delegate + everything else to Ballista's own codecs. +- [`table_provider::CometTableProvider`] — a `TableProvider` that produces a + `CometScanExec`, so a Comet scan can participate in a DataFusion logical + plan and be distributed by Ballista like any other table. diff --git a/native/ballista/src/codec.rs b/native/ballista/src/codec.rs index 6be590a7d2..2ae06b31f1 100644 --- a/native/ballista/src/codec.rs +++ b/native/ballista/src/codec.rs @@ -34,24 +34,20 @@ use crate::table_provider::CometTableProvider; /// Marks a payload as a Comet node so the codec can tell it apart from a /// Ballista/DataFusion node it should delegate. +/// +/// Prefix-sniffing this is safe because Ballista/DataFusion codec payloads +/// are protobuf tag streams that never begin with these bytes — the +/// embedded NUL in particular makes a collision effectively impossible. pub const COMET_MAGIC: &[u8] = b"CMET1\0"; /// Serializes `CometScanExec` as its Comet proto bytes (tagged with `COMET_MAGIC`) /// and reconstructs it on decode by re-running Comet's planner via FFI. All other /// nodes (including Ballista's own shuffle operators) delegate to Ballista's codec. -#[derive(Debug)] +#[derive(Debug, Default)] pub struct CometPhysicalCodec { inner: BallistaPhysicalExtensionCodec, } -impl Default for CometPhysicalCodec { - fn default() -> Self { - Self { - inner: BallistaPhysicalExtensionCodec::default(), - } - } -} - impl PhysicalExtensionCodec for CometPhysicalCodec { fn try_decode( &self, @@ -78,19 +74,11 @@ impl PhysicalExtensionCodec for CometPhysicalCodec { /// Serializes `CometTableProvider` (as its Comet proto bytes, tagged with /// `COMET_MAGIC`) so a query's logical plan can be shipped client -> scheduler /// and reconstructed there. Everything else delegates to Ballista's codec. -#[derive(Debug)] +#[derive(Debug, Default)] pub struct CometLogicalCodec { inner: BallistaLogicalExtensionCodec, } -impl Default for CometLogicalCodec { - fn default() -> Self { - Self { - inner: BallistaLogicalExtensionCodec::default(), - } - } -} - impl LogicalExtensionCodec for CometLogicalCodec { fn try_decode( &self, diff --git a/native/ballista/src/scan.rs b/native/ballista/src/scan.rs index 1b530964fd..fe417c9844 100644 --- a/native/ballista/src/scan.rs +++ b/native/ballista/src/scan.rs @@ -41,6 +41,11 @@ pub struct CometScanExec { impl CometScanExec { /// Build from Comet proto bytes: run Comet's planner via FFI to get the plan, /// wrap it as a `ForeignExecutionPlan` (forcing the real FFI vtable path). + /// + /// The Tokio runtime handle is captured here (via `Handle::try_current()`), + /// so `try_new` (and `try_decode`, which calls it) must run inside a Tokio + /// runtime — true for Ballista's executor, which drives all task execution + /// on a Tokio runtime. pub fn try_new(proto: Vec) -> Result { let ffi = comet_ffi_plan_from_proto(&proto, Handle::try_current().ok()) .map_err(DataFusionError::Execution)?; @@ -84,6 +89,7 @@ impl ExecutionPlan for CometScanExec { self: Arc, _children: Vec>, ) -> Result> { + debug_assert!(_children.is_empty(), "CometScanExec is a leaf"); Ok(self) } diff --git a/native/ballista/src/table_provider.rs b/native/ballista/src/table_provider.rs index daa53a6d45..6e70b6ba20 100644 --- a/native/ballista/src/table_provider.rs +++ b/native/ballista/src/table_provider.rs @@ -21,6 +21,9 @@ use datafusion::arrow::datatypes::SchemaRef; use datafusion::catalog::{Session, TableProvider}; use datafusion::common::Result; use datafusion::logical_expr::{Expr, TableType}; +use datafusion::physical_expr::expressions::Column; +use datafusion::physical_expr::PhysicalExpr; +use datafusion::physical_plan::projection::ProjectionExec; use datafusion::physical_plan::ExecutionPlan; use crate::scan::CometScanExec; @@ -51,13 +54,31 @@ impl TableProvider for CometTableProvider { fn table_type(&self) -> TableType { TableType::Base } + // `_filters` and `_limit` are intentionally not pushed down into the Comet + // scan; DataFusion re-applies them on top of the returned plan. async fn scan( &self, _state: &dyn Session, - _projection: Option<&Vec>, + projection: Option<&Vec>, _filters: &[Expr], _limit: Option, ) -> Result> { - Ok(Arc::new(CometScanExec::try_new(self.proto.clone())?)) + let scan: Arc = Arc::new(CometScanExec::try_new(self.proto.clone())?); + match projection { + Some(indices) => { + let exprs: Vec<(Arc, String)> = indices + .iter() + .map(|&i| { + let f = self.schema.field(i); + ( + Arc::new(Column::new(f.name(), i)) as Arc, + f.name().to_string(), + ) + }) + .collect(); + Ok(Arc::new(ProjectionExec::try_new(exprs, scan)?)) + } + None => Ok(scan), + } } } diff --git a/native/ballista/tests/codec_roundtrip.rs b/native/ballista/tests/codec_roundtrip.rs index 1e2245ad3a..b00518bc75 100644 --- a/native/ballista/tests/codec_roundtrip.rs +++ b/native/ballista/tests/codec_roundtrip.rs @@ -12,6 +12,10 @@ use std::sync::Arc; use datafusion::arrow::array::{Int32Array, RecordBatch}; use datafusion::arrow::datatypes::{DataType as ArrowDataType, Field, Schema}; use datafusion::parquet::arrow::ArrowWriter; +// `CoalesceBatchesExec` is deprecated upstream in favor of arrow-rs's +// `BatchCoalescer`, but it's still a real, functional standard DataFusion +// operator, which is exactly what this test needs on top of the Comet leaf. +#[allow(deprecated)] use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec; use datafusion::physical_plan::{displayable, ExecutionPlan}; use datafusion::prelude::SessionContext; @@ -87,6 +91,7 @@ fn build_native_scan_proto(parquet_path: &std::path::Path) -> anyhow::Result anyhow::Result<()> { let parquet_path = std::env::temp_dir().join("comet_ffi_ballista_codec_roundtrip.parquet"); diff --git a/native/core/src/execution/ffi.rs b/native/core/src/execution/ffi.rs index b8487261dc..f54d342b85 100644 --- a/native/core/src/execution/ffi.rs +++ b/native/core/src/execution/ffi.rs @@ -45,6 +45,10 @@ pub fn comet_ffi_plan_from_proto( let op = Operator::decode(proto_bytes) .map_err(|e| format!("failed to decode Comet Operator proto: {e}"))?; + // A fresh `SessionContext` means object-store configuration comes only + // from the proto's `object_store_options`, not from any ambient session. + // That's sufficient for local `file://` scans; remote object stores + // (S3, GCS, etc.) will need this revisited to plumb their config through. let session_ctx = Arc::new(SessionContext::new()); let planner = PhysicalPlanner::new(session_ctx, 0); From 1a612bbdad998393b881b3eda5c6b71d302df612 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Jul 2026 15:47:02 -0600 Subject: [PATCH 07/42] feat(ballista): prove JVM -> native -> in-process Ballista -> JVM Arrow round-trip Add a driver-side "offload to Ballista" submission entry that lets the JVM run a serialized Comet Operator proto on an in-process standalone Ballista engine (no Spark executors) and receive the result back over the Arrow C Data Interface. - Build datafusion-comet-ballista as a cdylib (plus rlib) carrying a JNI entry Java_org_apache_comet_ballista_NativeBallista_executeQuery. It reuses the existing proto -> standalone Ballista -> RecordBatches recipe and exports each result column into JVM-allocated FFI_ArrowArray/FFI_ArrowSchema structs, mirroring jni_api::prepare_output. A buildTestProto entry returns the fixed test proto so the JVM side needs no generated proto classes. - Add a Scala NativeBallista binding and a ScalaTest that loads the new lib, runs the proto, imports the result via Arrow Java, and asserts 5 rows come back with no Spark cluster. - Add a Rust integration test that exercises the same C Data export/import boundary end to end. Relates to #4796. --- native/Cargo.lock | 1 + native/ballista/Cargo.toml | 7 + native/ballista/src/ffi_jni.rs | 312 ++++++++++++++++++ native/ballista/src/lib.rs | 2 + native/ballista/tests/ffi_roundtrip.rs | 132 ++++++++ .../ballista/CometBallistaFfiSpikeSuite.scala | 131 ++++++++ .../comet/ballista/NativeBallista.scala | 54 +++ 7 files changed, 639 insertions(+) create mode 100644 native/ballista/src/ffi_jni.rs create mode 100644 native/ballista/tests/ffi_roundtrip.rs create mode 100644 spark/src/test/scala/org/apache/comet/ballista/CometBallistaFfiSpikeSuite.scala create mode 100644 spark/src/test/scala/org/apache/comet/ballista/NativeBallista.scala diff --git a/native/Cargo.lock b/native/Cargo.lock index ecaee0db5b..3d09166ad9 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -2270,6 +2270,7 @@ dependencies = [ "datafusion-ffi", "datafusion-proto", "futures", + "jni 0.22.4", "prost", "tokio", ] diff --git a/native/ballista/Cargo.toml b/native/ballista/Cargo.toml index 32a2aa4fad..a86328bd63 100644 --- a/native/ballista/Cargo.toml +++ b/native/ballista/Cargo.toml @@ -29,7 +29,14 @@ edition = "2021" # this crate does not contain public Rust APIs so we do not publish it publish = false +# `cdylib` builds libcomet_ballista.{dylib,so} carrying the JNI submission entry +# (loaded by the JVM alongside libcomet); `rlib` keeps the crate usable from +# Rust integration tests. +[lib] +crate-type = ["cdylib", "rlib"] + [dependencies] +jni = "0.22.4" datafusion-comet = { path = "../core" } datafusion-comet-proto = { workspace = true } datafusion = { workspace = true, features = ["parquet"] } diff --git a/native/ballista/src/ffi_jni.rs b/native/ballista/src/ffi_jni.rs new file mode 100644 index 0000000000..6ffb1223e2 --- /dev/null +++ b/native/ballista/src/ffi_jni.rs @@ -0,0 +1,312 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Driver-side "offload to Ballista" submission entry. +//! +//! The JVM hands us a serialized Comet `Operator` proto; we run it on an +//! **in-process standalone Ballista** engine (no Spark executors) and hand the +//! resulting Arrow batches back to the JVM over the Arrow C Data Interface — +//! the same FFI mechanism Comet already uses in `jni_api::prepare_output` +//! (`ArrayData` → caller-allocated `FFI_ArrowArray`/`FFI_ArrowSchema`). +//! +//! This is a SPIKE. It proves the round trip JVM → native → Ballista → JVM. + +use std::sync::Arc; + +use ballista::prelude::{SessionConfigExt, SessionContextExt}; +use datafusion::arrow::array::RecordBatch; +use datafusion::arrow::compute::concat_batches; +use datafusion::arrow::datatypes::{Field, Schema, SchemaRef}; +use datafusion::arrow::ffi::{FFI_ArrowArray, FFI_ArrowSchema}; +use datafusion::execution::SessionStateBuilder; +use datafusion::prelude::{SessionConfig, SessionContext}; + +use comet::execution::serde::to_arrow_datatype; +use datafusion_comet_proto::spark_operator::{operator::OpStruct, Operator}; +use prost::Message; + +/// Build the fixed spike test proto Rust-side: a single `NativeScan` over a +/// freshly written Parquet file with one int32 column `a` = [1..=5]. Returned to +/// the JVM so the JVM test can hand it straight back to [`Java_org_apache_comet_ballista_NativeBallista_executeQuery`] +/// without needing the generated proto Java classes. This is the same proto the +/// Rust `tests/` build. +pub fn build_test_proto() -> Result, String> { + use datafusion::arrow::array::Int32Array; + use datafusion::arrow::datatypes::DataType as ArrowDataType; + use datafusion::parquet::arrow::ArrowWriter; + use datafusion_comet_proto::spark_expression::{data_type::DataTypeId, DataType}; + use datafusion_comet_proto::spark_operator::{ + NativeScan, NativeScanCommon, SparkFilePartition, SparkPartitionedFile, SparkStructField, + }; + + let parquet = std::env::temp_dir().join("comet_ffi_ballista_jvm_spike.parquet"); + let arrow_schema = Arc::new(Schema::new(vec![Field::new( + "a", + ArrowDataType::Int32, + true, + )])); + let batch = RecordBatch::try_new( + Arc::clone(&arrow_schema), + vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]))], + ) + .map_err(|e| format!("failed to build test batch: {e}"))?; + let file = + std::fs::File::create(&parquet).map_err(|e| format!("failed to create parquet: {e}"))?; + let mut writer = ArrowWriter::try_new(file, arrow_schema, None) + .map_err(|e| format!("failed to open parquet writer: {e}"))?; + writer + .write(&batch) + .map_err(|e| format!("failed to write parquet: {e}"))?; + writer + .close() + .map_err(|e| format!("failed to close parquet: {e}"))?; + + let int32 = DataType { + type_id: DataTypeId::Int32 as i32, + type_info: None, + }; + let field_a = SparkStructField { + name: "a".to_string(), + data_type: Some(int32), + nullable: true, + metadata: Default::default(), + }; + let common = NativeScanCommon { + required_schema: vec![field_a.clone()], + data_schema: vec![field_a], + projection_vector: vec![0], + session_timezone: "UTC".to_string(), + source: "comet-ffi-ballista-jvm-spike".to_string(), + ..Default::default() + }; + let file_size = std::fs::metadata(&parquet) + .map_err(|e| format!("failed to stat parquet: {e}"))? + .len() as i64; + let partitioned_file = SparkPartitionedFile { + file_path: format!("file://{}", parquet.display()), + start: 0, + length: file_size, + file_size, + partition_values: vec![], + }; + let native_scan = NativeScan { + common: Some(common), + file_partition: Some(SparkFilePartition { + partitioned_file: vec![partitioned_file], + }), + }; + let op = Operator { + children: vec![], + plan_id: 0, + op_struct: Some(OpStruct::NativeScan(native_scan)), + }; + Ok(op.encode_to_vec()) +} + +use crate::{CometLogicalCodec, CometPhysicalCodec, CometTableProvider}; + +/// Derive the Arrow result schema from the `NativeScan` leaf carried in the +/// proto. For this spike the offloaded plan is a single `NativeScan`, so its +/// `required_schema` is the query's output schema. +fn schema_from_proto(op: &Operator) -> Result { + let native_scan = match op.op_struct.as_ref() { + Some(OpStruct::NativeScan(scan)) => scan, + _ => return Err("expected a NativeScan operator at the plan root".to_string()), + }; + let common = native_scan + .common + .as_ref() + .ok_or_else(|| "NativeScan is missing NativeScanCommon".to_string())?; + let fields: Vec = common + .required_schema + .iter() + .map(|f| { + let dt = f + .data_type + .as_ref() + .ok_or_else(|| format!("field {} has no data type", f.name))?; + Ok(Field::new(&f.name, to_arrow_datatype(dt), f.nullable)) + }) + .collect::>()?; + Ok(Arc::new(Schema::new(fields))) +} + +/// Run a Comet `Operator` proto on an in-process standalone Ballista engine and +/// return the collected Arrow batches plus the result schema. +/// +/// This reuses the exact "proto → standalone Ballista → RecordBatches" recipe +/// validated in `tests/distributed.rs`, but runs `SELECT * FROM t` (no shuffle) +/// because the spike only needs to prove the result boundary. +pub fn execute_comet_proto(proto: &[u8]) -> Result<(SchemaRef, Vec), String> { + let op = + Operator::decode(proto).map_err(|e| format!("failed to decode Operator proto: {e}"))?; + let schema = schema_from_proto(&op)?; + + let runtime = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .map_err(|e| format!("failed to build tokio runtime: {e}"))?; + + runtime.block_on(async move { + // In-process standalone Ballista cluster (scheduler + executor) with the + // Comet codecs registered so the Comet leaf survives serialization. + let config = SessionConfig::new_with_ballista() + .with_target_partitions(1) + .with_ballista_standalone_parallelism(1) + .with_ballista_physical_extension_codec(Arc::new(CometPhysicalCodec::default())) + .with_ballista_logical_extension_codec(Arc::new(CometLogicalCodec::default())); + let state = SessionStateBuilder::new() + .with_config(config) + .with_default_features() + .build(); + let ctx = SessionContext::standalone_with_state(state) + .await + .map_err(|e| format!("failed to start standalone Ballista: {e}"))?; + + ctx.register_table( + "comet_t", + Arc::new(CometTableProvider::new(proto.to_vec(), Arc::clone(&schema))), + ) + .map_err(|e| format!("failed to register Comet table: {e}"))?; + + let df = ctx + .sql("SELECT * FROM comet_t") + .await + .map_err(|e| format!("failed to plan query: {e}"))?; + let batches = df + .collect() + .await + .map_err(|e| format!("failed to execute query: {e}"))?; + Ok((schema, batches)) + }) +} + +/// Export one Arrow batch into caller-allocated `FFI_ArrowArray` / +/// `FFI_ArrowSchema` structs, one per column, whose addresses were allocated by +/// the JVM (Arrow Java `ArrowArray.allocateNew` / `ArrowSchema.allocateNew`). +/// +/// This mirrors `jni_api::prepare_output`: the JVM owns the C Data structs and +/// imports them with its `ArrowImporter` after this call returns. +/// +/// # Safety +/// `array_addrs[i]` / `schema_addrs[i]` must be valid, writable pointers to +/// uninitialized `FFI_ArrowArray` / `FFI_ArrowSchema` for each column. +unsafe fn export_batch_to_addresses( + batch: &RecordBatch, + array_addrs: &[i64], + schema_addrs: &[i64], +) -> Result<(), String> { + let num_cols = batch.num_columns(); + if array_addrs.len() != num_cols || schema_addrs.len() != num_cols { + return Err(format!( + "column count mismatch: batch has {num_cols}, got {} array / {} schema addresses", + array_addrs.len(), + schema_addrs.len() + )); + } + for i in 0..num_cols { + let data = batch.column(i).to_data(); + let schema = FFI_ArrowSchema::try_from(data.data_type()) + .map_err(|e| format!("failed to export schema for column {i}: {e}"))?; + let array = FFI_ArrowArray::new(&data); + // The JVM allocated these structs; write the exported values into them. + std::ptr::write(array_addrs[i] as *mut FFI_ArrowArray, array); + std::ptr::write(schema_addrs[i] as *mut FFI_ArrowSchema, schema); + } + Ok(()) +} + +/// Run the proto and export the (single) result batch into the JVM-allocated +/// FFI structs. Returns the row count, or `Err` with a message. +/// +/// # Safety +/// See [`export_batch_to_addresses`]. +pub unsafe fn submit_and_export( + proto: &[u8], + array_addrs: &[i64], + schema_addrs: &[i64], +) -> Result { + let (schema, batches) = execute_comet_proto(proto)?; + // The spike offloads a single small scan; concatenate to one batch so the + // JVM imports exactly one set of column structs. + let batch = concat_batches(&schema, &batches) + .map_err(|e| format!("failed to concatenate result batches: {e}"))?; + export_batch_to_addresses(&batch, array_addrs, schema_addrs)?; + Ok(batch.num_rows() as i64) +} + +// --------------------------------------------------------------------------- +// JNI entry point +// --------------------------------------------------------------------------- + +mod jni_entry { + use super::{build_test_proto, submit_and_export}; + use comet::errors::{try_unwrap_or_throw, CometError}; + use jni::objects::{JByteArray, JClass, JLongArray, ReleaseMode}; + use jni::sys::{jbyteArray, jlong}; + use jni::EnvUnowned; + + /// JVM entry: build the fixed spike test proto Rust-side and return its + /// bytes, so the JVM test does not need the generated proto Java classes. + /// + /// # Safety + /// Called from the JVM via JNI. + #[no_mangle] + pub unsafe extern "system" fn Java_org_apache_comet_ballista_NativeBallista_buildTestProto( + e: EnvUnowned, + _class: JClass, + ) -> jbyteArray { + try_unwrap_or_throw(&e, |env| { + let bytes = build_test_proto().map_err(CometError::Internal)?; + let arr = env.byte_array_from_slice(&bytes)?; + Ok(arr.into_raw()) + }) + } + + /// JVM entry: run a Comet `Operator` proto on in-process standalone Ballista + /// and export the result batch into the JVM-allocated Arrow C Data structs + /// (`FFI_ArrowArray`/`FFI_ArrowSchema`), returning the number of rows. This + /// mirrors `Java_org_apache_comet_Native_executePlan`'s use of + /// `prepare_output` — the JVM allocates the structs and imports them after + /// this call returns. + /// + /// # Safety + /// Called from the JVM via JNI; the address arrays must reference valid + /// caller-allocated `FFI_ArrowArray`/`FFI_ArrowSchema` structs. + #[no_mangle] + pub unsafe extern "system" fn Java_org_apache_comet_ballista_NativeBallista_executeQuery( + e: EnvUnowned, + _class: JClass, + proto: JByteArray, + array_addrs: JLongArray, + schema_addrs: JLongArray, + ) -> jlong { + try_unwrap_or_throw(&e, |env| { + let proto_bytes = env.convert_byte_array(proto)?; + + let arrays = unsafe { array_addrs.get_elements(env, ReleaseMode::NoCopyBack)? }; + let schemas = unsafe { schema_addrs.get_elements(env, ReleaseMode::NoCopyBack)? }; + + // SAFETY: the JVM allocated these FFI structs (Arrow Java + // ArrowArray/ArrowSchema.allocateNew); we write the exported values + // into them and the JVM imports them after this returns. + let num_rows = unsafe { submit_and_export(&proto_bytes, &arrays, &schemas) } + .map_err(CometError::Internal)?; + Ok(num_rows as jlong) + }) + } +} diff --git a/native/ballista/src/lib.rs b/native/ballista/src/lib.rs index f8a7085154..a75dd93923 100644 --- a/native/ballista/src/lib.rs +++ b/native/ballista/src/lib.rs @@ -31,9 +31,11 @@ //! plan. pub mod codec; +pub mod ffi_jni; pub mod scan; pub mod table_provider; pub use codec::{CometLogicalCodec, CometPhysicalCodec, COMET_MAGIC}; +pub use ffi_jni::{build_test_proto, execute_comet_proto, submit_and_export}; pub use scan::CometScanExec; pub use table_provider::CometTableProvider; diff --git a/native/ballista/tests/ffi_roundtrip.rs b/native/ballista/tests/ffi_roundtrip.rs new file mode 100644 index 0000000000..082aa814f2 --- /dev/null +++ b/native/ballista/tests/ffi_roundtrip.rs @@ -0,0 +1,132 @@ +// Proves the driver-side offload result boundary: a Comet `Operator` proto is +// run on in-process standalone Ballista and its result is exported over the +// Arrow C Data Interface into *caller-allocated* FFI structs, exactly as the +// JVM boundary does (Arrow Java allocates the structs, native writes into them, +// Arrow Java imports them). Here the "caller" is this Rust test standing in for +// the JVM: it allocates the FFI structs, calls `submit_and_export`, then +// re-imports via `from_ffi` and asserts 5 rows come back. +// +// cargo test -p datafusion-comet-ballista --test ffi_roundtrip -- --ignored --nocapture + +use std::sync::Arc; + +use datafusion::arrow::array::{make_array, Int32Array, RecordBatch}; +use datafusion::arrow::datatypes::{DataType as ArrowDataType, Field, Schema}; +use datafusion::arrow::ffi::{from_ffi, FFI_ArrowArray, FFI_ArrowSchema}; +use datafusion::parquet::arrow::ArrowWriter; + +use datafusion_comet_ballista::submit_and_export; +use datafusion_comet_proto::spark_expression::{data_type::DataTypeId, DataType}; +use datafusion_comet_proto::spark_operator::{ + operator::OpStruct, NativeScan, NativeScanCommon, Operator, SparkFilePartition, + SparkPartitionedFile, SparkStructField, +}; + +fn write_test_parquet(path: &std::path::Path) -> anyhow::Result<()> { + let schema = Arc::new(Schema::new(vec![Field::new( + "a", + ArrowDataType::Int32, + true, + )])); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]))], + )?; + let file = std::fs::File::create(path)?; + let mut writer = ArrowWriter::try_new(file, schema, None)?; + writer.write(&batch)?; + writer.close()?; + Ok(()) +} + +fn build_native_scan_proto(parquet_path: &std::path::Path) -> anyhow::Result> { + use prost::Message; + let int32 = DataType { + type_id: DataTypeId::Int32 as i32, + type_info: None, + }; + let field_a = SparkStructField { + name: "a".to_string(), + data_type: Some(int32), + nullable: true, + metadata: Default::default(), + }; + let common = NativeScanCommon { + required_schema: vec![field_a.clone()], + data_schema: vec![field_a], + projection_vector: vec![0], + session_timezone: "UTC".to_string(), + source: "comet-ffi-ballista-jvm-spike".to_string(), + ..Default::default() + }; + let file_size = std::fs::metadata(parquet_path)?.len() as i64; + let partitioned_file = SparkPartitionedFile { + file_path: format!("file://{}", parquet_path.display()), + start: 0, + length: file_size, + file_size, + partition_values: vec![], + }; + let native_scan = NativeScan { + common: Some(common), + file_partition: Some(SparkFilePartition { + partitioned_file: vec![partitioned_file], + }), + }; + let op = Operator { + children: vec![], + plan_id: 0, + op_struct: Some(OpStruct::NativeScan(native_scan)), + }; + Ok(op.encode_to_vec()) +} + +#[ignore = "starts an in-process Ballista cluster; run explicitly"] +#[test] +fn offload_proto_and_import_over_c_data_interface() -> anyhow::Result<()> { + let parquet = std::env::temp_dir().join("comet_ffi_ballista_jvm_spike.parquet"); + write_test_parquet(&parquet)?; + let proto = build_native_scan_proto(&parquet)?; + + // Stand in for the JVM: allocate one (array, schema) FFI struct per column. + // Arrow Java's ArrowArray.allocateNew / ArrowSchema.allocateNew produce the + // exact same C Data structs; we hand their addresses to native code. + const NUM_COLS: usize = 1; + let mut arrays: Vec = (0..NUM_COLS).map(|_| FFI_ArrowArray::empty()).collect(); + let mut schemas: Vec = + (0..NUM_COLS).map(|_| FFI_ArrowSchema::empty()).collect(); + let array_addrs: Vec = arrays + .iter_mut() + .map(|a| a as *mut FFI_ArrowArray as i64) + .collect(); + let schema_addrs: Vec = schemas + .iter_mut() + .map(|s| s as *mut FFI_ArrowSchema as i64) + .collect(); + + // JVM → native → in-process Ballista → export into the caller structs. + let num_rows = unsafe { submit_and_export(&proto, &array_addrs, &schema_addrs) } + .map_err(anyhow::Error::msg)?; + assert_eq!(num_rows, 5, "expected 5 rows back from Ballista"); + + // JVM side: import the exported structs (mirrors Arrow Java ArrowImporter). + for i in 0..NUM_COLS { + let array = std::mem::replace(&mut arrays[i], FFI_ArrowArray::empty()); + let schema = std::mem::replace(&mut schemas[i], FFI_ArrowSchema::empty()); + let data = unsafe { from_ffi(array, &schema) }?; + let imported = make_array(data); + assert_eq!(imported.len(), 5, "imported column {i} should have 5 rows"); + let ints = imported + .as_any() + .downcast_ref::() + .expect("column a should be Int32"); + let values: Vec = ints.values().to_vec(); + assert_eq!(values, vec![1, 2, 3, 4, 5]); + } + + println!( + "PASS: proto -> standalone Ballista -> {num_rows} rows exported and re-imported over the \ + Arrow C Data Interface (the JVM boundary mechanism)" + ); + Ok(()) +} diff --git a/spark/src/test/scala/org/apache/comet/ballista/CometBallistaFfiSpikeSuite.scala b/spark/src/test/scala/org/apache/comet/ballista/CometBallistaFfiSpikeSuite.scala new file mode 100644 index 0000000000..bb46e10533 --- /dev/null +++ b/spark/src/test/scala/org/apache/comet/ballista/CometBallistaFfiSpikeSuite.scala @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.ballista + +import java.io.File +import java.nio.file.{Files, Paths} + +import org.scalatest.funsuite.AnyFunSuite + +import org.apache.arrow.c.{ArrowArray, ArrowSchema, CDataDictionaryProvider, Data} +import org.apache.arrow.memory.RootAllocator +import org.apache.arrow.vector.IntVector + +/** + * SPIKE: proves the driver-side "offload to Ballista" round trip across the JVM boundary. + * + * The JVM asks native code to build a fixed Comet `Operator` proto, hands those proto bytes back + * to native code, which runs them on an in-process standalone Ballista engine (no Spark + * executors) and exports the result batch back to the JVM over the Arrow C Data Interface. The + * JVM imports the result and asserts 5 rows come back. + * + * The native entry points live in `libdatafusion_comet_ballista` (the `datafusion-comet-ballista` + * crate built as a cdylib), loaded here alongside — but independently of — `libcomet`. + */ +class CometBallistaFfiSpikeSuite extends AnyFunSuite { + + test("JVM -> native -> in-process Ballista -> JVM returns 5 rows over Arrow FFI") { + CometBallistaFfiSpikeSuite.assumeLibraryLoaded() + // Arrow's C Data JNI helper (arrow_cdata_jni) extracts itself into java.io.tmpdir; the surefire + // config points that at target/tmp, which may not exist yet when this suite runs alone. + Files.createDirectories(Paths.get(System.getProperty("java.io.tmpdir"))) + val native = new NativeBallista + + // 1. Native builds the fixed test proto (single NativeScan over a = [1..5]) and returns bytes. + val proto: Array[Byte] = native.buildTestProto() + assert(proto.nonEmpty, "native buildTestProto returned no bytes") + + val allocator = new RootAllocator(Long.MaxValue) + val provider = new CDataDictionaryProvider() + // One output column (`a`): allocate the C Data structs the JVM owns. + val arrowArray = ArrowArray.allocateNew(allocator) + val arrowSchema = ArrowSchema.allocateNew(allocator) + try { + // 2. JVM hands proto + struct addresses back to native, which runs Ballista and exports. + val numRows = native.executeQuery( + proto, + Array(arrowArray.memoryAddress()), + Array(arrowSchema.memoryAddress())) + assert(numRows == 5, s"expected 5 rows from Ballista, got $numRows") + + // 3. JVM imports the exported column over the Arrow C Data Interface. + val vector = Data.importVector(allocator, arrowArray, arrowSchema, provider) + try { + assert( + vector.getValueCount == 5, + s"expected 5 imported values, got ${vector.getValueCount}") + val ints = vector.asInstanceOf[IntVector] + val values = (0 until ints.getValueCount).map(ints.get) + assert(values == Seq(1, 2, 3, 4, 5), s"unexpected values: $values") + } finally { + vector.close() + } + } finally { + arrowArray.close() + arrowSchema.close() + provider.close() + allocator.close() + } + } +} + +object CometBallistaFfiSpikeSuite { + + @volatile private var loaded = false + @volatile private var loadError: Option[Throwable] = None + + /** + * Load `libdatafusion_comet_ballista` by absolute path (it is not on `java.library.path`). We + * try the `COMET_BALLISTA_LIB` env var first, then the debug/release build outputs relative to + * the `spark` module directory (surefire's working dir). + */ + private def load(): Unit = synchronized { + if (loaded || loadError.isDefined) return + val libName = System.mapLibraryName("datafusion_comet_ballista") + val moduleDir = new File(System.getProperty("user.dir")) + val candidates = Seq( + sys.env.get("COMET_BALLISTA_LIB"), + Some(new File(moduleDir, s"../native/target/debug/$libName").getPath), + Some(new File(moduleDir, s"../native/target/release/$libName").getPath), + Some(new File(moduleDir, s"native/target/debug/$libName").getPath)).flatten + val found = candidates.find(p => Files.exists(Paths.get(p))) + found match { + case Some(path) => + try { + System.load(new File(path).getAbsolutePath) + loaded = true + } catch { + case t: Throwable => loadError = Some(t) + } + case None => + loadError = Some( + new UnsatisfiedLinkError( + s"could not find $libName in any of: ${candidates.mkString(", ")}")) + } + } + + def assumeLibraryLoaded(): Unit = { + load() + loadError.foreach { t => + org.scalatest.Assertions + .cancel(s"native ballista library not available: ${t.getMessage}", t) + } + } +} diff --git a/spark/src/test/scala/org/apache/comet/ballista/NativeBallista.scala b/spark/src/test/scala/org/apache/comet/ballista/NativeBallista.scala new file mode 100644 index 0000000000..72192f07c5 --- /dev/null +++ b/spark/src/test/scala/org/apache/comet/ballista/NativeBallista.scala @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.ballista + +/** + * JNI binding to the native driver-side Ballista submission entry, implemented in the + * `datafusion-comet-ballista` crate (`libdatafusion_comet_ballista`). + * + * SPIKE: kept in the test tree while the offload-to-Ballista mode is being proven out. + */ +class NativeBallista { + + /** + * Build the fixed spike test proto (a single `NativeScan` over a Parquet file with one int32 + * column `a` = [1..5]) native-side and return its serialized bytes. Lets the JVM test exercise + * the proto boundary without depending on the generated proto Java classes. + */ + @native def buildTestProto(): Array[Byte] + + /** + * Run a serialized Comet `Operator` proto on an in-process standalone Ballista engine (no Spark + * executors) and export the single result batch into the caller-allocated Arrow C Data structs. + * + * @param proto + * serialized Comet `Operator` proto + * @param arrayAddrs + * memory addresses of one `ArrowArray` struct per output column + * @param schemaAddrs + * memory addresses of one `ArrowSchema` struct per output column + * @return + * the number of rows exported + */ + @native def executeQuery( + proto: Array[Byte], + arrayAddrs: Array[Long], + schemaAddrs: Array[Long]): Long +} From 19b4937694708ffbc3a607d2994123c607f13ff0 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Jul 2026 15:48:49 -0600 Subject: [PATCH 08/42] docs(contributor-guide): add experimental Ballista execution mode page Overview, architecture, components, config, and a roadmap with task tracking for the driver-side Comet->Ballista offload work. Relates to #4796 --- .../contributor-guide/ballista_execution.md | 135 ++++++++++++++++++ docs/source/contributor-guide/index.md | 8 ++ 2 files changed, 143 insertions(+) create mode 100644 docs/source/contributor-guide/ballista_execution.md diff --git a/docs/source/contributor-guide/ballista_execution.md b/docs/source/contributor-guide/ballista_execution.md new file mode 100644 index 0000000000..e818daa149 --- /dev/null +++ b/docs/source/contributor-guide/ballista_execution.md @@ -0,0 +1,135 @@ + + +# Experimental: Native Execution on Apache DataFusion Ballista + +> **Status: experimental / research.** This is an in-progress exploration, not a supported +> feature. Design discussion is tracked in +> [issue #4796](https://github.com/apache/datafusion-comet/issues/4796) and the initial code +> in draft PR [#4800](https://github.com/apache/datafusion-comet/pull/4800). Interfaces, +> configuration, and behavior described here are subject to change. + +## Overview + +Today Comet accelerates Spark by running native operators **inside Spark executors** via JNI. +This page describes an additional, optional deployment mode being prototyped: running Comet's +native operators on a distributed **Apache DataFusion Ballista** data plane instead of inside +Spark executors. + +In this mode the Spark cluster acts as a lightweight **control plane** — the driver plans the +query and hands it off — while all computation happens on Ballista. The existing in-Spark +accelerator is unchanged; this is purely additive. + +The feature is **not tied to Spark Connect**. The native side only consumes a serialized Comet +plan; it does not care how the plan was produced. Because Comet's `QueryPlanSerde` emits that +plan from *any* Spark physical plan, the same mechanism supports whole-query offload from a +regular Spark application as well as from a Spark Connect client. Execution is **all-or-nothing**: +a query offloaded to Ballista runs entirely on Ballista and its result terminates at the driver +(or is written out by Ballista). Interleaving Ballista execution with Spark's own distributed +execution within a single job is out of scope. + +## Architecture + +``` + Spark app / Spark Connect client + │ + ▼ + ┌─────────────────────────────┐ CONTROL PLANE (Spark driver only) + │ Spark driver │ • Catalyst + Comet driver-side plan rules + │ + Comet plan rules │ • serialize the Comet Operator protobuf + │ + Ballista client │ • submit to Ballista; collect results here + └──────────────┬──────────────┘ • NO Spark executor tasks run + │ Comet plan protobuf + ▼ + ┌─────────────────────────────┐ DATA PLANE (Ballista, fully native) + │ Ballista scheduler │ • splits into stages / owns shuffle + │ Ballista executors ×N │ • rebuild the Comet plan over datafusion-ffi + │ (Comet-flavored) │ • run Comet operators + expressions + └─────────────────────────────┘ +``` + +Two design choices make this mostly integration rather than new invention: + +- **`datafusion-ffi` boundary, not co-linking.** Comet and Ballista track DataFusion on + independent schedules, so their crates are **not** linked together. Comet exposes a native + plan (built by its own `PhysicalPlanner`) as an `FFI_ExecutionPlan`; the Ballista side consumes + it as a `ForeignExecutionPlan`. They share only a stable C ABI (compatible within a DataFusion + major version). This means Comet's `planner.rs`, operators, and expressions are reused as-is, + with no reimplementation of plan translation. +- **Driver-side offload.** Comet's driver-side rules already build a root `CometNativeExec` that + holds the whole-query serialized plan. In Ballista mode the driver submits that plan to Ballista + and returns results at the driver, instead of dispatching an RDD job to Spark executors. + +## Components + +**Rust** (`native/`): +- `comet_ffi_plan_from_proto` (`datafusion-comet` core) — decodes a Comet `Operator` proto, builds + the plan with the existing `PhysicalPlanner`, returns an `FFI_ExecutionPlan`. +- `datafusion-comet-ballista` crate: + - `CometScanExec` — a serializable DataFusion leaf that rebuilds the plan over FFI at execute time. + - `CometPhysicalCodec` / `CometLogicalCodec` — extension codecs that compose with Ballista's own + (delegating non-Comet nodes) so Comet plans can be shipped to Ballista executors. + - `CometTableProvider` — exposes a Comet plan to Ballista as a table. + +**JVM** (`spark/`): +- Driver-side offload hook and configuration (see below). + +## Configuration (planned / experimental) + +| Config | Default | Description | +| --- | --- | --- | +| `spark.comet.exec.ballista.enabled` | `false` | Offload Comet plans to Ballista at the driver instead of executing in Spark executors. | +| `spark.comet.exec.ballista.scheduler.url` | _(unset)_ | External Ballista scheduler to submit to. When unset, an in-process Ballista engine is used. | + +## Roadmap + +Legend: ✅ done · 🔨 in progress · ⬜ planned + +- ✅ **Rust core** — FFI plan export + `datafusion-comet-ballista` crate (`CometScanExec`, composed + codecs, `CometTableProvider`) with codec round-trip and standalone distributed tests. +- 🔨 **R1 — driver-side offload (single-stage).** A Spark app runs a query with + `spark.comet.exec.ballista.enabled=true`; the driver submits the whole Comet plan to Ballista and + returns results, with zero Spark-executor tasks. First target query: TPC-H Q1. + - 🔨 R1-T1 — JVM → native → in-process Ballista → JVM Arrow round-trip (spike). + - ⬜ R1-T2 — config flag + driver `executeCollect` override. + - ⬜ R1-T3 — end-to-end TPC-H Q1 via Ballista, results verified against Spark. + - ⬜ R1-T4 (R1b) — submit to an external Ballista scheduler + executor cluster. +- ⬜ **R2 — multi-stage distribution.** Map Comet's per-stage native fragments onto Ballista stages + with shuffle, so plans with exchanges (aggregations, joins) distribute across executors. +- ⬜ **JVM-free executor.** Feature-gate the JNI bridge so the native execution crates build without + `libjvm`, enabling a standalone Ballista executor binary. +- ⬜ **Multi-partition scans.** Map a scan's file groups to multiple partitions (currently a + `NativeScan` proto encodes a single partition). +- ⬜ **Wider coverage.** Broaden operator/expression coverage; capture real plans from the JVM + `QueryPlanSerde` for validation. +- ⬜ **Spark Connect front-end.** Package the driver as a Spark Connect server for unchanged clients. + +## Known limitations + +- Single-stage only in R1 (no distribution yet); plans containing an exchange are rejected. +- Scans are single-partition today. +- The FFI boundary requires Comet and Ballista to be built against the same DataFusion **major** + version. +- Comet core links the JNI bridge, so `libjvm` must be present at runtime even where JNI is unused. + +## References + +- Proposal and discussion: [issue #4796](https://github.com/apache/datafusion-comet/issues/4796) +- Prototype code: draft PR [#4800](https://github.com/apache/datafusion-comet/pull/4800) +- [Arrow FFI Usage in Comet](ffi.md) diff --git a/docs/source/contributor-guide/index.md b/docs/source/contributor-guide/index.md index 5236d03b9d..95a9c1e85e 100644 --- a/docs/source/contributor-guide/index.md +++ b/docs/source/contributor-guide/index.md @@ -52,6 +52,14 @@ ANSI Error Propagation S3 Credential Provider Design ``` +```{toctree} +:maxdepth: 2 +:caption: Experimental +:hidden: + +Native Execution on Ballista +``` + ```{toctree} :maxdepth: 2 :caption: Adding Functionality From 8fdc72b056afdd2e5cd8cd55ea9dc6281c98d3f9 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Jul 2026 16:06:17 -0600 Subject: [PATCH 09/42] feat(ballista): offload single-stage Comet collect() to in-process Ballista Add a driver-side seam so a `collect()` on a single-stage Comet query runs on the Spark driver via an in-process Ballista engine, launching no Spark executor tasks. Gated by `spark.comet.exec.ballista.enabled` (default false). - CometConf: add COMET_EXEC_BALLISTA_ENABLED. - CometExec.executeCollect and the Comet columnar-to-row nodes (the real collect roots) dispatch to CometExec.executeCollectViaBallista when the flag is on. It locates the single CometNativeExec boundary carrying the serialized plan, injects each NativeScan's file partitions into the template proto (merging all partitions into one native scan leaf), submits it via NativeBallista, imports the exported Arrow batch on the driver, and materializes the rows. Plans with more than one native block (an exchange) throw UnsupportedOperationException. - NativeBallista: promote from the test tree to main and load libdatafusion_comet_ballista after libcomet so core JNI symbols bind to libcomet (the ballista cdylib re-exports them) rather than a divergent copy. - ffi_jni: derive the result schema from the built plan's schema() instead of the NativeScan proto, so plans with operators above the scan report their true output schema. - Tests: add CometBallistaOffloadSuite proving identical rows flag-on vs -off and zero executor task starts (SparkListener) for the offloaded collect. Relates to #4796. --- native/ballista/src/ffi_jni.rs | 55 +++---- .../scala/org/apache/comet/CometConf.scala | 11 ++ .../comet/ballista/NativeBallista.scala | 141 ++++++++++++++++++ .../sql/comet/CometColumnarToRowExec.scala | 13 ++ .../comet/CometNativeColumnarToRowExec.scala | 11 ++ .../apache/spark/sql/comet/operators.scala | 110 +++++++++++++- .../ballista/CometBallistaFfiSpikeSuite.scala | 37 +---- .../ballista/CometBallistaOffloadSuite.scala | 104 +++++++++++++ .../comet/ballista/NativeBallista.scala | 54 ------- 9 files changed, 411 insertions(+), 125 deletions(-) create mode 100644 spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala create mode 100644 spark/src/test/scala/org/apache/comet/ballista/CometBallistaOffloadSuite.scala delete mode 100644 spark/src/test/scala/org/apache/comet/ballista/NativeBallista.scala diff --git a/native/ballista/src/ffi_jni.rs b/native/ballista/src/ffi_jni.rs index 6ffb1223e2..ceb5b8a412 100644 --- a/native/ballista/src/ffi_jni.rs +++ b/native/ballista/src/ffi_jni.rs @@ -33,9 +33,9 @@ use datafusion::arrow::compute::concat_batches; use datafusion::arrow::datatypes::{Field, Schema, SchemaRef}; use datafusion::arrow::ffi::{FFI_ArrowArray, FFI_ArrowSchema}; use datafusion::execution::SessionStateBuilder; +use datafusion::physical_plan::ExecutionPlan; use datafusion::prelude::{SessionConfig, SessionContext}; -use comet::execution::serde::to_arrow_datatype; use datafusion_comet_proto::spark_operator::{operator::OpStruct, Operator}; use prost::Message; @@ -117,44 +117,23 @@ pub fn build_test_proto() -> Result, String> { Ok(op.encode_to_vec()) } +use crate::scan::CometScanExec; use crate::{CometLogicalCodec, CometPhysicalCodec, CometTableProvider}; -/// Derive the Arrow result schema from the `NativeScan` leaf carried in the -/// proto. For this spike the offloaded plan is a single `NativeScan`, so its -/// `required_schema` is the query's output schema. -fn schema_from_proto(op: &Operator) -> Result { - let native_scan = match op.op_struct.as_ref() { - Some(OpStruct::NativeScan(scan)) => scan, - _ => return Err("expected a NativeScan operator at the plan root".to_string()), - }; - let common = native_scan - .common - .as_ref() - .ok_or_else(|| "NativeScan is missing NativeScanCommon".to_string())?; - let fields: Vec = common - .required_schema - .iter() - .map(|f| { - let dt = f - .data_type - .as_ref() - .ok_or_else(|| format!("field {} has no data type", f.name))?; - Ok(Field::new(&f.name, to_arrow_datatype(dt), f.nullable)) - }) - .collect::>()?; - Ok(Arc::new(Schema::new(fields))) -} - /// Run a Comet `Operator` proto on an in-process standalone Ballista engine and /// return the collected Arrow batches plus the result schema. /// -/// This reuses the exact "proto → standalone Ballista → RecordBatches" recipe -/// validated in `tests/distributed.rs`, but runs `SELECT * FROM t` (no shuffle) -/// because the spike only needs to prove the result boundary. +/// This reuses the "proto → standalone Ballista → RecordBatches" recipe +/// validated in `tests/distributed.rs`, running `SELECT * FROM t` (no shuffle) +/// over a table provider that carries the whole Comet plan proto — so any +/// operators above the scan (filter/project/aggregate) run natively too. +/// +/// The result schema is derived from the **built** Comet plan's `schema()` +/// (not from the scan proto's `required_schema`), so plans with operators above +/// the scan report their true output schema rather than the raw scan schema. pub fn execute_comet_proto(proto: &[u8]) -> Result<(SchemaRef, Vec), String> { - let op = - Operator::decode(proto).map_err(|e| format!("failed to decode Operator proto: {e}"))?; - let schema = schema_from_proto(&op)?; + // Validate the proto decodes before spinning up the engine. + Operator::decode(proto).map_err(|e| format!("failed to decode Operator proto: {e}"))?; let runtime = tokio::runtime::Builder::new_multi_thread() .enable_all() @@ -162,6 +141,16 @@ pub fn execute_comet_proto(proto: &[u8]) -> Result<(SchemaRef, Vec) .map_err(|e| format!("failed to build tokio runtime: {e}"))?; runtime.block_on(async move { + // Build the whole Comet plan once (inside the Tokio runtime, which + // `CometScanExec::try_new` requires) so we can read its true output + // schema. This is the fix for the T1 spike's scan-schema shortcut: the + // result schema now comes from the plan, not the NativeScan proto. + let built: Arc = Arc::new( + CometScanExec::try_new(proto.to_vec()) + .map_err(|e| format!("failed to build Comet plan: {e}"))?, + ); + let schema = built.schema(); + // In-process standalone Ballista cluster (scheduler + executor) with the // Comet codecs registered so the Comet leaf survives serialization. let config = SessionConfig::new_with_ballista() diff --git a/spark/src/main/scala/org/apache/comet/CometConf.scala b/spark/src/main/scala/org/apache/comet/CometConf.scala index 8e47151358..4340fa590e 100644 --- a/spark/src/main/scala/org/apache/comet/CometConf.scala +++ b/spark/src/main/scala/org/apache/comet/CometConf.scala @@ -280,6 +280,17 @@ object CometConf extends ShimCometConf { val COMET_EXEC_LOCAL_TABLE_SCAN_ENABLED: ConfigEntry[Boolean] = createExecEnabledConfig("localTableScan", defaultValue = false) + val COMET_EXEC_BALLISTA_ENABLED: ConfigEntry[Boolean] = + conf(s"$COMET_EXEC_CONFIG_PREFIX.ballista.enabled") + .category(CATEGORY_EXEC) + .doc("EXPERIMENTAL: When enabled, a `collect()` on a single-stage Comet-accelerated query " + + "is offloaded from the Spark driver to an in-process Apache DataFusion Ballista engine. " + + "The already-serialized whole-query Comet plan is submitted to Ballista and the result " + + "rows are returned directly on the driver, with no Spark executor tasks launched. Only " + + "single-stage plans (no exchange) are supported.") + .booleanConf + .createWithDefault(false) + val COMET_NATIVE_COLUMNAR_TO_ROW_ENABLED: ConfigEntry[Boolean] = conf(s"$COMET_EXEC_CONFIG_PREFIX.columnarToRow.native.enabled") .category(CATEGORY_EXEC) diff --git a/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala b/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala new file mode 100644 index 0000000000..33f295eadf --- /dev/null +++ b/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.ballista + +import java.io.File +import java.nio.file.{Files, Paths} + +import org.apache.comet.NativeBase + +/** + * JNI binding to the native driver-side Ballista submission entry, implemented in the + * `datafusion-comet-ballista` crate (`libdatafusion_comet_ballista`). + * + * EXPERIMENTAL (R1): used by [[org.apache.spark.sql.comet.CometExec.executeCollectViaBallista]] + * to offload a single-stage Comet query to an in-process Ballista engine on the Spark driver. + */ +class NativeBallista { + + // Ensure the native library is loaded before any native method is invoked. + NativeBallista.ensureLoaded() + + /** + * Build the fixed spike test proto (a single `NativeScan` over a Parquet file with one int32 + * column `a` = [1..5]) native-side and return its serialized bytes. Lets tests exercise the + * proto boundary without depending on the generated proto Java classes. + */ + @native def buildTestProto(): Array[Byte] + + /** + * Run a serialized Comet `Operator` proto on an in-process standalone Ballista engine (no Spark + * executors) and export the single (concatenated) result batch into the caller-allocated Arrow + * C Data structs. + * + * @param proto + * serialized Comet `Operator` proto + * @param arrayAddrs + * memory addresses of one `ArrowArray` struct per output column + * @param schemaAddrs + * memory addresses of one `ArrowSchema` struct per output column + * @return + * the number of rows exported + */ + @native def executeQuery( + proto: Array[Byte], + arrayAddrs: Array[Long], + schemaAddrs: Array[Long]): Long +} + +object NativeBallista { + + @volatile private var loaded = false + @volatile private var loadError: Option[Throwable] = None + + /** + * Load `libdatafusion_comet_ballista`. + * + * Symbol ownership: the ballista cdylib statically links Comet core and therefore re-exports + * core's `Java_org_apache_comet_Native_*` JNI symbols in addition to its own distinct + * `Java_org_apache_comet_ballista_NativeBallista_*` entries. We force `libcomet` (via + * [[NativeBase]]) to load FIRST so the JVM binds every core native method to `libcomet`; + * loading the ballista library afterwards contributes only the distinct `NativeBallista_*` + * symbols. This keeps all Comet core state in a single library and avoids two divergent copies. + * + * The library is not on `java.library.path`, so we resolve it by absolute path: the + * `COMET_BALLISTA_LIB` env var first, then the debug/release build outputs relative to the + * module working directory. + */ + private def load(): Unit = synchronized { + if (loaded || loadError.isDefined) return + + // Load libcomet first so core JNI symbols bind to it, not to the ballista cdylib's re-exports. + try { + NativeBase.isLoaded() + } catch { + case t: Throwable => + loadError = Some(t) + return + } + + val libName = System.mapLibraryName("datafusion_comet_ballista") + val moduleDir = new File(System.getProperty("user.dir")) + val candidates = Seq( + sys.env.get("COMET_BALLISTA_LIB"), + Some(new File(moduleDir, s"../native/target/debug/$libName").getPath), + Some(new File(moduleDir, s"../native/target/release/$libName").getPath), + Some(new File(moduleDir, s"native/target/debug/$libName").getPath), + Some(new File(moduleDir, s"native/target/release/$libName").getPath)).flatten + candidates.find(p => Files.exists(Paths.get(p))) match { + case Some(path) => + try { + System.load(new File(path).getAbsolutePath) + loaded = true + } catch { + case t: Throwable => loadError = Some(t) + } + case None => + loadError = Some( + new UnsatisfiedLinkError( + s"could not find $libName in any of: ${candidates.mkString(", ")}")) + } + } + + /** Load the library, throwing if it cannot be loaded. */ + def ensureLoaded(): Unit = { + load() + loadError.foreach { t => + throw new IllegalStateException( + s"failed to load native ballista library: ${t.getMessage}", + t) + } + } + + /** True if the native ballista library is available (loads it on first call). */ + def isAvailable: Boolean = { + load() + loaded + } + + /** The load failure, if any (loads the library on first call). */ + def loadFailure: Option[Throwable] = { + load() + loadError + } +} diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometColumnarToRowExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometColumnarToRowExec.scala index 2fe870ed06..48a9b9aec4 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/CometColumnarToRowExec.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometColumnarToRowExec.scala @@ -45,6 +45,8 @@ import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector} import org.apache.spark.util.{SparkFatalException, Utils} import org.apache.spark.util.io.ChunkedByteBuffer +import org.apache.comet.CometConf + /** * Copied from Spark `ColumnarToRowExec`. Comet needs the fix for SPARK-50235 but cannot wait for * the fix to be released in Spark versions. We copy the implementation here to apply the fix. @@ -62,6 +64,17 @@ case class CometColumnarToRowExec(child: SparkPlan) override def outputOrdering: Seq[SortOrder] = child.outputOrdering + override def executeCollect(): Array[InternalRow] = { + if (CometConf.COMET_EXEC_BALLISTA_ENABLED.get()) { + // EXPERIMENTAL (R1): offload the whole-query native plan to an in-process Ballista engine on + // the driver instead of launching a Spark job. This ColumnarToRow node is the collect root, + // so the CometNativeExec boundary carrying the serialized plan is in its subtree. + CometExec.executeCollectViaBallista(this) + } else { + super.executeCollect() + } + } + // `ColumnarToRowExec` processes the input RDD directly, which is kind of a leaf node in the // codegen stage and needs to do the limit check. protected override def canCheckLimitNotReached: Boolean = true diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometNativeColumnarToRowExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometNativeColumnarToRowExec.scala index 6fa220b728..0d39a5517f 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/CometNativeColumnarToRowExec.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometNativeColumnarToRowExec.scala @@ -69,6 +69,17 @@ case class CometNativeColumnarToRowExec(child: SparkPlan) override def outputOrdering: Seq[SortOrder] = child.outputOrdering + override def executeCollect(): Array[InternalRow] = { + if (CometConf.COMET_EXEC_BALLISTA_ENABLED.get()) { + // EXPERIMENTAL (R1): offload the whole-query native plan to an in-process Ballista engine on + // the driver instead of launching a Spark job. This ColumnarToRow node is the collect root, + // so the CometNativeExec boundary carrying the serialized plan is in its subtree. + CometExec.executeCollectViaBallista(this) + } else { + super.executeCollect() + } + } + override lazy val metrics: Map[String, SQLMetric] = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "numInputBatches" -> SQLMetrics.createMetric(sparkContext, "number of input batches"), diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala index e4d6b53770..7547e626db 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala @@ -53,6 +53,7 @@ import com.google.protobuf.CodedOutputStream import org.apache.comet.{CometConf, CometExecIterator, CometRuntimeException, ConfigEntry} import org.apache.comet.CometSparkSessionExtensions.{isCometShuffleEnabled, withFallbackReason} +import org.apache.comet.ballista.NativeBallista import org.apache.comet.parquet.CometParquetUtils import org.apache.comet.rules.CometExecRule import org.apache.comet.serde.{CometOperatorSerde, Compatible, Incompatible, OperatorOuterClass, SupportLevel, Unsupported} @@ -60,6 +61,7 @@ import org.apache.comet.serde.OperatorOuterClass.{AggregateMode => CometAggregat import org.apache.comet.serde.QueryPlanSerde import org.apache.comet.serde.QueryPlanSerde.{aggExprToProto, exprToProto, isStringCollationType, supportedSortType} import org.apache.comet.serde.operator.CometSink +import org.apache.comet.vector.NativeUtil /** * Trait for injecting per-partition planning data into operator nodes. @@ -271,8 +273,16 @@ abstract class CometExec extends CometPlan { override def doExecute(): RDD[InternalRow] = ColumnarToRowExec(this).doExecute() - override def executeCollect(): Array[InternalRow] = - ColumnarToRowExec(this).executeCollect() + override def executeCollect(): Array[InternalRow] = { + if (CometConf.COMET_EXEC_BALLISTA_ENABLED.get()) { + // EXPERIMENTAL (R1): offload the whole-query native plan to an in-process + // Ballista engine on the driver and return the rows directly, launching no + // Spark executor tasks. See CometExec.executeCollectViaBallista. + CometExec.executeCollectViaBallista(this) + } else { + ColumnarToRowExec(this).executeCollect() + } + } override def outputOrdering: Seq[SortOrder] = originalPlan.outputOrdering @@ -392,6 +402,102 @@ object CometExec { encryptedFilePaths) } + /** + * EXPERIMENTAL (R1): offload a single-stage Comet query to an in-process Apache DataFusion + * Ballista engine on the Spark driver and return the collected rows, launching NO Spark + * executor tasks. + * + * Enabled by `spark.comet.exec.ballista.enabled`. The whole-query native plan is already + * serialized on the boundary [[CometNativeExec]] (`serializedPlanOpt.plan`, produced by + * `convertBlock`). We hand those proto bytes to [[NativeBallista.executeQuery]], which runs + * them on Ballista and exports the (single, concatenated) result batch back over the Arrow C + * Data Interface; we import it via [[NativeUtil]] and materialize the rows on the driver. + * + * Only single-stage plans are supported: exactly one native block (no exchange). Anything else + * throws [[UnsupportedOperationException]]. + */ + def executeCollectViaBallista(root: SparkPlan): Array[InternalRow] = { + // Every boundary node (top of a native block) carries a serialized plan. More than one means + // the plan spans a shuffle boundary -> multiple stages, which R1 does not support. + val boundaries = root.collect { + case n: CometNativeExec if n.serializedPlanOpt.isDefined => n + } + val boundary = boundaries match { + case Seq(single) => single + case _ => + throw new UnsupportedOperationException( + "Comet Ballista offload (R1) supports single-stage plans only; found " + + s"${boundaries.size} serialized native plan blocks in:\n$root") + } + val planBytes = boundary.serializedPlanOpt.plan.getOrElse( + throw new UnsupportedOperationException( + "Comet Ballista offload (R1) supports single-stage plans only; " + + s"the native plan block carries no serialized plan:\n$root")) + + // The serialized template plan carries each NativeScan's `common` metadata but NOT its file + // list: Comet normally injects file partitions per-partition at task launch (see + // NativeScanPlanDataInjector). Since the offload runs the whole plan as a single native leaf, + // inject all partitions' files into one scan so Ballista reads the complete table. + val nativeScans = boundary.collect { case s: CometNativeScanExec => s } + val injectedPlanBytes = if (nativeScans.isEmpty) { + planBytes + } else { + val commonByKey = nativeScans.map { scan => + scan.ensureSubqueriesResolved() + scan.sourceKey -> scan.commonData + }.toMap + val partitionByKey = nativeScans.map { scan => + scan.sourceKey -> mergeFilePartitions(scan.perPartitionData) + }.toMap + val template = Operator.parseFrom(planBytes) + val injected = PlanDataInjector.injectPlanData(template, commonByKey, partitionByKey) + PlanDataInjector.serializeOperator(injected) + } + + val numCols = boundary.output.length + val nativeUtil = new NativeUtil() + try { + val nativeBallista = new NativeBallista + // Ballista concatenates the whole result into a single exported batch, so one import is + // sufficient for R1's single-stage plans. + nativeUtil.getNextBatch( + numCols, + (arrayAddrs, schemaAddrs) => + nativeBallista.executeQuery(injectedPlanBytes, arrayAddrs, schemaAddrs)) match { + case Some(batch) => + try { + batch.rowIterator().asScala.map(_.copy()).toArray + } finally { + batch.close() + } + case None => + Array.empty[InternalRow] + } + } finally { + nativeUtil.close() + } + } + + /** + * Merge the per-partition file lists of a native scan into a single `NativeScan` carrying every + * partition's files, serialized as the `partitionBytes` expected by + * [[NativeScanPlanDataInjector]] (a `NativeScan` whose `file_partition` holds all + * `partitioned_file`s). Used by the Ballista offload so the whole table is read by one native + * scan leaf. + */ + private def mergeFilePartitions(perPartitionData: Array[Array[Byte]]): Array[Byte] = { + val filePartition = OperatorOuterClass.SparkFilePartition.newBuilder() + perPartitionData.foreach { bytes => + val scan = OperatorOuterClass.NativeScan.parseFrom(bytes) + filePartition.addAllPartitionedFile(scan.getFilePartition.getPartitionedFileList) + } + OperatorOuterClass.NativeScan + .newBuilder() + .setFilePartition(filePartition) + .build() + .toByteArray + } + /** * Executes this Comet operator and serialized output ColumnarBatch into bytes. */ diff --git a/spark/src/test/scala/org/apache/comet/ballista/CometBallistaFfiSpikeSuite.scala b/spark/src/test/scala/org/apache/comet/ballista/CometBallistaFfiSpikeSuite.scala index bb46e10533..834230e385 100644 --- a/spark/src/test/scala/org/apache/comet/ballista/CometBallistaFfiSpikeSuite.scala +++ b/spark/src/test/scala/org/apache/comet/ballista/CometBallistaFfiSpikeSuite.scala @@ -19,7 +19,6 @@ package org.apache.comet.ballista -import java.io.File import java.nio.file.{Files, Paths} import org.scalatest.funsuite.AnyFunSuite @@ -88,42 +87,8 @@ class CometBallistaFfiSpikeSuite extends AnyFunSuite { object CometBallistaFfiSpikeSuite { - @volatile private var loaded = false - @volatile private var loadError: Option[Throwable] = None - - /** - * Load `libdatafusion_comet_ballista` by absolute path (it is not on `java.library.path`). We - * try the `COMET_BALLISTA_LIB` env var first, then the debug/release build outputs relative to - * the `spark` module directory (surefire's working dir). - */ - private def load(): Unit = synchronized { - if (loaded || loadError.isDefined) return - val libName = System.mapLibraryName("datafusion_comet_ballista") - val moduleDir = new File(System.getProperty("user.dir")) - val candidates = Seq( - sys.env.get("COMET_BALLISTA_LIB"), - Some(new File(moduleDir, s"../native/target/debug/$libName").getPath), - Some(new File(moduleDir, s"../native/target/release/$libName").getPath), - Some(new File(moduleDir, s"native/target/debug/$libName").getPath)).flatten - val found = candidates.find(p => Files.exists(Paths.get(p))) - found match { - case Some(path) => - try { - System.load(new File(path).getAbsolutePath) - loaded = true - } catch { - case t: Throwable => loadError = Some(t) - } - case None => - loadError = Some( - new UnsatisfiedLinkError( - s"could not find $libName in any of: ${candidates.mkString(", ")}")) - } - } - def assumeLibraryLoaded(): Unit = { - load() - loadError.foreach { t => + NativeBallista.loadFailure.foreach { t => org.scalatest.Assertions .cancel(s"native ballista library not available: ${t.getMessage}", t) } diff --git a/spark/src/test/scala/org/apache/comet/ballista/CometBallistaOffloadSuite.scala b/spark/src/test/scala/org/apache/comet/ballista/CometBallistaOffloadSuite.scala new file mode 100644 index 0000000000..fac7e0214d --- /dev/null +++ b/spark/src/test/scala/org/apache/comet/ballista/CometBallistaOffloadSuite.scala @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.ballista + +import java.util.concurrent.atomic.AtomicInteger + +import org.apache.spark.CometListenerBusUtils +import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskStart} +import org.apache.spark.sql.CometTestBase +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper +import org.apache.spark.sql.internal.SQLConf + +import org.apache.comet.CometConf + +/** + * Proves the driver-side "offload to Ballista" collect path (R1): when + * `spark.comet.exec.ballista.enabled=true`, a `collect()` on a single-stage Comet query runs on + * the Spark driver via an in-process Ballista engine and returns the same rows as the normal + * path, launching NO Spark executor tasks. + */ +class CometBallistaOffloadSuite extends CometTestBase with AdaptiveSparkPlanHelper { + + test("single-stage collect is offloaded to Ballista with no Spark executor tasks") { + assume( + NativeBallista.isAvailable, + s"native ballista library not available: ${NativeBallista.loadFailure.map(_.getMessage)}") + + withTempPath { dir => + import testImplicits._ + + // A single Parquet file (coalesce(1)) with two int columns, so the offloaded plan is a + // clean single-stage scan with no exchange. + Seq((1, 10), (2, 20), (3, 30), (4, 40), (5, 50)) + .toDF("a", "b") + .coalesce(1) + .write + .parquet(dir.getCanonicalPath) + + // Disable AQE so the collect root is the Comet columnar-to-row node (which carries our + // executeCollect override) rather than an AdaptiveSparkPlanExec wrapper. + withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { + spark.read.parquet(dir.getCanonicalPath).createOrReplaceTempView("t") + val query = "SELECT a, b FROM t WHERE a > 2" + + // Baseline: normal Comet execution (offload off). + val baseline = withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") { + spark.sql(query).collect().map(_.toSeq).toSet + } + assert( + baseline == Set(Seq(3, 30), Seq(4, 40), Seq(5, 50)), + s"unexpected baseline: $baseline") + + // Ballista offload: count executor task starts around the collect. + val taskStarts = new AtomicInteger(0) + val listener = new SparkListener { + override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = { + taskStarts.incrementAndGet() + } + } + // Drain any events from setup/baseline before attaching, so the counter only sees the + // offloaded collect. + CometListenerBusUtils.waitUntilEmpty(spark.sparkContext) + spark.sparkContext.addSparkListener(listener) + + val offloaded = + try { + withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "true") { + val df = spark.sql(query) + val rows = df.collect().map(_.toSeq).toSet + CometListenerBusUtils.waitUntilEmpty(spark.sparkContext) + rows + } + } finally { + spark.sparkContext.removeSparkListener(listener) + } + + // Same rows via the offloaded path... + assert(offloaded == baseline, s"offloaded rows $offloaded != baseline $baseline") + // ...and crucially, NO Spark executor tasks ran for the offloaded collect. + assert( + taskStarts.get() == 0, + s"expected 0 Spark executor tasks for the Ballista-offloaded collect, " + + s"but ${taskStarts.get()} started") + } + } + } +} diff --git a/spark/src/test/scala/org/apache/comet/ballista/NativeBallista.scala b/spark/src/test/scala/org/apache/comet/ballista/NativeBallista.scala deleted file mode 100644 index 72192f07c5..0000000000 --- a/spark/src/test/scala/org/apache/comet/ballista/NativeBallista.scala +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.comet.ballista - -/** - * JNI binding to the native driver-side Ballista submission entry, implemented in the - * `datafusion-comet-ballista` crate (`libdatafusion_comet_ballista`). - * - * SPIKE: kept in the test tree while the offload-to-Ballista mode is being proven out. - */ -class NativeBallista { - - /** - * Build the fixed spike test proto (a single `NativeScan` over a Parquet file with one int32 - * column `a` = [1..5]) native-side and return its serialized bytes. Lets the JVM test exercise - * the proto boundary without depending on the generated proto Java classes. - */ - @native def buildTestProto(): Array[Byte] - - /** - * Run a serialized Comet `Operator` proto on an in-process standalone Ballista engine (no Spark - * executors) and export the single result batch into the caller-allocated Arrow C Data structs. - * - * @param proto - * serialized Comet `Operator` proto - * @param arrayAddrs - * memory addresses of one `ArrowArray` struct per output column - * @param schemaAddrs - * memory addresses of one `ArrowSchema` struct per output column - * @return - * the number of rows exported - */ - @native def executeQuery( - proto: Array[Byte], - arrayAddrs: Array[Long], - schemaAddrs: Array[Long]): Long -} From 238439a6605e4e443e80d79242f614eb19d71d44 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Jul 2026 16:17:53 -0600 Subject: [PATCH 10/42] test(ballista): strengthen offload guard + zero-task proofs, note DPP caveat Relates to #4796. - Add a negative test that runs a multi-partition GROUP BY (forces an exchange, disabling AQE for determinism) with the offload flag on and asserts the single-stage guard in executeCollectViaBallista throws UnsupportedOperationException, after confirming the plan actually contains an exchange. - Add a positive control to the existing zero-task test: run the flag-off baseline collect() through the same SparkListener/waitUntilEmpty apparatus and assert task starts > 0, proving the listener genuinely observes executor tasks so the == 0 assertion for the offloaded collect is meaningful. - Document in COMET_EXEC_BALLISTA_ENABLED that R1 targets single-stage queries without dynamic partition pruning or correlated scalar subqueries, since resolving those via waitForSubqueries()/ updateResult() can still launch Spark executor tasks under offload. --- .../scala/org/apache/comet/CometConf.scala | 5 +- .../ballista/CometBallistaOffloadSuite.scala | 113 ++++++++++++++---- 2 files changed, 92 insertions(+), 26 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/CometConf.scala b/spark/src/main/scala/org/apache/comet/CometConf.scala index 4340fa590e..d9c511d3df 100644 --- a/spark/src/main/scala/org/apache/comet/CometConf.scala +++ b/spark/src/main/scala/org/apache/comet/CometConf.scala @@ -287,7 +287,10 @@ object CometConf extends ShimCometConf { "is offloaded from the Spark driver to an in-process Apache DataFusion Ballista engine. " + "The already-serialized whole-query Comet plan is submitted to Ballista and the result " + "rows are returned directly on the driver, with no Spark executor tasks launched. Only " + - "single-stage plans (no exchange) are supported.") + "single-stage plans (no exchange) are supported. R1 targets single-stage queries without " + + "dynamic partition pruning or correlated scalar subqueries: resolving those inputs " + + "(via `waitForSubqueries()`/`updateResult()` before the plan is handed to Ballista) can " + + "still transitively launch Spark executor tasks even with this flag enabled.") .booleanConf .createWithDefault(false) diff --git a/spark/src/test/scala/org/apache/comet/ballista/CometBallistaOffloadSuite.scala b/spark/src/test/scala/org/apache/comet/ballista/CometBallistaOffloadSuite.scala index fac7e0214d..ec45f63567 100644 --- a/spark/src/test/scala/org/apache/comet/ballista/CometBallistaOffloadSuite.scala +++ b/spark/src/test/scala/org/apache/comet/ballista/CometBallistaOffloadSuite.scala @@ -37,6 +37,29 @@ import org.apache.comet.CometConf */ class CometBallistaOffloadSuite extends CometTestBase with AdaptiveSparkPlanHelper { + /** + * Runs `f`, counting Spark executor task starts that occur during it. Drains the listener bus + * before attaching (so events from prior setup don't leak in) and after running `f` (so + * asynchronously-dispatched task-start events are flushed before we read the counter). + */ + private def countTaskStarts(f: => Unit): Int = { + val taskStarts = new AtomicInteger(0) + val listener = new SparkListener { + override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = { + taskStarts.incrementAndGet() + } + } + CometListenerBusUtils.waitUntilEmpty(spark.sparkContext) + spark.sparkContext.addSparkListener(listener) + try { + f + CometListenerBusUtils.waitUntilEmpty(spark.sparkContext) + } finally { + spark.sparkContext.removeSparkListener(listener) + } + taskStarts.get() + } + test("single-stage collect is offloaded to Ballista with no Spark executor tasks") { assume( NativeBallista.isAvailable, @@ -59,45 +82,85 @@ class CometBallistaOffloadSuite extends CometTestBase with AdaptiveSparkPlanHelp spark.read.parquet(dir.getCanonicalPath).createOrReplaceTempView("t") val query = "SELECT a, b FROM t WHERE a > 2" - // Baseline: normal Comet execution (offload off). - val baseline = withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") { - spark.sql(query).collect().map(_.toSeq).toSet + // Baseline: normal Comet execution (offload off), run through the same + // listener/waitUntilEmpty apparatus used for the offloaded case below. This is a + // positive control: it proves the listener actually observes executor task starts (i.e. + // it isn't a broken apparatus that would report 0 regardless), so the `== 0` assertion + // for the offloaded collect is meaningful. + var baseline: Set[Seq[Any]] = null + val baselineTaskStarts = countTaskStarts { + baseline = withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") { + spark.sql(query).collect().map(_.toSeq).toSet + } } assert( baseline == Set(Seq(3, 30), Seq(4, 40), Seq(5, 50)), s"unexpected baseline: $baseline") + assert( + baselineTaskStarts > 0, + "expected the flag-off baseline collect to launch at least one Spark executor task " + + "(sanity check that the listener/waitUntilEmpty apparatus catches task starts); " + + s"got $baselineTaskStarts") // Ballista offload: count executor task starts around the collect. - val taskStarts = new AtomicInteger(0) - val listener = new SparkListener { - override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = { - taskStarts.incrementAndGet() + var offloaded: Set[Seq[Any]] = null + val offloadedTaskStarts = countTaskStarts { + offloaded = withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "true") { + spark.sql(query).collect().map(_.toSeq).toSet } } - // Drain any events from setup/baseline before attaching, so the counter only sees the - // offloaded collect. - CometListenerBusUtils.waitUntilEmpty(spark.sparkContext) - spark.sparkContext.addSparkListener(listener) - - val offloaded = - try { - withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "true") { - val df = spark.sql(query) - val rows = df.collect().map(_.toSeq).toSet - CometListenerBusUtils.waitUntilEmpty(spark.sparkContext) - rows - } - } finally { - spark.sparkContext.removeSparkListener(listener) - } // Same rows via the offloaded path... assert(offloaded == baseline, s"offloaded rows $offloaded != baseline $baseline") // ...and crucially, NO Spark executor tasks ran for the offloaded collect. assert( - taskStarts.get() == 0, + offloadedTaskStarts == 0, s"expected 0 Spark executor tasks for the Ballista-offloaded collect, " + - s"but ${taskStarts.get()} started") + s"but $offloadedTaskStarts started") + } + } + } + + test("multi-stage collect (exchange present) throws under Ballista offload") { + assume( + NativeBallista.isAvailable, + s"native ballista library not available: ${NativeBallista.loadFailure.map(_.getMessage)}") + + withTempPath { dir => + import testImplicits._ + + // Several partition files with repeated keys, so a `GROUP BY` requires a shuffle + // (exchange) to aggregate across partitions -> more than one CometNativeExec boundary. + Seq((1, 10), (1, 20), (2, 30), (2, 40), (3, 50), (3, 60), (4, 70), (4, 80)) + .toDF("k", "v") + .repartition(4) + .write + .parquet(dir.getCanonicalPath) + + // Disable AQE so the shuffle boundary/plan shape is deterministic (no runtime coalescing + // of the exchange back down to a single stage). + withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { + spark.read.parquet(dir.getCanonicalPath).createOrReplaceTempView("t2") + val query = "SELECT k, count(*) FROM t2 GROUP BY k" + + withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "true") { + val df = spark.sql(query) + // Sanity check: the plan does contain an exchange, i.e. the single-stage guard is + // actually exercised and not vacuously satisfied. + val hasExchange = df.queryExecution.executedPlan.collect { + case e: org.apache.spark.sql.execution.exchange.Exchange => e + }.nonEmpty + assert( + hasExchange, + s"expected an exchange in the plan:\n${df.queryExecution.executedPlan}") + + val ex = intercept[UnsupportedOperationException] { + df.collect() + } + assert( + ex.getMessage.contains("single-stage plans only"), + s"unexpected exception message: ${ex.getMessage}") + } } } } From 51782f989918f8d21b330e2c175428ba986c6992 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Jul 2026 16:19:00 -0600 Subject: [PATCH 11/42] docs(contributor-guide): mark R1-T1/T2 done, note DPP/subquery caveat Relates to #4796 --- docs/source/contributor-guide/ballista_execution.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/source/contributor-guide/ballista_execution.md b/docs/source/contributor-guide/ballista_execution.md index e818daa149..0a7fef8f22 100644 --- a/docs/source/contributor-guide/ballista_execution.md +++ b/docs/source/contributor-guide/ballista_execution.md @@ -106,8 +106,8 @@ Legend: ✅ done · 🔨 in progress · ⬜ planned - 🔨 **R1 — driver-side offload (single-stage).** A Spark app runs a query with `spark.comet.exec.ballista.enabled=true`; the driver submits the whole Comet plan to Ballista and returns results, with zero Spark-executor tasks. First target query: TPC-H Q1. - - 🔨 R1-T1 — JVM → native → in-process Ballista → JVM Arrow round-trip (spike). - - ⬜ R1-T2 — config flag + driver `executeCollect` override. + - ✅ R1-T1 — JVM → native → in-process Ballista → JVM Arrow round-trip (spike). + - ✅ R1-T2 — config flag + driver `executeCollect` override. - ⬜ R1-T3 — end-to-end TPC-H Q1 via Ballista, results verified against Spark. - ⬜ R1-T4 (R1b) — submit to an external Ballista scheduler + executor cluster. - ⬜ **R2 — multi-stage distribution.** Map Comet's per-stage native fragments onto Ballista stages @@ -124,6 +124,8 @@ Legend: ✅ done · 🔨 in progress · ⬜ planned - Single-stage only in R1 (no distribution yet); plans containing an exchange are rejected. - Scans are single-partition today. +- Queries with dynamic partition pruning or correlated scalar subqueries may still launch Spark + executor tasks to resolve those inputs, even in Ballista mode. - The FFI boundary requires Comet and Ballista to be built against the same DataFusion **major** version. - Comet core links the JNI bridge, so `libjvm` must be present at runtime even where JNI is unused. From f4c5d00908cf5621299f16afae686a7d3159771e Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Jul 2026 16:29:16 -0600 Subject: [PATCH 12/42] test(ballista): offload TPC-H Q1 pre-aggregation slice via in-process Ballista Add CometBallistaQ1Suite, the milestone demonstration for the R1 driver-side Ballista offload using TPC-H Q1 lineitem data and per-row semantics. A synthetic single-file lineitem (correct decimal(12,2)/date/string types, rows straddling the Q1 date cutoff) is queried with spark.comet.exec.ballista.enabled=true and its collected rows are asserted identical to the flag-off Comet baseline, with zero Spark executor tasks (SparkListener), reusing CometBallistaOffloadSuite's apparatus and positive control. Full Q1 cannot be offloaded under R1's single-serialized-block machinery: the GROUP BY forces either a partial->exchange->final shuffle (two blocks) or, with coalesce(1), a CometCoalesce sink boundary (still two blocks); the file scan's UnknownPartitioning never satisfies the aggregate's ClusteredDistribution, so the exchange is unavoidable. The multi-block/aggregate case is R2. The suite therefore offloads the largest single-block subset of Q1 -- scan + WHERE date filter + Q1's decimal arithmetic projections (disc_price, charge) -- as one exchange-free CometNativeExec, exercising the native Parquet scan, date filtering, and decimal multiplication that matter for offload correctness. The plan is asserted single-block before offloading. Relates to #4796. --- .../comet/ballista/CometBallistaQ1Suite.scala | 299 ++++++++++++++++++ 1 file changed, 299 insertions(+) create mode 100644 spark/src/test/scala/org/apache/comet/ballista/CometBallistaQ1Suite.scala diff --git a/spark/src/test/scala/org/apache/comet/ballista/CometBallistaQ1Suite.scala b/spark/src/test/scala/org/apache/comet/ballista/CometBallistaQ1Suite.scala new file mode 100644 index 0000000000..fdc010f10a --- /dev/null +++ b/spark/src/test/scala/org/apache/comet/ballista/CometBallistaQ1Suite.scala @@ -0,0 +1,299 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.ballista + +import java.math.{BigDecimal => JBigDecimal} +import java.sql.Date +import java.util.concurrent.atomic.AtomicInteger + +import org.apache.spark.CometListenerBusUtils +import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskStart} +import org.apache.spark.sql.{CometTestBase, Row} +import org.apache.spark.sql.comet.CometNativeExec +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper +import org.apache.spark.sql.execution.exchange.Exchange +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types._ + +import org.apache.comet.CometConf + +/** + * The milestone demonstration for the R1 driver-side Ballista offload, using TPC-H Q1's + * `lineitem` data and per-row semantics: run the query with + * `spark.comet.exec.ballista.enabled=true` and prove the collected rows are identical to the + * flag-off (Spark/Comet-on-executors) baseline, while launching ZERO Spark executor tasks. + * + * Scope note — why this offloads the pre-aggregation slice of Q1, not the full aggregate: + * + * The R1 offload path only accepts a plan with exactly ONE serialized `CometNativeExec` block (a + * single native leaf reading Parquet directly). Full Q1's `GROUP BY` cannot be squeezed into one + * such block under this machinery: + * - Plain read: Spark plans partial-agg -> `CometExchange` -> final-agg, i.e. a shuffle + * boundary \=> two serialized blocks (the guard rejects it as multi-stage). The Parquet scan + * reports `UnknownPartitioning`, which never satisfies the aggregate's + * `ClusteredDistribution`, so the exchange is unavoidable regardless of how many + * files/partitions the input has. + * - `.coalesce(1)`: this removes the exchange (the single-partition child satisfies the + * distribution) but inserts a `CometCoalesce` *sink*, which is itself a native-block boundary + * \=> still two serialized blocks, still rejected. So no arrangement of full Q1 is a single + * exchange-free `CometNativeExec` with a Parquet leaf in R1; the multi-block/aggregate case + * is R2 and explicitly out of scope here (see task brief). + * + * We therefore offload the largest single-block subset of Q1: its scan + `WHERE` date filter + + * the exact decimal arithmetic projections that feed the aggregate (`disc_price`, `charge`). This + * exercises the parts that matter for offload correctness — Parquet native scan, date filtering + * against the Q1 cutoff, and Q1's decimal multiplications — as ONE exchange-free native block. + * The test asserts the plan really is single-block before offloading, and compares full result + * rows flag-on vs flag-off using the exact decimal types Spark produces. + */ +class CometBallistaQ1Suite extends CometTestBase with AdaptiveSparkPlanHelper { + + /** + * The single-block, pre-aggregation slice of TPC-H Q1: the scan, the Q1 `WHERE` date filter, + * and Q1's per-row decimal projections (`l_extendedprice * (1 - l_discount)` and `... * (1 + + * l_tax)`) — everything up to, but not including, the `GROUP BY` (which would force a shuffle + * boundary, see the class doc). No `ORDER BY` (a global sort would also need a range-partition + * exchange); the test sorts the collected rows itself. + */ + private val q1 = + """ + |SELECT l_returnflag, l_linestatus, + | l_quantity, + | l_extendedprice, + | l_extendedprice * (1 - l_discount) AS disc_price, + | l_extendedprice * (1 - l_discount) * (1 + l_tax) AS charge + |FROM lineitem + |WHERE l_shipdate <= date '1998-12-01' - interval '90' day + |""".stripMargin + + /** + * TPC-H `lineitem`, restricted to the columns Q1 touches, with the correct Spark types. + * Decimals use the classic TPC-H `decimal(12,2)`; `l_shipdate` is a real `date`. + */ + private val lineitemSchema: StructType = StructType( + Seq( + StructField("l_quantity", DecimalType(12, 2), nullable = false), + StructField("l_extendedprice", DecimalType(12, 2), nullable = false), + StructField("l_discount", DecimalType(12, 2), nullable = false), + StructField("l_tax", DecimalType(12, 2), nullable = false), + StructField("l_returnflag", StringType, nullable = false), + StructField("l_linestatus", StringType, nullable = false), + StructField("l_shipdate", DateType, nullable = false))) + + private def dec(v: String): JBigDecimal = new JBigDecimal(v).setScale(2) + + /** + * A small synthetic `lineitem`: a handful of rows spanning three `(returnflag, linestatus)` + * groups, with shipdates straddling the Q1 cutoff (`1998-12-01 - 90 days = 1998-09-02`) so the + * `WHERE` filter actually removes rows (the two past-cutoff rows). A range of discount/tax + * values gives the decimal projections non-trivial products. + */ + private def lineitemRows: Seq[Row] = Seq( + // group (A, F) -- all kept + Row( + dec("17.00"), + dec("21168.23"), + dec("0.04"), + dec("0.02"), + "A", + "F", + Date.valueOf("1998-08-01")), + Row( + dec("36.00"), + dec("45983.16"), + dec("0.09"), + dec("0.06"), + "A", + "F", + Date.valueOf("1998-07-15")), + Row( + dec("8.00"), + dec("13309.60"), + dec("0.10"), + dec("0.02"), + "A", + "F", + Date.valueOf("1998-09-01")), + // group (N, O) -- all kept + Row( + dec("28.00"), + dec("28955.64"), + dec("0.05"), + dec("0.08"), + "N", + "O", + Date.valueOf("1998-06-10")), + Row( + dec("24.00"), + dec("32000.00"), + dec("0.00"), + dec("0.00"), + "N", + "O", + Date.valueOf("1998-08-20")), + Row( + dec("2.00"), + dec("2600.00"), + dec("0.06"), + dec("0.03"), + "N", + "O", + Date.valueOf("1998-09-02")), + // group (R, F) -- all kept + Row( + dec("32.00"), + dec("41000.50"), + dec("0.07"), + dec("0.05"), + "R", + "F", + Date.valueOf("1998-05-05")), + Row( + dec("45.00"), + dec("60000.00"), + dec("0.02"), + dec("0.01"), + "R", + "F", + Date.valueOf("1998-08-31")), + // rows PAST the cutoff -- must be filtered out (would form a (N, F) group if kept) + Row( + dec("50.00"), + dec("70000.00"), + dec("0.03"), + dec("0.04"), + "N", + "F", + Date.valueOf("1998-09-03")), + Row( + dec("99.00"), + dec("99999.99"), + dec("0.05"), + dec("0.05"), + "N", + "F", + Date.valueOf("1998-12-01"))) + + /** + * Runs `f`, counting Spark executor task starts during it. Drains the listener bus before + * attaching and after running so asynchronous task-start events are flushed. (Same apparatus as + * `CometBallistaOffloadSuite`.) + */ + private def countTaskStarts(f: => Unit): Int = { + val taskStarts = new AtomicInteger(0) + val listener = new SparkListener { + override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = { + taskStarts.incrementAndGet() + } + } + CometListenerBusUtils.waitUntilEmpty(spark.sparkContext) + spark.sparkContext.addSparkListener(listener) + try { + f + CometListenerBusUtils.waitUntilEmpty(spark.sparkContext) + } finally { + spark.sparkContext.removeSparkListener(listener) + } + taskStarts.get() + } + + test( + "TPC-H Q1 (pre-aggregation slice) offloads to Ballista single-block with identical results " + + "and no executor tasks") { + assume( + NativeBallista.isAvailable, + s"native ballista library not available: ${NativeBallista.loadFailure.map(_.getMessage)}") + + withTempPath { dir => + // Single Parquet file (coalesce(1)) so the offloaded plan reads one native scan leaf. + spark + .createDataFrame(spark.sparkContext.parallelize(lineitemRows), lineitemSchema) + .coalesce(1) + .write + .parquet(dir.getCanonicalPath) + + // AQE off so the physical plan is stable (no AdaptiveSparkPlanExec collect root wrapping the + // Comet columnar-to-row node that carries our executeCollect offload hook). + withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { + spark.read.parquet(dir.getCanonicalPath).createOrReplaceTempView("lineitem") + + // Confirm the plan is offloadable BEFORE running it: no exchange, and exactly one + // CometNativeExec block carrying a serialized plan (the R1 single-block requirement). + val executed = withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") { + spark.sql(q1).queryExecution.executedPlan + } + val exchanges = executed.collect { case e: Exchange => e } + assert( + exchanges.isEmpty, + s"expected no exchange (single-stage) in the plan, found ${exchanges.size}:\n$executed") + val nativeBlocks = executed.collect { + case n: CometNativeExec if n.serializedPlanOpt.isDefined => n + } + assert( + nativeBlocks.size == 1, + s"expected exactly one serialized CometNativeExec block, found ${nativeBlocks.size}:\n" + + s"$executed") + + // Baseline: normal Comet execution (offload off), run through the same listener apparatus. + // This is a positive control proving the listener actually observes executor task starts, + // so the `== 0` assertion for the offloaded run is meaningful. + var baseline: Seq[Seq[Any]] = null + val baselineTaskStarts = countTaskStarts { + baseline = withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") { + spark.sql(q1).collect().map(_.toSeq.toIndexedSeq).toIndexedSeq + } + } + assert( + baselineTaskStarts > 0, + "expected the flag-off baseline collect to launch at least one Spark executor task " + + s"(sanity check for the listener apparatus); got $baselineTaskStarts") + + // Ballista offload: run the same query with the flag on, counting executor task starts. + var offloaded: Seq[Seq[Any]] = null + val offloadedTaskStarts = countTaskStarts { + offloaded = withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "true") { + spark.sql(q1).collect().map(_.toSeq.toIndexedSeq).toIndexedSeq + } + } + + // Compare full rows (each sorted into a stable total order by its string form) using the + // exact values/types Spark produced -- decimals stay decimals with their computed scale. + def sortKey(r: Seq[Any]): String = r.map(v => s"$v").mkString("") + val baselineSorted = baseline.sortBy(sortKey) + val offloadedSorted = offloaded.sortBy(sortKey) + assert( + offloadedSorted == baselineSorted, + "offloaded rows do not match baseline\n" + + s" baseline: $baselineSorted\n offloaded: $offloadedSorted") + + // The 8 rows on/before the Q1 cutoff are kept; the two past-cutoff rows are filtered out. + assert( + baselineSorted.size == 8, + s"expected 8 rows after the Q1 date filter, got ${baselineSorted.size}: $baselineSorted") + + // Crucially, NO Spark executor tasks ran for the offloaded collect. + assert( + offloadedTaskStarts == 0, + s"expected 0 Spark executor tasks for the Ballista-offloaded collect, " + + s"but $offloadedTaskStarts started") + } + } + } +} From bb3971fb305810ce5f6db02dec31937c3d1501ec Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Jul 2026 16:31:16 -0600 Subject: [PATCH 13/42] docs(contributor-guide): record R1-T3 result (Q1 single-stage subset; full Q1 needs R2) Relates to #4796 --- docs/source/contributor-guide/ballista_execution.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/contributor-guide/ballista_execution.md b/docs/source/contributor-guide/ballista_execution.md index 0a7fef8f22..12ededdb0c 100644 --- a/docs/source/contributor-guide/ballista_execution.md +++ b/docs/source/contributor-guide/ballista_execution.md @@ -108,7 +108,7 @@ Legend: ✅ done · 🔨 in progress · ⬜ planned returns results, with zero Spark-executor tasks. First target query: TPC-H Q1. - ✅ R1-T1 — JVM → native → in-process Ballista → JVM Arrow round-trip (spike). - ✅ R1-T2 — config flag + driver `executeCollect` override. - - ⬜ R1-T3 — end-to-end TPC-H Q1 via Ballista, results verified against Spark. + - ◐ R1-T3 — offload proven end-to-end on Q1's single-stage subset (scan + date filter + decimal projections), results match Spark, 0 executor tasks. Full Q1 GROUP BY is structurally multi-block → R2. - ⬜ R1-T4 (R1b) — submit to an external Ballista scheduler + executor cluster. - ⬜ **R2 — multi-stage distribution.** Map Comet's per-stage native fragments onto Ballista stages with shuffle, so plans with exchanges (aggregations, joins) distribute across executors. From bc57f7caf33da9da95ac76f83c38a3ab89327ca2 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Jul 2026 17:01:53 -0600 Subject: [PATCH 14/42] feat(scan): let ScanExec pull from a native RecordBatch stream Introduce InputBatchStream, a small dyn-compatible trait (next_batch) that abstracts ScanExec's input source. AlignedArrowStreamReader (the JVM/FFI path via planner.rs) implements it unchanged; a new NativeBatchStream wraps a DataFusion SendableRecordBatchStream and implements it by blocking each pull on futures::executor::block_on, which is safe outside a Tokio runtime and composes with Comet's executor threads. ScanExec.input_source now holds Arc> instead of being hard-wired to AlignedArrowStreamReader, and ScanExec::new_native gives a constructor for the native case. The JVM path in planner.rs's OpStruct::Scan handling is otherwise unchanged, just wraps its reader in the trait object. This is the enabling change for a later Comet fragment Scan/ShuffleScan leaf to be fed by a Ballista shuffle-reader stream with no JVM in the data path. Relates to #4796. --- native/core/src/execution/operators/scan.rs | 191 ++++++++++++++++++-- native/core/src/execution/planner.rs | 11 +- 2 files changed, 182 insertions(+), 20 deletions(-) diff --git a/native/core/src/execution/operators/scan.rs b/native/core/src/execution/operators/scan.rs index 409d064284..5ce5d18241 100644 --- a/native/core/src/execution/operators/scan.rs +++ b/native/core/src/execution/operators/scan.rs @@ -20,7 +20,9 @@ use crate::{errors::CometError, execution::planner::TEST_EXEC_CONTEXT_ID}; use arrow::array::{ArrayRef, RecordBatch, RecordBatchOptions}; use arrow::compute::{cast_with_options, CastOptions}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use arrow::error::ArrowError; use datafusion::common::{arrow_datafusion_err, DataFusionError, Result as DataFusionResult}; +use datafusion::execution::SendableRecordBatchStream; use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType}; use datafusion::physical_plan::metrics::{ BaselineMetrics, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet, Time, @@ -30,25 +32,88 @@ use datafusion::{ physical_expr::*, physical_plan::{ExecutionPlan, *}, }; -use futures::Stream; +use futures::{Stream, StreamExt}; use itertools::Itertools; use std::{ + fmt::Debug, pin::Pin, sync::{Arc, Mutex}, task::{Context, Poll}, }; -/// `ScanExec` reads batches of data from Spark over the Arrow C Stream Interface. The -/// `input_source` is moved out of the JVM-exported `ArrowArrayStream` at plan-construction time; +/// Abstraction over the source that feeds batches into a [`ScanExec`]. `ScanExec` was originally +/// hard-wired to a JVM-exported `ArrowArrayStream` (via [`AlignedArrowStreamReader`]); this trait +/// lets it be driven by that reader OR by a purely native producer (e.g. a DataFusion +/// `SendableRecordBatchStream`), with no JVM involved in the latter case. Mirrors +/// `Iterator>`, which `AlignedArrowStreamReader` already +/// implements, but is spelled out as `next_batch` so it stays object-safe (`Iterator` itself +/// isn't dyn-compatible because of its many default/adapter methods). +pub trait InputBatchStream: Send + Debug { + /// Pull the next batch. `Ok(None)` signals end of stream. + fn next_batch(&mut self) -> Result, ArrowError>; +} + +impl InputBatchStream for AlignedArrowStreamReader { + fn next_batch(&mut self) -> Result, ArrowError> { + self.next().transpose() + } +} + +/// Feeds a `ScanExec` from a native DataFusion [`SendableRecordBatchStream`] instead of a +/// JVM-exported `ArrowArrayStream`. This is what lets a `ScanExec` sit at the bottom of a plan +/// fed purely natively, e.g. by a future Ballista shuffle-reader stream. +/// +/// `SendableRecordBatchStream` is async, but `ScanExec` pulls its input synchronously (via +/// `reader.next()`/`next_batch()` from `poll_next`, itself invoked off the batch producer thread +/// rather than awaited). To bridge that without restructuring `ScanExec`, each call blocks the +/// current thread on the stream's next item via `futures::executor::block_on`. This is safe here +/// because `block_on` merely parks the calling thread on a `Future`/`Waker` pair — unlike +/// `Runtime::block_on`, it does not require (and will not panic inside) an existing Tokio +/// runtime, so it composes with Comet's own executor threads. It does mean the calling thread is +/// unavailable for other work while a batch is pending, which is fine for the in-memory / +/// channel-backed producers this abstraction targets (e.g. a shuffle reader), but would be a poor +/// fit for a producer that itself does blocking I/O on the same thread. +pub struct NativeBatchStream { + stream: SendableRecordBatchStream, +} + +impl NativeBatchStream { + pub fn new(stream: SendableRecordBatchStream) -> Self { + Self { stream } + } +} + +impl Debug for NativeBatchStream { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("NativeBatchStream").finish_non_exhaustive() + } +} + +impl InputBatchStream for NativeBatchStream { + fn next_batch(&mut self) -> Result, ArrowError> { + match futures::executor::block_on(self.stream.next()) { + None => Ok(None), + Some(Ok(batch)) => Ok(Some(batch)), + Some(Err(e)) => Err(ArrowError::ExternalError(Box::new(e))), + } + } +} + +/// `ScanExec` reads batches of data from an upstream [`InputBatchStream`]. The common case is +/// Spark, over the Arrow C Stream Interface: the `input_source` is moved out of the +/// JVM-exported `ArrowArrayStream` at plan-construction time via `AlignedArrowStreamReader`; /// dropping the reader (when this exec drops) fires the stream's release callback, which closes -/// the JVM-side `ArrowReader` and its `VectorSchemaRoot`. +/// the JVM-side `ArrowReader` and its `VectorSchemaRoot`. `ScanExec` can equally be fed by a +/// native `SendableRecordBatchStream` (see [`ScanExec::new_native`] / [`NativeBatchStream`]), +/// with no JVM involved. #[derive(Debug, Clone)] pub struct ScanExec { /// JVM execution-context id used to look up the `JNIEnv` for callbacks. pub exec_context_id: i64, - /// The C Stream Interface reader. `None` only in unit tests that seed input via - /// `set_input_batch`. - pub input_source: Option>>, + /// The batch source: the C Stream Interface reader for the JVM path, or a native + /// [`NativeBatchStream`] wrapping a `SendableRecordBatchStream`. `None` only in unit tests + /// that seed input via `set_input_batch`. + pub input_source: Option>>, pub input_source_description: String, pub data_types: Vec, pub schema: SchemaRef, @@ -63,7 +128,7 @@ pub struct ScanExec { impl ScanExec { pub fn new( exec_context_id: i64, - input_source: Option>>, + input_source: Option>>, input_source_description: &str, data_types: Vec, ) -> Result { @@ -95,6 +160,25 @@ impl ScanExec { }) } + /// Convenience constructor for a `ScanExec` fed by a native `SendableRecordBatchStream` + /// (no JVM involved), e.g. a Ballista shuffle-reader stream. Wraps `stream` in a + /// [`NativeBatchStream`] and delegates to [`ScanExec::new`]. + pub fn new_native( + exec_context_id: i64, + stream: SendableRecordBatchStream, + input_source_description: &str, + data_types: Vec, + ) -> Result { + let input_source: Arc> = + Arc::new(Mutex::new(NativeBatchStream::new(stream))); + Self::new( + exec_context_id, + Some(input_source), + input_source_description, + data_types, + ) + } + /// Unpack all dictionary types because some DataFusion operators /// and expressions do not support dictionary types fn unpack_dictionary_type(dt: &DataType) -> DataType { @@ -133,25 +217,24 @@ impl ScanExec { /// columns are unpacked because Comet's downstream operators do not handle them. fn pull_next( exec_context_id: i64, - reader: &Arc>, + reader: &Arc>, ) -> Result { if exec_context_id == TEST_EXEC_CONTEXT_ID { // Unit test path; input batches are seeded directly. return Ok(InputBatch::EOF); } - // The `Mutex` is for interior mutability (`next` needs `&mut`, but the exec holds the - // reader behind an `Arc`); access is already serialized by the `self.batch` lock held in - // `get_next_batch`, so a contended `try_lock` here would signal a caller bug, not races. + // The `Mutex` is for interior mutability (`next_batch` needs `&mut`, but the exec holds + // the reader behind an `Arc`); access is already serialized by the `self.batch` lock held + // in `get_next_batch`, so a contended `try_lock` here would signal a caller bug, not + // races. let mut reader = reader .try_lock() - .map_err(|_| CometError::Internal("AlignedArrowStreamReader contended".to_string()))?; + .map_err(|_| CometError::Internal("input batch stream contended".to_string()))?; - let next = reader.next(); - match next { + match reader.next_batch()? { None => Ok(InputBatch::EOF), - Some(Err(e)) => Err(CometError::from(e)), - Some(Ok(record_batch)) => { + Some(record_batch) => { let num_rows = record_batch.num_rows(); let columns = record_batch.columns(); let mut inputs: Vec = Vec::with_capacity(columns.len()); @@ -341,6 +424,80 @@ impl RecordBatchStream for ScanStream<'_> { } } +#[cfg(test)] +mod tests { + use super::*; + use arrow::array::Int32Array; + use arrow::datatypes::{DataType, Field, Schema}; + use datafusion::execution::TaskContext; + use datafusion::physical_plan::memory::MemoryStream; + + /// A `ScanExec` fed by a native `SendableRecordBatchStream` (no JVM involved) must pass + /// batches through unchanged, row-for-row and value-for-value. This is the enabling case for + /// a future non-JVM producer (e.g. a Ballista shuffle-reader stream) driving a Comet + /// fragment's `ScanExec` leaf. + #[test] + fn scan_exec_reads_native_record_batch_stream() { + let schema: SchemaRef = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, true)])); + let batch1 = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef], + ) + .unwrap(); + let batch2 = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int32Array::from(vec![4, 5])) as ArrayRef], + ) + .unwrap(); + + let mem_stream = MemoryStream::try_new( + vec![batch1.clone(), batch2.clone()], + Arc::clone(&schema), + None, + ) + .unwrap(); + let native_stream: SendableRecordBatchStream = Box::pin(mem_stream); + + // Any id other than `TEST_EXEC_CONTEXT_ID` so `pull_next` actually pulls from the native + // stream instead of short-circuiting to EOF (that short-circuit is what lets *other* + // unit tests seed batches directly via `set_input_batch`). + let exec_context_id = TEST_EXEC_CONTEXT_ID + 1; + let mut scan = ScanExec::new_native( + exec_context_id, + native_stream, + "native-test-stream", + vec![DataType::Int32], + ) + .unwrap(); + + let task_ctx = Arc::new(TaskContext::default()); + let mut output_stream = scan.execute(0, task_ctx).unwrap(); + + let mut collected = Vec::new(); + loop { + scan.get_next_batch().unwrap(); + match futures::executor::block_on(output_stream.next()) { + Some(Ok(batch)) => collected.push(batch), + Some(Err(e)) => panic!("unexpected error polling ScanExec: {e}"), + None => break, + } + } + + assert_eq!( + collected.len(), + 2, + "expected both native batches to pass through" + ); + // `ScanExec` rebuilds the schema from `data_types` with placeholder field names (see + // `schema_from_data_types`), so compare columns/row counts rather than full batch/schema + // equality. + assert_eq!(collected[0].columns(), batch1.columns()); + assert_eq!(collected[1].columns(), batch2.columns()); + let total_rows: usize = collected.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 5); + } +} + #[derive(Clone, Debug)] pub enum InputBatch { /// The end of input batches. diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs index 25162332fd..dfadbeace5 100644 --- a/native/core/src/execution/planner.rs +++ b/native/core/src/execution/planner.rs @@ -22,8 +22,8 @@ pub mod macros; pub mod operator_registry; use crate::execution::operators::init_csv_datasource_exec; -use crate::execution::operators::AlignedArrowStreamReader; use crate::execution::operators::IcebergScanExec; +use crate::execution::operators::{AlignedArrowStreamReader, InputBatchStream}; use crate::execution::{ expressions::list_positions::ListPositionsExpr, expressions::subquery::Subquery, @@ -1533,8 +1533,13 @@ impl PhysicalPlanner { // Consumes the first input source for the scan. The Java side passes an // `org.apache.arrow.c.ArrowArrayStream` whose `memoryAddress` points at the C - // struct; native takes ownership via `AlignedArrowStreamReader::from_raw`. - let input_source = if self.exec_context_id == TEST_EXEC_CONTEXT_ID + // struct; native takes ownership via `AlignedArrowStreamReader::from_raw`. Wrapped + // as `dyn InputBatchStream` so `ScanExec` can equally be driven by a native + // `SendableRecordBatchStream` (see `ScanExec::new_native`); this JVM path's + // behavior is otherwise unchanged. + let input_source: Option>> = if self + .exec_context_id + == TEST_EXEC_CONTEXT_ID && inputs.is_empty() { // For unit test, we will set input batch to scan directly by `set_input_batch`. From c942df9ed7d0bf7335f11b1a39b310ea99219dba Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Jul 2026 17:20:59 -0600 Subject: [PATCH 15/42] feat(ballista): add CometFragmentExec fed by DataFusion child streams Add a CometFragmentExec that runs a Comet plan fragment (serialized Operator proto) whose input-leaf Scan operators are fed by the node's DataFusion children, reusing T2's native ScanExec input path. A childless fragment behaves like CometScanExec. Core gains a native (non-FFI) fragment builder in execution::fragment: it builds the plan via the planner, injects each child stream into the returned Scan handle through a new ScanExec::set_native_input setter (the handle shares its batch slot with the executable leaf), and drives get_next_batch while streaming the root, mirroring jni_api's busy-poll. CometPhysicalCodec (de)serializes the fragment via a distinct COMET_FRAGMENT_MAGIC tag, with children round-tripping through datafusion-proto. Relates to #4796. --- native/ballista/src/codec.rs | 19 ++ native/ballista/src/fragment.rs | 114 +++++++ native/ballista/src/lib.rs | 6 +- native/ballista/tests/fragment_child_input.rs | 298 ++++++++++++++++++ native/core/src/execution/fragment.rs | 164 ++++++++++ native/core/src/execution/mod.rs | 1 + native/core/src/execution/operators/scan.rs | 21 ++ 7 files changed, 622 insertions(+), 1 deletion(-) create mode 100644 native/ballista/src/fragment.rs create mode 100644 native/ballista/tests/fragment_child_input.rs create mode 100644 native/core/src/execution/fragment.rs diff --git a/native/ballista/src/codec.rs b/native/ballista/src/codec.rs index 2ae06b31f1..dde2f18396 100644 --- a/native/ballista/src/codec.rs +++ b/native/ballista/src/codec.rs @@ -29,6 +29,7 @@ use datafusion_proto::physical_plan::PhysicalExtensionCodec; use ballista_core::serde::{BallistaLogicalExtensionCodec, BallistaPhysicalExtensionCodec}; +use crate::fragment::CometFragmentExec; use crate::scan::CometScanExec; use crate::table_provider::CometTableProvider; @@ -40,6 +41,11 @@ use crate::table_provider::CometTableProvider; /// embedded NUL in particular makes a collision effectively impossible. pub const COMET_MAGIC: &[u8] = b"CMET1\0"; +/// Marks a payload as a [`CometFragmentExec`] (a Comet fragment fed by +/// DataFusion children), distinct from [`COMET_MAGIC`] for the childless +/// [`CometScanExec`]. Same collision-safety argument as `COMET_MAGIC`. +pub const COMET_FRAGMENT_MAGIC: &[u8] = b"CMETF\0"; + /// Serializes `CometScanExec` as its Comet proto bytes (tagged with `COMET_MAGIC`) /// and reconstructs it on decode by re-running Comet's planner via FFI. All other /// nodes (including Ballista's own shuffle operators) delegate to Ballista's codec. @@ -55,6 +61,14 @@ impl PhysicalExtensionCodec for CometPhysicalCodec { inputs: &[Arc], ctx: &TaskContext, ) -> Result> { + if let Some(rest) = buf.strip_prefix(COMET_FRAGMENT_MAGIC) { + // `inputs` are the already-decoded DataFusion children that feed the + // fragment's `Scan` input leaves. + return Ok(Arc::new(CometFragmentExec::try_new( + rest.to_vec(), + inputs.to_vec(), + )?)); + } if let Some(rest) = buf.strip_prefix(COMET_MAGIC) { return Ok(Arc::new(CometScanExec::try_new(rest.to_vec())?)); } @@ -62,6 +76,11 @@ impl PhysicalExtensionCodec for CometPhysicalCodec { } fn try_encode(&self, node: Arc, buf: &mut Vec) -> Result<()> { + if let Some(fragment) = node.downcast_ref::() { + buf.extend_from_slice(COMET_FRAGMENT_MAGIC); + buf.extend_from_slice(fragment.proto()); + return Ok(()); + } if let Some(scan) = node.downcast_ref::() { buf.extend_from_slice(COMET_MAGIC); buf.extend_from_slice(scan.proto()); diff --git a/native/ballista/src/fragment.rs b/native/ballista/src/fragment.rs new file mode 100644 index 0000000000..3fd878d8a1 --- /dev/null +++ b/native/ballista/src/fragment.rs @@ -0,0 +1,114 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::fmt; +use std::sync::Arc; + +use datafusion::common::{DataFusionError, Result}; +use datafusion::execution::TaskContext; +use datafusion::physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, SendableRecordBatchStream, +}; + +use comet::execution::fragment::{build_native_fragment, native_fragment_plan_properties}; + +/// A DataFusion node that runs a Comet plan fragment (carried as `Operator` +/// proto bytes) whose input-leaf `Scan` operators are fed by this node's +/// DataFusion `children`. +/// +/// In a Ballista stage those children are shuffle readers; a childless fragment +/// (whose leaf is a self-contained `NativeScan`) behaves like [`super::scan::CometScanExec`], +/// but reached through the native (non-FFI) path since the executor and Comet +/// share a DataFusion build. +/// +/// Serializable through [`super::codec::CometPhysicalCodec`] by its proto bytes; +/// the children round-trip via datafusion-proto and are handed back on decode. +#[derive(Debug)] +pub struct CometFragmentExec { + proto: Vec, + children: Vec>, + props: Arc, +} + +impl CometFragmentExec { + /// Build from Comet proto bytes and the fragment's DataFusion children. The + /// schema/properties are derived by building the fragment plan once (without + /// executing it or requiring the child streams). + pub fn try_new(proto: Vec, children: Vec>) -> Result { + let props = native_fragment_plan_properties(&proto).map_err(DataFusionError::Execution)?; + Ok(Self { + proto, + children, + props, + }) + } + + pub fn proto(&self) -> &[u8] { + &self.proto + } +} + +impl DisplayAs for CometFragmentExec { + fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "CometFragmentExec(proto={} bytes, children={})", + self.proto.len(), + self.children.len() + ) + } +} + +impl ExecutionPlan for CometFragmentExec { + fn name(&self) -> &str { + "CometFragmentExec" + } + + fn properties(&self) -> &Arc { + &self.props + } + + fn children(&self) -> Vec<&Arc> { + self.children.iter().collect() + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> Result> { + Ok(Arc::new(CometFragmentExec::try_new( + self.proto.clone(), + children, + )?)) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> Result { + // Materialize one input stream per child for the requested output + // partition; these feed the fragment's `Scan` input leaves in order. + let inputs = self + .children + .iter() + .map(|child| child.execute(partition, Arc::clone(&context))) + .collect::>>()?; + + build_native_fragment(&self.proto, context, inputs).map_err(DataFusionError::Execution) + } +} diff --git a/native/ballista/src/lib.rs b/native/ballista/src/lib.rs index a75dd93923..110f678de0 100644 --- a/native/ballista/src/lib.rs +++ b/native/ballista/src/lib.rs @@ -32,10 +32,14 @@ pub mod codec; pub mod ffi_jni; +pub mod fragment; pub mod scan; pub mod table_provider; -pub use codec::{CometLogicalCodec, CometPhysicalCodec, COMET_MAGIC}; +pub use codec::{ + CometLogicalCodec, CometPhysicalCodec, COMET_FRAGMENT_MAGIC, COMET_MAGIC, +}; pub use ffi_jni::{build_test_proto, execute_comet_proto, submit_and_export}; +pub use fragment::CometFragmentExec; pub use scan::CometScanExec; pub use table_provider::CometTableProvider; diff --git a/native/ballista/tests/fragment_child_input.rs b/native/ballista/tests/fragment_child_input.rs new file mode 100644 index 0000000000..3d9eec4eda --- /dev/null +++ b/native/ballista/tests/fragment_child_input.rs @@ -0,0 +1,298 @@ +// Proves `CometFragmentExec` runs a Comet plan fragment whose input-leaf `Scan` +// is fed by the node's DataFusion child stream (the R2 shuffle-reader shape, +// stood in for here by an in-memory child and a `CometScanExec` child), and +// that such a fragment survives Ballista's physical-plan (de)serialization. + +use std::sync::Arc; + +use datafusion::arrow::array::{Int32Array, RecordBatch}; +use datafusion::arrow::datatypes::{DataType as ArrowDataType, Field, Schema, SchemaRef}; +use datafusion::common::Result; +use datafusion::execution::{SendableRecordBatchStream, TaskContext}; +use datafusion::parquet::arrow::ArrowWriter; +use datafusion::physical_expr::EquivalenceProperties; +use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType}; +use datafusion::physical_plan::memory::MemoryStream; +use datafusion::physical_plan::{ + displayable, DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, +}; +use datafusion::prelude::SessionContext; +use datafusion_proto::physical_plan::AsExecutionPlan; +use datafusion_proto::protobuf::PhysicalPlanNode; +use futures::StreamExt; +use prost::Message; + +use datafusion_comet_ballista::{CometFragmentExec, CometPhysicalCodec, CometScanExec}; +use datafusion_comet_proto::spark_expression::{ + data_type::DataTypeId, expr::ExprStruct, literal, BinaryExpr, BoundReference, DataType, Expr, + Literal, +}; +use datafusion_comet_proto::spark_operator::{ + operator::OpStruct, Filter, NativeScan, NativeScanCommon, Operator, Scan, SparkFilePartition, + SparkPartitionedFile, SparkStructField, +}; + +/// A minimal in-memory DataFusion leaf yielding a fixed set of batches, standing +/// in for a shuffle reader (or any upstream DataFusion child) that feeds a +/// `CometFragmentExec`'s `Scan` input leaf. +#[derive(Debug)] +struct InMemoryChildExec { + batches: Vec, + schema: SchemaRef, + props: Arc, +} + +impl InMemoryChildExec { + fn new(batches: Vec, schema: SchemaRef) -> Self { + let props = Arc::new(PlanProperties::new( + EquivalenceProperties::new(Arc::clone(&schema)), + Partitioning::UnknownPartitioning(1), + EmissionType::Incremental, + Boundedness::Bounded, + )); + Self { + batches, + schema, + props, + } + } +} + +impl DisplayAs for InMemoryChildExec { + fn fmt_as(&self, _t: DisplayFormatType, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "InMemoryChildExec") + } +} + +impl ExecutionPlan for InMemoryChildExec { + fn name(&self) -> &str { + "InMemoryChildExec" + } + fn properties(&self) -> &Arc { + &self.props + } + fn children(&self) -> Vec<&Arc> { + vec![] + } + fn with_new_children( + self: Arc, + _children: Vec>, + ) -> Result> { + Ok(self) + } + fn execute( + &self, + _partition: usize, + _context: Arc, + ) -> Result { + Ok(Box::pin(MemoryStream::try_new( + self.batches.clone(), + Arc::clone(&self.schema), + None, + )?)) + } +} + +fn int32_type() -> DataType { + DataType { + type_id: DataTypeId::Int32 as i32, + type_info: None, + } +} + +/// Build a Comet `Operator` proto: `Filter(gt(col0, 2))` over a `Scan` leaf with +/// one Int32 column. The `Scan` (op #100) is the input leaf fed by a child +/// stream; the `Filter` proves an operator is applied on top of the child rows. +fn build_filter_over_scan_proto() -> Vec { + let scan = Scan { + fields: vec![int32_type()], + source: "fragment-child-test".to_string(), + }; + let scan_op = Operator { + children: vec![], + plan_id: 1, + op_struct: Some(OpStruct::Scan(scan)), + }; + + let col0 = Expr { + expr_struct: Some(ExprStruct::Bound(BoundReference { + index: 0, + datatype: Some(int32_type()), + })), + ..Default::default() + }; + let lit2 = Expr { + expr_struct: Some(ExprStruct::Literal(Literal { + value: Some(literal::Value::IntVal(2)), + datatype: Some(int32_type()), + is_null: false, + })), + ..Default::default() + }; + let predicate = Expr { + expr_struct: Some(ExprStruct::Gt(Box::new(BinaryExpr { + left: Some(Box::new(col0)), + right: Some(Box::new(lit2)), + }))), + ..Default::default() + }; + let filter_op = Operator { + children: vec![scan_op], + plan_id: 2, + op_struct: Some(OpStruct::Filter(Filter { + predicate: Some(predicate), + })), + }; + filter_op.encode_to_vec() +} + +fn int32_schema() -> SchemaRef { + Arc::new(Schema::new(vec![Field::new("a", ArrowDataType::Int32, true)])) +} + +/// A `CometFragmentExec` whose `Scan` leaf is fed by an in-memory DataFusion +/// child must pass the child rows through the fragment and apply the fragment's +/// `Filter` (col0 > 2) to them. +#[tokio::test(flavor = "multi_thread")] +async fn fragment_scan_leaf_fed_by_child() -> anyhow::Result<()> { + let schema = int32_schema(); + let batch1 = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int32Array::from(vec![1, 2, 3])) as _], + )?; + let batch2 = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int32Array::from(vec![4, 5])) as _], + )?; + let child: Arc = + Arc::new(InMemoryChildExec::new(vec![batch1, batch2], schema)); + + let proto = build_filter_over_scan_proto(); + let fragment: Arc = + Arc::new(CometFragmentExec::try_new(proto, vec![child])?); + + let ctx = SessionContext::new(); + let mut stream = fragment.execute(0, ctx.task_ctx())?; + let mut values: Vec = Vec::new(); + while let Some(batch) = stream.next().await { + let batch = batch?; + let col = batch + .column(0) + .as_any() + .downcast_ref::() + .expect("int32 column"); + values.extend(col.values().iter().copied()); + } + + // Child produced 1..=5; the fragment's Filter keeps col0 > 2. + assert_eq!(values, vec![3, 4, 5], "child rows must flow through and be filtered"); + Ok(()) +} + +/// Write a tiny Parquet file with a single int32 column `a` = [1..=5]. +fn write_test_parquet(path: &std::path::Path) -> anyhow::Result<()> { + let schema = int32_schema(); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]))], + )?; + let file = std::fs::File::create(path)?; + let mut writer = ArrowWriter::try_new(file, schema, None)?; + writer.write(&batch)?; + writer.close()?; + Ok(()) +} + +/// Build a Comet `Operator` proto: a single `NativeScan` over `parquet_path`. +fn build_native_scan_proto(parquet_path: &std::path::Path) -> anyhow::Result> { + let field_a = SparkStructField { + name: "a".to_string(), + data_type: Some(int32_type()), + nullable: true, + metadata: Default::default(), + }; + let common = NativeScanCommon { + required_schema: vec![field_a.clone()], + data_schema: vec![field_a], + projection_vector: vec![0], + session_timezone: "UTC".to_string(), + source: "comet-fragment-child-native-scan".to_string(), + ..Default::default() + }; + let file_size = std::fs::metadata(parquet_path)?.len() as i64; + let partitioned_file = SparkPartitionedFile { + file_path: format!("file://{}", parquet_path.display()), + start: 0, + length: file_size, + file_size, + partition_values: vec![], + }; + let native_scan = NativeScan { + common: Some(common), + file_partition: Some(SparkFilePartition { + partitioned_file: vec![partitioned_file], + }), + }; + Ok(Operator { + children: vec![], + plan_id: 0, + op_struct: Some(OpStruct::NativeScan(native_scan)), + } + .encode_to_vec()) +} + +/// A `CometFragmentExec` (with a `CometScanExec` child, so the whole tree is +/// serializable) must survive Ballista's physical-plan codec round-trip and +/// produce the same filtered result on the far side. +#[tokio::test(flavor = "multi_thread")] +async fn fragment_codec_roundtrip() -> anyhow::Result<()> { + let parquet_path = std::env::temp_dir().join("comet_fragment_child_codec_roundtrip.parquet"); + write_test_parquet(&parquet_path)?; + + // Child = CometScanExec over the parquet (round-trips via COMET_MAGIC); + // parent fragment = Filter(col0 > 2) over a Scan input leaf. + let child: Arc = + Arc::new(CometScanExec::try_new(build_native_scan_proto(&parquet_path)?)?); + let fragment_proto = build_filter_over_scan_proto(); + let plan: Arc = + Arc::new(CometFragmentExec::try_new(fragment_proto, vec![child])?); + println!( + "original plan:\n{}", + displayable(plan.as_ref()).indent(false) + ); + + // --- Encode (scheduler side) --- + let codec = CometPhysicalCodec::default(); + let node = PhysicalPlanNode::try_from_physical_plan(Arc::clone(&plan), &codec)?; + let bytes = node.encode_to_vec(); + + // --- Ship bytes, decode in a fresh context (executor side) --- + let ctx = SessionContext::new(); + let task_ctx = ctx.task_ctx(); + let node2 = PhysicalPlanNode::decode(&bytes[..])?; + let plan2 = node2.try_into_physical_plan(task_ctx.as_ref(), &codec)?; + println!( + "reconstructed plan (executor side):\n{}", + displayable(plan2.as_ref()).indent(false) + ); + + // --- Execute the reconstructed plan --- + let mut stream = plan2.execute(0, task_ctx)?; + let mut values: Vec = Vec::new(); + while let Some(batch) = stream.next().await { + let batch = batch?; + let col = batch + .column(0) + .as_any() + .downcast_ref::() + .expect("int32 column"); + values.extend(col.values().iter().copied()); + } + + assert_eq!( + values, + vec![3, 4, 5], + "fragment result must be identical after codec round-trip" + ); + Ok(()) +} diff --git a/native/core/src/execution/fragment.rs b/native/core/src/execution/fragment.rs new file mode 100644 index 0000000000..9142320a0b --- /dev/null +++ b/native/core/src/execution/fragment.rs @@ -0,0 +1,164 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Builds and drives a Comet plan fragment whose input-leaf `Scan` operators are +//! fed by native DataFusion [`SendableRecordBatchStream`]s (e.g. a Ballista +//! shuffle reader), rather than by JVM-exported Arrow streams. +//! +//! Unlike [`super::ffi`], this path stays entirely in-process: there is no +//! `datafusion-ffi` boundary, because the consumer (a Ballista executor) and +//! Comet resolve to the same DataFusion build, so `ExecutionPlan`s and +//! `SendableRecordBatchStream`s are shared directly. +//! +//! `PhysicalPlanner::create_plan` builds a `Scan` (op #100) leaf with no input +//! source and returns a handle to it. Its executable clone shares this handle's +//! `batch` slot (an `Arc`), so [`super::operators::ScanExec::set_native_input`] +//! injects the child stream into the handle and [`NativeFragmentStream`] drives +//! `get_next_batch` on it — mirroring the JVM busy-poll in `jni_api` — to make +//! child batches flow through the fragment. + +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; + +use arrow::array::RecordBatch; +use arrow::datatypes::SchemaRef; +use datafusion::common::{DataFusionError, Result as DataFusionResult}; +use datafusion::execution::{SendableRecordBatchStream, TaskContext}; +use datafusion::physical_plan::{ExecutionPlan, PlanProperties, RecordBatchStream}; +use datafusion::prelude::SessionContext; +use datafusion_comet_proto::spark_operator::Operator; +use futures::{Stream, StreamExt}; +use prost::Message; + +use super::operators::ScanExec; +use super::planner::PhysicalPlanner; + +/// A non-`TEST_EXEC_CONTEXT_ID` execution-context id for natively-fed scans. No +/// JVM context is involved on this path (the input is a native stream, not an +/// `ArrowArrayStream`), so the concrete value is immaterial as long as it is not +/// the test sentinel, which would make `pull_next` short-circuit to EOF. +const NATIVE_FRAGMENT_EXEC_ID: i64 = 0; + +/// Decode the Comet `Operator` proto and build the DataFusion plan with Comet's +/// planner, returning the input-leaf `Scan` handles (in encounter order) and the +/// fragment root. The default (test) `exec_context_id` is used so the `Scan` op +/// builds without consuming a JVM input; native inputs are injected afterwards +/// via [`ScanExec::set_native_input`]. +fn plan_from_proto(proto_bytes: &[u8]) -> Result<(Vec, Arc), String> { + let op = + Operator::decode(proto_bytes).map_err(|e| format!("failed to decode Operator proto: {e}"))?; + + // A fresh `SessionContext` means configuration comes only from the proto, + // not from any ambient session (see `super::ffi`). + let session_ctx = Arc::new(SessionContext::new()); + let planner = PhysicalPlanner::new(session_ctx, 0); + + let mut jvm_inputs = Vec::new(); + let (scans, _shuffle_scans, spark_plan) = planner + .create_plan(&op, &mut jvm_inputs, 1) + .map_err(|e| format!("failed to build native plan: {e}"))?; + + Ok((scans, Arc::clone(&spark_plan.native_plan))) +} + +/// The `PlanProperties` (schema, partitioning, ordering) of the fragment root, +/// used to establish a `CometFragmentExec`'s schema/properties at construction +/// time without executing it or requiring the child streams. +pub fn native_fragment_plan_properties(proto_bytes: &[u8]) -> Result, String> { + let (_scans, root) = plan_from_proto(proto_bytes)?; + Ok(Arc::clone(root.properties())) +} + +/// Build the Comet fragment described by `proto_bytes`, feeding its input-leaf +/// `Scan` operators from `inputs` (one stream per leaf, in encounter order), and +/// return the fragment root's output stream. Executing that stream drives the +/// child streams through the fragment. +pub fn build_native_fragment( + proto_bytes: &[u8], + task_ctx: Arc, + inputs: Vec, +) -> Result { + let (mut scans, root) = plan_from_proto(proto_bytes)?; + + if scans.len() != inputs.len() { + return Err(format!( + "Comet fragment has {} Scan input leaves but {} child streams were provided", + scans.len(), + inputs.len() + )); + } + + // Inject each child stream into the matching `Scan` handle. The handle shares + // its `batch` slot with the executable leaf, so pulling here delivers batches + // to the plan node. + for (scan, input) in scans.iter_mut().zip(inputs) { + scan.set_native_input(NATIVE_FRAGMENT_EXEC_ID, input); + } + + // The Comet fragment is internally single-partition; execute its root at + // partition 0. The child streams were already obtained for the desired output + // partition by the caller. + let root_stream = root + .execute(0, task_ctx) + .map_err(|e| format!("failed to execute Comet fragment root: {e}"))?; + let schema = root_stream.schema(); + + Ok(Box::pin(NativeFragmentStream { + root: root_stream, + scans, + schema, + })) +} + +/// Streams the fragment root while pumping its `Scan` leaves. When the root +/// yields `Pending` (a leaf's `batch` slot is empty), each leaf handle is asked +/// to pull its next batch, then the root is polled again — the same interleaving +/// `jni_api` performs for JVM-fed scans, but with native child streams. +struct NativeFragmentStream { + root: SendableRecordBatchStream, + scans: Vec, + schema: SchemaRef, +} + +impl Stream for NativeFragmentStream { + type Item = DataFusionResult; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.get_mut(); + loop { + match this.root.poll_next_unpin(cx) { + Poll::Ready(item) => return Poll::Ready(item), + Poll::Pending => { + for scan in this.scans.iter_mut() { + if let Err(e) = scan.get_next_batch() { + return Poll::Ready(Some(Err(DataFusionError::Execution(format!( + "Comet fragment scan input error: {e}" + ))))); + } + } + } + } + } + } +} + +impl RecordBatchStream for NativeFragmentStream { + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } +} diff --git a/native/core/src/execution/mod.rs b/native/core/src/execution/mod.rs index 3e85715920..d67d74118c 100644 --- a/native/core/src/execution/mod.rs +++ b/native/core/src/execution/mod.rs @@ -19,6 +19,7 @@ pub mod columnar_to_row; pub mod expressions; pub mod ffi; +pub mod fragment; pub mod jni_api; pub(crate) mod merge_as_partial; pub(crate) mod metrics; diff --git a/native/core/src/execution/operators/scan.rs b/native/core/src/execution/operators/scan.rs index 5ce5d18241..da239973ed 100644 --- a/native/core/src/execution/operators/scan.rs +++ b/native/core/src/execution/operators/scan.rs @@ -179,6 +179,27 @@ impl ScanExec { ) } + /// Inject a native [`SendableRecordBatchStream`] into an already-constructed + /// `ScanExec` handle (e.g. one returned by `PhysicalPlanner::create_plan` for a + /// `Scan` leaf, which is built with `input_source = None`). This is what lets a + /// [`crate::execution::fragment`] feed the fragment's `Scan` leaves from its + /// DataFusion children after the plan has been built. + /// + /// A non-`TEST_EXEC_CONTEXT_ID` `exec_context_id` MUST be supplied so that + /// `pull_next` actually pulls from the stream instead of short-circuiting to + /// EOF (that short-circuit is reserved for unit tests that seed batches via + /// `set_input_batch`). Only the handle that `get_next_batch` is driven on needs + /// this — the executable leaf shares this handle's `batch` slot (an `Arc`), so + /// batches pulled here become visible to the plan node without touching it. + pub fn set_native_input( + &mut self, + exec_context_id: i64, + stream: SendableRecordBatchStream, + ) { + self.exec_context_id = exec_context_id; + self.input_source = Some(Arc::new(Mutex::new(NativeBatchStream::new(stream)))); + } + /// Unpack all dictionary types because some DataFusion operators /// and expressions do not support dictionary types fn unpack_dictionary_type(dt: &DataType) -> DataType { From 23c9ab4baa20d445c914f1b6ff0f599524130ebe Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Jul 2026 17:52:32 -0600 Subject: [PATCH 16/42] feat(ballista): distribute a GROUP BY offload across a Ballista hash shuffle Offload a two-stage Comet GROUP BY from the Spark driver to an in-process Ballista engine, running it distributed across a hash shuffle with Comet native fragments on both sides: partial aggregate -> Ballista IPC shuffle -> final aggregate, results returned at the driver with zero Spark-executor tasks. Native side (executeQueryDistributed): assemble CometFragmentExec(block2, [Hash-Repartition(CometFragmentExec(block1))]), serialize with CometPhysicalCodec, and submit via execute_physical_plan to a standalone cluster started from a SessionState carrying the Comet codecs (so both scheduler and executor can rebuild the fragments). Ballista splits at the RepartitionExec(Hash) into two stages and the push-mode fetch returns all final partitions, concatenated and exported over Arrow-FFI. Fix CometFragmentExec partitioning: a fragment with children is a per-partition transform, so it must report its children's partition count (not the block's internal single partition). Without this only output partition 0 was executed, silently dropping the other hash buckets' groups. JVM driver: relax the offload guard to dispatch on plan shape -- one native block + no exchange stays the R1 single-stage path; two native blocks + one CometShuffleExchangeExec become the R2 distributed path, reading the group-key count and shuffle-partition count from the exchange's HashPartitioning. R2 requires shuffle directRead disabled so the final block's leaf serializes as a plain Scan the shuffle-fed fragment can consume. The partial-aggregate count state composes across the IPC shuffle: block2's Scan leaf schema is derived from the exchange output (block1's schema), so the shuffle write and final-aggregate read schemas match by construction. Point the Ballista dependency at the local worktree (physical-plan submission branch) and add ballista-scheduler/ballista-executor for the standalone start. Relates to #4796. --- native/Cargo.lock | 110 +++++++- native/Cargo.toml | 9 +- native/ballista/Cargo.toml | 2 + native/ballista/src/ffi_jni.rs | 252 +++++++++++++++++- native/ballista/src/fragment.rs | 29 +- native/ballista/src/lib.rs | 5 +- .../comet/ballista/NativeBallista.scala | 34 +++ .../apache/spark/sql/comet/operators.scala | 181 ++++++++++--- .../CometBallistaDistributedSuite.scala | 169 ++++++++++++ .../ballista/CometBallistaOffloadSuite.scala | 15 +- 10 files changed, 746 insertions(+), 60 deletions(-) create mode 100644 spark/src/test/scala/org/apache/comet/ballista/CometBallistaDistributedSuite.scala diff --git a/native/Cargo.lock b/native/Cargo.lock index 3d09166ad9..56c5432c52 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -1139,7 +1139,6 @@ dependencies = [ [[package]] name = "ballista" version = "53.0.0" -source = "git+https://github.com/apache/datafusion-ballista?rev=a8b3c79c#a8b3c79c80f7f1c0aa862ed1a76dee7a1ac67265" dependencies = [ "async-trait", "ballista-core", @@ -1154,11 +1153,13 @@ dependencies = [ [[package]] name = "ballista-core" version = "53.0.0" -source = "git+https://github.com/apache/datafusion-ballista?rev=a8b3c79c#a8b3c79c80f7f1c0aa862ed1a76dee7a1ac67265" dependencies = [ "arrow-flight", "async-trait", + "aws-config", + "aws-credential-types", "chrono", + "clap", "datafusion", "datafusion-proto", "datafusion-proto-common", @@ -1166,6 +1167,7 @@ dependencies = [ "itertools 0.15.0", "log", "md-5 0.11.0", + "object_store", "parking_lot", "prost", "prost-types", @@ -1185,13 +1187,13 @@ dependencies = [ [[package]] name = "ballista-executor" version = "53.0.0" -source = "git+https://github.com/apache/datafusion-ballista?rev=a8b3c79c#a8b3c79c80f7f1c0aa862ed1a76dee7a1ac67265" dependencies = [ "arrow", "arrow-flight", "async-trait", "ballista-core", "bytesize", + "clap", "dashmap", "datafusion", "datafusion-proto", @@ -1199,6 +1201,7 @@ dependencies = [ "libc", "log", "memory-stats", + "mimalloc", "parking_lot", "serde", "sysinfo", @@ -1207,18 +1210,21 @@ dependencies = [ "tokio-stream", "tokio-util", "tonic", + "tracing", + "tracing-appender", + "tracing-subscriber", "uuid", ] [[package]] name = "ballista-scheduler" version = "53.0.0" -source = "git+https://github.com/apache/datafusion-ballista?rev=a8b3c79c#a8b3c79c80f7f1c0aa862ed1a76dee7a1ac67265" dependencies = [ "arrow-flight", "async-trait", "axum", "ballista-core", + "clap", "dashmap", "datafusion", "datafusion-proto", @@ -1237,6 +1243,9 @@ dependencies = [ "tokio-stream", "tonic", "tower-http 0.7.0", + "tracing", + "tracing-appender", + "tracing-subscriber", "uuid", ] @@ -2264,6 +2273,8 @@ dependencies = [ "async-trait", "ballista", "ballista-core", + "ballista-executor", + "ballista-scheduler", "datafusion", "datafusion-comet", "datafusion-comet-proto", @@ -4651,6 +4662,15 @@ dependencies = [ "twox-hash", ] +[[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + [[package]] name = "matchit" version = "0.8.4" @@ -4821,6 +4841,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "nu-ansi-term" +version = "0.50.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" +dependencies = [ + "windows-sys 0.59.0", +] + [[package]] name = "num" version = "0.4.3" @@ -6833,6 +6862,15 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f179d4e11094a893b82fff208f74d448a7512f99f5a0acbd5c679b705f83ed9" +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + [[package]] name = "shlex" version = "1.3.0" @@ -7089,6 +7127,12 @@ dependencies = [ "symbolic-common", ] +[[package]] +name = "symlink" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7973cce6668464ea31f176d85b13c7ab3bba2cb3b77a2ed26abd7801688010a" + [[package]] name = "syn" version = "1.0.109" @@ -7214,6 +7258,15 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "thread_local" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" +dependencies = [ + "cfg-if", +] + [[package]] name = "thrift" version = "0.17.0" @@ -7567,6 +7620,19 @@ dependencies = [ "tracing-core", ] +[[package]] +name = "tracing-appender" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "050686193eb999b4bb3bc2acfa891a13da00f79734704c4b8b4ef1a10b368a3c" +dependencies = [ + "crossbeam-channel", + "symlink", + "thiserror 2.0.18", + "time", + "tracing-subscriber", +] + [[package]] name = "tracing-attributes" version = "0.1.31" @@ -7585,6 +7651,36 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" dependencies = [ "once_cell", + "valuable", +] + +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7f578e5945fb242538965c2d0b04418d38ec25c79d160cd279bf0731c8d319" +dependencies = [ + "matchers", + "nu-ansi-term", + "once_cell", + "regex-automata", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", ] [[package]] @@ -7770,6 +7866,12 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "valuable" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" + [[package]] name = "value-bag" version = "1.12.0" diff --git a/native/Cargo.toml b/native/Cargo.toml index 07c2936c0f..7685b1b617 100644 --- a/native/Cargo.toml +++ b/native/Cargo.toml @@ -61,8 +61,13 @@ aws-credential-types = "1.2.13" iceberg = { git = "https://github.com/apache/iceberg-rust", rev = "80a30d3" } iceberg-storage-opendal = { git = "https://github.com/apache/iceberg-rust", rev = "80a30d3", features = ["opendal-memory", "opendal-fs", "opendal-s3", "opendal-gcs", "opendal-oss", "opendal-azdls"] } reqsign-core = "3" -ballista = { git = "https://github.com/apache/datafusion-ballista", rev = "a8b3c79c", package = "ballista" } -ballista-core = { git = "https://github.com/apache/datafusion-ballista", rev = "a8b3c79c", package = "ballista-core" } +# Path deps to the local Ballista worktree (branch feat/physical-plan-submission) +# so T1's execute_physical_plan + PhysicalPlan(bytes) submission variant are +# available. Revert to a fork git-rev pin once T1 lands upstream. +ballista = { path = "/Users/andy/git/ballista-ffi-poc/ballista/client", package = "ballista" } +ballista-core = { path = "/Users/andy/git/ballista-ffi-poc/ballista/core", package = "ballista-core" } +ballista-scheduler = { path = "/Users/andy/git/ballista-ffi-poc/ballista/scheduler", package = "ballista-scheduler" } +ballista-executor = { path = "/Users/andy/git/ballista-ffi-poc/ballista/executor", package = "ballista-executor" } [profile.release] debug = true diff --git a/native/ballista/Cargo.toml b/native/ballista/Cargo.toml index a86328bd63..c02d82f129 100644 --- a/native/ballista/Cargo.toml +++ b/native/ballista/Cargo.toml @@ -49,6 +49,8 @@ futures = { workspace = true } ballista = { workspace = true } ballista-core = { workspace = true } +ballista-scheduler = { workspace = true } +ballista-executor = { workspace = true } [dev-dependencies] tokio = { version = "1", features = ["rt-multi-thread", "macros"] } diff --git a/native/ballista/src/ffi_jni.rs b/native/ballista/src/ffi_jni.rs index ceb5b8a412..e2de4b17db 100644 --- a/native/ballista/src/ffi_jni.rs +++ b/native/ballista/src/ffi_jni.rs @@ -118,7 +118,7 @@ pub fn build_test_proto() -> Result, String> { } use crate::scan::CometScanExec; -use crate::{CometLogicalCodec, CometPhysicalCodec, CometTableProvider}; +use crate::{CometFragmentExec, CometLogicalCodec, CometPhysicalCodec, CometTableProvider}; /// Run a Comet `Operator` proto on an in-process standalone Ballista engine and /// return the collected Arrow batches plus the result schema. @@ -184,6 +184,208 @@ pub fn execute_comet_proto(proto: &[u8]) -> Result<(SchemaRef, Vec) }) } +// --------------------------------------------------------------------------- +// R2: two-stage (distributed) GROUP BY offload +// --------------------------------------------------------------------------- + +use std::time::Duration; + +use ballista_core::config::BallistaConfig; +use ballista_core::execution_plans::execute_physical_plan; +use ballista_core::serde::protobuf::scheduler_grpc_client::SchedulerGrpcClient; +use datafusion::execution::SessionState; +use datafusion::physical_expr::expressions::Column; +use datafusion::physical_expr::PhysicalExpr; +use datafusion::physical_plan::repartition::RepartitionExec; +use datafusion::physical_plan::Partitioning; +use datafusion_proto::protobuf::PhysicalPlanNode; +use futures::TryStreamExt; + +/// Build the R2 two-stage physical plan for a distributed GROUP BY: +/// +/// ```text +/// CometFragmentExec(block2, children=[ +/// RepartitionExec::Hash( CometFragmentExec(block1, children=[]), keys=0..num_group_keys, N ) +/// ]) +/// ``` +/// +/// `block1` is the partial aggregate (self-contained `NativeScan` leaf); its +/// output is `[group_keys..., agg_states...]`, so the group keys are columns +/// `0..num_group_keys`. Hash-repartitioning on those columns co-locates every +/// row for a group key on one downstream task, which is what lets the final +/// aggregate in `block2` compose across the shuffle. Ballista's +/// `DistributedPlanner` splits this plan at the `RepartitionExec(Hash)` into two +/// stages (block1 -> ShuffleWriter; ShuffleReader -> block2), and at stage 2 the +/// ShuffleReader becomes `block2`'s `Scan` (#100) input leaf. +fn build_two_stage_plan( + block1_proto: &[u8], + block2_proto: &[u8], + num_group_keys: usize, + num_partitions: usize, +) -> Result, String> { + let block1: Arc = + Arc::new(CometFragmentExec::try_new(block1_proto.to_vec(), vec![]).map_err(|e| { + format!("failed to build block1 (partial-agg) fragment: {e}") + })?); + + let schema1 = block1.schema(); + if num_group_keys == 0 || num_group_keys > schema1.fields().len() { + return Err(format!( + "invalid num_group_keys {num_group_keys}: block1 output has {} columns ({:?})", + schema1.fields().len(), + schema1.fields().iter().map(|f| f.name()).collect::>() + )); + } + + // Investigation aid: the schema of the batches that cross Ballista's IPC + // shuffle. block2's `Scan` (#100) leaf schema (derived from the exchange + // output on the JVM side) must match this for the aggregate to compose. + eprintln!( + "[comet-ballista R2] block1 (partial-agg) output schema = {:?}", + schema1 + ); + + let hash_exprs: Vec> = (0..num_group_keys) + .map(|i| Arc::new(Column::new(schema1.field(i).name(), i)) as Arc) + .collect(); + + let repart: Arc = Arc::new( + RepartitionExec::try_new( + block1, + Partitioning::Hash(hash_exprs, num_partitions.max(1)), + ) + .map_err(|e| format!("failed to build hash RepartitionExec: {e}"))?, + ); + + let block2: Arc = + Arc::new(CometFragmentExec::try_new(block2_proto.to_vec(), vec![repart]).map_err( + |e| format!("failed to build block2 (final-agg) fragment: {e}"), + )?); + + eprintln!( + "[comet-ballista R2] block2 (final-agg) output schema = {:?}", + block2.schema() + ); + + Ok(block2) +} + +/// Start an in-process standalone Ballista cluster (scheduler + executor) from +/// `state`, so the Comet extension codecs registered on the state's config reach +/// both sides. Mirrors `ballista::extension`'s private `setup_standalone`, but +/// returns the scheduler URL for the direct physical-plan submission path. +async fn start_standalone_from_state(state: &SessionState) -> Result { + let addr = ballista_scheduler::standalone::new_standalone_scheduler_from_state(state) + .await + .map_err(|e| format!("failed to start standalone scheduler: {e}"))?; + let scheduler_url = format!("http://localhost:{}", addr.port()); + + let mut retries = 50; + let scheduler = loop { + match SchedulerGrpcClient::connect(scheduler_url.clone()).await { + Ok(s) => break s, + Err(e) if retries > 0 => { + retries -= 1; + tokio::time::sleep(Duration::from_millis(100)).await; + let _ = e; + } + Err(e) => return Err(format!("could not connect to standalone scheduler: {e}")), + } + }; + + let concurrent_tasks = state.config().ballista_standalone_parallelism(); + ballista_executor::new_standalone_executor_from_state(scheduler, concurrent_tasks, state) + .await + .map_err(|e| format!("failed to start standalone executor: {e}"))?; + + Ok(scheduler_url) +} + +/// Build and submit the R2 two-stage plan on an in-process standalone Ballista +/// cluster, returning the collected Arrow result batches plus the result schema. +/// +/// The Comet logical+physical codecs are registered on the SessionConfig so both +/// the `CometFragmentExec` nodes and Ballista's own shuffle operators survive +/// serialization to the scheduler/executor. The plan is submitted through T1's +/// `execute_physical_plan`, which fetches **all** output partitions of the final +/// stage (not just partition 0), so the returned batches cover every group. +pub fn execute_two_stage( + block1_proto: &[u8], + block2_proto: &[u8], + num_group_keys: usize, + num_partitions: usize, +) -> Result<(SchemaRef, Vec), String> { + let runtime = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .map_err(|e| format!("failed to build tokio runtime: {e}"))?; + + runtime.block_on(async move { + let n = num_partitions.max(1); + let config = SessionConfig::new_with_ballista() + .with_target_partitions(n) + .with_ballista_standalone_parallelism(n) + .with_ballista_physical_extension_codec(Arc::new(CometPhysicalCodec::default())) + .with_ballista_logical_extension_codec(Arc::new(CometLogicalCodec::default())); + let state = SessionStateBuilder::new() + .with_config(config) + .with_default_features() + .build(); + + // Build the plan inside the runtime: the fragments' NativeScan leaf builds + // via Comet's planner, which requires an active Tokio runtime. + let plan = build_two_stage_plan(block1_proto, block2_proto, num_group_keys, n)?; + let schema = plan.schema(); + + let scheduler_url = start_standalone_from_state(&state).await?; + + let session_config = state.config().clone(); + let codec = CometPhysicalCodec::default(); + let session_id = state.session_id().to_string(); + + let stream = execute_physical_plan::( + scheduler_url, + &BallistaConfig::default(), + plan, + &codec, + session_id, + session_config, + ) + .await + .map_err(|e| format!("failed to submit two-stage physical plan: {e}"))?; + + let batches = stream + .try_collect::>() + .await + .map_err(|e| format!("failed to collect distributed results: {e}"))?; + + Ok((schema, batches)) + }) +} + +/// Run the R2 two-stage plan and export the (single, concatenated) result batch +/// into the JVM-allocated FFI structs. Returns the row count. +/// +/// # Safety +/// See [`export_batch_to_addresses`]. +pub unsafe fn submit_and_export_distributed( + block1_proto: &[u8], + block2_proto: &[u8], + num_group_keys: usize, + num_partitions: usize, + array_addrs: &[i64], + schema_addrs: &[i64], +) -> Result { + let (schema, batches) = + execute_two_stage(block1_proto, block2_proto, num_group_keys, num_partitions)?; + // The final stage's partitions are concatenated into one batch so the JVM + // imports exactly one set of column structs (same contract as R1). + let batch = concat_batches(&schema, &batches) + .map_err(|e| format!("failed to concatenate result batches: {e}"))?; + export_batch_to_addresses(&batch, array_addrs, schema_addrs)?; + Ok(batch.num_rows() as i64) +} + /// Export one Arrow batch into caller-allocated `FFI_ArrowArray` / /// `FFI_ArrowSchema` structs, one per column, whose addresses were allocated by /// the JVM (Arrow Java `ArrowArray.allocateNew` / `ArrowSchema.allocateNew`). @@ -243,10 +445,10 @@ pub unsafe fn submit_and_export( // --------------------------------------------------------------------------- mod jni_entry { - use super::{build_test_proto, submit_and_export}; + use super::{build_test_proto, submit_and_export, submit_and_export_distributed}; use comet::errors::{try_unwrap_or_throw, CometError}; use jni::objects::{JByteArray, JClass, JLongArray, ReleaseMode}; - use jni::sys::{jbyteArray, jlong}; + use jni::sys::{jbyteArray, jint, jlong}; use jni::EnvUnowned; /// JVM entry: build the fixed spike test proto Rust-side and return its @@ -298,4 +500,48 @@ mod jni_entry { Ok(num_rows as jlong) }) } + + /// JVM entry: run a distributed (R2) two-stage GROUP BY offload. Builds + /// `CometFragmentExec(block2, [Hash-Repartition(CometFragmentExec(block1))])`, + /// submits it to an in-process standalone Ballista cluster (which splits it at + /// the hash-repartition into a partial-agg stage and a final-agg stage over a + /// shuffle), and exports the concatenated result batch into the JVM-allocated + /// Arrow C Data structs, returning the number of rows. + /// + /// # Safety + /// Called from the JVM via JNI; the address arrays must reference valid + /// caller-allocated `FFI_ArrowArray`/`FFI_ArrowSchema` structs (one per output + /// column of `block2`). + #[no_mangle] + pub unsafe extern "system" fn Java_org_apache_comet_ballista_NativeBallista_executeQueryDistributed( + e: EnvUnowned, + _class: JClass, + block1: JByteArray, + block2: JByteArray, + num_group_keys: jint, + num_partitions: jint, + array_addrs: JLongArray, + schema_addrs: JLongArray, + ) -> jlong { + try_unwrap_or_throw(&e, |env| { + let block1_bytes = env.convert_byte_array(block1)?; + let block2_bytes = env.convert_byte_array(block2)?; + + let arrays = unsafe { array_addrs.get_elements(env, ReleaseMode::NoCopyBack)? }; + let schemas = unsafe { schema_addrs.get_elements(env, ReleaseMode::NoCopyBack)? }; + + let num_rows = unsafe { + submit_and_export_distributed( + &block1_bytes, + &block2_bytes, + num_group_keys as usize, + num_partitions as usize, + &arrays, + &schemas, + ) + } + .map_err(CometError::Internal)?; + Ok(num_rows as jlong) + }) + } } diff --git a/native/ballista/src/fragment.rs b/native/ballista/src/fragment.rs index 3fd878d8a1..8e16da320a 100644 --- a/native/ballista/src/fragment.rs +++ b/native/ballista/src/fragment.rs @@ -21,7 +21,8 @@ use std::sync::Arc; use datafusion::common::{DataFusionError, Result}; use datafusion::execution::TaskContext; use datafusion::physical_plan::{ - DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, SendableRecordBatchStream, + DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, + SendableRecordBatchStream, }; use comet::execution::fragment::{build_native_fragment, native_fragment_plan_properties}; @@ -46,10 +47,32 @@ pub struct CometFragmentExec { impl CometFragmentExec { /// Build from Comet proto bytes and the fragment's DataFusion children. The - /// schema/properties are derived by building the fragment plan once (without + /// schema/ordering are derived by building the fragment plan once (without /// executing it or requiring the child streams). + /// + /// A Comet fragment is internally single-partition, but as a DataFusion node + /// it is a *per-partition* transform: [`execute`](Self::execute) runs the + /// fragment once for each output partition, feeding that partition's child + /// streams into the fragment's `Scan` leaves. So when the fragment has + /// children (e.g. a Ballista shuffle reader with `N` partitions), its output + /// partition count must match the children's — otherwise consumers (and the + /// distributed planner / result fetch) would only ever drive partition 0 and + /// silently drop the other `N-1` partitions' rows. A childless fragment + /// (self-contained `NativeScan` leaf) keeps the built plan's own partitioning. pub fn try_new(proto: Vec, children: Vec>) -> Result { - let props = native_fragment_plan_properties(&proto).map_err(DataFusionError::Execution)?; + let base = native_fragment_plan_properties(&proto).map_err(DataFusionError::Execution)?; + let props = match children.first() { + Some(child) => { + let n = child.properties().partitioning.partition_count(); + Arc::new(PlanProperties::new( + base.eq_properties.clone(), + Partitioning::UnknownPartitioning(n), + base.emission_type, + base.boundedness, + )) + } + None => base, + }; Ok(Self { proto, children, diff --git a/native/ballista/src/lib.rs b/native/ballista/src/lib.rs index 110f678de0..5c8d1a4730 100644 --- a/native/ballista/src/lib.rs +++ b/native/ballista/src/lib.rs @@ -39,7 +39,10 @@ pub mod table_provider; pub use codec::{ CometLogicalCodec, CometPhysicalCodec, COMET_FRAGMENT_MAGIC, COMET_MAGIC, }; -pub use ffi_jni::{build_test_proto, execute_comet_proto, submit_and_export}; +pub use ffi_jni::{ + build_test_proto, execute_comet_proto, execute_two_stage, submit_and_export, + submit_and_export_distributed, +}; pub use fragment::CometFragmentExec; pub use scan::CometScanExec; pub use table_provider::CometTableProvider; diff --git a/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala b/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala index 33f295eadf..00c3dd22d8 100644 --- a/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala +++ b/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala @@ -61,6 +61,40 @@ class NativeBallista { proto: Array[Byte], arrayAddrs: Array[Long], schemaAddrs: Array[Long]): Long + + /** + * EXPERIMENTAL (R2): run a distributed two-stage GROUP BY offload on an in-process standalone + * Ballista cluster (no Spark executors). + * + * `block1` is the serialized partial-aggregate block (self-contained `NativeScan` leaf); + * `block2` is the serialized final-aggregate block (whose input leaf is a `Scan` fed by the + * shuffle). The native side assembles `CometFragmentExec(block2, + * [Hash-Repartition(CometFragmentExec(block1))])`, which Ballista splits at the hash + * repartition into a partial-agg stage and a final-agg stage across a shuffle, then exports the + * concatenated result batch into the caller-allocated Arrow C Data structs. + * + * @param block1 + * serialized partial-aggregate `Operator` proto (with file partitions injected) + * @param block2 + * serialized final-aggregate `Operator` proto (leaf is a `Scan`, not a `ShuffleScan`) + * @param numGroupKeys + * number of grouping columns (the leading columns of block1's output to hash on) + * @param numPartitions + * number of shuffle partitions + * @param arrayAddrs + * memory addresses of one `ArrowArray` struct per output column of `block2` + * @param schemaAddrs + * memory addresses of one `ArrowSchema` struct per output column of `block2` + * @return + * the number of rows exported + */ + @native def executeQueryDistributed( + block1: Array[Byte], + block2: Array[Byte], + numGroupKeys: Int, + numPartitions: Int, + arrayAddrs: Array[Long], + schemaAddrs: Array[Long]): Long } object NativeBallista { diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala index 7547e626db..6109bc2d48 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala @@ -403,67 +403,137 @@ object CometExec { } /** - * EXPERIMENTAL (R1): offload a single-stage Comet query to an in-process Apache DataFusion - * Ballista engine on the Spark driver and return the collected rows, launching NO Spark - * executor tasks. + * EXPERIMENTAL: offload a Comet query to an in-process Apache DataFusion Ballista engine on the + * Spark driver and return the collected rows, launching NO Spark executor tasks. * - * Enabled by `spark.comet.exec.ballista.enabled`. The whole-query native plan is already - * serialized on the boundary [[CometNativeExec]] (`serializedPlanOpt.plan`, produced by - * `convertBlock`). We hand those proto bytes to [[NativeBallista.executeQuery]], which runs - * them on Ballista and exports the (single, concatenated) result batch back over the Arrow C - * Data Interface; we import it via [[NativeUtil]] and materialize the rows on the driver. + * Enabled by `spark.comet.exec.ballista.enabled`. Two plan shapes are supported: * - * Only single-stage plans are supported: exactly one native block (no exchange). Anything else - * throws [[UnsupportedOperationException]]. + * - **R1 single-stage:** exactly one native block ([[CometNativeExec]] with a serialized + * plan) and no Comet exchange. The whole-query native plan is submitted as one native leaf. + * - **R2 two-stage GROUP BY:** exactly two native blocks with one + * [[CometShuffleExchangeExec]] between them (partial aggregate below the exchange, final + * aggregate above). The two blocks are submitted separately and Ballista distributes them + * across a hash shuffle. + * + * Anything else throws [[UnsupportedOperationException]]. */ def executeCollectViaBallista(root: SparkPlan): Array[InternalRow] = { - // Every boundary node (top of a native block) carries a serialized plan. More than one means - // the plan spans a shuffle boundary -> multiple stages, which R1 does not support. + // Every boundary node (top of a native block) carries a serialized plan. val boundaries = root.collect { case n: CometNativeExec if n.serializedPlanOpt.isDefined => n } - val boundary = boundaries match { - case Seq(single) => single + val exchanges = root.collect { case e: CometShuffleExchangeExec => e } + + (boundaries, exchanges) match { + case (Seq(single), Nil) => + executeSingleBlockViaBallista(root, single) + case (Seq(_, _), Seq(exchange)) => + executeTwoBlockViaBallista(root, boundaries, exchange) case _ => throw new UnsupportedOperationException( - "Comet Ballista offload (R1) supports single-stage plans only; found " + - s"${boundaries.size} serialized native plan blocks in:\n$root") + "Comet Ballista offload supports either a single-stage plan (one native block, no " + + "Comet exchange) or a two-stage GROUP BY (two native blocks + one hash exchange); " + + s"found ${boundaries.size} serialized native blocks and ${exchanges.size} Comet " + + s"exchanges in:\n$root") } - val planBytes = boundary.serializedPlanOpt.plan.getOrElse( + } + + /** + * R1: submit a single native block as one self-contained native leaf. Ballista concatenates the + * whole result into a single exported batch, so one import suffices. + */ + private def executeSingleBlockViaBallista( + root: SparkPlan, + boundary: CometNativeExec): Array[InternalRow] = { + val injectedPlanBytes = injectScanFiles(root, boundary) + val numCols = boundary.output.length + val nativeUtil = new NativeUtil() + try { + val nativeBallista = new NativeBallista + nativeUtil.getNextBatch( + numCols, + (arrayAddrs, schemaAddrs) => + nativeBallista.executeQuery(injectedPlanBytes, arrayAddrs, schemaAddrs)) match { + case Some(batch) => + try { + batch.rowIterator().asScala.map(_.copy()).toArray + } finally { + batch.close() + } + case None => + Array.empty[InternalRow] + } + } finally { + nativeUtil.close() + } + } + + /** + * R2: submit a two-stage GROUP BY. `block1` (below the exchange) is the partial aggregate over + * a `NativeScan`; `block2` (above the exchange) is the final aggregate whose input leaf is a + * plain `Scan` fed by the Ballista shuffle. The exchange's [[HashPartitioning]] gives the + * number of grouping columns and shuffle partitions. The native side assembles + * `CometFragmentExec(block2, [Hash-Repartition(CometFragmentExec(block1))])`, which Ballista + * splits at the hash repartition into the two shuffle stages. + */ + private def executeTwoBlockViaBallista( + root: SparkPlan, + boundaries: Seq[CometNativeExec], + exchange: CometShuffleExchangeExec): Array[InternalRow] = { + // The final-aggregate block's input leaf must serialize as a plain `Scan` (#100), which the + // native fragment feeds from the Ballista shuffle reader — NOT a native `ShuffleScan` (#116), + // which expects to read Comet shuffle blocks directly. That requires direct read disabled. + if (CometConf.COMET_SHUFFLE_DIRECT_READ_ENABLED.get()) { throw new UnsupportedOperationException( - "Comet Ballista offload (R1) supports single-stage plans only; " + - s"the native plan block carries no serialized plan:\n$root")) + "Comet Ballista two-stage (R2) offload requires " + + s"${CometConf.COMET_SHUFFLE_DIRECT_READ_ENABLED.key}=false so the final-aggregate " + + "block reads a plain Scan leaf (fed by the Ballista shuffle) rather than a native " + + s"ShuffleScan:\n$root") + } - // The serialized template plan carries each NativeScan's `common` metadata but NOT its file - // list: Comet normally injects file partitions per-partition at task launch (see - // NativeScanPlanDataInjector). Since the offload runs the whole plan as a single native leaf, - // inject all partitions' files into one scan so Ballista reads the complete table. - val nativeScans = boundary.collect { case s: CometNativeScanExec => s } - val injectedPlanBytes = if (nativeScans.isEmpty) { - planBytes - } else { - val commonByKey = nativeScans.map { scan => - scan.ensureSubqueriesResolved() - scan.sourceKey -> scan.commonData - }.toMap - val partitionByKey = nativeScans.map { scan => - scan.sourceKey -> mergeFilePartitions(scan.perPartitionData) - }.toMap - val template = Operator.parseFrom(planBytes) - val injected = PlanDataInjector.injectPlanData(template, commonByKey, partitionByKey) - PlanDataInjector.serializeOperator(injected) + val (numGroupKeys, numPartitions) = exchange.outputPartitioning match { + case HashPartitioning(expressions, n) => (expressions.length, n) + case other => + throw new UnsupportedOperationException( + "Comet Ballista two-stage (R2) offload requires a HashPartitioning exchange; found " + + s"$other in:\n$root") } - val numCols = boundary.output.length + // block1 = the serialized native boundary within the exchange's subtree (partial aggregate); + // block2 = the other boundary (final aggregate, an ancestor of the exchange). + val block1 = exchange + .collectFirst { case n: CometNativeExec if n.serializedPlanOpt.isDefined => n } + .getOrElse( + throw new UnsupportedOperationException( + s"Comet Ballista two-stage (R2) offload: no serialized native block below the " + + s"exchange:\n$root")) + val block2 = boundaries + .find(_ ne block1) + .getOrElse(throw new UnsupportedOperationException( + s"Comet Ballista two-stage (R2) offload: could not identify the final-aggregate block " + + s"above the exchange:\n$root")) + + val block1Bytes = injectScanFiles(root, block1) + val block2Bytes = block2.serializedPlanOpt.plan.getOrElse( + throw new UnsupportedOperationException( + s"Comet Ballista two-stage (R2) offload: the final-aggregate block carries no " + + s"serialized plan:\n$root")) + + val numCols = block2.output.length val nativeUtil = new NativeUtil() try { val nativeBallista = new NativeBallista - // Ballista concatenates the whole result into a single exported batch, so one import is - // sufficient for R1's single-stage plans. + // The native side concatenates all shuffle-partition outputs into a single exported batch. nativeUtil.getNextBatch( numCols, (arrayAddrs, schemaAddrs) => - nativeBallista.executeQuery(injectedPlanBytes, arrayAddrs, schemaAddrs)) match { + nativeBallista.executeQueryDistributed( + block1Bytes, + block2Bytes, + numGroupKeys, + numPartitions, + arrayAddrs, + schemaAddrs)) match { case Some(batch) => try { batch.rowIterator().asScala.map(_.copy()).toArray @@ -478,6 +548,35 @@ object CometExec { } } + /** + * Inject file partitions into a native block's serialized plan. The serialized template carries + * each `NativeScan`'s `common` metadata but NOT its file list (Comet normally injects files + * per-partition at task launch, see `NativeScanPlanDataInjector`). Since the offload runs a + * block as one native leaf, merge all partitions' files into each scan so Ballista reads the + * complete table. Blocks with no `NativeScan` (e.g. an R2 final-aggregate reading a shuffle) + * are returned unchanged. + */ + private def injectScanFiles(root: SparkPlan, boundary: CometNativeExec): Array[Byte] = { + val planBytes = boundary.serializedPlanOpt.plan.getOrElse( + throw new UnsupportedOperationException( + s"Comet Ballista offload: the native plan block carries no serialized plan:\n$root")) + val nativeScans = boundary.collect { case s: CometNativeScanExec => s } + if (nativeScans.isEmpty) { + planBytes + } else { + val commonByKey = nativeScans.map { scan => + scan.ensureSubqueriesResolved() + scan.sourceKey -> scan.commonData + }.toMap + val partitionByKey = nativeScans.map { scan => + scan.sourceKey -> mergeFilePartitions(scan.perPartitionData) + }.toMap + val template = Operator.parseFrom(planBytes) + val injected = PlanDataInjector.injectPlanData(template, commonByKey, partitionByKey) + PlanDataInjector.serializeOperator(injected) + } + } + /** * Merge the per-partition file lists of a native scan into a single `NativeScan` carrying every * partition's files, serialized as the `partitionBytes` expected by diff --git a/spark/src/test/scala/org/apache/comet/ballista/CometBallistaDistributedSuite.scala b/spark/src/test/scala/org/apache/comet/ballista/CometBallistaDistributedSuite.scala new file mode 100644 index 0000000000..ef6baf519e --- /dev/null +++ b/spark/src/test/scala/org/apache/comet/ballista/CometBallistaDistributedSuite.scala @@ -0,0 +1,169 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.ballista + +import java.util.concurrent.atomic.AtomicInteger + +import org.apache.spark.CometListenerBusUtils +import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskStart} +import org.apache.spark.sql.CometTestBase +import org.apache.spark.sql.comet.CometNativeExec +import org.apache.spark.sql.comet.execution.shuffle.CometShuffleExchangeExec +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper +import org.apache.spark.sql.internal.SQLConf + +import org.apache.comet.CometConf + +/** + * The R2 milestone: a Spark `GROUP BY` runs DISTRIBUTED on an in-process Apache DataFusion + * Ballista engine on the Spark driver, across a hash shuffle, with Comet native fragments on both + * sides (partial aggregate -> Ballista hash shuffle -> final aggregate). The collected rows must + * be identical to the flag-off (Spark/Comet-on-executors) baseline, launching ZERO Spark executor + * tasks. + * + * Plan shape offloaded (with `spark.comet.exec.shuffle.directRead.enabled=false` so the final + * aggregate's input leaf serializes as a plain `Scan` fed by the Ballista shuffle): + * {{{ + * CometNativeExec[block2] (final HashAggregate over a Scan leaf) + * CometShuffleExchangeExec (HashPartitioning(k), N) + * CometNativeExec[block1] (partial HashAggregate over a NativeScan) + * CometNativeScanExec + * }}} + * + * No `ORDER BY` in the offloaded query: a global sort would add a range-partition exchange (a + * third stage), which is out of the 2-block scope. The test sorts the collected rows itself. + */ +class CometBallistaDistributedSuite extends CometTestBase with AdaptiveSparkPlanHelper { + + /** + * Runs `f`, counting Spark executor task starts during it. Drains the listener bus before + * attaching and after running so asynchronous task-start events are flushed. (Same apparatus as + * `CometBallistaOffloadSuite`.) + */ + private def countTaskStarts(f: => Unit): Int = { + val taskStarts = new AtomicInteger(0) + val listener = new SparkListener { + override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = { + taskStarts.incrementAndGet() + } + } + CometListenerBusUtils.waitUntilEmpty(spark.sparkContext) + spark.sparkContext.addSparkListener(listener) + try { + f + CometListenerBusUtils.waitUntilEmpty(spark.sparkContext) + } finally { + spark.sparkContext.removeSparkListener(listener) + } + taskStarts.get() + } + + test("two-stage GROUP BY is offloaded to distributed Ballista with no Spark executor tasks") { + assume( + NativeBallista.isAvailable, + s"native ballista library not available: ${NativeBallista.loadFailure.map(_.getMessage)}") + + withTempPath { dir => + import testImplicits._ + + // A few `k` values with distinct row counts (k=1 x3, k=2 x2, k=3 x4), spread across several + // input files so a GROUP BY needs a shuffle to aggregate across partitions. + Seq((1, 10), (1, 11), (1, 12), (2, 20), (2, 21), (3, 30), (3, 31), (3, 32), (3, 33)) + .toDF("k", "v") + .repartition(4) + .write + .parquet(dir.getCanonicalPath) + + // AQE off so the collect root is the Comet columnar-to-row node carrying our executeCollect + // override (not an AdaptiveSparkPlanExec wrapper), and the shuffle boundary is deterministic. + // Small shuffle-partition count keeps the in-process distributed run fast. + // directRead off so block2's input leaf serializes as a plain Scan (#100) fed by the Ballista + // shuffle, not a native ShuffleScan (#116). + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false", + SQLConf.SHUFFLE_PARTITIONS.key -> "4", + CometConf.COMET_SHUFFLE_DIRECT_READ_ENABLED.key -> "false") { + spark.read.parquet(dir.getCanonicalPath).createOrReplaceTempView("t") + val query = "SELECT k, count(*) AS c FROM t GROUP BY k" + + // Confirm the plan is the offloadable R2 shape BEFORE running it: exactly one Comet hash + // exchange and exactly two serialized CometNativeExec blocks (partial + final aggregate). + val executed = withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") { + spark.sql(query).queryExecution.executedPlan + } + val exchanges = executed.collect { case e: CometShuffleExchangeExec => e } + assert( + exchanges.size == 1, + s"expected exactly one Comet hash exchange (two stages), found ${exchanges.size}:\n" + + s"$executed") + val nativeBlocks = executed.collect { + case n: CometNativeExec if n.serializedPlanOpt.isDefined => n + } + assert( + nativeBlocks.size == 2, + s"expected exactly two serialized CometNativeExec blocks, found ${nativeBlocks.size}:\n" + + s"$executed") + + // Baseline: normal Comet execution (offload off) through the same listener apparatus. This + // positive control proves the listener observes executor task starts, so the `== 0` + // assertion for the offloaded run is meaningful. + var baseline: Seq[Seq[Any]] = null + val baselineTaskStarts = countTaskStarts { + baseline = withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") { + spark.sql(query).collect().map(_.toSeq.toIndexedSeq).toIndexedSeq + } + } + assert( + baselineTaskStarts > 0, + "expected the flag-off baseline collect to launch at least one Spark executor task " + + s"(sanity check for the listener apparatus); got $baselineTaskStarts") + + // Ballista offload: run the same query with the flag on, counting executor task starts. + var offloaded: Seq[Seq[Any]] = null + val offloadedTaskStarts = countTaskStarts { + offloaded = withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "true") { + spark.sql(query).collect().map(_.toSeq.toIndexedSeq).toIndexedSeq + } + } + + def sortKey(r: Seq[Any]): String = r.map(v => s"$v").mkString(",") + val baselineSorted = baseline.sortBy(sortKey) + val offloadedSorted = offloaded.sortBy(sortKey) + + // The distributed aggregate must compose across the shuffle: partial counts written to + // Ballista's IPC shuffle, read back, and merged by the final aggregate into correct totals. + assert( + offloadedSorted == baselineSorted, + "offloaded (distributed) rows do not match baseline\n" + + s" baseline: $baselineSorted\n offloaded: $offloadedSorted") + assert( + baselineSorted == Seq(Seq(1, 3L), Seq(2, 2L), Seq(3, 4L)), + s"unexpected group counts: $baselineSorted") + + // Crucially, NO Spark executor tasks ran for the offloaded (driver-side, distributed) + // collect. + assert( + offloadedTaskStarts == 0, + s"expected 0 Spark executor tasks for the Ballista-offloaded distributed collect, " + + s"but $offloadedTaskStarts started") + } + } + } +} diff --git a/spark/src/test/scala/org/apache/comet/ballista/CometBallistaOffloadSuite.scala b/spark/src/test/scala/org/apache/comet/ballista/CometBallistaOffloadSuite.scala index ec45f63567..5ce5eca1dc 100644 --- a/spark/src/test/scala/org/apache/comet/ballista/CometBallistaOffloadSuite.scala +++ b/spark/src/test/scala/org/apache/comet/ballista/CometBallistaOffloadSuite.scala @@ -121,7 +121,7 @@ class CometBallistaOffloadSuite extends CometTestBase with AdaptiveSparkPlanHelp } } - test("multi-stage collect (exchange present) throws under Ballista offload") { + test("two-stage GROUP BY with shuffle direct read on throws under Ballista offload") { assume( NativeBallista.isAvailable, s"native ballista library not available: ${NativeBallista.loadFailure.map(_.getMessage)}") @@ -130,7 +130,10 @@ class CometBallistaOffloadSuite extends CometTestBase with AdaptiveSparkPlanHelp import testImplicits._ // Several partition files with repeated keys, so a `GROUP BY` requires a shuffle - // (exchange) to aggregate across partitions -> more than one CometNativeExec boundary. + // (exchange) to aggregate across partitions -> two CometNativeExec boundaries. This is the + // R2 two-stage shape, but with shuffle direct read left ON (the default) the final block's + // input leaf serializes as a native ShuffleScan, which the Ballista shuffle-fed fragment + // cannot consume, so the offload is rejected with a clear error. Seq((1, 10), (1, 20), (2, 30), (2, 40), (3, 50), (3, 60), (4, 70), (4, 80)) .toDF("k", "v") .repartition(4) @@ -138,15 +141,15 @@ class CometBallistaOffloadSuite extends CometTestBase with AdaptiveSparkPlanHelp .parquet(dir.getCanonicalPath) // Disable AQE so the shuffle boundary/plan shape is deterministic (no runtime coalescing - // of the exchange back down to a single stage). + // of the exchange back down to a single stage). Leave directRead at its default (true). withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { spark.read.parquet(dir.getCanonicalPath).createOrReplaceTempView("t2") val query = "SELECT k, count(*) FROM t2 GROUP BY k" withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "true") { val df = spark.sql(query) - // Sanity check: the plan does contain an exchange, i.e. the single-stage guard is - // actually exercised and not vacuously satisfied. + // Sanity check: the plan does contain an exchange, i.e. the two-stage path is actually + // exercised and not vacuously satisfied. val hasExchange = df.queryExecution.executedPlan.collect { case e: org.apache.spark.sql.execution.exchange.Exchange => e }.nonEmpty @@ -158,7 +161,7 @@ class CometBallistaOffloadSuite extends CometTestBase with AdaptiveSparkPlanHelp df.collect() } assert( - ex.getMessage.contains("single-stage plans only"), + ex.getMessage.contains("directRead"), s"unexpected exception message: ${ex.getMessage}") } } From 3583544b9be675031879a3a6e9c6e8bff0f589d9 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Jul 2026 18:08:40 -0600 Subject: [PATCH 17/42] test(ballista): distribute full TPC-H Q1 aggregate across a Ballista shuffle Add the R2 milestone test: TPC-H Q1's full aggregate (sum x4, avg x3, count over decimals, grouped by the two keys l_returnflag/l_linestatus, no ORDER BY) offloaded from the driver and distributed across the Ballista hash shuffle as the 2-block shape (Comet partial-agg -> CometShuffleExchangeExec -> Comet final-agg), with 0 Spark-executor tasks. Reuses the T4 two-block driver path and R1's synthetic lineitem generator. This verifies the composition risks the count(*) test left unchecked: avg's partial (sum + count) state and decimal partial sums round-trip through Ballista's Arrow IPC shuffle and compose in the Comet final aggregate. Results match Spark's own Q1 row-for-row, including decimal scale (sum scale 4/6, avg scale 6) and avg values. The reference oracle runs with Comet fully disabled (pure Spark): a Comet-native baseline uses the native tokio engine which, once an in-process Ballista offload has run in the JVM, resolves with_env to a second, uninitialized JAVA_VM OnceCell in libdatafusion_comet_ballista and panics. Pure Spark is immune and is the correct oracle. Documented as an infrastructure limitation of the offload spike. Relates to #4796. --- .../contributor-guide/ballista_execution.md | 6 +- .../comet/ballista/CometBallistaQ1Suite.scala | 143 ++++++++++++++++++ 2 files changed, 147 insertions(+), 2 deletions(-) diff --git a/docs/source/contributor-guide/ballista_execution.md b/docs/source/contributor-guide/ballista_execution.md index 12ededdb0c..b7b08126d1 100644 --- a/docs/source/contributor-guide/ballista_execution.md +++ b/docs/source/contributor-guide/ballista_execution.md @@ -110,8 +110,10 @@ Legend: ✅ done · 🔨 in progress · ⬜ planned - ✅ R1-T2 — config flag + driver `executeCollect` override. - ◐ R1-T3 — offload proven end-to-end on Q1's single-stage subset (scan + date filter + decimal projections), results match Spark, 0 executor tasks. Full Q1 GROUP BY is structurally multi-block → R2. - ⬜ R1-T4 (R1b) — submit to an external Ballista scheduler + executor cluster. -- ⬜ **R2 — multi-stage distribution.** Map Comet's per-stage native fragments onto Ballista stages - with shuffle, so plans with exchanges (aggregations, joins) distribute across executors. +- 🔨 **R2 — multi-stage distribution.** A distributed 2-block `GROUP BY` (Comet partial-agg → Ballista hash shuffle → Comet final-agg) runs offloaded with 0 Spark-executor tasks and correct results. + - ✅ R2-T4 — 2-block `count(*)` single-key distributes across the shuffle; results match Spark, 0 executor tasks. + - ✅ R2-T5 — **full TPC-H Q1 aggregate distributed** (no `ORDER BY`): `sum`×4, `avg`×3, `count` over decimals grouped by two keys (`l_returnflag`, `l_linestatus`). `avg`'s partial (sum + count) state and decimal partial sums round-trip through Ballista's Arrow IPC shuffle and compose in the Comet final aggregate; results match Spark's own Q1 row-for-row (incl. decimal scale), 0 executor tasks. + - ⬜ N-block generalization (a trailing `ORDER BY` / range exchange is a third stage — still out of scope). - ⬜ **JVM-free executor.** Feature-gate the JNI bridge so the native execution crates build without `libjvm`, enabling a standalone Ballista executor binary. - ⬜ **Multi-partition scans.** Map a scan's file groups to multiple partitions (currently a diff --git a/spark/src/test/scala/org/apache/comet/ballista/CometBallistaQ1Suite.scala b/spark/src/test/scala/org/apache/comet/ballista/CometBallistaQ1Suite.scala index fdc010f10a..32716cc6e0 100644 --- a/spark/src/test/scala/org/apache/comet/ballista/CometBallistaQ1Suite.scala +++ b/spark/src/test/scala/org/apache/comet/ballista/CometBallistaQ1Suite.scala @@ -27,6 +27,7 @@ import org.apache.spark.CometListenerBusUtils import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskStart} import org.apache.spark.sql.{CometTestBase, Row} import org.apache.spark.sql.comet.CometNativeExec +import org.apache.spark.sql.comet.execution.shuffle.CometShuffleExchangeExec import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.exchange.Exchange import org.apache.spark.sql.internal.SQLConf @@ -62,6 +63,17 @@ import org.apache.comet.CometConf * against the Q1 cutoff, and Q1's decimal multiplications — as ONE exchange-free native block. * The test asserts the plan really is single-block before offloading, and compares full result * rows flag-on vs flag-off using the exact decimal types Spark produces. + * + * Ordering caveat (dual-library global state): both tests in this suite run in one JVM, and the + * in-process Ballista offload statically links a SECOND copy of Comet core into + * `libdatafusion_comet_ballista`. Its `JAVA_VM` `OnceCell` is distinct from `libcomet`'s, and + * once an offload has run, Comet-on-Spark-executor native execution (`Native.executePlan` on a + * `tokio-rt-worker`) can resolve `with_env` to that second, uninitialized copy and panic with + * `JAVA_VM not initialized`. Reference oracles here therefore run with Comet fully DISABLED (pure + * Spark), which never touches Comet native code and is immune to this interaction. The offload + * itself is unaffected (it initializes/uses the JVM through its own path). This is an + * infrastructure limitation of the in-process offload spike, not a correctness issue in the + * distributed aggregate. */ class CometBallistaQ1Suite extends CometTestBase with AdaptiveSparkPlanHelper { @@ -296,4 +308,135 @@ class CometBallistaQ1Suite extends CometTestBase with AdaptiveSparkPlanHelper { } } } + + /** + * TPC-H Q1's full aggregate (NO `ORDER BY`): `sum`/`avg`/`count` over decimals, grouped by the + * two keys `(l_returnflag, l_linestatus)`. This is the R2 milestone — it distributes the + * aggregate across a Ballista hash shuffle as the two-block shape (Comet partial-agg -> + * `CometShuffleExchangeExec` -> Comet final-agg) and asserts the collected rows are identical + * to Spark's own Q1, launching ZERO Spark executor tasks. + * + * It exercises the composition risks the single-block R1 slice and the `count(*)` R2 test left + * unverified: `avg`'s partial state (sum + count) and decimal partial sums round-tripping + * through Ballista's Arrow IPC shuffle and composing in the Comet final aggregate, across two + * group keys. + * + * No `ORDER BY` in the offloaded query (a global sort would add a third, range-partition stage, + * out of the 2-block scope); both sides are sorted on the driver by `(returnflag, linestatus)` + * before comparison. + */ + private val q1FullAggregate = + """ + |SELECT l_returnflag, l_linestatus, + | sum(l_quantity) AS sum_qty, + | sum(l_extendedprice) AS sum_base_price, + | sum(l_extendedprice * (1 - l_discount)) AS sum_disc_price, + | sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge, + | avg(l_quantity) AS avg_qty, + | avg(l_extendedprice) AS avg_price, + | avg(l_discount) AS avg_disc, + | count(*) AS count_order + |FROM lineitem + |WHERE l_shipdate <= date '1998-12-01' - interval '90' day + |GROUP BY l_returnflag, l_linestatus + |""".stripMargin + + test( + "TPC-H Q1 full aggregate (sum/avg/count over decimals, two group keys) distributes across a " + + "Ballista shuffle with identical results and no executor tasks") { + assume( + NativeBallista.isAvailable, + s"native ballista library not available: ${NativeBallista.loadFailure.map(_.getMessage)}") + + withTempPath { dir => + // Spread the rows across several input files so rows of the same (returnflag, linestatus) + // group land in different partitions — the hash shuffle must then actually combine partial + // aggregate states across partitions for the final totals to be correct. + spark + .createDataFrame(spark.sparkContext.parallelize(lineitemRows), lineitemSchema) + .repartition(3) + .write + .parquet(dir.getCanonicalPath) + + // AQE off so the collect root carries our executeCollect override; direct-read off so + // block2's input leaf serializes as a plain Scan (#100) fed by the Ballista shuffle; small + // shuffle-partition count keeps the in-process distributed run fast. + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false", + SQLConf.SHUFFLE_PARTITIONS.key -> "4", + CometConf.COMET_SHUFFLE_DIRECT_READ_ENABLED.key -> "false") { + spark.read.parquet(dir.getCanonicalPath).createOrReplaceTempView("lineitem") + + // Confirm the offloadable R2 shape BEFORE running: exactly one Comet hash exchange (two + // stages) and exactly two serialized CometNativeExec blocks (partial + final aggregate). + val executed = withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") { + spark.sql(q1FullAggregate).queryExecution.executedPlan + } + val exchanges = executed.collect { case e: CometShuffleExchangeExec => e } + assert( + exchanges.size == 1, + s"expected exactly one Comet hash exchange (two stages), found ${exchanges.size}:\n" + + s"$executed") + val nativeBlocks = executed.collect { + case n: CometNativeExec if n.serializedPlanOpt.isDefined => n + } + assert( + nativeBlocks.size == 2, + s"expected exactly two serialized CometNativeExec blocks, found ${nativeBlocks.size}:\n" + + s"$executed") + + // Baseline oracle: Spark's OWN Q1 answer, with Comet fully disabled. This is the truest + // reference for "does the distributed offload match Spark's own Q1" (the brief's goal), and + // it also runs through the same listener apparatus as a positive control proving the + // listener observes executor task starts, so the `== 0` assertion for the offloaded run is + // meaningful. Comet is disabled here deliberately: a Comet-native baseline uses the native + // execution engine (tokio) which, once an in-process Ballista offload has run in this JVM, + // panics with `JAVA_VM not initialized` (dual-library global-state issue — see the class + // doc). Spark-only execution is immune and is the correct oracle regardless. + var baseline: Seq[Seq[Any]] = null + val baselineTaskStarts = countTaskStarts { + baseline = withSQLConf( + CometConf.COMET_ENABLED.key -> "false", + CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") { + spark.sql(q1FullAggregate).collect().map(_.toSeq.toIndexedSeq).toIndexedSeq + } + } + assert( + baselineTaskStarts > 0, + "expected the Spark baseline collect to launch at least one Spark executor task " + + s"(sanity check for the listener apparatus); got $baselineTaskStarts") + + // Ballista offload: run the same query with the flag on, counting executor task starts. + var offloaded: Seq[Seq[Any]] = null + val offloadedTaskStarts = countTaskStarts { + offloaded = withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "true") { + spark.sql(q1FullAggregate).collect().map(_.toSeq.toIndexedSeq).toIndexedSeq + } + } + + // Sort both sides by (returnflag, linestatus) on the driver (Q1's trailing ORDER BY is not + // offloaded). Compare full rows using the exact values Spark produced — decimals keep their + // computed scale, so a wrong decimal scale from avg/sum composition fails the assertion. + def sortKey(r: Seq[Any]): (String, String) = (s"${r.head}", s"${r(1)}") + val baselineSorted = baseline.sortBy(sortKey) + val offloadedSorted = offloaded.sortBy(sortKey) + assert( + offloadedSorted == baselineSorted, + "offloaded (distributed) Q1 aggregate rows do not match Spark's own Q1\n" + + s" spark: $baselineSorted\n offloaded: $offloadedSorted") + + // Sanity: the synthetic lineitem forms three surviving groups after the Q1 date filter. + assert( + baselineSorted.map(r => (s"${r.head}", s"${r(1)}")) == + Seq(("A", "F"), ("N", "O"), ("R", "F")), + s"unexpected Q1 groups: ${baselineSorted.map(r => (r.head, r(1)))}") + + // Crucially, NO Spark executor tasks ran for the offloaded (distributed) collect. + assert( + offloadedTaskStarts == 0, + s"expected 0 Spark executor tasks for the Ballista-offloaded distributed collect, " + + s"but $offloadedTaskStarts started") + } + } + } } From 300ddb625dc73ae108e7613d9276041c05342c78 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Jul 2026 18:10:53 -0600 Subject: [PATCH 18/42] docs(contributor-guide): R2 foundations + full-Q1 milestone + coexistence limitation Relates to #4796 --- docs/source/contributor-guide/ballista_execution.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/docs/source/contributor-guide/ballista_execution.md b/docs/source/contributor-guide/ballista_execution.md index b7b08126d1..dd90aa66c4 100644 --- a/docs/source/contributor-guide/ballista_execution.md +++ b/docs/source/contributor-guide/ballista_execution.md @@ -110,7 +110,10 @@ Legend: ✅ done · 🔨 in progress · ⬜ planned - ✅ R1-T2 — config flag + driver `executeCollect` override. - ◐ R1-T3 — offload proven end-to-end on Q1's single-stage subset (scan + date filter + decimal projections), results match Spark, 0 executor tasks. Full Q1 GROUP BY is structurally multi-block → R2. - ⬜ R1-T4 (R1b) — submit to an external Ballista scheduler + executor cluster. -- 🔨 **R2 — multi-stage distribution.** A distributed 2-block `GROUP BY` (Comet partial-agg → Ballista hash shuffle → Comet final-agg) runs offloaded with 0 Spark-executor tasks and correct results. +- 🔨 **R2 — multi-stage distribution.** A distributed 2-block `GROUP BY` (Comet partial-agg → Ballista hash shuffle → Comet final-agg) runs offloaded with 0 Spark-executor tasks and correct results — **full TPC-H Q1's aggregate now runs distributed on Ballista and matches Spark.** + - ✅ R2-T1 (Ballista) — accept a pre-built physical plan for distribution (a `physical_plan` submission variant; its own Ballista branch/PR). + - ✅ R2-T2 (Comet native) — feed a `ScanExec` leaf from a native `RecordBatchStream` (not only a JVM input). + - ✅ R2-T3 (`comet-ballista`) — `CometFragmentExec`: a Comet fragment whose `Scan` leaf is fed by DataFusion child streams. - ✅ R2-T4 — 2-block `count(*)` single-key distributes across the shuffle; results match Spark, 0 executor tasks. - ✅ R2-T5 — **full TPC-H Q1 aggregate distributed** (no `ORDER BY`): `sum`×4, `avg`×3, `count` over decimals grouped by two keys (`l_returnflag`, `l_linestatus`). `avg`'s partial (sum + count) state and decimal partial sums round-trip through Ballista's Arrow IPC shuffle and compose in the Comet final aggregate; results match Spark's own Q1 row-for-row (incl. decimal scale), 0 executor tasks. - ⬜ N-block generalization (a trailing `ORDER BY` / range exchange is a third stage — still out of scope). @@ -131,6 +134,11 @@ Legend: ✅ done · 🔨 in progress · ⬜ planned - The FFI boundary requires Comet and Ballista to be built against the same DataFusion **major** version. - Comet core links the JNI bridge, so `libjvm` must be present at runtime even where JNI is unused. +- The `comet-ballista` cdylib statically links a second copy of Comet core, so a Comet-on-executor query + and an in-process Ballista offload cannot currently coexist in the same JVM (the second core's `JAVA_VM` + is uninitialized). Unifying that core is a planned follow-up; until then, use a single mode per JVM. +- The single-stage `ORDER BY`/range exchange makes Q1's final sort a third stage — out of the current + 2-block scope; sort on the driver, or wait for N-block generalization. ## References From 04242f770cc5a7ad38de8ec55c7a9edf80d291d8 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Jul 2026 18:16:47 -0600 Subject: [PATCH 19/42] build(ballista): pin Ballista to the physical-plan-submission fork branch Replace the local path deps with a git-rev pin on the experimental Ballista branch (apache/datafusion-ballista#1924) so the branch builds without a local checkout. Relates to #4796 --- native/Cargo.lock | 4 ++++ native/Cargo.toml | 14 +++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/native/Cargo.lock b/native/Cargo.lock index 56c5432c52..1d5c931e9c 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -1139,6 +1139,7 @@ dependencies = [ [[package]] name = "ballista" version = "53.0.0" +source = "git+https://github.com/andygrove/datafusion-ballista?rev=ec0d92799896e608efa43e446bacdc4079e7b6a2#ec0d92799896e608efa43e446bacdc4079e7b6a2" dependencies = [ "async-trait", "ballista-core", @@ -1153,6 +1154,7 @@ dependencies = [ [[package]] name = "ballista-core" version = "53.0.0" +source = "git+https://github.com/andygrove/datafusion-ballista?rev=ec0d92799896e608efa43e446bacdc4079e7b6a2#ec0d92799896e608efa43e446bacdc4079e7b6a2" dependencies = [ "arrow-flight", "async-trait", @@ -1187,6 +1189,7 @@ dependencies = [ [[package]] name = "ballista-executor" version = "53.0.0" +source = "git+https://github.com/andygrove/datafusion-ballista?rev=ec0d92799896e608efa43e446bacdc4079e7b6a2#ec0d92799896e608efa43e446bacdc4079e7b6a2" dependencies = [ "arrow", "arrow-flight", @@ -1219,6 +1222,7 @@ dependencies = [ [[package]] name = "ballista-scheduler" version = "53.0.0" +source = "git+https://github.com/andygrove/datafusion-ballista?rev=ec0d92799896e608efa43e446bacdc4079e7b6a2#ec0d92799896e608efa43e446bacdc4079e7b6a2" dependencies = [ "arrow-flight", "async-trait", diff --git a/native/Cargo.toml b/native/Cargo.toml index 7685b1b617..377bb41d08 100644 --- a/native/Cargo.toml +++ b/native/Cargo.toml @@ -61,13 +61,13 @@ aws-credential-types = "1.2.13" iceberg = { git = "https://github.com/apache/iceberg-rust", rev = "80a30d3" } iceberg-storage-opendal = { git = "https://github.com/apache/iceberg-rust", rev = "80a30d3", features = ["opendal-memory", "opendal-fs", "opendal-s3", "opendal-gcs", "opendal-oss", "opendal-azdls"] } reqsign-core = "3" -# Path deps to the local Ballista worktree (branch feat/physical-plan-submission) -# so T1's execute_physical_plan + PhysicalPlan(bytes) submission variant are -# available. Revert to a fork git-rev pin once T1 lands upstream. -ballista = { path = "/Users/andy/git/ballista-ffi-poc/ballista/client", package = "ballista" } -ballista-core = { path = "/Users/andy/git/ballista-ffi-poc/ballista/core", package = "ballista-core" } -ballista-scheduler = { path = "/Users/andy/git/ballista-ffi-poc/ballista/scheduler", package = "ballista-scheduler" } -ballista-executor = { path = "/Users/andy/git/ballista-ffi-poc/ballista/executor", package = "ballista-executor" } +# Ballista pinned to the experimental physical-plan-submission branch +# (apache/datafusion-ballista#1924) which adds execute_physical_plan + the +# PhysicalPlan(bytes) submission variant that the distributed offload needs. +ballista = { git = "https://github.com/andygrove/datafusion-ballista", rev = "ec0d92799896e608efa43e446bacdc4079e7b6a2", package = "ballista" } +ballista-core = { git = "https://github.com/andygrove/datafusion-ballista", rev = "ec0d92799896e608efa43e446bacdc4079e7b6a2", package = "ballista-core" } +ballista-scheduler = { git = "https://github.com/andygrove/datafusion-ballista", rev = "ec0d92799896e608efa43e446bacdc4079e7b6a2", package = "ballista-scheduler" } +ballista-executor = { git = "https://github.com/andygrove/datafusion-ballista", rev = "ec0d92799896e608efa43e446bacdc4079e7b6a2", package = "ballista-executor" } [profile.release] debug = true From 1d747280398f29bbd46ad315aa879be7922404cc Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Jul 2026 19:00:50 -0600 Subject: [PATCH 20/42] refactor: fold Ballista offload into libcomet behind default-off feature MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the `datafusion-comet-ballista` crate into `datafusion-comet` core as a gated `execution::ballista` module and delete the separate crate, so the offload is compiled into the single `libcomet` cdylib only when built with the new default-off `ballista` Cargo feature. This removes the second, statically-linked copy of Comet core (and its distinct `JAVA_VM` static) that made a Comet-on-executor query and an in-process Ballista offload panic with "JAVA_VM not initialized" when run in the same JVM. - Make the ballista deps optional and add the `ballista` feature (default off); fix module paths to `crate::execution::…` / `super::…` and reuse core's existing `execution::ffi` and `execution::fragment` helpers. - Re-gate the moved integration tests with `#![cfg(feature = "ballista")]`. - Load the offload JNI entries from `libcomet` (built with the feature) instead of a separate `libdatafusion_comet_ballista`; `NativeBallista.isAvailable` probes symbol presence so offload suites skip on a feature-less build. - Add a `make core-ballista` convenience target; the default build stays lean. - Flip the Q1 suite's second-test oracle to Comet-on-executor (enabled) to assert coexistence with a prior offload in one JVM. Relates to #4796. --- Makefile | 6 + .../contributor-guide/ballista_execution.md | 18 ++- native/Cargo.lock | 27 +---- native/Cargo.toml | 2 +- native/ballista/Cargo.toml | 57 ---------- native/ballista/README.md | 15 --- native/core/Cargo.toml | 25 +++++ .../src/execution/ballista}/codec.rs | 6 +- .../src/execution/ballista}/ffi_jni.rs | 28 +++-- .../src/execution/ballista}/fragment.rs | 2 +- .../src/execution/ballista/mod.rs} | 9 +- .../src/execution/ballista}/scan.rs | 2 +- .../src/execution/ballista}/table_provider.rs | 2 +- native/core/src/execution/fragment.rs | 4 +- native/core/src/execution/mod.rs | 2 + native/core/src/execution/operators/scan.rs | 6 +- .../tests/ballista_codec_roundtrip.rs} | 4 +- .../tests/ballista_distributed.rs} | 6 +- .../tests/ballista_ffi_roundtrip.rs} | 6 +- .../tests/ballista_fragment_child_input.rs} | 21 +++- .../comet/ballista/NativeBallista.scala | 106 ++++++++---------- .../ballista/CometBallistaFfiSpikeSuite.scala | 5 +- .../comet/ballista/CometBallistaQ1Suite.scala | 38 +++---- 23 files changed, 180 insertions(+), 217 deletions(-) delete mode 100644 native/ballista/Cargo.toml delete mode 100644 native/ballista/README.md rename native/{ballista/src => core/src/execution/ballista}/codec.rs (97%) rename native/{ballista/src => core/src/execution/ballista}/ffi_jni.rs (96%) rename native/{ballista/src => core/src/execution/ballista}/fragment.rs (98%) rename native/{ballista/src/lib.rs => core/src/execution/ballista/mod.rs} (83%) rename native/{ballista/src => core/src/execution/ballista}/scan.rs (98%) rename native/{ballista/src => core/src/execution/ballista}/table_provider.rs (98%) rename native/{ballista/tests/codec_roundtrip.rs => core/tests/ballista_codec_roundtrip.rs} (98%) rename native/{ballista/tests/distributed.rs => core/tests/ballista_distributed.rs} (95%) rename native/{ballista/tests/ffi_roundtrip.rs => core/tests/ballista_ffi_roundtrip.rs} (96%) rename native/{ballista/tests/fragment_child_input.rs => core/tests/ballista_fragment_child_input.rs} (95%) diff --git a/Makefile b/Makefile index 8685d171be..69b9d5c7a0 100644 --- a/Makefile +++ b/Makefile @@ -28,6 +28,12 @@ all: core jvm core: cd native && cargo build $(FEATURES_ARG) +# Build the single libcomet cdylib WITH the default-off `ballista` feature, so the +# in-process Ballista offload (and its NativeBallista JNI entries) is folded into +# libcomet. Required before running the org.apache.comet.ballista offload suites. +# The default `core` target stays Ballista-free. +core-ballista: + cd native && cargo build --features ballista $(FEATURES_ARG) test-rust: # We need to compile CometException so that the cargo test can pass ./mvnw compile -pl common -DskipTests $(PROFILES) diff --git a/docs/source/contributor-guide/ballista_execution.md b/docs/source/contributor-guide/ballista_execution.md index dd90aa66c4..6b8e82cb6c 100644 --- a/docs/source/contributor-guide/ballista_execution.md +++ b/docs/source/contributor-guide/ballista_execution.md @@ -81,11 +81,14 @@ Two design choices make this mostly integration rather than new invention: **Rust** (`native/`): - `comet_ffi_plan_from_proto` (`datafusion-comet` core) — decodes a Comet `Operator` proto, builds the plan with the existing `PhysicalPlanner`, returns an `FFI_ExecutionPlan`. -- `datafusion-comet-ballista` crate: +- `execution::ballista` module (`datafusion-comet` core, gated behind the default-off `ballista` + Cargo feature — built with `cargo build --features ballista` / `make core-ballista`, so it is + folded into the single `libcomet` cdylib rather than a separate library): - `CometScanExec` — a serializable DataFusion leaf that rebuilds the plan over FFI at execute time. - `CometPhysicalCodec` / `CometLogicalCodec` — extension codecs that compose with Ballista's own (delegating non-Comet nodes) so Comet plans can be shipped to Ballista executors. - `CometTableProvider` — exposes a Comet plan to Ballista as a table. + - the `Java_org_apache_comet_ballista_NativeBallista_*` JNI entries (driver-side submission). **JVM** (`spark/`): - Driver-side offload hook and configuration (see below). @@ -101,8 +104,9 @@ Two design choices make this mostly integration rather than new invention: Legend: ✅ done · 🔨 in progress · ⬜ planned -- ✅ **Rust core** — FFI plan export + `datafusion-comet-ballista` crate (`CometScanExec`, composed - codecs, `CometTableProvider`) with codec round-trip and standalone distributed tests. +- ✅ **Rust core** — FFI plan export + gated `execution::ballista` module in `datafusion-comet` + (`CometScanExec`, composed codecs, `CometTableProvider`) folded into `libcomet` behind the + default-off `ballista` feature, with codec round-trip and standalone distributed tests. - 🔨 **R1 — driver-side offload (single-stage).** A Spark app runs a query with `spark.comet.exec.ballista.enabled=true`; the driver submits the whole Comet plan to Ballista and returns results, with zero Spark-executor tasks. First target query: TPC-H Q1. @@ -134,9 +138,11 @@ Legend: ✅ done · 🔨 in progress · ⬜ planned - The FFI boundary requires Comet and Ballista to be built against the same DataFusion **major** version. - Comet core links the JNI bridge, so `libjvm` must be present at runtime even where JNI is unused. -- The `comet-ballista` cdylib statically links a second copy of Comet core, so a Comet-on-executor query - and an in-process Ballista offload cannot currently coexist in the same JVM (the second core's `JAVA_VM` - is uninitialized). Unifying that core is a planned follow-up; until then, use a single mode per JVM. +- The offload is folded into the single `libcomet` cdylib behind the default-off `ballista` Cargo + feature (there is no separate `comet-ballista` cdylib / second copy of Comet core), so a + Comet-on-executor query and an in-process Ballista offload share one `JAVA_VM` and coexist in the + same JVM without the "JAVA_VM not initialized" panic. Building with the feature is required for + the offload entries (`make core-ballista`); the default build stays Ballista-free. - The single-stage `ORDER BY`/range exchange makes Q1's final sort a third stage — out of the current 2-block scope; sort on the driver, or wait for N-block generalization. diff --git a/native/Cargo.lock b/native/Cargo.lock index 1d5c931e9c..636c74bfec 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -2216,11 +2216,16 @@ dependencies = [ name = "datafusion-comet" version = "1.0.0" dependencies = [ + "anyhow", "arrow", "assertables", "async-trait", "aws-config", "aws-credential-types", + "ballista", + "ballista-core", + "ballista-executor", + "ballista-scheduler", "criterion", "datafusion", "datafusion-comet-common", @@ -2233,6 +2238,7 @@ dependencies = [ "datafusion-ffi", "datafusion-functions-nested", "datafusion-physical-expr-adapter", + "datafusion-proto", "datafusion-spark", "futures", "hdfs-sys", @@ -2269,27 +2275,6 @@ dependencies = [ "uuid", ] -[[package]] -name = "datafusion-comet-ballista" -version = "1.0.0" -dependencies = [ - "anyhow", - "async-trait", - "ballista", - "ballista-core", - "ballista-executor", - "ballista-scheduler", - "datafusion", - "datafusion-comet", - "datafusion-comet-proto", - "datafusion-ffi", - "datafusion-proto", - "futures", - "jni 0.22.4", - "prost", - "tokio", -] - [[package]] name = "datafusion-comet-common" version = "1.0.0" diff --git a/native/Cargo.toml b/native/Cargo.toml index 377bb41d08..d3f0c29d0f 100644 --- a/native/Cargo.toml +++ b/native/Cargo.toml @@ -17,7 +17,7 @@ [workspace] default-members = ["core", "spark-expr", "common", "proto", "jni-bridge", "shuffle"] -members = ["core", "spark-expr", "common", "proto", "jni-bridge", "shuffle", "hdfs", "fs-hdfs", "ballista"] +members = ["core", "spark-expr", "common", "proto", "jni-bridge", "shuffle", "hdfs", "fs-hdfs"] resolver = "2" [workspace.package] diff --git a/native/ballista/Cargo.toml b/native/ballista/Cargo.toml deleted file mode 100644 index c02d82f129..0000000000 --- a/native/ballista/Cargo.toml +++ /dev/null @@ -1,57 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -[package] -name = "datafusion-comet-ballista" -version = { workspace = true } -homepage = "https://datafusion.apache.org/comet" -repository = "https://github.com/apache/datafusion-comet" -authors = ["Apache DataFusion "] -description = "Runs Apache DataFusion Comet native plans as leaves in Apache DataFusion Ballista" -readme = "README.md" -license = "Apache-2.0" -edition = "2021" - -# this crate does not contain public Rust APIs so we do not publish it -publish = false - -# `cdylib` builds libcomet_ballista.{dylib,so} carrying the JNI submission entry -# (loaded by the JVM alongside libcomet); `rlib` keeps the crate usable from -# Rust integration tests. -[lib] -crate-type = ["cdylib", "rlib"] - -[dependencies] -jni = "0.22.4" -datafusion-comet = { path = "../core" } -datafusion-comet-proto = { workspace = true } -datafusion = { workspace = true, features = ["parquet"] } -datafusion-ffi = "54.0.0" -datafusion-proto = "54.0.0" -async-trait = { workspace = true } -tokio = { version = "1", features = ["rt-multi-thread"] } -prost = "0.14.3" -futures = { workspace = true } - -ballista = { workspace = true } -ballista-core = { workspace = true } -ballista-scheduler = { workspace = true } -ballista-executor = { workspace = true } - -[dev-dependencies] -tokio = { version = "1", features = ["rt-multi-thread", "macros"] } -anyhow = "1" diff --git a/native/ballista/README.md b/native/ballista/README.md deleted file mode 100644 index f2fbd59db3..0000000000 --- a/native/ballista/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# datafusion-comet-ballista - -Lets Apache DataFusion Ballista execute Apache DataFusion Comet native plans -that are handed across a `datafusion-ffi` boundary, so a Ballista executor can -run Comet's native scans without linking Comet's Rust crates directly. - -- [`scan::CometScanExec`] — a serializable DataFusion leaf that carries a - Comet plan's proto bytes and rebuilds the FFI plan at `execute()` time. This - is what Ballista ships to executors and reconstructs there. -- [`codec::CometPhysicalCodec`] / [`codec::CometLogicalCodec`] — extension - codecs that (de)serialize Comet nodes as their proto bytes and delegate - everything else to Ballista's own codecs. -- [`table_provider::CometTableProvider`] — a `TableProvider` that produces a - `CometScanExec`, so a Comet scan can participate in a DataFusion logical - plan and be distributed by Ballista like any other table. diff --git a/native/core/Cargo.toml b/native/core/Cargo.toml index f9876037ae..ea1727e71e 100644 --- a/native/core/Cargo.toml +++ b/native/core/Cargo.toml @@ -54,6 +54,9 @@ itertools = "0.15.0" paste = "1.0.14" datafusion = { workspace = true, features = ["parquet_encryption", "sql"] } datafusion-ffi = "54.0.0" +# Only used by the optional `ballista` offload module (extension codecs + +# physical-plan (de)serialization); activated by the `ballista` feature. +datafusion-proto = { version = "54.0.0", optional = true } datafusion-physical-expr-adapter = { workspace = true } datafusion-datasource = { workspace = true } datafusion-spark = { workspace = true } @@ -79,6 +82,13 @@ reqsign-core = { workspace = true } serde_json = "1.0" uuid = "1.23.3" +# Ballista offload deps — optional, activated by the `ballista` feature so the +# default `libcomet` build stays Ballista-free (no ballista/tonic/second core). +ballista = { workspace = true, optional = true } +ballista-core = { workspace = true, optional = true } +ballista-scheduler = { workspace = true, optional = true } +ballista-executor = { workspace = true, optional = true } + [target.'cfg(target_os = "linux")'.dependencies] procfs = "0.18.0" @@ -93,6 +103,10 @@ lazy_static = "1.4" assertables = "10" hex = "0.4.3" datafusion-functions-nested = { version = "54.0.0" } +# `#[tokio::test]` macro + `anyhow` are used by the `ballista`-feature-gated +# integration tests under `tests/ballista_*.rs`. +tokio = { version = "1", features = ["rt-multi-thread", "macros"] } +anyhow = "1" [features] backtrace = ["datafusion/backtrace"] @@ -100,6 +114,17 @@ default = ["hdfs-opendal"] hdfs = ["datafusion-comet-objectstore-hdfs"] hdfs-opendal = ["opendal", "object_store_opendal", "hdfs-sys"] jemalloc = ["tikv-jemallocator", "tikv-jemalloc-ctl"] +# Folds the in-process Ballista offload (`execution::ballista`, incl. the +# `Java_org_apache_comet_ballista_NativeBallista_*` JNI entries) into `libcomet`. +# Default-OFF so the standard build pulls no ballista/tonic and links only one +# copy of Comet core. +ballista = [ + "dep:ballista", + "dep:ballista-core", + "dep:ballista-scheduler", + "dep:ballista-executor", + "dep:datafusion-proto", +] # exclude optional packages from cargo machete verifications [package.metadata.cargo-machete] diff --git a/native/ballista/src/codec.rs b/native/core/src/execution/ballista/codec.rs similarity index 97% rename from native/ballista/src/codec.rs rename to native/core/src/execution/ballista/codec.rs index dde2f18396..322c5a45f1 100644 --- a/native/ballista/src/codec.rs +++ b/native/core/src/execution/ballista/codec.rs @@ -29,9 +29,9 @@ use datafusion_proto::physical_plan::PhysicalExtensionCodec; use ballista_core::serde::{BallistaLogicalExtensionCodec, BallistaPhysicalExtensionCodec}; -use crate::fragment::CometFragmentExec; -use crate::scan::CometScanExec; -use crate::table_provider::CometTableProvider; +use super::fragment::CometFragmentExec; +use super::scan::CometScanExec; +use super::table_provider::CometTableProvider; /// Marks a payload as a Comet node so the codec can tell it apart from a /// Ballista/DataFusion node it should delegate. diff --git a/native/ballista/src/ffi_jni.rs b/native/core/src/execution/ballista/ffi_jni.rs similarity index 96% rename from native/ballista/src/ffi_jni.rs rename to native/core/src/execution/ballista/ffi_jni.rs index e2de4b17db..67b15cc111 100644 --- a/native/ballista/src/ffi_jni.rs +++ b/native/core/src/execution/ballista/ffi_jni.rs @@ -117,8 +117,8 @@ pub fn build_test_proto() -> Result, String> { Ok(op.encode_to_vec()) } -use crate::scan::CometScanExec; -use crate::{CometFragmentExec, CometLogicalCodec, CometPhysicalCodec, CometTableProvider}; +use super::scan::CometScanExec; +use super::{CometFragmentExec, CometLogicalCodec, CometPhysicalCodec, CometTableProvider}; /// Run a Comet `Operator` proto on an in-process standalone Ballista engine and /// return the collected Arrow batches plus the result schema. @@ -223,17 +223,21 @@ fn build_two_stage_plan( num_group_keys: usize, num_partitions: usize, ) -> Result, String> { - let block1: Arc = - Arc::new(CometFragmentExec::try_new(block1_proto.to_vec(), vec![]).map_err(|e| { - format!("failed to build block1 (partial-agg) fragment: {e}") - })?); + let block1: Arc = Arc::new( + CometFragmentExec::try_new(block1_proto.to_vec(), vec![]) + .map_err(|e| format!("failed to build block1 (partial-agg) fragment: {e}"))?, + ); let schema1 = block1.schema(); if num_group_keys == 0 || num_group_keys > schema1.fields().len() { return Err(format!( "invalid num_group_keys {num_group_keys}: block1 output has {} columns ({:?})", schema1.fields().len(), - schema1.fields().iter().map(|f| f.name()).collect::>() + schema1 + .fields() + .iter() + .map(|f| f.name()) + .collect::>() )); } @@ -257,10 +261,10 @@ fn build_two_stage_plan( .map_err(|e| format!("failed to build hash RepartitionExec: {e}"))?, ); - let block2: Arc = - Arc::new(CometFragmentExec::try_new(block2_proto.to_vec(), vec![repart]).map_err( - |e| format!("failed to build block2 (final-agg) fragment: {e}"), - )?); + let block2: Arc = Arc::new( + CometFragmentExec::try_new(block2_proto.to_vec(), vec![repart]) + .map_err(|e| format!("failed to build block2 (final-agg) fragment: {e}"))?, + ); eprintln!( "[comet-ballista R2] block2 (final-agg) output schema = {:?}", @@ -446,7 +450,7 @@ pub unsafe fn submit_and_export( mod jni_entry { use super::{build_test_proto, submit_and_export, submit_and_export_distributed}; - use comet::errors::{try_unwrap_or_throw, CometError}; + use crate::errors::{try_unwrap_or_throw, CometError}; use jni::objects::{JByteArray, JClass, JLongArray, ReleaseMode}; use jni::sys::{jbyteArray, jint, jlong}; use jni::EnvUnowned; diff --git a/native/ballista/src/fragment.rs b/native/core/src/execution/ballista/fragment.rs similarity index 98% rename from native/ballista/src/fragment.rs rename to native/core/src/execution/ballista/fragment.rs index 8e16da320a..e624737ddd 100644 --- a/native/ballista/src/fragment.rs +++ b/native/core/src/execution/ballista/fragment.rs @@ -25,7 +25,7 @@ use datafusion::physical_plan::{ SendableRecordBatchStream, }; -use comet::execution::fragment::{build_native_fragment, native_fragment_plan_properties}; +use crate::execution::fragment::{build_native_fragment, native_fragment_plan_properties}; /// A DataFusion node that runs a Comet plan fragment (carried as `Operator` /// proto bytes) whose input-leaf `Scan` operators are fed by this node's diff --git a/native/ballista/src/lib.rs b/native/core/src/execution/ballista/mod.rs similarity index 83% rename from native/ballista/src/lib.rs rename to native/core/src/execution/ballista/mod.rs index 5c8d1a4730..af2f69a8a5 100644 --- a/native/ballista/src/lib.rs +++ b/native/core/src/execution/ballista/mod.rs @@ -18,6 +18,11 @@ //! Runs Apache DataFusion Comet native plans as leaves inside Apache //! DataFusion Ballista. //! +//! This module is compiled into the single `libcomet` cdylib only when the +//! default-off `ballista` Cargo feature is enabled, so the offload entry lives +//! in the same library (and shares the same Comet core state, e.g. `JAVA_VM`) +//! as the rest of Comet — there is no separate `comet-ballista` cdylib. +//! //! - [`scan::CometScanExec`]: a serializable DataFusion leaf that carries the //! Comet proto bytes (the "recipe") and builds the FFI plan at execute() //! time. This is what Ballista ships to executors and reconstructs there. @@ -36,9 +41,7 @@ pub mod fragment; pub mod scan; pub mod table_provider; -pub use codec::{ - CometLogicalCodec, CometPhysicalCodec, COMET_FRAGMENT_MAGIC, COMET_MAGIC, -}; +pub use codec::{CometLogicalCodec, CometPhysicalCodec, COMET_FRAGMENT_MAGIC, COMET_MAGIC}; pub use ffi_jni::{ build_test_proto, execute_comet_proto, execute_two_stage, submit_and_export, submit_and_export_distributed, diff --git a/native/ballista/src/scan.rs b/native/core/src/execution/ballista/scan.rs similarity index 98% rename from native/ballista/src/scan.rs rename to native/core/src/execution/ballista/scan.rs index fe417c9844..8c3db67530 100644 --- a/native/ballista/src/scan.rs +++ b/native/core/src/execution/ballista/scan.rs @@ -26,7 +26,7 @@ use datafusion::physical_plan::{ use datafusion_ffi::execution_plan::ForeignExecutionPlan; use tokio::runtime::Handle; -use comet::execution::ffi::comet_ffi_plan_from_proto; +use crate::execution::ffi::comet_ffi_plan_from_proto; /// A DataFusion leaf that carries a Comet plan protobuf and executes it via the /// `datafusion-ffi` boundary. Serializable through `CometPhysicalCodec` by its diff --git a/native/ballista/src/table_provider.rs b/native/core/src/execution/ballista/table_provider.rs similarity index 98% rename from native/ballista/src/table_provider.rs rename to native/core/src/execution/ballista/table_provider.rs index 6e70b6ba20..4009af3b02 100644 --- a/native/ballista/src/table_provider.rs +++ b/native/core/src/execution/ballista/table_provider.rs @@ -26,7 +26,7 @@ use datafusion::physical_expr::PhysicalExpr; use datafusion::physical_plan::projection::ProjectionExec; use datafusion::physical_plan::ExecutionPlan; -use crate::scan::CometScanExec; +use super::scan::CometScanExec; /// A DataFusion `TableProvider` that produces a `CometScanExec`. Carries the /// Comet proto so the table can be reconstructed on the scheduler side via the diff --git a/native/core/src/execution/fragment.rs b/native/core/src/execution/fragment.rs index 9142320a0b..78d5951b7b 100644 --- a/native/core/src/execution/fragment.rs +++ b/native/core/src/execution/fragment.rs @@ -60,8 +60,8 @@ const NATIVE_FRAGMENT_EXEC_ID: i64 = 0; /// builds without consuming a JVM input; native inputs are injected afterwards /// via [`ScanExec::set_native_input`]. fn plan_from_proto(proto_bytes: &[u8]) -> Result<(Vec, Arc), String> { - let op = - Operator::decode(proto_bytes).map_err(|e| format!("failed to decode Operator proto: {e}"))?; + let op = Operator::decode(proto_bytes) + .map_err(|e| format!("failed to decode Operator proto: {e}"))?; // A fresh `SessionContext` means configuration comes only from the proto, // not from any ambient session (see `super::ffi`). diff --git a/native/core/src/execution/mod.rs b/native/core/src/execution/mod.rs index d67d74118c..6eb3076998 100644 --- a/native/core/src/execution/mod.rs +++ b/native/core/src/execution/mod.rs @@ -16,6 +16,8 @@ // under the License. //! PoC of vectorization execution through JNI to Rust. +#[cfg(feature = "ballista")] +pub mod ballista; pub mod columnar_to_row; pub mod expressions; pub mod ffi; diff --git a/native/core/src/execution/operators/scan.rs b/native/core/src/execution/operators/scan.rs index da239973ed..a002345a70 100644 --- a/native/core/src/execution/operators/scan.rs +++ b/native/core/src/execution/operators/scan.rs @@ -191,11 +191,7 @@ impl ScanExec { /// `set_input_batch`). Only the handle that `get_next_batch` is driven on needs /// this — the executable leaf shares this handle's `batch` slot (an `Arc`), so /// batches pulled here become visible to the plan node without touching it. - pub fn set_native_input( - &mut self, - exec_context_id: i64, - stream: SendableRecordBatchStream, - ) { + pub fn set_native_input(&mut self, exec_context_id: i64, stream: SendableRecordBatchStream) { self.exec_context_id = exec_context_id; self.input_source = Some(Arc::new(Mutex::new(NativeBatchStream::new(stream)))); } diff --git a/native/ballista/tests/codec_roundtrip.rs b/native/core/tests/ballista_codec_roundtrip.rs similarity index 98% rename from native/ballista/tests/codec_roundtrip.rs rename to native/core/tests/ballista_codec_roundtrip.rs index b00518bc75..8068c89854 100644 --- a/native/ballista/tests/codec_roundtrip.rs +++ b/native/core/tests/ballista_codec_roundtrip.rs @@ -7,6 +7,8 @@ // executor) and execute it. The Comet leaf travels as proto bytes and is rebuilt // on the far side by re-running Comet's planner over FFI. +#![cfg(feature = "ballista")] + use std::sync::Arc; use datafusion::arrow::array::{Int32Array, RecordBatch}; @@ -24,7 +26,7 @@ use datafusion_proto::protobuf::PhysicalPlanNode; use futures::StreamExt; use prost::Message; -use datafusion_comet_ballista::{CometPhysicalCodec, CometScanExec}; +use comet::execution::ballista::{CometPhysicalCodec, CometScanExec}; use datafusion_comet_proto::spark_expression::{data_type::DataTypeId, DataType}; use datafusion_comet_proto::spark_operator::{ operator::OpStruct, NativeScan, NativeScanCommon, Operator, SparkFilePartition, diff --git a/native/ballista/tests/distributed.rs b/native/core/tests/ballista_distributed.rs similarity index 95% rename from native/ballista/tests/distributed.rs rename to native/core/tests/ballista_distributed.rs index 1523f0c9ad..cea8e56526 100644 --- a/native/ballista/tests/distributed.rs +++ b/native/core/tests/ballista_distributed.rs @@ -9,7 +9,9 @@ // // Starts an in-process Ballista scheduler + executors, so it is heavier and // slower than a unit test. Run explicitly: -// cargo test -p datafusion-comet-ballista --test distributed -- --ignored +// cargo test -p datafusion-comet --features ballista --test ballista_distributed -- --ignored + +#![cfg(feature = "ballista")] use std::sync::Arc; @@ -21,7 +23,7 @@ use datafusion::execution::SessionStateBuilder; use datafusion::parquet::arrow::ArrowWriter; use datafusion::prelude::{SessionConfig, SessionContext}; -use datafusion_comet_ballista::{CometLogicalCodec, CometPhysicalCodec, CometTableProvider}; +use comet::execution::ballista::{CometLogicalCodec, CometPhysicalCodec, CometTableProvider}; use datafusion_comet_proto::spark_expression::{data_type::DataTypeId, DataType}; use datafusion_comet_proto::spark_operator::{ operator::OpStruct, NativeScan, NativeScanCommon, Operator, SparkFilePartition, diff --git a/native/ballista/tests/ffi_roundtrip.rs b/native/core/tests/ballista_ffi_roundtrip.rs similarity index 96% rename from native/ballista/tests/ffi_roundtrip.rs rename to native/core/tests/ballista_ffi_roundtrip.rs index 082aa814f2..c87e70536b 100644 --- a/native/ballista/tests/ffi_roundtrip.rs +++ b/native/core/tests/ballista_ffi_roundtrip.rs @@ -6,7 +6,9 @@ // the JVM: it allocates the FFI structs, calls `submit_and_export`, then // re-imports via `from_ffi` and asserts 5 rows come back. // -// cargo test -p datafusion-comet-ballista --test ffi_roundtrip -- --ignored --nocapture +// cargo test -p datafusion-comet --features ballista --test ballista_ffi_roundtrip -- --ignored --nocapture + +#![cfg(feature = "ballista")] use std::sync::Arc; @@ -15,7 +17,7 @@ use datafusion::arrow::datatypes::{DataType as ArrowDataType, Field, Schema}; use datafusion::arrow::ffi::{from_ffi, FFI_ArrowArray, FFI_ArrowSchema}; use datafusion::parquet::arrow::ArrowWriter; -use datafusion_comet_ballista::submit_and_export; +use comet::execution::ballista::submit_and_export; use datafusion_comet_proto::spark_expression::{data_type::DataTypeId, DataType}; use datafusion_comet_proto::spark_operator::{ operator::OpStruct, NativeScan, NativeScanCommon, Operator, SparkFilePartition, diff --git a/native/ballista/tests/fragment_child_input.rs b/native/core/tests/ballista_fragment_child_input.rs similarity index 95% rename from native/ballista/tests/fragment_child_input.rs rename to native/core/tests/ballista_fragment_child_input.rs index 3d9eec4eda..6fede20023 100644 --- a/native/ballista/tests/fragment_child_input.rs +++ b/native/core/tests/ballista_fragment_child_input.rs @@ -3,6 +3,8 @@ // stood in for here by an in-memory child and a `CometScanExec` child), and // that such a fragment survives Ballista's physical-plan (de)serialization. +#![cfg(feature = "ballista")] + use std::sync::Arc; use datafusion::arrow::array::{Int32Array, RecordBatch}; @@ -22,7 +24,7 @@ use datafusion_proto::protobuf::PhysicalPlanNode; use futures::StreamExt; use prost::Message; -use datafusion_comet_ballista::{CometFragmentExec, CometPhysicalCodec, CometScanExec}; +use comet::execution::ballista::{CometFragmentExec, CometPhysicalCodec, CometScanExec}; use datafusion_comet_proto::spark_expression::{ data_type::DataTypeId, expr::ExprStruct, literal, BinaryExpr, BoundReference, DataType, Expr, Literal, @@ -147,7 +149,11 @@ fn build_filter_over_scan_proto() -> Vec { } fn int32_schema() -> SchemaRef { - Arc::new(Schema::new(vec![Field::new("a", ArrowDataType::Int32, true)])) + Arc::new(Schema::new(vec![Field::new( + "a", + ArrowDataType::Int32, + true, + )])) } /// A `CometFragmentExec` whose `Scan` leaf is fed by an in-memory DataFusion @@ -185,7 +191,11 @@ async fn fragment_scan_leaf_fed_by_child() -> anyhow::Result<()> { } // Child produced 1..=5; the fragment's Filter keeps col0 > 2. - assert_eq!(values, vec![3, 4, 5], "child rows must flow through and be filtered"); + assert_eq!( + values, + vec![3, 4, 5], + "child rows must flow through and be filtered" + ); Ok(()) } @@ -251,8 +261,9 @@ async fn fragment_codec_roundtrip() -> anyhow::Result<()> { // Child = CometScanExec over the parquet (round-trips via COMET_MAGIC); // parent fragment = Filter(col0 > 2) over a Scan input leaf. - let child: Arc = - Arc::new(CometScanExec::try_new(build_native_scan_proto(&parquet_path)?)?); + let child: Arc = Arc::new(CometScanExec::try_new(build_native_scan_proto( + &parquet_path, + )?)?); let fragment_proto = build_filter_over_scan_proto(); let plan: Arc = Arc::new(CometFragmentExec::try_new(fragment_proto, vec![child])?); diff --git a/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala b/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala index 00c3dd22d8..46a1dbc696 100644 --- a/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala +++ b/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala @@ -19,14 +19,18 @@ package org.apache.comet.ballista -import java.io.File -import java.nio.file.{Files, Paths} - import org.apache.comet.NativeBase /** - * JNI binding to the native driver-side Ballista submission entry, implemented in the - * `datafusion-comet-ballista` crate (`libdatafusion_comet_ballista`). + * JNI binding to the native driver-side Ballista submission entry. + * + * The offload code (the `execution::ballista` module and its + * `Java_org_apache_comet_ballista_NativeBallista_*` JNI entries) is compiled into the single + * `libcomet` cdylib when Comet's native crate is built with the default-off `ballista` Cargo + * feature (`cd native && cargo build --features ballista`). There is no separate + * `libdatafusion_comet_ballista` library anymore: folding the offload into core means it shares + * Comet core's single `JAVA_VM` static, so a Comet-on-executor query and an in-process offload + * can coexist in one JVM without the "JAVA_VM not initialized" panic. * * EXPERIMENTAL (R1): used by [[org.apache.spark.sql.comet.CometExec.executeCollectViaBallista]] * to offload a single-stage Comet query to an in-process Ballista engine on the Spark driver. @@ -99,77 +103,63 @@ class NativeBallista { object NativeBallista { - @volatile private var loaded = false + @volatile private var probed = false + @volatile private var available = false @volatile private var loadError: Option[Throwable] = None /** - * Load `libdatafusion_comet_ballista`. - * - * Symbol ownership: the ballista cdylib statically links Comet core and therefore re-exports - * core's `Java_org_apache_comet_Native_*` JNI symbols in addition to its own distinct - * `Java_org_apache_comet_ballista_NativeBallista_*` entries. We force `libcomet` (via - * [[NativeBase]]) to load FIRST so the JVM binds every core native method to `libcomet`; - * loading the ballista library afterwards contributes only the distinct `NativeBallista_*` - * symbols. This keeps all Comet core state in a single library and avoids two divergent copies. - * - * The library is not on `java.library.path`, so we resolve it by absolute path: the - * `COMET_BALLISTA_LIB` env var first, then the debug/release build outputs relative to the - * module working directory. + * Ensure the single `libcomet` cdylib is loaded. [[NativeBase]] already loads it for every + * Comet native method; because the offload JNI entries are now compiled into `libcomet` (behind + * the `ballista` feature), loading libcomet also binds the `NativeBallista_*` entries. There is + * no separate library to `System.load`. */ - private def load(): Unit = synchronized { - if (loaded || loadError.isDefined) return - - // Load libcomet first so core JNI symbols bind to it, not to the ballista cdylib's re-exports. + private def ensureCometLoaded(): Unit = synchronized { + if (loadError.isDefined) return try { NativeBase.isLoaded() } catch { - case t: Throwable => - loadError = Some(t) - return - } - - val libName = System.mapLibraryName("datafusion_comet_ballista") - val moduleDir = new File(System.getProperty("user.dir")) - val candidates = Seq( - sys.env.get("COMET_BALLISTA_LIB"), - Some(new File(moduleDir, s"../native/target/debug/$libName").getPath), - Some(new File(moduleDir, s"../native/target/release/$libName").getPath), - Some(new File(moduleDir, s"native/target/debug/$libName").getPath), - Some(new File(moduleDir, s"native/target/release/$libName").getPath)).flatten - candidates.find(p => Files.exists(Paths.get(p))) match { - case Some(path) => - try { - System.load(new File(path).getAbsolutePath) - loaded = true - } catch { - case t: Throwable => loadError = Some(t) - } - case None => - loadError = Some( - new UnsatisfiedLinkError( - s"could not find $libName in any of: ${candidates.mkString(", ")}")) + case t: Throwable => loadError = Some(t) } } - /** Load the library, throwing if it cannot be loaded. */ + /** Load libcomet, throwing if it cannot be loaded. */ def ensureLoaded(): Unit = { - load() + ensureCometLoaded() loadError.foreach { t => - throw new IllegalStateException( - s"failed to load native ballista library: ${t.getMessage}", - t) + throw new IllegalStateException(s"failed to load native comet library: ${t.getMessage}", t) } } - /** True if the native ballista library is available (loads it on first call). */ - def isAvailable: Boolean = { - load() - loaded + /** + * True if the offload native entries are present - i.e. `libcomet` loaded AND was built with + * the `ballista` Cargo feature. Detected once by resolving a `NativeBallista_*` JNI symbol; a + * feature-less `libcomet` has no such symbol and yields `false`, so the offload suites + * `assume`-skip instead of hard-failing with an `UnsatisfiedLinkError`. + */ + def isAvailable: Boolean = synchronized { + if (probed) return available + probed = true + ensureCometLoaded() + if (loadError.isDefined) { + available = false + } else { + available = + try { + // Resolve a NativeBallista JNI entry; only a `--features ballista` libcomet has it. + new NativeBallista().buildTestProto() + true + } catch { + case t: Throwable => + loadError = Some(t) + false + } + } + available } - /** The load failure, if any (loads the library on first call). */ + /** The load/availability failure, if any (probes on first call). */ def loadFailure: Option[Throwable] = { - load() + if (!probed) isAvailable loadError } } diff --git a/spark/src/test/scala/org/apache/comet/ballista/CometBallistaFfiSpikeSuite.scala b/spark/src/test/scala/org/apache/comet/ballista/CometBallistaFfiSpikeSuite.scala index 834230e385..d68b443e10 100644 --- a/spark/src/test/scala/org/apache/comet/ballista/CometBallistaFfiSpikeSuite.scala +++ b/spark/src/test/scala/org/apache/comet/ballista/CometBallistaFfiSpikeSuite.scala @@ -35,8 +35,9 @@ import org.apache.arrow.vector.IntVector * executors) and exports the result batch back to the JVM over the Arrow C Data Interface. The * JVM imports the result and asserts 5 rows come back. * - * The native entry points live in `libdatafusion_comet_ballista` (the `datafusion-comet-ballista` - * crate built as a cdylib), loaded here alongside — but independently of — `libcomet`. + * The native entry points live in the single `libcomet` cdylib, compiled in when Comet's native + * crate is built with the default-off `ballista` Cargo feature (`cd native && cargo build + * --features ballista`, or `make core-ballista`). There is no separate offload library. */ class CometBallistaFfiSpikeSuite extends AnyFunSuite { diff --git a/spark/src/test/scala/org/apache/comet/ballista/CometBallistaQ1Suite.scala b/spark/src/test/scala/org/apache/comet/ballista/CometBallistaQ1Suite.scala index 32716cc6e0..1b30f42889 100644 --- a/spark/src/test/scala/org/apache/comet/ballista/CometBallistaQ1Suite.scala +++ b/spark/src/test/scala/org/apache/comet/ballista/CometBallistaQ1Suite.scala @@ -64,16 +64,15 @@ import org.apache.comet.CometConf * The test asserts the plan really is single-block before offloading, and compares full result * rows flag-on vs flag-off using the exact decimal types Spark produces. * - * Ordering caveat (dual-library global state): both tests in this suite run in one JVM, and the - * in-process Ballista offload statically links a SECOND copy of Comet core into - * `libdatafusion_comet_ballista`. Its `JAVA_VM` `OnceCell` is distinct from `libcomet`'s, and - * once an offload has run, Comet-on-Spark-executor native execution (`Native.executePlan` on a - * `tokio-rt-worker`) can resolve `with_env` to that second, uninitialized copy and panic with - * `JAVA_VM not initialized`. Reference oracles here therefore run with Comet fully DISABLED (pure - * Spark), which never touches Comet native code and is immune to this interaction. The offload - * itself is unaffected (it initializes/uses the JVM through its own path). This is an - * infrastructure limitation of the in-process offload spike, not a correctness issue in the - * distributed aggregate. + * Coexistence (single-core global state): both tests in this suite run in one JVM. Since the + * offload was folded into `libcomet` behind the `ballista` Cargo feature, there is now exactly + * ONE copy of Comet core — and one `JAVA_VM` static — shared by both the Comet-on-executor path + * (`Native.executePlan` on a `tokio-rt-worker`) and the in-process offload. The old dual-library + * hazard (a second, uninitialized `JAVA_VM` in `libdatafusion_comet_ballista` causing a `JAVA_VM + * not initialized` panic once an offload had run) is therefore gone. The second test's reference + * oracle deliberately runs with Comet ENABLED (the executor native path) AFTER the first test's + * offload has already run in this JVM: it is the coexistence acceptance check — a + * Comet-on-executor native query and an in-process offload sharing one JVM without panicking. */ class CometBallistaQ1Suite extends CometTestBase with AdaptiveSparkPlanHelper { @@ -385,18 +384,19 @@ class CometBallistaQ1Suite extends CometTestBase with AdaptiveSparkPlanHelper { s"expected exactly two serialized CometNativeExec blocks, found ${nativeBlocks.size}:\n" + s"$executed") - // Baseline oracle: Spark's OWN Q1 answer, with Comet fully disabled. This is the truest - // reference for "does the distributed offload match Spark's own Q1" (the brief's goal), and - // it also runs through the same listener apparatus as a positive control proving the - // listener observes executor task starts, so the `== 0` assertion for the offloaded run is - // meaningful. Comet is disabled here deliberately: a Comet-native baseline uses the native - // execution engine (tokio) which, once an in-process Ballista offload has run in this JVM, - // panics with `JAVA_VM not initialized` (dual-library global-state issue — see the class - // doc). Spark-only execution is immune and is the correct oracle regardless. + // Baseline oracle: Q1's answer computed via the Comet-on-executor native path + // (COMET_ENABLED=true, offload off). This is ALSO the coexistence acceptance check: the + // first test in this suite already ran an in-process Ballista offload in this same JVM, so + // this Comet-native collect drives `Native.executePlan` (tokio `with_env`) AFTER an offload + // — the exact scenario that used to panic with `JAVA_VM not initialized` under the old + // dual-library layout. With the offload folded into the single `libcomet` (one shared + // `JAVA_VM`), it must now run cleanly and still match Spark's Q1. It also runs through the + // same listener apparatus as a positive control proving the listener observes executor task + // starts, so the `== 0` assertion for the offloaded run is meaningful. var baseline: Seq[Seq[Any]] = null val baselineTaskStarts = countTaskStarts { baseline = withSQLConf( - CometConf.COMET_ENABLED.key -> "false", + CometConf.COMET_ENABLED.key -> "true", CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") { spark.sql(q1FullAggregate).collect().map(_.toSeq.toIndexedSeq).toIndexedSeq } From 71835fc5a0580e8fa80a615f6fe59c3bd67e3e3a Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Jul 2026 19:20:02 -0600 Subject: [PATCH 21/42] feat(ballista): run the Comet offload on a real external Ballista cluster MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Comet-flavored scheduler/executor binaries and a remote submission path so a distributed Comet plan can run on a genuinely external Ballista cluster (separate scheduler + executor processes), not just the in-process standalone engine. - Add `comet-ballista-scheduler` / `comet-ballista-executor` binaries (`required-features = ["ballista"]`) that construct the Ballista config in Rust and inject Comet's `CometLogicalCodec` / `CometPhysicalCodec` via the config `override_*_codec` fields — which the stock Ballista CLIs hardcode to None — so shipped Comet plan nodes decode on the scheduler and executor. - Generalize `execute_two_stage` with a `scheduler_url`: empty keeps the in-process standalone path; non-empty submits the built 2-stage plan to that external scheduler. Thread the URL through `submit_and_export_distributed` and the `executeQueryDistributed` JNI entry. - Wire the JVM side: `NativeBallista.executeQueryDistributed` gains a `schedulerUrl`; new `spark.comet.exec.ballista.scheduler.url` config; read and passed through in `operators.scala`. - Add an ignored integration test that spawns the two binaries as child processes and submits `CometFragment(NativeScan) -> hash-shuffle -> CometFragment(Filter)` to the external scheduler, asserting correct results. Verifies the Comet fragments execute in the separate, JVM-less executor process (only libjvm present, JAVA_VM uninitialized) with no "JAVA_VM not initialized" panic. Relates to #4796. --- native/core/Cargo.toml | 15 + .../core/src/bin/comet-ballista-executor.rs | 100 ++++++ .../core/src/bin/comet-ballista-scheduler.rs | 74 ++++ native/core/src/execution/ballista/ffi_jni.rs | 45 ++- .../core/tests/ballista_external_cluster.rs | 317 ++++++++++++++++++ .../scala/org/apache/comet/CometConf.scala | 12 + .../comet/ballista/NativeBallista.scala | 4 + .../apache/spark/sql/comet/operators.scala | 4 + 8 files changed, 564 insertions(+), 7 deletions(-) create mode 100644 native/core/src/bin/comet-ballista-executor.rs create mode 100644 native/core/src/bin/comet-ballista-scheduler.rs create mode 100644 native/core/tests/ballista_external_cluster.rs diff --git a/native/core/Cargo.toml b/native/core/Cargo.toml index ea1727e71e..2ff08cd163 100644 --- a/native/core/Cargo.toml +++ b/native/core/Cargo.toml @@ -135,6 +135,21 @@ name = "comet" # "rlib" is for benchmarking with criterion. crate-type = ["cdylib", "rlib"] +# Comet-flavored Ballista scheduler/executor binaries. They register Comet's +# extension codecs (which the stock Ballista CLIs hardcode to None) so a +# distributed Comet plan survives (de)serialization across a real external +# cluster. Only built with `--features ballista`, so the default build stays +# Ballista-free. +[[bin]] +name = "comet-ballista-scheduler" +path = "src/bin/comet-ballista-scheduler.rs" +required-features = ["ballista"] + +[[bin]] +name = "comet-ballista-executor" +path = "src/bin/comet-ballista-executor.rs" +required-features = ["ballista"] + [[bench]] name = "array_element_append" harness = false diff --git a/native/core/src/bin/comet-ballista-executor.rs b/native/core/src/bin/comet-ballista-executor.rs new file mode 100644 index 0000000000..db386e0afd --- /dev/null +++ b/native/core/src/bin/comet-ballista-executor.rs @@ -0,0 +1,100 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! A Comet-flavored Ballista **executor** process. +//! +//! Identical to the stock `ballista-executor` binary except that it registers +//! Comet's extension codecs ([`CometLogicalCodec`] / [`CometPhysicalCodec`]) on +//! the [`ExecutorProcessConfig`], so a `CometFragmentExec` shipped from the +//! scheduler is reconstructed here (via Comet's planner over the proto) and run. +//! +//! This is the process that actually **executes** Comet fragments. It links +//! `libcomet` (as an rlib) but is a plain Rust process with **no running JVM** — +//! only `libjvm` is on the loader path. Comet fragments whose leaf is a +//! self-contained `NativeScan` read Parquet directly and never touch `JAVA_VM`, +//! so they run here JVM-less; this binary is the first place that is proven in a +//! *separate* process rather than an in-process test. +//! +//! Only built with `--features ballista` (see `required-features` in +//! `core/Cargo.toml`). +//! +//! Configuration (all optional, env-driven so a harness can place it): +//! - `COMET_BALLISTA_EXECUTOR_BIND_HOST` (default `127.0.0.1`) +//! - `COMET_BALLISTA_EXECUTOR_PORT` (flight port, default `50051`) +//! - `COMET_BALLISTA_EXECUTOR_GRPC_PORT` (default `50052`) +//! - `COMET_BALLISTA_SCHEDULER_HOST` (default `localhost`) +//! - `COMET_BALLISTA_SCHEDULER_PORT` (default `50050`) +//! - `COMET_BALLISTA_EXECUTOR_CONCURRENT_TASKS` (default: available parallelism) + +use std::sync::Arc; + +use ballista_executor::executor_process::{start_executor_process, ExecutorProcessConfig}; + +use comet::execution::ballista::{CometLogicalCodec, CometPhysicalCodec}; + +fn env_u16(key: &str, default: u16) -> u16 { + std::env::var(key) + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(default) +} + +fn main() -> Result<(), Box> { + let bind_host = std::env::var("COMET_BALLISTA_EXECUTOR_BIND_HOST") + .unwrap_or_else(|_| "127.0.0.1".to_string()); + let port = env_u16("COMET_BALLISTA_EXECUTOR_PORT", 50051); + let grpc_port = env_u16("COMET_BALLISTA_EXECUTOR_GRPC_PORT", 50052); + let scheduler_host = + std::env::var("COMET_BALLISTA_SCHEDULER_HOST").unwrap_or_else(|_| "localhost".to_string()); + let scheduler_port = env_u16("COMET_BALLISTA_SCHEDULER_PORT", 50050); + let concurrent_tasks = std::env::var("COMET_BALLISTA_EXECUTOR_CONCURRENT_TASKS") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or_else(|| { + std::thread::available_parallelism() + .map(|n| n.get()) + .unwrap_or(1) + }); + + // Manual runtime so the default (feature-less) build needs no tokio `macros`. + let runtime = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build()?; + + runtime.block_on(async move { + let config = ExecutorProcessConfig { + bind_host, + port, + grpc_port, + scheduler_host, + scheduler_port, + concurrent_tasks, + // The seam: Comet codecs so the executor can rebuild Comet fragments. + override_logical_codec: Some(Arc::new(CometLogicalCodec::default())), + override_physical_codec: Some(Arc::new(CometPhysicalCodec::default())), + ..Default::default() + }; + + eprintln!( + "[comet-ballista-executor] flight :{port} grpc :{grpc_port} -> scheduler {}:{}", + config.scheduler_host, config.scheduler_port + ); + + start_executor_process(Arc::new(config)).await?; + Ok::<(), Box>(()) + }) +} diff --git a/native/core/src/bin/comet-ballista-scheduler.rs b/native/core/src/bin/comet-ballista-scheduler.rs new file mode 100644 index 0000000000..a2e3daf8d1 --- /dev/null +++ b/native/core/src/bin/comet-ballista-scheduler.rs @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! A Comet-flavored Ballista **scheduler** process. +//! +//! Identical to the stock `ballista-scheduler` binary except that it registers +//! Comet's extension codecs ([`CometLogicalCodec`] / [`CometPhysicalCodec`]) on +//! the [`SchedulerConfig`], so a submitted plan containing Comet nodes +//! (`CometFragmentExec` / `CometScanExec`) survives (de)serialization on the +//! scheduler. The stock CLI hardcodes those overrides to `None`, which is why +//! this bespoke binary exists. +//! +//! Only built with `--features ballista` (see `required-features` in +//! `core/Cargo.toml`). Runs a real, externally reachable gRPC scheduler that a +//! separate `comet-ballista-executor` process connects to. +//! +//! Configuration (all optional, env-driven so a harness can place it): +//! - `COMET_BALLISTA_SCHEDULER_BIND_HOST` (default `127.0.0.1`) +//! - `COMET_BALLISTA_SCHEDULER_BIND_PORT` (default `50050`) + +use std::net::SocketAddr; +use std::sync::Arc; + +use ballista_scheduler::cluster::BallistaCluster; +use ballista_scheduler::config::SchedulerConfig; +use ballista_scheduler::scheduler_process::start_server; + +use comet::execution::ballista::{CometLogicalCodec, CometPhysicalCodec}; + +fn main() -> Result<(), Box> { + let bind_host = std::env::var("COMET_BALLISTA_SCHEDULER_BIND_HOST") + .unwrap_or_else(|_| "127.0.0.1".to_string()); + let bind_port: u16 = std::env::var("COMET_BALLISTA_SCHEDULER_BIND_PORT") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(50050); + + // Manual runtime so the default (feature-less) build needs no tokio `macros`. + let runtime = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build()?; + + runtime.block_on(async move { + let config = SchedulerConfig { + bind_host: bind_host.clone(), + bind_port, + // The seam: Comet codecs so the scheduler can decode Comet plan nodes. + override_logical_codec: Some(Arc::new(CometLogicalCodec::default())), + override_physical_codec: Some(Arc::new(CometPhysicalCodec::default())), + ..Default::default() + }; + + let addr: SocketAddr = format!("{bind_host}:{bind_port}").parse()?; + eprintln!("[comet-ballista-scheduler] starting on {addr}"); + + let cluster = BallistaCluster::new_from_config(&config).await?; + start_server(cluster, addr, Arc::new(config)).await?; + Ok::<(), Box>(()) + }) +} diff --git a/native/core/src/execution/ballista/ffi_jni.rs b/native/core/src/execution/ballista/ffi_jni.rs index 67b15cc111..519858d32d 100644 --- a/native/core/src/execution/ballista/ffi_jni.rs +++ b/native/core/src/execution/ballista/ffi_jni.rs @@ -305,12 +305,22 @@ async fn start_standalone_from_state(state: &SessionState) -> Result Result<(SchemaRef, Vec), String> { let runtime = tokio::runtime::Builder::new_multi_thread() .enable_all() @@ -341,7 +352,16 @@ pub fn execute_two_stage( let plan = build_two_stage_plan(block1_proto, block2_proto, num_group_keys, n)?; let schema = plan.schema(); - let scheduler_url = start_standalone_from_state(&state).await?; + // Empty URL => in-process standalone; non-empty => external cluster. For + // the external path the scheduler creates the session from the submitted + // settings + its own (Comet) codecs, so we do not start a local cluster. + let scheduler_url = if scheduler_url.is_empty() { + eprintln!("[comet-ballista R2] submitting to in-process standalone cluster"); + start_standalone_from_state(&state).await? + } else { + eprintln!("[comet-ballista R2] submitting to external cluster at {scheduler_url}"); + scheduler_url.to_string() + }; let session_config = state.config().clone(); let codec = CometPhysicalCodec::default(); @@ -377,11 +397,17 @@ pub unsafe fn submit_and_export_distributed( block2_proto: &[u8], num_group_keys: usize, num_partitions: usize, + scheduler_url: &str, array_addrs: &[i64], schema_addrs: &[i64], ) -> Result { - let (schema, batches) = - execute_two_stage(block1_proto, block2_proto, num_group_keys, num_partitions)?; + let (schema, batches) = execute_two_stage( + block1_proto, + block2_proto, + num_group_keys, + num_partitions, + scheduler_url, + )?; // The final stage's partitions are concatenated into one batch so the JVM // imports exactly one set of column structs (same contract as R1). let batch = concat_batches(&schema, &batches) @@ -451,7 +477,7 @@ pub unsafe fn submit_and_export( mod jni_entry { use super::{build_test_proto, submit_and_export, submit_and_export_distributed}; use crate::errors::{try_unwrap_or_throw, CometError}; - use jni::objects::{JByteArray, JClass, JLongArray, ReleaseMode}; + use jni::objects::{JByteArray, JClass, JLongArray, JString, ReleaseMode}; use jni::sys::{jbyteArray, jint, jlong}; use jni::EnvUnowned; @@ -524,12 +550,16 @@ mod jni_entry { block2: JByteArray, num_group_keys: jint, num_partitions: jint, + scheduler_url: JString, array_addrs: JLongArray, schema_addrs: JLongArray, ) -> jlong { try_unwrap_or_throw(&e, |env| { let block1_bytes = env.convert_byte_array(block1)?; let block2_bytes = env.convert_byte_array(block2)?; + // Empty => in-process standalone (as before); non-empty (e.g. + // "http://host:50050") => submit to that external scheduler. + let scheduler_url: String = scheduler_url.try_to_string(env)?; let arrays = unsafe { array_addrs.get_elements(env, ReleaseMode::NoCopyBack)? }; let schemas = unsafe { schema_addrs.get_elements(env, ReleaseMode::NoCopyBack)? }; @@ -540,6 +570,7 @@ mod jni_entry { &block2_bytes, num_group_keys as usize, num_partitions as usize, + &scheduler_url, &arrays, &schemas, ) diff --git a/native/core/tests/ballista_external_cluster.rs b/native/core/tests/ballista_external_cluster.rs new file mode 100644 index 0000000000..1c8ec235c0 --- /dev/null +++ b/native/core/tests/ballista_external_cluster.rs @@ -0,0 +1,317 @@ +// Distributes a Comet plan across a REAL external Ballista cluster: a separate +// `comet-ballista-scheduler` process and a separate `comet-ballista-executor` +// process, each spawned as a child of this test. This is unlike +// `ballista_distributed.rs`, which runs an *in-process* standalone cluster +// (scheduler + executor threads inside the test). +// +// The point it proves: +// 1. The two Comet-flavored binaries (which inject Comet's extension codecs, +// unlike the stock Ballista CLIs) start, and the executor registers. +// 2. A two-stage Comet plan (`CometFragment(NativeScan) -> hash-shuffle -> +// CometFragment(Filter over a Scan)`) submitted to the external scheduler +// is split into two stages, shipped to the *separate* executor process, +// reconstructed there via the codecs, and executed — returning correct +// results across the process boundary. +// 3. Crucially, the executor is a plain Rust process with NO running JVM (only +// `libjvm` on the loader path). The Comet fragments must execute there +// without a "JAVA_VM not initialized" panic. A childless `NativeScan` +// fragment reads Parquet directly and never touches `JAVA_VM`, so this +// should hold — this test is the first proof of it in a *separate* process. +// +// Spawns child processes and binds ports, so it is `#[ignore]`. Run explicitly: +// export DYLD_LIBRARY_PATH="$JAVA_HOME/lib/server:$DYLD_LIBRARY_PATH" +// cargo test -p datafusion-comet --features ballista \ +// --test ballista_external_cluster -- --ignored --nocapture + +#![cfg(feature = "ballista")] + +use std::net::{SocketAddr, TcpStream}; +use std::path::PathBuf; +use std::process::{Child, Command}; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use datafusion::arrow::array::{Int32Array, RecordBatch}; +use datafusion::arrow::datatypes::{DataType as ArrowDataType, Field, Schema, SchemaRef}; +use datafusion::parquet::arrow::ArrowWriter; +use prost::Message; + +use comet::execution::ballista::execute_two_stage; +use datafusion_comet_proto::spark_expression::{ + data_type::DataTypeId, expr::ExprStruct, literal, BinaryExpr, BoundReference, DataType, Expr, + Literal, +}; +use datafusion_comet_proto::spark_operator::{ + operator::OpStruct, Filter, NativeScan, NativeScanCommon, Operator, Scan, SparkFilePartition, + SparkPartitionedFile, SparkStructField, +}; + +// Non-default ports so this test does not collide with a real cluster on the +// usual 50050/50051/50052. +const SCHEDULER_PORT: u16 = 51050; +const EXECUTOR_FLIGHT_PORT: u16 = 51051; +const EXECUTOR_GRPC_PORT: u16 = 51052; + +fn int32_type() -> DataType { + DataType { + type_id: DataTypeId::Int32 as i32, + type_info: None, + } +} + +fn int32_schema() -> SchemaRef { + Arc::new(Schema::new(vec![Field::new( + "a", + ArrowDataType::Int32, + true, + )])) +} + +/// Write a tiny Parquet file with a single int32 column `a` = [1..=5]. +fn write_test_parquet(path: &std::path::Path) -> anyhow::Result<()> { + let schema = int32_schema(); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]))], + )?; + let file = std::fs::File::create(path)?; + let mut writer = ArrowWriter::try_new(file, schema, None)?; + writer.write(&batch)?; + writer.close()?; + Ok(()) +} + +/// block1: a childless `NativeScan` fragment over `parquet_path` (int32 `a`). +/// This is the JVM-less leaf — it reads Parquet directly, no `JAVA_VM`. +fn build_native_scan_proto(parquet_path: &std::path::Path) -> anyhow::Result> { + let field_a = SparkStructField { + name: "a".to_string(), + data_type: Some(int32_type()), + nullable: true, + metadata: Default::default(), + }; + let common = NativeScanCommon { + required_schema: vec![field_a.clone()], + data_schema: vec![field_a], + projection_vector: vec![0], + session_timezone: "UTC".to_string(), + source: "comet-external-cluster-native-scan".to_string(), + ..Default::default() + }; + let file_size = std::fs::metadata(parquet_path)?.len() as i64; + let partitioned_file = SparkPartitionedFile { + file_path: format!("file://{}", parquet_path.display()), + start: 0, + length: file_size, + file_size, + partition_values: vec![], + }; + let native_scan = NativeScan { + common: Some(common), + file_partition: Some(SparkFilePartition { + partitioned_file: vec![partitioned_file], + }), + }; + Ok(Operator { + children: vec![], + plan_id: 0, + op_struct: Some(OpStruct::NativeScan(native_scan)), + } + .encode_to_vec()) +} + +/// block2: `Filter(col0 > 2)` over a `Scan` (#100) input leaf, fed by the shuffle +/// reader. Keeps a > 2, i.e. rows {3, 4, 5}. +fn build_filter_over_scan_proto() -> Vec { + let scan = Scan { + fields: vec![int32_type()], + source: "comet-external-cluster-shuffle-scan".to_string(), + }; + let scan_op = Operator { + children: vec![], + plan_id: 1, + op_struct: Some(OpStruct::Scan(scan)), + }; + + let col0 = Expr { + expr_struct: Some(ExprStruct::Bound(BoundReference { + index: 0, + datatype: Some(int32_type()), + })), + ..Default::default() + }; + let lit2 = Expr { + expr_struct: Some(ExprStruct::Literal(Literal { + value: Some(literal::Value::IntVal(2)), + datatype: Some(int32_type()), + is_null: false, + })), + ..Default::default() + }; + let predicate = Expr { + expr_struct: Some(ExprStruct::Gt(Box::new(BinaryExpr { + left: Some(Box::new(col0)), + right: Some(Box::new(lit2)), + }))), + ..Default::default() + }; + Operator { + children: vec![scan_op], + plan_id: 2, + op_struct: Some(OpStruct::Filter(Filter { + predicate: Some(predicate), + })), + } + .encode_to_vec() +} + +/// Kills the spawned child processes on drop, so a panicking assertion still +/// tears the external cluster down. +struct ClusterGuard { + children: Vec<(&'static str, Child)>, +} + +impl Drop for ClusterGuard { + fn drop(&mut self) { + for (name, child) in self.children.iter_mut() { + let _ = child.kill(); + let _ = child.wait(); + eprintln!("[harness] stopped {name}"); + } + } +} + +/// The `libjvm` directory (`$JAVA_HOME/lib/server`) so the spawned binaries can +/// load `libjvm` (present, not a running JVM). Inherited env usually already has +/// it, but we set it explicitly to be robust to macOS DYLD stripping. +fn dyld_path() -> Option { + let java_home = std::env::var("JAVA_HOME").ok()?; + let lib = PathBuf::from(&java_home).join("lib").join("server"); + let existing = std::env::var("DYLD_LIBRARY_PATH").unwrap_or_default(); + Some(if existing.is_empty() { + lib.display().to_string() + } else { + format!("{}:{}", lib.display(), existing) + }) +} + +/// Poll a TCP port until it accepts a connection (the process is listening) or +/// the deadline passes. +fn wait_for_port(port: u16, what: &str, timeout: Duration) -> anyhow::Result<()> { + let addr: SocketAddr = format!("127.0.0.1:{port}").parse()?; + let deadline = Instant::now() + timeout; + while Instant::now() < deadline { + if TcpStream::connect_timeout(&addr, Duration::from_millis(200)).is_ok() { + eprintln!("[harness] {what} is listening on {port}"); + return Ok(()); + } + std::thread::sleep(Duration::from_millis(150)); + } + anyhow::bail!("timed out waiting for {what} on port {port}") +} + +#[ignore = "spawns external scheduler + executor processes and binds ports; run explicitly"] +#[test] +fn comet_plan_on_external_cluster() -> anyhow::Result<()> { + let scheduler_bin = env!("CARGO_BIN_EXE_comet-ballista-scheduler"); + let executor_bin = env!("CARGO_BIN_EXE_comet-ballista-executor"); + let dyld = dyld_path(); + + // --- 1. Spawn the external scheduler process --- + let mut scheduler_cmd = Command::new(scheduler_bin); + scheduler_cmd + .env("COMET_BALLISTA_SCHEDULER_BIND_HOST", "127.0.0.1") + .env( + "COMET_BALLISTA_SCHEDULER_BIND_PORT", + SCHEDULER_PORT.to_string(), + ); + if let Some(ref d) = dyld { + scheduler_cmd.env("DYLD_LIBRARY_PATH", d); + } + let scheduler = scheduler_cmd.spawn()?; + let mut guard = ClusterGuard { + children: vec![("comet-ballista-scheduler", scheduler)], + }; + wait_for_port(SCHEDULER_PORT, "scheduler", Duration::from_secs(30))?; + + // --- 2. Spawn the external executor process (separate, JVM-less) --- + let mut executor_cmd = Command::new(executor_bin); + executor_cmd + .env("COMET_BALLISTA_EXECUTOR_BIND_HOST", "127.0.0.1") + .env( + "COMET_BALLISTA_EXECUTOR_PORT", + EXECUTOR_FLIGHT_PORT.to_string(), + ) + .env( + "COMET_BALLISTA_EXECUTOR_GRPC_PORT", + EXECUTOR_GRPC_PORT.to_string(), + ) + .env("COMET_BALLISTA_SCHEDULER_HOST", "127.0.0.1") + .env("COMET_BALLISTA_SCHEDULER_PORT", SCHEDULER_PORT.to_string()) + .env("COMET_BALLISTA_EXECUTOR_CONCURRENT_TASKS", "4"); + if let Some(ref d) = dyld { + executor_cmd.env("DYLD_LIBRARY_PATH", d); + } + let executor = executor_cmd.spawn()?; + guard.children.push(("comet-ballista-executor", executor)); + wait_for_port( + EXECUTOR_FLIGHT_PORT, + "executor flight", + Duration::from_secs(30), + )?; + wait_for_port(EXECUTOR_GRPC_PORT, "executor grpc", Duration::from_secs(30))?; + // Grace for the executor to complete registration with the scheduler. + std::thread::sleep(Duration::from_secs(3)); + + // --- 3. Build the two-stage Comet plan protos --- + let parquet = std::env::temp_dir().join("comet_external_cluster.parquet"); + write_test_parquet(&parquet)?; + let block1 = build_native_scan_proto(&parquet)?; // NativeScan a=[1..5] + let block2 = build_filter_over_scan_proto(); // Filter(a > 2) + + // --- 4. Submit to the EXTERNAL scheduler (non-empty URL => remote path) --- + let scheduler_url = format!("http://127.0.0.1:{SCHEDULER_PORT}"); + eprintln!("[harness] submitting two-stage Comet plan to {scheduler_url}"); + let (schema, batches) = execute_two_stage( + &block1, + &block2, + /* num_group_keys */ 1, + /* num_partitions */ 4, + &scheduler_url, + ) + .map_err(|e| anyhow::anyhow!("external submission failed: {e}"))?; + + // --- 5. Verify correctness across the process boundary --- + let mut values: Vec = Vec::new(); + for batch in &batches { + let col = batch + .column(0) + .as_any() + .downcast_ref::() + .expect("int32 column"); + values.extend(col.values().iter().copied()); + } + values.sort_unstable(); + eprintln!( + "[harness] external cluster returned {} rows: {:?} (schema: {:?})", + values.len(), + values, + schema.fields().iter().map(|f| f.name()).collect::>() + ); + + assert_eq!( + values, + vec![3, 4, 5], + "distributed Comet plan on the external cluster must return {{3,4,5}} (a > 2)" + ); + + eprintln!( + "PASS: a distributed Comet plan ran on a SEPARATE scheduler+executor process pair \ + (JVM-less executor) and returned correct results" + ); + + // guard drops here, tearing down both child processes. + drop(guard); + let _ = schema; + Ok(()) +} diff --git a/spark/src/main/scala/org/apache/comet/CometConf.scala b/spark/src/main/scala/org/apache/comet/CometConf.scala index d9c511d3df..2406fbc861 100644 --- a/spark/src/main/scala/org/apache/comet/CometConf.scala +++ b/spark/src/main/scala/org/apache/comet/CometConf.scala @@ -294,6 +294,18 @@ object CometConf extends ShimCometConf { .booleanConf .createWithDefault(false) + val COMET_EXEC_BALLISTA_SCHEDULER_URL: ConfigEntry[String] = + conf(s"$COMET_EXEC_CONFIG_PREFIX.ballista.scheduler.url") + .category(CATEGORY_EXEC) + .doc("EXPERIMENTAL: When the Comet Ballista offload is enabled, the URL of an external " + + "Ballista scheduler (e.g. `http://host:50050`) to submit the distributed plan to. When " + + "empty (the default), the plan is submitted to an in-process standalone Ballista cluster " + + "on the driver instead. The external scheduler and its executors must be the " + + "Comet-flavored `comet-ballista-scheduler` / `comet-ballista-executor` binaries so the " + + "shipped Comet plan nodes can be decoded there.") + .stringConf + .createWithDefault("") + val COMET_NATIVE_COLUMNAR_TO_ROW_ENABLED: ConfigEntry[Boolean] = conf(s"$COMET_EXEC_CONFIG_PREFIX.columnarToRow.native.enabled") .category(CATEGORY_EXEC) diff --git a/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala b/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala index 46a1dbc696..916dd716f5 100644 --- a/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala +++ b/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala @@ -85,6 +85,9 @@ class NativeBallista { * number of grouping columns (the leading columns of block1's output to hash on) * @param numPartitions * number of shuffle partitions + * @param schedulerUrl + * the external Ballista scheduler URL (e.g. `http://host:50050`) to submit the plan to; an + * empty string submits to an in-process standalone Ballista cluster instead * @param arrayAddrs * memory addresses of one `ArrowArray` struct per output column of `block2` * @param schemaAddrs @@ -97,6 +100,7 @@ class NativeBallista { block2: Array[Byte], numGroupKeys: Int, numPartitions: Int, + schedulerUrl: String, arrayAddrs: Array[Long], schemaAddrs: Array[Long]): Long } diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala index 6109bc2d48..e99bc55cef 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala @@ -519,6 +519,9 @@ object CometExec { s"Comet Ballista two-stage (R2) offload: the final-aggregate block carries no " + s"serialized plan:\n$root")) + // Empty => in-process standalone Ballista; non-empty => submit to that external scheduler. + val schedulerUrl = CometConf.COMET_EXEC_BALLISTA_SCHEDULER_URL.get() + val numCols = block2.output.length val nativeUtil = new NativeUtil() try { @@ -532,6 +535,7 @@ object CometExec { block2Bytes, numGroupKeys, numPartitions, + schedulerUrl, arrayAddrs, schemaAddrs)) match { case Some(batch) => From 29efd0743ebe85fc019c0364045e809baf679159 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Jul 2026 19:21:05 -0600 Subject: [PATCH 22/42] docs(contributor-guide): mark R1b external-cluster done Relates to #4796 --- docs/source/contributor-guide/ballista_execution.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/source/contributor-guide/ballista_execution.md b/docs/source/contributor-guide/ballista_execution.md index 6b8e82cb6c..aa2692029c 100644 --- a/docs/source/contributor-guide/ballista_execution.md +++ b/docs/source/contributor-guide/ballista_execution.md @@ -113,7 +113,10 @@ Legend: ✅ done · 🔨 in progress · ⬜ planned - ✅ R1-T1 — JVM → native → in-process Ballista → JVM Arrow round-trip (spike). - ✅ R1-T2 — config flag + driver `executeCollect` override. - ◐ R1-T3 — offload proven end-to-end on Q1's single-stage subset (scan + date filter + decimal projections), results match Spark, 0 executor tasks. Full Q1 GROUP BY is structurally multi-block → R2. - - ⬜ R1-T4 (R1b) — submit to an external Ballista scheduler + executor cluster. + - ✅ R1-T4 (R1b) — **external cluster:** a distributed Comet plan submitted to a separate + `comet-ballista-scheduler` process runs on a separate, **JVM-less** `comet-ballista-executor` + process and returns correct results (verified for `NativeScan`-leaf fragments; no Ballista change + needed — the config `override_*_codec` fields already exist). Config: `spark.comet.exec.ballista.scheduler.url`. - 🔨 **R2 — multi-stage distribution.** A distributed 2-block `GROUP BY` (Comet partial-agg → Ballista hash shuffle → Comet final-agg) runs offloaded with 0 Spark-executor tasks and correct results — **full TPC-H Q1's aggregate now runs distributed on Ballista and matches Spark.** - ✅ R2-T1 (Ballista) — accept a pre-built physical plan for distribution (a `physical_plan` submission variant; its own Ballista branch/PR). - ✅ R2-T2 (Comet native) — feed a `ScanExec` leaf from a native `RecordBatchStream` (not only a JVM input). From 094fae9d7cacf0775b1dedffbfb508fc90d601dd Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Jul 2026 19:48:49 -0600 Subject: [PATCH 23/42] refactor(ballista): rename binaries to comet-scheduler / comet-executor Relates to #4796 --- docs/source/contributor-guide/ballista_execution.md | 2 +- native/core/Cargo.toml | 8 ++++---- .../{comet-ballista-executor.rs => comet-executor.rs} | 2 +- ...{comet-ballista-scheduler.rs => comet-scheduler.rs} | 4 ++-- native/core/src/execution/ballista/ffi_jni.rs | 2 +- native/core/tests/ballista_external_cluster.rs | 10 +++++----- spark/src/main/scala/org/apache/comet/CometConf.scala | 2 +- 7 files changed, 15 insertions(+), 15 deletions(-) rename native/core/src/bin/{comet-ballista-executor.rs => comet-executor.rs} (97%) rename native/core/src/bin/{comet-ballista-scheduler.rs => comet-scheduler.rs} (96%) diff --git a/docs/source/contributor-guide/ballista_execution.md b/docs/source/contributor-guide/ballista_execution.md index aa2692029c..3e1b990346 100644 --- a/docs/source/contributor-guide/ballista_execution.md +++ b/docs/source/contributor-guide/ballista_execution.md @@ -114,7 +114,7 @@ Legend: ✅ done · 🔨 in progress · ⬜ planned - ✅ R1-T2 — config flag + driver `executeCollect` override. - ◐ R1-T3 — offload proven end-to-end on Q1's single-stage subset (scan + date filter + decimal projections), results match Spark, 0 executor tasks. Full Q1 GROUP BY is structurally multi-block → R2. - ✅ R1-T4 (R1b) — **external cluster:** a distributed Comet plan submitted to a separate - `comet-ballista-scheduler` process runs on a separate, **JVM-less** `comet-ballista-executor` + `comet-scheduler` process runs on a separate, **JVM-less** `comet-executor` process and returns correct results (verified for `NativeScan`-leaf fragments; no Ballista change needed — the config `override_*_codec` fields already exist). Config: `spark.comet.exec.ballista.scheduler.url`. - 🔨 **R2 — multi-stage distribution.** A distributed 2-block `GROUP BY` (Comet partial-agg → Ballista hash shuffle → Comet final-agg) runs offloaded with 0 Spark-executor tasks and correct results — **full TPC-H Q1's aggregate now runs distributed on Ballista and matches Spark.** diff --git a/native/core/Cargo.toml b/native/core/Cargo.toml index 2ff08cd163..dae4304808 100644 --- a/native/core/Cargo.toml +++ b/native/core/Cargo.toml @@ -141,13 +141,13 @@ crate-type = ["cdylib", "rlib"] # cluster. Only built with `--features ballista`, so the default build stays # Ballista-free. [[bin]] -name = "comet-ballista-scheduler" -path = "src/bin/comet-ballista-scheduler.rs" +name = "comet-scheduler" +path = "src/bin/comet-scheduler.rs" required-features = ["ballista"] [[bin]] -name = "comet-ballista-executor" -path = "src/bin/comet-ballista-executor.rs" +name = "comet-executor" +path = "src/bin/comet-executor.rs" required-features = ["ballista"] [[bench]] diff --git a/native/core/src/bin/comet-ballista-executor.rs b/native/core/src/bin/comet-executor.rs similarity index 97% rename from native/core/src/bin/comet-ballista-executor.rs rename to native/core/src/bin/comet-executor.rs index db386e0afd..b3c91a77ba 100644 --- a/native/core/src/bin/comet-ballista-executor.rs +++ b/native/core/src/bin/comet-executor.rs @@ -90,7 +90,7 @@ fn main() -> Result<(), Box> { }; eprintln!( - "[comet-ballista-executor] flight :{port} grpc :{grpc_port} -> scheduler {}:{}", + "[comet-executor] flight :{port} grpc :{grpc_port} -> scheduler {}:{}", config.scheduler_host, config.scheduler_port ); diff --git a/native/core/src/bin/comet-ballista-scheduler.rs b/native/core/src/bin/comet-scheduler.rs similarity index 96% rename from native/core/src/bin/comet-ballista-scheduler.rs rename to native/core/src/bin/comet-scheduler.rs index a2e3daf8d1..89b8ee0eb8 100644 --- a/native/core/src/bin/comet-ballista-scheduler.rs +++ b/native/core/src/bin/comet-scheduler.rs @@ -26,7 +26,7 @@ //! //! Only built with `--features ballista` (see `required-features` in //! `core/Cargo.toml`). Runs a real, externally reachable gRPC scheduler that a -//! separate `comet-ballista-executor` process connects to. +//! separate `comet-executor` process connects to. //! //! Configuration (all optional, env-driven so a harness can place it): //! - `COMET_BALLISTA_SCHEDULER_BIND_HOST` (default `127.0.0.1`) @@ -65,7 +65,7 @@ fn main() -> Result<(), Box> { }; let addr: SocketAddr = format!("{bind_host}:{bind_port}").parse()?; - eprintln!("[comet-ballista-scheduler] starting on {addr}"); + eprintln!("[comet-scheduler] starting on {addr}"); let cluster = BallistaCluster::new_from_config(&config).await?; start_server(cluster, addr, Arc::new(config)).await?; diff --git a/native/core/src/execution/ballista/ffi_jni.rs b/native/core/src/execution/ballista/ffi_jni.rs index 519858d32d..09c155688f 100644 --- a/native/core/src/execution/ballista/ffi_jni.rs +++ b/native/core/src/execution/ballista/ffi_jni.rs @@ -312,7 +312,7 @@ async fn start_standalone_from_state(state: &SessionState) -> Result anyhow::Result<()> #[ignore = "spawns external scheduler + executor processes and binds ports; run explicitly"] #[test] fn comet_plan_on_external_cluster() -> anyhow::Result<()> { - let scheduler_bin = env!("CARGO_BIN_EXE_comet-ballista-scheduler"); - let executor_bin = env!("CARGO_BIN_EXE_comet-ballista-executor"); + let scheduler_bin = env!("CARGO_BIN_EXE_comet-scheduler"); + let executor_bin = env!("CARGO_BIN_EXE_comet-executor"); let dyld = dyld_path(); // --- 1. Spawn the external scheduler process --- @@ -230,7 +230,7 @@ fn comet_plan_on_external_cluster() -> anyhow::Result<()> { } let scheduler = scheduler_cmd.spawn()?; let mut guard = ClusterGuard { - children: vec![("comet-ballista-scheduler", scheduler)], + children: vec![("comet-scheduler", scheduler)], }; wait_for_port(SCHEDULER_PORT, "scheduler", Duration::from_secs(30))?; @@ -253,7 +253,7 @@ fn comet_plan_on_external_cluster() -> anyhow::Result<()> { executor_cmd.env("DYLD_LIBRARY_PATH", d); } let executor = executor_cmd.spawn()?; - guard.children.push(("comet-ballista-executor", executor)); + guard.children.push(("comet-executor", executor)); wait_for_port( EXECUTOR_FLIGHT_PORT, "executor flight", diff --git a/spark/src/main/scala/org/apache/comet/CometConf.scala b/spark/src/main/scala/org/apache/comet/CometConf.scala index 2406fbc861..6b44c36c9d 100644 --- a/spark/src/main/scala/org/apache/comet/CometConf.scala +++ b/spark/src/main/scala/org/apache/comet/CometConf.scala @@ -301,7 +301,7 @@ object CometConf extends ShimCometConf { "Ballista scheduler (e.g. `http://host:50050`) to submit the distributed plan to. When " + "empty (the default), the plan is submitted to an in-process standalone Ballista cluster " + "on the driver instead. The external scheduler and its executors must be the " + - "Comet-flavored `comet-ballista-scheduler` / `comet-ballista-executor` binaries so the " + + "Comet-flavored `comet-scheduler` / `comet-executor` binaries so the " + "shipped Comet plan nodes can be decoded there.") .stringConf .createWithDefault("") From 495523846a00114bf3206d9629ce2c08a9e01f0d Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Jul 2026 19:57:39 -0600 Subject: [PATCH 24/42] test(ballista): run full Q1 on a live external cluster from a Spark driver Add CometBallistaExternalClusterQ1Suite (-Pspark-4.0), which spawns the feature-built comet-scheduler and comet-executor as child processes on non-default ports (libjvm on the loader path, port readiness + registration grace, teardown that kills them even on failure), points spark.comet.exec.ballista.scheduler.url at the live scheduler, and offloads full TPC-H Q1's aggregate to it. Asserts the collected rows match a Spark oracle row-for-row (including decimal scale) and that 0 Spark-executor tasks run, closing the R1b gap where the scheduler.url path was validated only by compilation. This is the first time the full Q1 aggregate fragment (partial-agg NativeScan leaf -> hash shuffle -> final-agg over a Scan leaf) runs on a separate, JVM-less executor process; it runs cleanly with no JAVA_VM panic. Relates to #4796. No Rust changed; the default (no-feature) build is unaffected. --- .../contributor-guide/ballista_execution.md | 11 +- .../CometBallistaExternalClusterQ1Suite.scala | 450 ++++++++++++++++++ 2 files changed, 459 insertions(+), 2 deletions(-) create mode 100644 spark/src/test/scala/org/apache/comet/ballista/CometBallistaExternalClusterQ1Suite.scala diff --git a/docs/source/contributor-guide/ballista_execution.md b/docs/source/contributor-guide/ballista_execution.md index 3e1b990346..fb07445658 100644 --- a/docs/source/contributor-guide/ballista_execution.md +++ b/docs/source/contributor-guide/ballista_execution.md @@ -115,8 +115,15 @@ Legend: ✅ done · 🔨 in progress · ⬜ planned - ◐ R1-T3 — offload proven end-to-end on Q1's single-stage subset (scan + date filter + decimal projections), results match Spark, 0 executor tasks. Full Q1 GROUP BY is structurally multi-block → R2. - ✅ R1-T4 (R1b) — **external cluster:** a distributed Comet plan submitted to a separate `comet-scheduler` process runs on a separate, **JVM-less** `comet-executor` - process and returns correct results (verified for `NativeScan`-leaf fragments; no Ballista change - needed — the config `override_*_codec` fields already exist). Config: `spark.comet.exec.ballista.scheduler.url`. + process and returns correct results (no Ballista change needed — the config `override_*_codec` + fields already exist). Config: `spark.comet.exec.ballista.scheduler.url`. Proven at two layers: + the Rust harness (`ballista_external_cluster.rs`, a `NativeScan`→shuffle→`Filter` plan), and — via + `CometBallistaExternalClusterQ1Suite` (`-Pspark-4.0`, feature-built `libcomet` + binaries) — a + **live Spark driver** offloading **full TPC-H Q1's aggregate** to spawned `comet-scheduler` + + `comet-executor` child processes, results matching Spark row-for-row (incl. decimal scale) with 0 + Spark-executor tasks. The full agg fragment (partial-agg `NativeScan` leaf → hash shuffle → + final-agg over a `Scan` leaf) runs on the separate JVM-less executor process without a `JAVA_VM` + panic. - 🔨 **R2 — multi-stage distribution.** A distributed 2-block `GROUP BY` (Comet partial-agg → Ballista hash shuffle → Comet final-agg) runs offloaded with 0 Spark-executor tasks and correct results — **full TPC-H Q1's aggregate now runs distributed on Ballista and matches Spark.** - ✅ R2-T1 (Ballista) — accept a pre-built physical plan for distribution (a `physical_plan` submission variant; its own Ballista branch/PR). - ✅ R2-T2 (Comet native) — feed a `ScanExec` leaf from a native `RecordBatchStream` (not only a JVM input). diff --git a/spark/src/test/scala/org/apache/comet/ballista/CometBallistaExternalClusterQ1Suite.scala b/spark/src/test/scala/org/apache/comet/ballista/CometBallistaExternalClusterQ1Suite.scala new file mode 100644 index 0000000000..38952c1341 --- /dev/null +++ b/spark/src/test/scala/org/apache/comet/ballista/CometBallistaExternalClusterQ1Suite.scala @@ -0,0 +1,450 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.ballista + +import java.io.File +import java.math.{BigDecimal => JBigDecimal} +import java.net.{InetSocketAddress, Socket} +import java.nio.file.Files +import java.sql.Date +import java.util.concurrent.atomic.AtomicInteger + +import scala.io.Source + +import org.apache.spark.CometListenerBusUtils +import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskStart} +import org.apache.spark.sql.{CometTestBase, Row} +import org.apache.spark.sql.comet.CometNativeExec +import org.apache.spark.sql.comet.execution.shuffle.CometShuffleExchangeExec +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types._ + +import org.apache.comet.CometConf + +/** + * The R1b/R2 external-cluster milestone from the FULL Spark-driver side: a live Spark driver + * offloads TPC-H Q1's aggregate to a GENUINELY external, separately-spawned Ballista cluster + * (`comet-scheduler` + `comet-executor` child processes) and gets results identical to Spark, + * with ZERO Spark-executor tasks. + * + * How this differs from [[CometBallistaQ1Suite]] / [[CometBallistaDistributedSuite]]: those run + * the same two-block Q1 aggregate against an IN-PROCESS standalone Ballista engine on the driver + * (`scheduler.url` empty). Here `spark.comet.exec.ballista.scheduler.url` points at a real + * external scheduler process, so the plan is shipped over gRPC/Flight to a SEPARATE, JVM-less + * executor process, reconstructed there via the injected Comet codecs, and run. This is the first + * time the full Q1 aggregate fragment (partial-agg NativeScan leaf -> hash shuffle -> final-agg + * over a Scan leaf) runs on a separate executor PROCESS rather than in the test's own process. + * + * The `comet-scheduler` / `comet-executor` binaries and the `libcomet` this JVM loads must all be + * built with `--features ballista` (`make core-ballista`), so the offload + URL path and the + * Comet-flavored codecs exist. When the loaded `libcomet` lacks the feature, the suite + * `assume`-skips (same guard as the other offload suites). When the feature binaries are missing + * on disk, the suite fails with a build hint rather than silently passing. + * + * Spawns child processes and binds ports, so it mirrors the Rust harness + * (`native/core/tests/ballista_external_cluster.rs`): libjvm on the loader path, wait for ports, + * a short registration grace, and a teardown that kills the children even on failure. + */ +class CometBallistaExternalClusterQ1Suite extends CometTestBase with AdaptiveSparkPlanHelper { + + // Non-default ports so this suite does not collide with a real cluster on the usual + // 50050/50051/50052, nor with the Rust harness on 51050-51052. + private val schedulerPort = 51150 + private val executorFlightPort = 51151 + private val executorGrpcPort = 51152 + + private var scheduler: Process = _ + private var executor: Process = _ + private var logDir: File = _ + private var schedulerLog: File = _ + private var executorLog: File = _ + + /** + * TPC-H `lineitem`, restricted to the columns Q1 touches, with the correct Spark types + * (`decimal(12,2)` and a real `date`). Same fixture as [[CometBallistaQ1Suite]]. + */ + private val lineitemSchema: StructType = StructType( + Seq( + StructField("l_quantity", DecimalType(12, 2), nullable = false), + StructField("l_extendedprice", DecimalType(12, 2), nullable = false), + StructField("l_discount", DecimalType(12, 2), nullable = false), + StructField("l_tax", DecimalType(12, 2), nullable = false), + StructField("l_returnflag", StringType, nullable = false), + StructField("l_linestatus", StringType, nullable = false), + StructField("l_shipdate", DateType, nullable = false))) + + private def dec(v: String): JBigDecimal = new JBigDecimal(v).setScale(2) + + /** Same synthetic `lineitem` rows as [[CometBallistaQ1Suite]] (three surviving Q1 groups). */ + private def lineitemRows: Seq[Row] = Seq( + Row( + dec("17.00"), + dec("21168.23"), + dec("0.04"), + dec("0.02"), + "A", + "F", + Date.valueOf("1998-08-01")), + Row( + dec("36.00"), + dec("45983.16"), + dec("0.09"), + dec("0.06"), + "A", + "F", + Date.valueOf("1998-07-15")), + Row( + dec("8.00"), + dec("13309.60"), + dec("0.10"), + dec("0.02"), + "A", + "F", + Date.valueOf("1998-09-01")), + Row( + dec("28.00"), + dec("28955.64"), + dec("0.05"), + dec("0.08"), + "N", + "O", + Date.valueOf("1998-06-10")), + Row( + dec("24.00"), + dec("32000.00"), + dec("0.00"), + dec("0.00"), + "N", + "O", + Date.valueOf("1998-08-20")), + Row( + dec("2.00"), + dec("2600.00"), + dec("0.06"), + dec("0.03"), + "N", + "O", + Date.valueOf("1998-09-02")), + Row( + dec("32.00"), + dec("41000.50"), + dec("0.07"), + dec("0.05"), + "R", + "F", + Date.valueOf("1998-05-05")), + Row( + dec("45.00"), + dec("60000.00"), + dec("0.02"), + dec("0.01"), + "R", + "F", + Date.valueOf("1998-08-31")), + // rows PAST the Q1 cutoff -- must be filtered out + Row( + dec("50.00"), + dec("70000.00"), + dec("0.03"), + dec("0.04"), + "N", + "F", + Date.valueOf("1998-09-03")), + Row( + dec("99.00"), + dec("99999.99"), + dec("0.05"), + dec("0.05"), + "N", + "F", + Date.valueOf("1998-12-01"))) + + /** + * TPC-H Q1's full aggregate (NO `ORDER BY`): `sum`x4, `avg`x3, `count`, grouped by the two keys + * `(l_returnflag, l_linestatus)`. Same query as [[CometBallistaQ1Suite]]'s R2 test. + */ + private val q1FullAggregate = + """ + |SELECT l_returnflag, l_linestatus, + | sum(l_quantity) AS sum_qty, + | sum(l_extendedprice) AS sum_base_price, + | sum(l_extendedprice * (1 - l_discount)) AS sum_disc_price, + | sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge, + | avg(l_quantity) AS avg_qty, + | avg(l_extendedprice) AS avg_price, + | avg(l_discount) AS avg_disc, + | count(*) AS count_order + |FROM lineitem + |WHERE l_shipdate <= date '1998-12-01' - interval '90' day + |GROUP BY l_returnflag, l_linestatus + |""".stripMargin + + /** Runs `f`, counting Spark executor task starts during it (drains the bus around it). */ + private def countTaskStarts(f: => Unit): Int = { + val taskStarts = new AtomicInteger(0) + val listener = new SparkListener { + override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = { + taskStarts.incrementAndGet() + } + } + CometListenerBusUtils.waitUntilEmpty(spark.sparkContext) + spark.sparkContext.addSparkListener(listener) + try { + f + CometListenerBusUtils.waitUntilEmpty(spark.sparkContext) + } finally { + spark.sparkContext.removeSparkListener(listener) + } + taskStarts.get() + } + + /** + * `$JAVA_HOME/lib/server` prepended to any inherited `DYLD_LIBRARY_PATH` (libjvm, not a JVM). + */ + private def dyldPath(): Option[String] = { + Option(System.getenv("JAVA_HOME")).map { javaHome => + val lib = new File(new File(javaHome, "lib"), "server").getAbsolutePath + val existing = Option(System.getenv("DYLD_LIBRARY_PATH")).getOrElse("") + if (existing.isEmpty) lib else s"$lib:$existing" + } + } + + /** + * Locate the directory holding the feature-built `comet-scheduler` / `comet-executor` binaries. + * Honors `COMET_BALLISTA_BIN_DIR`, else tries the usual debug/release target dirs relative to + * the module (surefire's `user.dir` is the `spark/` module) and the repo root. + */ + private def findBinDir(): Option[File] = { + val candidates = + Option(System.getenv("COMET_BALLISTA_BIN_DIR")).map(new File(_)).toSeq ++ Seq( + "../native/target/debug", + "../native/target/release", + "native/target/debug", + "native/target/release").map(p => new File(System.getProperty("user.dir"), p)) + candidates.find { d => + new File(d, "comet-scheduler").canExecute && new File(d, "comet-executor").canExecute + } + } + + /** Poll a TCP port until it accepts a connection or the deadline passes. */ + private def waitForPort(port: Int, what: String, timeoutMillis: Long): Unit = { + val deadline = System.currentTimeMillis() + timeoutMillis + var connected = false + while (!connected && System.currentTimeMillis() < deadline) { + val socket = new Socket() + try { + socket.connect(new InetSocketAddress("127.0.0.1", port), 200) + connected = true + } catch { + case _: Throwable => Thread.sleep(150) + } finally { + try socket.close() + catch { case _: Throwable => } + } + } + if (!connected) { + // Surface the child's log to make a startup failure diagnosable. + throw new IllegalStateException( + s"timed out waiting for $what on port $port\n${tailLog(what)}") + } + // scalastyle:off println + println(s"[external-cluster] $what is listening on $port") + // scalastyle:on println + } + + private def tailLog(what: String): String = { + val f = if (what.startsWith("scheduler")) schedulerLog else executorLog + if (f != null && f.exists()) { + val src = Source.fromFile(f) + try s"--- $what log tail ---\n${src.getLines().toSeq.takeRight(40).mkString("\n")}" + finally src.close() + } else "" + } + + /** Spawn the external `comet-scheduler` + `comet-executor` child processes. */ + private def startCluster(binDir: File): Unit = { + logDir = Files.createTempDirectory("comet-external-cluster-").toFile + schedulerLog = new File(logDir, "scheduler.log") + executorLog = new File(logDir, "executor.log") + val dyld = dyldPath() + + // --- scheduler --- + val schedulerPb = new ProcessBuilder(new File(binDir, "comet-scheduler").getAbsolutePath) + schedulerPb.redirectOutput(schedulerLog).redirectErrorStream(true) + val schedEnv = schedulerPb.environment() + schedEnv.put("COMET_BALLISTA_SCHEDULER_BIND_HOST", "127.0.0.1") + schedEnv.put("COMET_BALLISTA_SCHEDULER_BIND_PORT", schedulerPort.toString) + schedEnv.put("RUST_LOG", "info") + dyld.foreach(schedEnv.put("DYLD_LIBRARY_PATH", _)) + scheduler = schedulerPb.start() + waitForPort(schedulerPort, "scheduler", 30000) + + // --- executor (separate, JVM-less) --- + val executorPb = new ProcessBuilder(new File(binDir, "comet-executor").getAbsolutePath) + executorPb.redirectOutput(executorLog).redirectErrorStream(true) + val exEnv = executorPb.environment() + exEnv.put("COMET_BALLISTA_EXECUTOR_BIND_HOST", "127.0.0.1") + exEnv.put("COMET_BALLISTA_EXECUTOR_PORT", executorFlightPort.toString) + exEnv.put("COMET_BALLISTA_EXECUTOR_GRPC_PORT", executorGrpcPort.toString) + exEnv.put("COMET_BALLISTA_SCHEDULER_HOST", "127.0.0.1") + exEnv.put("COMET_BALLISTA_SCHEDULER_PORT", schedulerPort.toString) + exEnv.put("COMET_BALLISTA_EXECUTOR_CONCURRENT_TASKS", "4") + exEnv.put("RUST_LOG", "info") + dyld.foreach(exEnv.put("DYLD_LIBRARY_PATH", _)) + executor = executorPb.start() + waitForPort(executorFlightPort, "executor flight", 30000) + waitForPort(executorGrpcPort, "executor grpc", 30000) + // Grace for the executor to finish registering with the scheduler. + Thread.sleep(3000) + } + + private def stopCluster(): Unit = { + Seq(("comet-executor", executor), ("comet-scheduler", scheduler)).foreach { + case (name, proc) => + if (proc != null) { + proc.destroyForcibly() + proc.waitFor() + // scalastyle:off println + println(s"[external-cluster] stopped $name") + // scalastyle:on println + } + } + } + + override def afterAll(): Unit = { + try stopCluster() + finally super.afterAll() + } + + test( + "TPC-H Q1 full aggregate offloads to a LIVE external comet-scheduler/comet-executor cluster " + + "with identical results and no executor tasks") { + assume( + NativeBallista.isAvailable, + s"native ballista library not available (build with `make core-ballista`): " + + s"${NativeBallista.loadFailure.map(_.getMessage)}") + + val binDir = findBinDir().getOrElse { + fail( + "could not find feature-built comet-scheduler/comet-executor binaries; " + + "build them with `make core-ballista` (they require --features ballista), or set " + + "COMET_BALLISTA_BIN_DIR") + } + // scalastyle:off println + println(s"[external-cluster] using binaries from ${binDir.getAbsolutePath}") + // scalastyle:on println + + startCluster(binDir) + + withTempPath { dir => + // Spread rows across several input files so same-group rows land in different partitions — + // the hash shuffle must combine partial-aggregate states across partitions. + spark + .createDataFrame(spark.sparkContext.parallelize(lineitemRows), lineitemSchema) + .repartition(3) + .write + .parquet(dir.getCanonicalPath) + + // AQE off (collect root carries the executeCollect override); direct-read off so block2's + // input leaf serializes as a plain Scan fed by the Ballista shuffle; small shuffle-partition + // count keeps the run fast. scheduler.url points at the LIVE external scheduler. + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false", + SQLConf.SHUFFLE_PARTITIONS.key -> "4", + CometConf.COMET_SHUFFLE_DIRECT_READ_ENABLED.key -> "false", + CometConf.COMET_EXEC_BALLISTA_SCHEDULER_URL.key -> s"http://127.0.0.1:$schedulerPort") { + spark.read.parquet(dir.getCanonicalPath).createOrReplaceTempView("lineitem") + + // Confirm the offloadable R2 shape BEFORE running: exactly one Comet hash exchange (two + // stages) and exactly two serialized CometNativeExec blocks (partial + final aggregate). + val executed = withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") { + spark.sql(q1FullAggregate).queryExecution.executedPlan + } + val exchanges = executed.collect { case e: CometShuffleExchangeExec => e } + assert( + exchanges.size == 1, + s"expected exactly one Comet hash exchange (two stages), found ${exchanges.size}:\n" + + s"$executed") + val nativeBlocks = executed.collect { + case n: CometNativeExec if n.serializedPlanOpt.isDefined => n + } + assert( + nativeBlocks.size == 2, + s"expected exactly two serialized CometNativeExec blocks, found ${nativeBlocks.size}:\n" + + s"$executed") + + // Baseline oracle: Q1 via the Comet-on-executor native path (offload off). Positive control + // for the listener (must launch executor tasks) and the row-for-row reference. + var baseline: Seq[Seq[Any]] = null + val baselineTaskStarts = countTaskStarts { + baseline = withSQLConf( + CometConf.COMET_ENABLED.key -> "true", + CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") { + spark.sql(q1FullAggregate).collect().map(_.toSeq.toIndexedSeq).toIndexedSeq + } + } + assert( + baselineTaskStarts > 0, + "expected the Spark baseline collect to launch at least one Spark executor task " + + s"(sanity check for the listener apparatus); got $baselineTaskStarts") + + // Ballista offload to the LIVE external cluster: same query, flag on, URL set. + var offloaded: Seq[Seq[Any]] = null + val offloadedTaskStarts = countTaskStarts { + offloaded = withSQLConf(CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "true") { + spark.sql(q1FullAggregate).collect().map(_.toSeq.toIndexedSeq).toIndexedSeq + } + } + + // Sort both sides by (returnflag, linestatus) on the driver (Q1's ORDER BY is not + // offloaded). Compare full rows using the exact values Spark produced — decimals keep their + // computed scale, so a wrong decimal scale from avg/sum composition fails the assertion. + def sortKey(r: Seq[Any]): (String, String) = (s"${r.head}", s"${r(1)}") + val baselineSorted = baseline.sortBy(sortKey) + val offloadedSorted = offloaded.sortBy(sortKey) + assert( + offloadedSorted == baselineSorted, + "external-cluster-offloaded Q1 aggregate rows do not match Spark's own Q1\n" + + s" spark: $baselineSorted\n offloaded: $offloadedSorted\n" + + s"${tailLog("scheduler")}\n${tailLog("executor")}") + + // Sanity: three surviving groups after the Q1 date filter. + assert( + baselineSorted.map(r => (s"${r.head}", s"${r(1)}")) == + Seq(("A", "F"), ("N", "O"), ("R", "F")), + s"unexpected Q1 groups: ${baselineSorted.map(r => (r.head, r(1)))}") + + // Crucially, NO Spark executor tasks ran for the offloaded collect — the external Ballista + // cluster served it. + assert( + offloadedTaskStarts == 0, + s"expected 0 Spark executor tasks for the external-cluster-offloaded collect, " + + s"but $offloadedTaskStarts started") + + // scalastyle:off println + println(s"[external-cluster] PASS: live Spark driver ran full Q1 on the external cluster") + println(tailLog("scheduler")) + println(tailLog("executor")) + // scalastyle:on println + } + } + } +} From 8907feeb4188d42639cb4f10cd71bfd969b212d6 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Jul 2026 22:23:13 -0600 Subject: [PATCH 25/42] build: add Apache license headers to ballista test files Fixes the RAT license check on the offload test files. Relates to #4796 --- native/core/tests/ballista_codec_roundtrip.rs | 18 +++++++++++++++++- native/core/tests/ballista_distributed.rs | 18 +++++++++++++++++- native/core/tests/ballista_external_cluster.rs | 18 +++++++++++++++++- native/core/tests/ballista_ffi_roundtrip.rs | 18 +++++++++++++++++- .../tests/ballista_fragment_child_input.rs | 18 +++++++++++++++++- 5 files changed, 85 insertions(+), 5 deletions(-) diff --git a/native/core/tests/ballista_codec_roundtrip.rs b/native/core/tests/ballista_codec_roundtrip.rs index 8068c89854..efc3ab2d84 100644 --- a/native/core/tests/ballista_codec_roundtrip.rs +++ b/native/core/tests/ballista_codec_roundtrip.rs @@ -1,3 +1,19 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. // Proves a Comet-FFI leaf survives Ballista's physical-plan serialization. // // Ballista ships each stage's physical plan to executors as protobuf via a @@ -134,4 +150,4 @@ async fn comet_leaf_survives_ballista_codec() -> anyhow::Result<()> { println!("\nTOTAL ROWS AFTER CODEC ROUND-TRIP: {total_rows}"); assert_eq!(total_rows, 5, "expected 5 rows"); Ok(()) -} +} \ No newline at end of file diff --git a/native/core/tests/ballista_distributed.rs b/native/core/tests/ballista_distributed.rs index cea8e56526..7ff7a1c25d 100644 --- a/native/core/tests/ballista_distributed.rs +++ b/native/core/tests/ballista_distributed.rs @@ -1,3 +1,19 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. // Distributes a Comet FFI scan across a real (in-process) Ballista cluster. // // A `CometTableProvider` exposes the Comet `NativeScan` as a SQL table. The @@ -130,4 +146,4 @@ async fn comet_scan_distributed_with_shuffle() -> anyhow::Result<()> { assert_eq!(groups, 5, "expected 5 groups (a = 1..=5)"); println!("PASS: Comet FFI scan distributed by Ballista (with shuffle) — correct results"); Ok(()) -} +} \ No newline at end of file diff --git a/native/core/tests/ballista_external_cluster.rs b/native/core/tests/ballista_external_cluster.rs index 7b4b325ed7..7b2f90a2b2 100644 --- a/native/core/tests/ballista_external_cluster.rs +++ b/native/core/tests/ballista_external_cluster.rs @@ -1,3 +1,19 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. // Distributes a Comet plan across a REAL external Ballista cluster: a separate // `comet-scheduler` process and a separate `comet-executor` // process, each spawned as a child of this test. This is unlike @@ -314,4 +330,4 @@ fn comet_plan_on_external_cluster() -> anyhow::Result<()> { drop(guard); let _ = schema; Ok(()) -} +} \ No newline at end of file diff --git a/native/core/tests/ballista_ffi_roundtrip.rs b/native/core/tests/ballista_ffi_roundtrip.rs index c87e70536b..2f63444ff8 100644 --- a/native/core/tests/ballista_ffi_roundtrip.rs +++ b/native/core/tests/ballista_ffi_roundtrip.rs @@ -1,3 +1,19 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. // Proves the driver-side offload result boundary: a Comet `Operator` proto is // run on in-process standalone Ballista and its result is exported over the // Arrow C Data Interface into *caller-allocated* FFI structs, exactly as the @@ -131,4 +147,4 @@ fn offload_proto_and_import_over_c_data_interface() -> anyhow::Result<()> { Arrow C Data Interface (the JVM boundary mechanism)" ); Ok(()) -} +} \ No newline at end of file diff --git a/native/core/tests/ballista_fragment_child_input.rs b/native/core/tests/ballista_fragment_child_input.rs index 6fede20023..9543a035b7 100644 --- a/native/core/tests/ballista_fragment_child_input.rs +++ b/native/core/tests/ballista_fragment_child_input.rs @@ -1,3 +1,19 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. // Proves `CometFragmentExec` runs a Comet plan fragment whose input-leaf `Scan` // is fed by the node's DataFusion child stream (the R2 shuffle-reader shape, // stood in for here by an in-memory child and a `CometScanExec` child), and @@ -306,4 +322,4 @@ async fn fragment_codec_roundtrip() -> anyhow::Result<()> { "fragment result must be identical after codec round-trip" ); Ok(()) -} +} \ No newline at end of file From 0ba297a3a3c10a7a958913235dc83e73f7794b43 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Jul 2026 22:29:19 -0600 Subject: [PATCH 26/42] test(ballista): move offload suites to spark-4.x test dir They use Spark 4.0+ APIs (withSQLConf returning a value), so compiling them under Spark 3.4/3.5 fails. Gate them to the 4.x-only test source dir. Relates to #4796 --- .../org/apache/comet/ballista/CometBallistaDistributedSuite.scala | 0 .../comet/ballista/CometBallistaExternalClusterQ1Suite.scala | 0 .../org/apache/comet/ballista/CometBallistaFfiSpikeSuite.scala | 0 .../org/apache/comet/ballista/CometBallistaOffloadSuite.scala | 0 .../org/apache/comet/ballista/CometBallistaQ1Suite.scala | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename spark/src/test/{scala => spark-4.x}/org/apache/comet/ballista/CometBallistaDistributedSuite.scala (100%) rename spark/src/test/{scala => spark-4.x}/org/apache/comet/ballista/CometBallistaExternalClusterQ1Suite.scala (100%) rename spark/src/test/{scala => spark-4.x}/org/apache/comet/ballista/CometBallistaFfiSpikeSuite.scala (100%) rename spark/src/test/{scala => spark-4.x}/org/apache/comet/ballista/CometBallistaOffloadSuite.scala (100%) rename spark/src/test/{scala => spark-4.x}/org/apache/comet/ballista/CometBallistaQ1Suite.scala (100%) diff --git a/spark/src/test/scala/org/apache/comet/ballista/CometBallistaDistributedSuite.scala b/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaDistributedSuite.scala similarity index 100% rename from spark/src/test/scala/org/apache/comet/ballista/CometBallistaDistributedSuite.scala rename to spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaDistributedSuite.scala diff --git a/spark/src/test/scala/org/apache/comet/ballista/CometBallistaExternalClusterQ1Suite.scala b/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaExternalClusterQ1Suite.scala similarity index 100% rename from spark/src/test/scala/org/apache/comet/ballista/CometBallistaExternalClusterQ1Suite.scala rename to spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaExternalClusterQ1Suite.scala diff --git a/spark/src/test/scala/org/apache/comet/ballista/CometBallistaFfiSpikeSuite.scala b/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaFfiSpikeSuite.scala similarity index 100% rename from spark/src/test/scala/org/apache/comet/ballista/CometBallistaFfiSpikeSuite.scala rename to spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaFfiSpikeSuite.scala diff --git a/spark/src/test/scala/org/apache/comet/ballista/CometBallistaOffloadSuite.scala b/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaOffloadSuite.scala similarity index 100% rename from spark/src/test/scala/org/apache/comet/ballista/CometBallistaOffloadSuite.scala rename to spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaOffloadSuite.scala diff --git a/spark/src/test/scala/org/apache/comet/ballista/CometBallistaQ1Suite.scala b/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaQ1Suite.scala similarity index 100% rename from spark/src/test/scala/org/apache/comet/ballista/CometBallistaQ1Suite.scala rename to spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaQ1Suite.scala From e95843e09d0563538266ea0e219bb9e1916ec6d2 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Jul 2026 22:45:24 -0600 Subject: [PATCH 27/42] fix(ballista): gate offload deps, fix fragment busy-spin, tighten R2 guard Apply whole-branch landing-review fixes to the experimental Comet -> Ballista offload feature so the default build stays Ballista-free and the offload path is correct. - Keep the default `libcomet` build Ballista-free: `execution::ffi` and `execution::fragment` are used only by the feature-gated `ballista` module, so gate both `pub mod` declarations behind `#[cfg(feature = "ballista")]` and make `datafusion-ffi` an optional dependency activated by the `ballista` feature. The default `cargo build` now pulls zero offload deps (no datafusion-ffi / ballista / tonic). - Fix the busy-spin in `NativeFragmentStream::poll_next`: only re-poll the root after actually feeding new input into a leaf. When no leaf needed input (or a childless `NativeScan` fragment has no leaves) and the root returned `Pending`, return `Poll::Pending` and rely on the root's registered waker instead of hot-looping a worker thread on every async-I/O `Pending`. - Tighten the R2 two-block offload guard: only take the hash-shuffle path when block1 is a partial `HashAggregate`, block2 the matching final `HashAggregate`, and the grouping-key width matches the exchange. Other single-hash-exchange shapes (e.g. a window `PARTITION BY`) previously hashed the wrong columns and silently produced wrong results; they now fall through to a clear rejection. - Reject Iceberg native scans in `injectScanFiles`: `CometIcebergNativeScanExec` leaves carry their splits differently and would be shipped with no files to read (silently zero rows), so raise a clear error. - Minor: replace library `eprintln!` debug output with `log::debug!`, fix the `build_test_proto` doc sentence, stage exported Arrow columns into a Vec before writing them into the JVM structs (avoid a partial-write leak on mid-loop failure), and document that the offload flag requires AQE off. Relates to #4796. --- native/core/Cargo.toml | 6 ++- native/core/src/execution/ballista/ffi_jni.rs | 27 +++++++----- native/core/src/execution/fragment.rs | 42 ++++++++++++++++--- native/core/src/execution/mod.rs | 5 +++ .../scala/org/apache/comet/CometConf.scala | 5 ++- .../apache/spark/sql/comet/operators.scala | 28 +++++++++++++ 6 files changed, 95 insertions(+), 18 deletions(-) diff --git a/native/core/Cargo.toml b/native/core/Cargo.toml index dae4304808..252934849d 100644 --- a/native/core/Cargo.toml +++ b/native/core/Cargo.toml @@ -53,7 +53,10 @@ tempfile = "3.26.0" itertools = "0.15.0" paste = "1.0.14" datafusion = { workspace = true, features = ["parquet_encryption", "sql"] } -datafusion-ffi = "54.0.0" +# Only used by the optional `ballista` offload module (`execution::ffi` / +# `execution::fragment`); activated by the `ballista` feature so the default +# `libcomet` build links no `datafusion-ffi`. +datafusion-ffi = { version = "54.0.0", optional = true } # Only used by the optional `ballista` offload module (extension codecs + # physical-plan (de)serialization); activated by the `ballista` feature. datafusion-proto = { version = "54.0.0", optional = true } @@ -124,6 +127,7 @@ ballista = [ "dep:ballista-scheduler", "dep:ballista-executor", "dep:datafusion-proto", + "dep:datafusion-ffi", ] # exclude optional packages from cargo machete verifications diff --git a/native/core/src/execution/ballista/ffi_jni.rs b/native/core/src/execution/ballista/ffi_jni.rs index 09c155688f..174c63156e 100644 --- a/native/core/src/execution/ballista/ffi_jni.rs +++ b/native/core/src/execution/ballista/ffi_jni.rs @@ -42,8 +42,8 @@ use prost::Message; /// Build the fixed spike test proto Rust-side: a single `NativeScan` over a /// freshly written Parquet file with one int32 column `a` = [1..=5]. Returned to /// the JVM so the JVM test can hand it straight back to [`Java_org_apache_comet_ballista_NativeBallista_executeQuery`] -/// without needing the generated proto Java classes. This is the same proto the -/// Rust `tests/` build. +/// without needing the generated proto Java classes. This is the same proto that +/// the Rust `tests/` build constructs. pub fn build_test_proto() -> Result, String> { use datafusion::arrow::array::Int32Array; use datafusion::arrow::datatypes::DataType as ArrowDataType; @@ -244,10 +244,7 @@ fn build_two_stage_plan( // Investigation aid: the schema of the batches that cross Ballista's IPC // shuffle. block2's `Scan` (#100) leaf schema (derived from the exchange // output on the JVM side) must match this for the aggregate to compose. - eprintln!( - "[comet-ballista R2] block1 (partial-agg) output schema = {:?}", - schema1 - ); + log::debug!("[comet-ballista R2] block1 (partial-agg) output schema = {schema1:?}"); let hash_exprs: Vec> = (0..num_group_keys) .map(|i| Arc::new(Column::new(schema1.field(i).name(), i)) as Arc) @@ -266,7 +263,7 @@ fn build_two_stage_plan( .map_err(|e| format!("failed to build block2 (final-agg) fragment: {e}"))?, ); - eprintln!( + log::debug!( "[comet-ballista R2] block2 (final-agg) output schema = {:?}", block2.schema() ); @@ -356,10 +353,10 @@ pub fn execute_two_stage( // the external path the scheduler creates the session from the submitted // settings + its own (Comet) codecs, so we do not start a local cluster. let scheduler_url = if scheduler_url.is_empty() { - eprintln!("[comet-ballista R2] submitting to in-process standalone cluster"); + log::debug!("[comet-ballista R2] submitting to in-process standalone cluster"); start_standalone_from_state(&state).await? } else { - eprintln!("[comet-ballista R2] submitting to external cluster at {scheduler_url}"); + log::debug!("[comet-ballista R2] submitting to external cluster at {scheduler_url}"); scheduler_url.to_string() }; @@ -439,12 +436,22 @@ unsafe fn export_batch_to_addresses( schema_addrs.len() )); } + // Export every column first; only once *all* succeed do we write into the + // JVM-owned structs. Exporting can fail mid-loop (e.g. an unsupported data + // type); writing incrementally would then leave already-written structs that + // the JVM never imports (and thus never releases) — a leak. Staging into a + // local Vec makes the write phase below infallible, so it is all-or-nothing. + let mut exported = Vec::with_capacity(num_cols); for i in 0..num_cols { let data = batch.column(i).to_data(); let schema = FFI_ArrowSchema::try_from(data.data_type()) .map_err(|e| format!("failed to export schema for column {i}: {e}"))?; let array = FFI_ArrowArray::new(&data); - // The JVM allocated these structs; write the exported values into them. + exported.push((array, schema)); + } + // The JVM allocated these structs; write the exported values into them. This + // phase cannot fail, so no partial write is possible. + for (i, (array, schema)) in exported.into_iter().enumerate() { std::ptr::write(array_addrs[i] as *mut FFI_ArrowArray, array); std::ptr::write(schema_addrs[i] as *mut FFI_ArrowSchema, schema); } diff --git a/native/core/src/execution/fragment.rs b/native/core/src/execution/fragment.rs index 78d5951b7b..76378bea94 100644 --- a/native/core/src/execution/fragment.rs +++ b/native/core/src/execution/fragment.rs @@ -126,9 +126,17 @@ pub fn build_native_fragment( } /// Streams the fragment root while pumping its `Scan` leaves. When the root -/// yields `Pending` (a leaf's `batch` slot is empty), each leaf handle is asked -/// to pull its next batch, then the root is polled again — the same interleaving +/// yields `Pending` because a leaf's `batch` slot is empty, that leaf handle is +/// asked to pull its next batch and the root is re-polled — the same interleaving /// `jni_api` performs for JVM-fed scans, but with native child streams. +/// +/// Crucially, the root is only re-polled after new input is actually fed into a +/// leaf. If the root returns `Pending` while every leaf already has a batch +/// pending consumption (or there are no leaves at all — e.g. a childless +/// `NativeScan` fragment reading Parquet directly), the root is genuinely pending +/// on its own async work and has registered a waker on `cx`; we return +/// `Poll::Pending` and let that waker reschedule us, rather than hot-spinning the +/// worker thread on every async-I/O `Pending`. struct NativeFragmentStream { root: SendableRecordBatchStream, scans: Vec, @@ -144,13 +152,35 @@ impl Stream for NativeFragmentStream { match this.root.poll_next_unpin(cx) { Poll::Ready(item) => return Poll::Ready(item), Poll::Pending => { + // Feed only leaves whose `batch` slot is empty; `get_next_batch` + // blocks until it delivers a batch (or EOF), so a fed leaf then + // holds input and re-polling the root makes progress. Track + // whether we fed anything this iteration. + let mut fed_new_input = false; for scan in this.scans.iter_mut() { - if let Err(e) = scan.get_next_batch() { - return Poll::Ready(Some(Err(DataFusionError::Execution(format!( - "Comet fragment scan input error: {e}" - ))))); + // Peek the slot without holding the lock into `get_next_batch` + // (which takes it again). A slot that is already `Some` needs + // no feeding; a contended `try_lock` is treated as "not empty" + // (nothing to do this round). + let needs_input = scan + .batch + .try_lock() + .map(|slot| slot.is_none()) + .unwrap_or(false); + if needs_input { + if let Err(e) = scan.get_next_batch() { + return Poll::Ready(Some(Err(DataFusionError::Execution( + format!("Comet fragment scan input error: {e}"), + )))); + } + fed_new_input = true; } } + // Nothing new to feed: the root is pending on its own async work + // and its waker (registered on `cx` above) will reschedule us. + if !fed_new_input { + return Poll::Pending; + } } } } diff --git a/native/core/src/execution/mod.rs b/native/core/src/execution/mod.rs index 6eb3076998..17d5fa9f40 100644 --- a/native/core/src/execution/mod.rs +++ b/native/core/src/execution/mod.rs @@ -20,7 +20,12 @@ pub mod ballista; pub mod columnar_to_row; pub mod expressions; +// `ffi` and `fragment` are used only by the feature-gated `ballista` offload +// module (and their own `#[cfg(test)]` code). Gating them keeps the default +// `libcomet` build from compiling the offload path or linking `datafusion-ffi`. +#[cfg(feature = "ballista")] pub mod ffi; +#[cfg(feature = "ballista")] pub mod fragment; pub mod jni_api; pub(crate) mod merge_as_partial; diff --git a/spark/src/main/scala/org/apache/comet/CometConf.scala b/spark/src/main/scala/org/apache/comet/CometConf.scala index 6b44c36c9d..4bfef40f97 100644 --- a/spark/src/main/scala/org/apache/comet/CometConf.scala +++ b/spark/src/main/scala/org/apache/comet/CometConf.scala @@ -290,7 +290,10 @@ object CometConf extends ShimCometConf { "single-stage plans (no exchange) are supported. R1 targets single-stage queries without " + "dynamic partition pruning or correlated scalar subqueries: resolving those inputs " + "(via `waitForSubqueries()`/`updateResult()` before the plan is handed to Ballista) can " + - "still transitively launch Spark executor tasks even with this flag enabled.") + "still transitively launch Spark executor tasks even with this flag enabled. Requires " + + "Adaptive Query Execution to be OFF: with AQE on, the collect root is an " + + "`AdaptiveSparkPlanExec` rather than the Comet columnar-to-row node that carries the " + + "offload override, so this flag silently has no effect.") .booleanConf .createWithDefault(false) diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala index e99bc55cef..b4b89414ee 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala @@ -513,6 +513,24 @@ object CometExec { s"Comet Ballista two-stage (R2) offload: could not identify the final-aggregate block " + s"above the exchange:\n$root")) + // The native side hashes block1's leading `numGroupKeys` output columns to repartition across + // the shuffle, which is correct ONLY when block1 is a partial `HashAggregate` (output layout + // `[groupKeys..., aggStates...]`) and block2 the matching final `HashAggregate`. A different + // single-hash-exchange shape (e.g. a `... OVER (PARTITION BY k)` window) would hash the wrong + // columns and silently return wrong results, so reject anything else rather than offload it. + (block1, block2) match { + case (b1: CometHashAggregateExec, b2: CometHashAggregateExec) + if b1.modes.contains(Partial) && b2.modes.contains(Final) && + b1.groupingExpressions.length == numGroupKeys => + // ok: partial -> hash-shuffle -> final aggregate, grouping-key width matches the exchange. + case _ => + throw new UnsupportedOperationException( + "Comet Ballista two-stage (R2) offload requires a partial HashAggregate below the hash " + + s"exchange and a final HashAggregate above it (grouping keys: $numGroupKeys); found " + + s"block1=${block1.nodeName}, block2=${block2.nodeName}. Other single-hash-exchange " + + s"shapes (e.g. a window PARTITION BY) are not supported:\n$root") + } + val block1Bytes = injectScanFiles(root, block1) val block2Bytes = block2.serializedPlanOpt.plan.getOrElse( throw new UnsupportedOperationException( @@ -564,6 +582,16 @@ object CometExec { val planBytes = boundary.serializedPlanOpt.plan.getOrElse( throw new UnsupportedOperationException( s"Comet Ballista offload: the native plan block carries no serialized plan:\n$root")) + // Only `CometNativeScanExec` leaves have their files injected below; an Iceberg native scan + // carries its splits differently (see `CometIcebergNativeScanExec.serializedPartitionData`) + // and would be shipped to Ballista with no files to read, silently returning zero rows. Reject. + val icebergScans = boundary.collect { case s: CometIcebergNativeScanExec => s } + if (icebergScans.nonEmpty) { + throw new UnsupportedOperationException( + "Comet Ballista offload does not support Iceberg native scans " + + s"(${icebergScans.size} CometIcebergNativeScanExec leaves found); only " + + s"CometNativeScanExec leaves can be offloaded:\n$root") + } val nativeScans = boundary.collect { case s: CometNativeScanExec => s } if (nativeScans.isEmpty) { planBytes From 5ce639f9a4e3c7bf8e470faa21bb59cc453d8f1c Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Jul 2026 22:47:18 -0600 Subject: [PATCH 28/42] style: add trailing newlines to ballista test files (cargo fmt) Relates to #4796 --- native/core/tests/ballista_codec_roundtrip.rs | 2 +- native/core/tests/ballista_distributed.rs | 2 +- native/core/tests/ballista_external_cluster.rs | 2 +- native/core/tests/ballista_ffi_roundtrip.rs | 2 +- native/core/tests/ballista_fragment_child_input.rs | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/native/core/tests/ballista_codec_roundtrip.rs b/native/core/tests/ballista_codec_roundtrip.rs index efc3ab2d84..223e8904d0 100644 --- a/native/core/tests/ballista_codec_roundtrip.rs +++ b/native/core/tests/ballista_codec_roundtrip.rs @@ -150,4 +150,4 @@ async fn comet_leaf_survives_ballista_codec() -> anyhow::Result<()> { println!("\nTOTAL ROWS AFTER CODEC ROUND-TRIP: {total_rows}"); assert_eq!(total_rows, 5, "expected 5 rows"); Ok(()) -} \ No newline at end of file +} diff --git a/native/core/tests/ballista_distributed.rs b/native/core/tests/ballista_distributed.rs index 7ff7a1c25d..0230987f48 100644 --- a/native/core/tests/ballista_distributed.rs +++ b/native/core/tests/ballista_distributed.rs @@ -146,4 +146,4 @@ async fn comet_scan_distributed_with_shuffle() -> anyhow::Result<()> { assert_eq!(groups, 5, "expected 5 groups (a = 1..=5)"); println!("PASS: Comet FFI scan distributed by Ballista (with shuffle) — correct results"); Ok(()) -} \ No newline at end of file +} diff --git a/native/core/tests/ballista_external_cluster.rs b/native/core/tests/ballista_external_cluster.rs index 7b2f90a2b2..3ec9f0f25d 100644 --- a/native/core/tests/ballista_external_cluster.rs +++ b/native/core/tests/ballista_external_cluster.rs @@ -330,4 +330,4 @@ fn comet_plan_on_external_cluster() -> anyhow::Result<()> { drop(guard); let _ = schema; Ok(()) -} \ No newline at end of file +} diff --git a/native/core/tests/ballista_ffi_roundtrip.rs b/native/core/tests/ballista_ffi_roundtrip.rs index 2f63444ff8..da773f9b09 100644 --- a/native/core/tests/ballista_ffi_roundtrip.rs +++ b/native/core/tests/ballista_ffi_roundtrip.rs @@ -147,4 +147,4 @@ fn offload_proto_and_import_over_c_data_interface() -> anyhow::Result<()> { Arrow C Data Interface (the JVM boundary mechanism)" ); Ok(()) -} \ No newline at end of file +} diff --git a/native/core/tests/ballista_fragment_child_input.rs b/native/core/tests/ballista_fragment_child_input.rs index 9543a035b7..187575b125 100644 --- a/native/core/tests/ballista_fragment_child_input.rs +++ b/native/core/tests/ballista_fragment_child_input.rs @@ -322,4 +322,4 @@ async fn fragment_codec_roundtrip() -> anyhow::Result<()> { "fragment result must be identical after codec round-trip" ); Ok(()) -} \ No newline at end of file +} From 635692fe8cb8f883bddb4a210d97f9043e4b0d2e Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 3 Jul 2026 08:06:56 -0600 Subject: [PATCH 29/42] build(ballista): pin Ballista to apache main rev with #1924 merged PR apache/datafusion-ballista#1924 (execute_physical_plan + the PhysicalPlan submission variant the distributed offload needs) has merged, so pin to the apache/datafusion-ballista main rev 6472c7f2 instead of the personal fork branch. --- native/Cargo.lock | 8 ++++---- native/Cargo.toml | 14 +++++++------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/native/Cargo.lock b/native/Cargo.lock index 636c74bfec..136c9b8e8a 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -1139,7 +1139,7 @@ dependencies = [ [[package]] name = "ballista" version = "53.0.0" -source = "git+https://github.com/andygrove/datafusion-ballista?rev=ec0d92799896e608efa43e446bacdc4079e7b6a2#ec0d92799896e608efa43e446bacdc4079e7b6a2" +source = "git+https://github.com/apache/datafusion-ballista?rev=6472c7f21ad1a824b123037b2f18669bd1538bca#6472c7f21ad1a824b123037b2f18669bd1538bca" dependencies = [ "async-trait", "ballista-core", @@ -1154,7 +1154,7 @@ dependencies = [ [[package]] name = "ballista-core" version = "53.0.0" -source = "git+https://github.com/andygrove/datafusion-ballista?rev=ec0d92799896e608efa43e446bacdc4079e7b6a2#ec0d92799896e608efa43e446bacdc4079e7b6a2" +source = "git+https://github.com/apache/datafusion-ballista?rev=6472c7f21ad1a824b123037b2f18669bd1538bca#6472c7f21ad1a824b123037b2f18669bd1538bca" dependencies = [ "arrow-flight", "async-trait", @@ -1189,7 +1189,7 @@ dependencies = [ [[package]] name = "ballista-executor" version = "53.0.0" -source = "git+https://github.com/andygrove/datafusion-ballista?rev=ec0d92799896e608efa43e446bacdc4079e7b6a2#ec0d92799896e608efa43e446bacdc4079e7b6a2" +source = "git+https://github.com/apache/datafusion-ballista?rev=6472c7f21ad1a824b123037b2f18669bd1538bca#6472c7f21ad1a824b123037b2f18669bd1538bca" dependencies = [ "arrow", "arrow-flight", @@ -1222,7 +1222,7 @@ dependencies = [ [[package]] name = "ballista-scheduler" version = "53.0.0" -source = "git+https://github.com/andygrove/datafusion-ballista?rev=ec0d92799896e608efa43e446bacdc4079e7b6a2#ec0d92799896e608efa43e446bacdc4079e7b6a2" +source = "git+https://github.com/apache/datafusion-ballista?rev=6472c7f21ad1a824b123037b2f18669bd1538bca#6472c7f21ad1a824b123037b2f18669bd1538bca" dependencies = [ "arrow-flight", "async-trait", diff --git a/native/Cargo.toml b/native/Cargo.toml index d3f0c29d0f..f159350197 100644 --- a/native/Cargo.toml +++ b/native/Cargo.toml @@ -61,13 +61,13 @@ aws-credential-types = "1.2.13" iceberg = { git = "https://github.com/apache/iceberg-rust", rev = "80a30d3" } iceberg-storage-opendal = { git = "https://github.com/apache/iceberg-rust", rev = "80a30d3", features = ["opendal-memory", "opendal-fs", "opendal-s3", "opendal-gcs", "opendal-oss", "opendal-azdls"] } reqsign-core = "3" -# Ballista pinned to the experimental physical-plan-submission branch -# (apache/datafusion-ballista#1924) which adds execute_physical_plan + the -# PhysicalPlan(bytes) submission variant that the distributed offload needs. -ballista = { git = "https://github.com/andygrove/datafusion-ballista", rev = "ec0d92799896e608efa43e446bacdc4079e7b6a2", package = "ballista" } -ballista-core = { git = "https://github.com/andygrove/datafusion-ballista", rev = "ec0d92799896e608efa43e446bacdc4079e7b6a2", package = "ballista-core" } -ballista-scheduler = { git = "https://github.com/andygrove/datafusion-ballista", rev = "ec0d92799896e608efa43e446bacdc4079e7b6a2", package = "ballista-scheduler" } -ballista-executor = { git = "https://github.com/andygrove/datafusion-ballista", rev = "ec0d92799896e608efa43e446bacdc4079e7b6a2", package = "ballista-executor" } +# Ballista pinned to an apache/datafusion-ballista main rev that includes +# execute_physical_plan + the PhysicalPlan(bytes) submission variant the +# distributed offload needs (PR #1924, merged). +ballista = { git = "https://github.com/apache/datafusion-ballista", rev = "6472c7f21ad1a824b123037b2f18669bd1538bca", package = "ballista" } +ballista-core = { git = "https://github.com/apache/datafusion-ballista", rev = "6472c7f21ad1a824b123037b2f18669bd1538bca", package = "ballista-core" } +ballista-scheduler = { git = "https://github.com/apache/datafusion-ballista", rev = "6472c7f21ad1a824b123037b2f18669bd1538bca", package = "ballista-scheduler" } +ballista-executor = { git = "https://github.com/apache/datafusion-ballista", rev = "6472c7f21ad1a824b123037b2f18669bd1538bca", package = "ballista-executor" } [profile.release] debug = true From 7fb924f1ad47028e9004ef8eb86ac181ee25a9d3 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 3 Jul 2026 08:12:18 -0600 Subject: [PATCH 30/42] refactor(ballista): remove FFI spike helper and suite Drop the build_test_proto/buildTestProto spike helper (a hand-built NativeScan proto used only to probe symbol availability and by the spike suite) and the CometBallistaFfiSpikeSuite. Replace the availability probe with a dedicated no-op JNI entry (probeAvailable). The offload path already runs real Comet-serialized plans; the remaining suites cover it against real Spark SQL. --- native/core/src/execution/ballista/ffi_jni.rs | 114 +++--------------- native/core/src/execution/ballista/mod.rs | 3 +- .../comet/ballista/NativeBallista.scala | 10 +- .../ballista/CometBallistaFfiSpikeSuite.scala | 97 --------------- 4 files changed, 22 insertions(+), 202 deletions(-) delete mode 100644 spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaFfiSpikeSuite.scala diff --git a/native/core/src/execution/ballista/ffi_jni.rs b/native/core/src/execution/ballista/ffi_jni.rs index 174c63156e..4b14945de6 100644 --- a/native/core/src/execution/ballista/ffi_jni.rs +++ b/native/core/src/execution/ballista/ffi_jni.rs @@ -22,101 +22,21 @@ //! resulting Arrow batches back to the JVM over the Arrow C Data Interface — //! the same FFI mechanism Comet already uses in `jni_api::prepare_output` //! (`ArrayData` → caller-allocated `FFI_ArrowArray`/`FFI_ArrowSchema`). -//! -//! This is a SPIKE. It proves the round trip JVM → native → Ballista → JVM. use std::sync::Arc; use ballista::prelude::{SessionConfigExt, SessionContextExt}; use datafusion::arrow::array::RecordBatch; use datafusion::arrow::compute::concat_batches; -use datafusion::arrow::datatypes::{Field, Schema, SchemaRef}; +use datafusion::arrow::datatypes::SchemaRef; use datafusion::arrow::ffi::{FFI_ArrowArray, FFI_ArrowSchema}; use datafusion::execution::SessionStateBuilder; use datafusion::physical_plan::ExecutionPlan; use datafusion::prelude::{SessionConfig, SessionContext}; -use datafusion_comet_proto::spark_operator::{operator::OpStruct, Operator}; +use datafusion_comet_proto::spark_operator::Operator; use prost::Message; -/// Build the fixed spike test proto Rust-side: a single `NativeScan` over a -/// freshly written Parquet file with one int32 column `a` = [1..=5]. Returned to -/// the JVM so the JVM test can hand it straight back to [`Java_org_apache_comet_ballista_NativeBallista_executeQuery`] -/// without needing the generated proto Java classes. This is the same proto that -/// the Rust `tests/` build constructs. -pub fn build_test_proto() -> Result, String> { - use datafusion::arrow::array::Int32Array; - use datafusion::arrow::datatypes::DataType as ArrowDataType; - use datafusion::parquet::arrow::ArrowWriter; - use datafusion_comet_proto::spark_expression::{data_type::DataTypeId, DataType}; - use datafusion_comet_proto::spark_operator::{ - NativeScan, NativeScanCommon, SparkFilePartition, SparkPartitionedFile, SparkStructField, - }; - - let parquet = std::env::temp_dir().join("comet_ffi_ballista_jvm_spike.parquet"); - let arrow_schema = Arc::new(Schema::new(vec![Field::new( - "a", - ArrowDataType::Int32, - true, - )])); - let batch = RecordBatch::try_new( - Arc::clone(&arrow_schema), - vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]))], - ) - .map_err(|e| format!("failed to build test batch: {e}"))?; - let file = - std::fs::File::create(&parquet).map_err(|e| format!("failed to create parquet: {e}"))?; - let mut writer = ArrowWriter::try_new(file, arrow_schema, None) - .map_err(|e| format!("failed to open parquet writer: {e}"))?; - writer - .write(&batch) - .map_err(|e| format!("failed to write parquet: {e}"))?; - writer - .close() - .map_err(|e| format!("failed to close parquet: {e}"))?; - - let int32 = DataType { - type_id: DataTypeId::Int32 as i32, - type_info: None, - }; - let field_a = SparkStructField { - name: "a".to_string(), - data_type: Some(int32), - nullable: true, - metadata: Default::default(), - }; - let common = NativeScanCommon { - required_schema: vec![field_a.clone()], - data_schema: vec![field_a], - projection_vector: vec![0], - session_timezone: "UTC".to_string(), - source: "comet-ffi-ballista-jvm-spike".to_string(), - ..Default::default() - }; - let file_size = std::fs::metadata(&parquet) - .map_err(|e| format!("failed to stat parquet: {e}"))? - .len() as i64; - let partitioned_file = SparkPartitionedFile { - file_path: format!("file://{}", parquet.display()), - start: 0, - length: file_size, - file_size, - partition_values: vec![], - }; - let native_scan = NativeScan { - common: Some(common), - file_partition: Some(SparkFilePartition { - partitioned_file: vec![partitioned_file], - }), - }; - let op = Operator { - children: vec![], - plan_id: 0, - op_struct: Some(OpStruct::NativeScan(native_scan)), - }; - Ok(op.encode_to_vec()) -} - use super::scan::CometScanExec; use super::{CometFragmentExec, CometLogicalCodec, CometPhysicalCodec, CometTableProvider}; @@ -143,8 +63,8 @@ pub fn execute_comet_proto(proto: &[u8]) -> Result<(SchemaRef, Vec) runtime.block_on(async move { // Build the whole Comet plan once (inside the Tokio runtime, which // `CometScanExec::try_new` requires) so we can read its true output - // schema. This is the fix for the T1 spike's scan-schema shortcut: the - // result schema now comes from the plan, not the NativeScan proto. + // schema — the result schema comes from the built plan, not the + // NativeScan proto's `required_schema`. let built: Arc = Arc::new( CometScanExec::try_new(proto.to_vec()) .map_err(|e| format!("failed to build Comet plan: {e}"))?, @@ -469,8 +389,8 @@ pub unsafe fn submit_and_export( schema_addrs: &[i64], ) -> Result { let (schema, batches) = execute_comet_proto(proto)?; - // The spike offloads a single small scan; concatenate to one batch so the - // JVM imports exactly one set of column structs. + // Concatenate to one batch so the JVM imports exactly one set of column + // structs. let batch = concat_batches(&schema, &batches) .map_err(|e| format!("failed to concatenate result batches: {e}"))?; export_batch_to_addresses(&batch, array_addrs, schema_addrs)?; @@ -482,27 +402,25 @@ pub unsafe fn submit_and_export( // --------------------------------------------------------------------------- mod jni_entry { - use super::{build_test_proto, submit_and_export, submit_and_export_distributed}; + use super::{submit_and_export, submit_and_export_distributed}; use crate::errors::{try_unwrap_or_throw, CometError}; use jni::objects::{JByteArray, JClass, JLongArray, JString, ReleaseMode}; - use jni::sys::{jbyteArray, jint, jlong}; + use jni::sys::{jint, jlong}; use jni::EnvUnowned; - /// JVM entry: build the fixed spike test proto Rust-side and return its - /// bytes, so the JVM test does not need the generated proto Java classes. + /// JVM entry: a no-op whose only purpose is symbol resolution. It is compiled + /// only into a `--features ballista` `libcomet`, so the JVM side can detect + /// whether the offload is present by resolving this symbol (see + /// `NativeBallista.isAvailable`); a feature-less library lacks it and yields an + /// `UnsatisfiedLinkError`. /// /// # Safety /// Called from the JVM via JNI. #[no_mangle] - pub unsafe extern "system" fn Java_org_apache_comet_ballista_NativeBallista_buildTestProto( - e: EnvUnowned, + pub unsafe extern "system" fn Java_org_apache_comet_ballista_NativeBallista_probeAvailable( + _e: EnvUnowned, _class: JClass, - ) -> jbyteArray { - try_unwrap_or_throw(&e, |env| { - let bytes = build_test_proto().map_err(CometError::Internal)?; - let arr = env.byte_array_from_slice(&bytes)?; - Ok(arr.into_raw()) - }) + ) { } /// JVM entry: run a Comet `Operator` proto on in-process standalone Ballista diff --git a/native/core/src/execution/ballista/mod.rs b/native/core/src/execution/ballista/mod.rs index af2f69a8a5..30ca956ff3 100644 --- a/native/core/src/execution/ballista/mod.rs +++ b/native/core/src/execution/ballista/mod.rs @@ -43,8 +43,7 @@ pub mod table_provider; pub use codec::{CometLogicalCodec, CometPhysicalCodec, COMET_FRAGMENT_MAGIC, COMET_MAGIC}; pub use ffi_jni::{ - build_test_proto, execute_comet_proto, execute_two_stage, submit_and_export, - submit_and_export_distributed, + execute_comet_proto, execute_two_stage, submit_and_export, submit_and_export_distributed, }; pub use fragment::CometFragmentExec; pub use scan::CometScanExec; diff --git a/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala b/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala index 916dd716f5..1584c66860 100644 --- a/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala +++ b/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala @@ -41,11 +41,11 @@ class NativeBallista { NativeBallista.ensureLoaded() /** - * Build the fixed spike test proto (a single `NativeScan` over a Parquet file with one int32 - * column `a` = [1..5]) native-side and return its serialized bytes. Lets tests exercise the - * proto boundary without depending on the generated proto Java classes. + * No-op native entry used only to detect whether `libcomet` was built with the `ballista` + * feature: resolving this symbol succeeds only in a `--features ballista` build. See + * [[NativeBallista.isAvailable]]. */ - @native def buildTestProto(): Array[Byte] + @native def probeAvailable(): Unit /** * Run a serialized Comet `Operator` proto on an in-process standalone Ballista engine (no Spark @@ -150,7 +150,7 @@ object NativeBallista { available = try { // Resolve a NativeBallista JNI entry; only a `--features ballista` libcomet has it. - new NativeBallista().buildTestProto() + new NativeBallista().probeAvailable() true } catch { case t: Throwable => diff --git a/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaFfiSpikeSuite.scala b/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaFfiSpikeSuite.scala deleted file mode 100644 index d68b443e10..0000000000 --- a/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaFfiSpikeSuite.scala +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.comet.ballista - -import java.nio.file.{Files, Paths} - -import org.scalatest.funsuite.AnyFunSuite - -import org.apache.arrow.c.{ArrowArray, ArrowSchema, CDataDictionaryProvider, Data} -import org.apache.arrow.memory.RootAllocator -import org.apache.arrow.vector.IntVector - -/** - * SPIKE: proves the driver-side "offload to Ballista" round trip across the JVM boundary. - * - * The JVM asks native code to build a fixed Comet `Operator` proto, hands those proto bytes back - * to native code, which runs them on an in-process standalone Ballista engine (no Spark - * executors) and exports the result batch back to the JVM over the Arrow C Data Interface. The - * JVM imports the result and asserts 5 rows come back. - * - * The native entry points live in the single `libcomet` cdylib, compiled in when Comet's native - * crate is built with the default-off `ballista` Cargo feature (`cd native && cargo build - * --features ballista`, or `make core-ballista`). There is no separate offload library. - */ -class CometBallistaFfiSpikeSuite extends AnyFunSuite { - - test("JVM -> native -> in-process Ballista -> JVM returns 5 rows over Arrow FFI") { - CometBallistaFfiSpikeSuite.assumeLibraryLoaded() - // Arrow's C Data JNI helper (arrow_cdata_jni) extracts itself into java.io.tmpdir; the surefire - // config points that at target/tmp, which may not exist yet when this suite runs alone. - Files.createDirectories(Paths.get(System.getProperty("java.io.tmpdir"))) - val native = new NativeBallista - - // 1. Native builds the fixed test proto (single NativeScan over a = [1..5]) and returns bytes. - val proto: Array[Byte] = native.buildTestProto() - assert(proto.nonEmpty, "native buildTestProto returned no bytes") - - val allocator = new RootAllocator(Long.MaxValue) - val provider = new CDataDictionaryProvider() - // One output column (`a`): allocate the C Data structs the JVM owns. - val arrowArray = ArrowArray.allocateNew(allocator) - val arrowSchema = ArrowSchema.allocateNew(allocator) - try { - // 2. JVM hands proto + struct addresses back to native, which runs Ballista and exports. - val numRows = native.executeQuery( - proto, - Array(arrowArray.memoryAddress()), - Array(arrowSchema.memoryAddress())) - assert(numRows == 5, s"expected 5 rows from Ballista, got $numRows") - - // 3. JVM imports the exported column over the Arrow C Data Interface. - val vector = Data.importVector(allocator, arrowArray, arrowSchema, provider) - try { - assert( - vector.getValueCount == 5, - s"expected 5 imported values, got ${vector.getValueCount}") - val ints = vector.asInstanceOf[IntVector] - val values = (0 until ints.getValueCount).map(ints.get) - assert(values == Seq(1, 2, 3, 4, 5), s"unexpected values: $values") - } finally { - vector.close() - } - } finally { - arrowArray.close() - arrowSchema.close() - provider.close() - allocator.close() - } - } -} - -object CometBallistaFfiSpikeSuite { - - def assumeLibraryLoaded(): Unit = { - NativeBallista.loadFailure.foreach { t => - org.scalatest.Assertions - .cancel(s"native ballista library not available: ${t.getMessage}", t) - } - } -} From 9022e9bdb53476d91deae9cbd6722f2ce1ecc75c Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 3 Jul 2026 08:54:00 -0600 Subject: [PATCH 31/42] feat(ballista): add CometBallistaOffloadPlan proto for DAG offload --- native/proto/src/proto/operator.proto | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/native/proto/src/proto/operator.proto b/native/proto/src/proto/operator.proto index 2fcfe7f25b..a428d07bb4 100644 --- a/native/proto/src/proto/operator.proto +++ b/native/proto/src/proto/operator.proto @@ -58,6 +58,31 @@ message Operator { } } +// A distributed offload plan: a DAG of Comet native fragments connected by hash +// exchanges. `fragments` is topologically ordered; the last entry is the root +// (the final stage whose output is returned to the driver). +message CometBallistaOffloadPlan { + repeated OffloadFragment fragments = 1; + // Shuffle width applied to every hash exchange (one consistent partition count). + uint32 num_partitions = 2; +} + +message OffloadFragment { + // The serialized Comet `Operator` plan for this fragment (with file partitions + // already injected for NativeScan leaves). + bytes block_proto = 1; + // Inputs in the same DFS order the fragment's `Scan` (#100) leaves appear + // (left subtree before right). Empty for a leaf fragment (a NativeScan block). + repeated OffloadInput inputs = 2; +} + +message OffloadInput { + // Index into `CometBallistaOffloadPlan.fragments` of the producing fragment. + uint32 producer = 1; + // Hash-partition key ordinals into the PRODUCER fragment's output schema. + repeated uint32 hash_key_ordinals = 2; +} + message SparkPartitionedFile { string file_path = 1; int64 start = 2; From 557f9dd9ced8a9dabd069986bd215cf543909842 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 3 Jul 2026 09:04:56 -0600 Subject: [PATCH 32/42] feat(ballista): add build_offload_plan DAG builder --- native/core/src/execution/ballista/ffi_jni.rs | 93 ++++++++++++- native/core/src/execution/ballista/mod.rs | 3 +- native/core/tests/ballista_offload_dag.rs | 107 +++++++++++++++ native/core/tests/common/mod.rs | 122 ++++++++++++++++++ 4 files changed, 323 insertions(+), 2 deletions(-) create mode 100644 native/core/tests/ballista_offload_dag.rs create mode 100644 native/core/tests/common/mod.rs diff --git a/native/core/src/execution/ballista/ffi_jni.rs b/native/core/src/execution/ballista/ffi_jni.rs index 4b14945de6..b31a5aa96b 100644 --- a/native/core/src/execution/ballista/ffi_jni.rs +++ b/native/core/src/execution/ballista/ffi_jni.rs @@ -34,7 +34,7 @@ use datafusion::execution::SessionStateBuilder; use datafusion::physical_plan::ExecutionPlan; use datafusion::prelude::{SessionConfig, SessionContext}; -use datafusion_comet_proto::spark_operator::Operator; +use datafusion_comet_proto::spark_operator::{CometBallistaOffloadPlan, Operator}; use prost::Message; use super::scan::CometScanExec; @@ -191,6 +191,97 @@ fn build_two_stage_plan( Ok(block2) } +// --------------------------------------------------------------------------- +// R3: general DAG offload (`CometBallistaOffloadPlan`) +// --------------------------------------------------------------------------- + +/// Count the `Scan` (#100) input leaves in a serialized Comet `Operator` block — +/// the same leaves `build_native_fragment` (`native/core/src/execution/fragment.rs`) +/// expects one child stream per, in DFS order. Used as a build-time guard so a +/// mismatched `OffloadFragment.inputs` count fails fast in `build_offload_plan` +/// rather than lazily inside `CometFragmentExec::execute`. +fn comet_offload_scan_leaf_count(block_proto: &[u8]) -> Result { + use datafusion_comet_proto::spark_operator::{operator::OpStruct, Operator}; + fn count(op: &Operator) -> usize { + if matches!(op.op_struct, Some(OpStruct::Scan(_))) { + return 1; + } + op.children.iter().map(count).sum() + } + let op = Operator::decode(block_proto).map_err(|e| format!("decode block: {e}"))?; + Ok(count(&op)) +} + +/// Fold a serialized `CometBallistaOffloadPlan` into a Ballista physical plan: a DAG +/// of `CometFragmentExec` nodes whose inputs are `RepartitionExec(Hash)` over the +/// producer fragments. Fragments are processed in topological order; the last is the +/// root. Ballista's planner then splits at each hash repartition into a stage. +pub fn build_offload_plan(plan_bytes: &[u8]) -> Result, String> { + let plan = CometBallistaOffloadPlan::decode(plan_bytes) + .map_err(|e| format!("failed to decode CometBallistaOffloadPlan: {e}"))?; + if plan.fragments.is_empty() { + return Err("CometBallistaOffloadPlan has no fragments".to_string()); + } + let n = plan.num_partitions.max(1) as usize; + + let mut built: Vec> = Vec::with_capacity(plan.fragments.len()); + for (idx, frag) in plan.fragments.iter().enumerate() { + // Build-time guard: the block's actual `Scan`(#100) leaf count must match + // the descriptor's declared input count, or `CometFragmentExec::execute` + // would fail lazily (or silently under-drive leaves) later. + let leaf_count = comet_offload_scan_leaf_count(&frag.block_proto) + .map_err(|e| format!("fragment {idx}: {e}"))?; + if leaf_count != frag.inputs.len() { + return Err(format!( + "fragment {idx}: block has {leaf_count} Scan input leaves but the descriptor \ + declares {} inputs", + frag.inputs.len() + )); + } + + // Build each input edge as a hash repartition over an already-built producer. + let mut children: Vec> = Vec::with_capacity(frag.inputs.len()); + for input in &frag.inputs { + let producer_idx = input.producer as usize; + if producer_idx >= idx { + return Err(format!( + "fragment {idx} references producer {producer_idx} that is not earlier in \ + topological order" + )); + } + let producer = Arc::clone(&built[producer_idx]); + let producer_schema = producer.schema(); + let hash_exprs: Vec> = input + .hash_key_ordinals + .iter() + .map(|&ord| { + let ord = ord as usize; + if ord >= producer_schema.fields().len() { + return Err(format!( + "fragment {idx} input hash key ordinal {ord} out of range for \ + producer {producer_idx} with {} columns", + producer_schema.fields().len() + )); + } + Ok( + Arc::new(Column::new(producer_schema.field(ord).name(), ord)) + as Arc, + ) + }) + .collect::>()?; + let repart = RepartitionExec::try_new(producer, Partitioning::Hash(hash_exprs, n)) + .map_err(|e| { + format!("fragment {idx}: failed to build hash RepartitionExec: {e}") + })?; + children.push(Arc::new(repart)); + } + let fragment = CometFragmentExec::try_new(frag.block_proto.clone(), children) + .map_err(|e| format!("fragment {idx}: failed to build CometFragmentExec: {e}"))?; + built.push(Arc::new(fragment)); + } + Ok(built.pop().expect("fragments non-empty")) +} + /// Start an in-process standalone Ballista cluster (scheduler + executor) from /// `state`, so the Comet extension codecs registered on the state's config reach /// both sides. Mirrors `ballista::extension`'s private `setup_standalone`, but diff --git a/native/core/src/execution/ballista/mod.rs b/native/core/src/execution/ballista/mod.rs index 30ca956ff3..effa91badd 100644 --- a/native/core/src/execution/ballista/mod.rs +++ b/native/core/src/execution/ballista/mod.rs @@ -43,7 +43,8 @@ pub mod table_provider; pub use codec::{CometLogicalCodec, CometPhysicalCodec, COMET_FRAGMENT_MAGIC, COMET_MAGIC}; pub use ffi_jni::{ - execute_comet_proto, execute_two_stage, submit_and_export, submit_and_export_distributed, + build_offload_plan, execute_comet_proto, execute_two_stage, submit_and_export, + submit_and_export_distributed, }; pub use fragment::CometFragmentExec; pub use scan::CometScanExec; diff --git a/native/core/tests/ballista_offload_dag.rs b/native/core/tests/ballista_offload_dag.rs new file mode 100644 index 0000000000..d75a0fc4fd --- /dev/null +++ b/native/core/tests/ballista_offload_dag.rs @@ -0,0 +1,107 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// Builder tests for the general DAG offload plan (`build_offload_plan`). No +// cluster is started; these only assert the shape of the built plan (and that +// the build-time leaf-count guard fires), mirroring `ballista_fragment_child_input.rs`. + +#![cfg(feature = "ballista")] + +use comet::execution::ballista::build_offload_plan; +use datafusion::physical_plan::displayable; +use datafusion_comet_proto::spark_operator::{ + CometBallistaOffloadPlan, OffloadFragment, OffloadInput, +}; +use prost::Message; + +mod common; +use common::{build_native_scan_proto, build_scan_leaf_block_proto, write_test_parquet}; + +/// A two-fragment DAG: fragment 0 is a `NativeScan` producer (no inputs) reading +/// Parquet column `a`; fragment 1 is a consumer whose block is a `Scan`(#100) leaf +/// fed by a hash `RepartitionExec` over fragment 0's output column `a` (ordinal 0). +/// `build_offload_plan` must fold this into +/// `CometFragmentExec(consumer, [RepartitionExec::Hash([a@0], 4)(CometFragmentExec(producer, []))])`. +#[test] +fn two_stage_aggregate_builds_hash_repartition_dag() { + let parquet = std::env::temp_dir().join("comet_ffi_ballista_offload_dag.parquet"); + write_test_parquet(&parquet).expect("write test parquet"); + let producer = build_native_scan_proto(&parquet).expect("build NativeScan producer block"); + let consumer = build_scan_leaf_block_proto(); + + let plan = CometBallistaOffloadPlan { + num_partitions: 4, + fragments: vec![ + OffloadFragment { + block_proto: producer, + inputs: vec![], + }, + OffloadFragment { + block_proto: consumer, + inputs: vec![OffloadInput { + producer: 0, + hash_key_ordinals: vec![0], + }], + }, + ], + }; + + let built = build_offload_plan(&plan.encode_to_vec()).expect("build_offload_plan"); + let rendered = format!("{}", displayable(built.as_ref()).indent(false)); + assert!(rendered.contains("CometFragmentExec"), "got:\n{rendered}"); + assert!( + rendered.contains("RepartitionExec: partitioning=Hash([a@0], 4)"), + "got:\n{rendered}" + ); +} + +/// A fragment's block must declare exactly as many `OffloadInput`s as it has +/// `Scan`(#100) leaves. Here fragment 1's block is a `NativeScan` (0 leaves), but +/// the descriptor declares 1 input — `build_offload_plan` must fail fast at BUILD +/// time (not lazily inside `CometFragmentExec::execute`). +#[test] +fn leaf_count_mismatch_fails_fast() { + let parquet = std::env::temp_dir().join("comet_ffi_ballista_offload_dag_mismatch.parquet"); + write_test_parquet(&parquet).expect("write test parquet"); + let producer = build_native_scan_proto(&parquet).expect("build NativeScan producer block"); + // A second NativeScan block: 0 `Scan` leaves, but we wire it up as a consumer + // with 1 declared input. + let mismatched = build_native_scan_proto(&parquet).expect("build NativeScan block"); + + let plan = CometBallistaOffloadPlan { + num_partitions: 2, + fragments: vec![ + OffloadFragment { + block_proto: producer, + inputs: vec![], + }, + OffloadFragment { + block_proto: mismatched, + inputs: vec![OffloadInput { + producer: 0, + hash_key_ordinals: vec![0], + }], + }, + ], + }; + + let err = + build_offload_plan(&plan.encode_to_vec()).expect_err("must fail fast on leaf mismatch"); + assert!( + err.contains("Scan input leaves"), + "expected leaf-count mismatch error, got: {err}" + ); +} diff --git a/native/core/tests/common/mod.rs b/native/core/tests/common/mod.rs new file mode 100644 index 0000000000..9c4a2c90dd --- /dev/null +++ b/native/core/tests/common/mod.rs @@ -0,0 +1,122 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// Shared helpers for Ballista offload tests: write a tiny Parquet file and build +// Comet `Operator` proto blocks (`NativeScan` leaf, `Scan` leaf) used to assemble +// offload descriptors without standing up a real cluster. +// +// `write_test_parquet` / `build_native_scan_proto` are copied verbatim from +// `ballista_distributed.rs` so multiple test binaries can share them (each +// `tests/*.rs` file is its own crate, so this lives in `tests/common/mod.rs` and +// is pulled in via `mod common;`). + +#![allow(dead_code)] + +use std::sync::Arc; + +use datafusion::arrow::array::{Int32Array, RecordBatch}; +use datafusion::arrow::datatypes::{DataType as ArrowDataType, Field, Schema}; +use datafusion::parquet::arrow::ArrowWriter; +use prost::Message; + +use datafusion_comet_proto::spark_expression::{data_type::DataTypeId, DataType}; +use datafusion_comet_proto::spark_operator::{ + operator::OpStruct, NativeScan, NativeScanCommon, Operator, Scan, SparkFilePartition, + SparkPartitionedFile, SparkStructField, +}; + +/// Write a tiny Parquet file with a single int32 column `a` = [1..=5]. +pub fn write_test_parquet(path: &std::path::Path) -> anyhow::Result<()> { + let schema = Arc::new(Schema::new(vec![Field::new( + "a", + ArrowDataType::Int32, + true, + )])); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]))], + )?; + let file = std::fs::File::create(path)?; + let mut writer = ArrowWriter::try_new(file, schema, None)?; + writer.write(&batch)?; + writer.close()?; + Ok(()) +} + +/// Build a Comet `Operator` proto: a single `NativeScan` over `parquet_path`. +pub fn build_native_scan_proto(parquet_path: &std::path::Path) -> anyhow::Result> { + let int32 = DataType { + type_id: DataTypeId::Int32 as i32, + type_info: None, + }; + let field_a = SparkStructField { + name: "a".to_string(), + data_type: Some(int32), + nullable: true, + metadata: Default::default(), + }; + let common = NativeScanCommon { + required_schema: vec![field_a.clone()], + data_schema: vec![field_a], + projection_vector: vec![0], + session_timezone: "UTC".to_string(), + source: "comet-ffi-ballista-test".to_string(), + ..Default::default() + }; + let file_size = std::fs::metadata(parquet_path)?.len() as i64; + let partitioned_file = SparkPartitionedFile { + file_path: format!("file://{}", parquet_path.display()), + start: 0, + length: file_size, + file_size, + partition_values: vec![], + }; + let native_scan = NativeScan { + common: Some(common), + file_partition: Some(SparkFilePartition { + partitioned_file: vec![partitioned_file], + }), + }; + let op = Operator { + children: vec![], + plan_id: 0, + op_struct: Some(OpStruct::NativeScan(native_scan)), + }; + Ok(op.encode_to_vec()) +} + +/// Build a Comet `Operator` proto: a single `Scan` (#100) leaf over a one-column +/// Int32 schema named `a`. This is the shape a DAG **consumer** fragment's block +/// must have — a childless `Scan` leaf that `build_offload_plan`'s hash +/// `RepartitionExec` child feeds — as opposed to a `NativeScan` block (which reads +/// Parquet directly and has zero `Scan` leaves, i.e. is only valid as a producer +/// with no inputs). +pub fn build_scan_leaf_block_proto() -> Vec { + let int32 = DataType { + type_id: DataTypeId::Int32 as i32, + type_info: None, + }; + let scan = Scan { + fields: vec![int32], + source: "comet-offload-dag-test".to_string(), + }; + let op = Operator { + children: vec![], + plan_id: 0, + op_struct: Some(OpStruct::Scan(scan)), + }; + op.encode_to_vec() +} From e48be25c02c6d0057b473063472d6fc1ce8d7f30 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 3 Jul 2026 09:15:36 -0600 Subject: [PATCH 33/42] feat(ballista): add execute_offload_plan + executeOffloadPlan JNI entry --- native/core/src/execution/ballista/ffi_jni.rs | 133 +++++++++++++++++- native/core/src/execution/ballista/mod.rs | 4 +- native/core/tests/ballista_offload_dag.rs | 47 ++++++- 3 files changed, 180 insertions(+), 4 deletions(-) diff --git a/native/core/src/execution/ballista/ffi_jni.rs b/native/core/src/execution/ballista/ffi_jni.rs index b31a5aa96b..94b0486373 100644 --- a/native/core/src/execution/ballista/ffi_jni.rs +++ b/native/core/src/execution/ballista/ffi_jni.rs @@ -282,6 +282,102 @@ pub fn build_offload_plan(plan_bytes: &[u8]) -> Result, S Ok(built.pop().expect("fragments non-empty")) } +/// Build and submit a general `CometBallistaOffloadPlan` DAG to a Ballista +/// cluster, returning the collected Arrow result batches plus the result schema. +/// +/// Mirrors [`execute_two_stage`], but the plan is an arbitrary DAG of +/// `CometFragmentExec` nodes (folded by [`build_offload_plan`]) rather than a +/// fixed two-stage GROUP BY shape. The shuffle width `n` is read directly from +/// the descriptor's `num_partitions` field — the authoritative parallelism for +/// every hash repartition `build_offload_plan` builds — so `build_offload_plan` +/// itself keeps its single-return signature. +/// +/// As with `execute_two_stage`, an empty `scheduler_url` starts an in-process +/// standalone cluster; a non-empty one submits to that external scheduler +/// instead. +pub fn execute_offload_plan( + plan_bytes: &[u8], + scheduler_url: &str, +) -> Result<(SchemaRef, Vec), String> { + let runtime = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .map_err(|e| format!("failed to build tokio runtime: {e}"))?; + + runtime.block_on(async move { + // The descriptor carries the authoritative shuffle width; decode it for `n`. + let n = CometBallistaOffloadPlan::decode(plan_bytes) + .map_err(|e| format!("failed to decode CometBallistaOffloadPlan: {e}"))? + .num_partitions + .max(1) as usize; + + // Build the plan inside the runtime: the fragments' NativeScan leaves + // build via Comet's planner, which requires an active Tokio runtime. + let plan = build_offload_plan(plan_bytes)?; + let config = SessionConfig::new_with_ballista() + .with_target_partitions(n) + .with_ballista_standalone_parallelism(n) + .with_ballista_physical_extension_codec(Arc::new(CometPhysicalCodec::default())) + .with_ballista_logical_extension_codec(Arc::new(CometLogicalCodec::default())); + let state = SessionStateBuilder::new() + .with_config(config) + .with_default_features() + .build(); + let schema = plan.schema(); + + // Empty URL => in-process standalone; non-empty => external cluster. + let scheduler_url = if scheduler_url.is_empty() { + log::debug!("[comet-ballista R3] submitting to in-process standalone cluster"); + start_standalone_from_state(&state).await? + } else { + log::debug!("[comet-ballista R3] submitting to external cluster at {scheduler_url}"); + scheduler_url.to_string() + }; + + let session_config = state.config().clone(); + let codec = CometPhysicalCodec::default(); + let session_id = state.session_id().to_string(); + + let stream = execute_physical_plan::( + scheduler_url, + &BallistaConfig::default(), + plan, + &codec, + session_id, + session_config, + ) + .await + .map_err(|e| format!("failed to submit offload plan: {e}"))?; + + let batches = stream + .try_collect::>() + .await + .map_err(|e| format!("failed to collect distributed results: {e}"))?; + + Ok((schema, batches)) + }) +} + +/// Run the general DAG offload plan and export the (single, concatenated) +/// result batch into the JVM-allocated FFI structs. Returns the row count. +/// +/// # Safety +/// See [`export_batch_to_addresses`]. +pub unsafe fn submit_and_export_offload( + plan_bytes: &[u8], + scheduler_url: &str, + array_addrs: &[i64], + schema_addrs: &[i64], +) -> Result { + let (schema, batches) = execute_offload_plan(plan_bytes, scheduler_url)?; + // The final fragment's partitions are concatenated into one batch so the + // JVM imports exactly one set of column structs (same contract as R1/R2). + let batch = concat_batches(&schema, &batches) + .map_err(|e| format!("failed to concatenate result batches: {e}"))?; + export_batch_to_addresses(&batch, array_addrs, schema_addrs)?; + Ok(batch.num_rows() as i64) +} + /// Start an in-process standalone Ballista cluster (scheduler + executor) from /// `state`, so the Comet extension codecs registered on the state's config reach /// both sides. Mirrors `ballista::extension`'s private `setup_standalone`, but @@ -493,7 +589,7 @@ pub unsafe fn submit_and_export( // --------------------------------------------------------------------------- mod jni_entry { - use super::{submit_and_export, submit_and_export_distributed}; + use super::{submit_and_export, submit_and_export_distributed, submit_and_export_offload}; use crate::errors::{try_unwrap_or_throw, CometError}; use jni::objects::{JByteArray, JClass, JLongArray, JString, ReleaseMode}; use jni::sys::{jint, jlong}; @@ -595,4 +691,39 @@ mod jni_entry { Ok(num_rows as jlong) }) } + + /// JVM entry: run a general DAG offload (R3), a `CometBallistaOffloadPlan` + /// describing an arbitrary DAG of `CometFragmentExec` nodes joined by hash + /// shuffles (folded by `build_offload_plan`). Submits it to a Ballista + /// cluster — in-process standalone if `schedulerUrl` is empty, or the named + /// external scheduler otherwise — and exports the concatenated result batch + /// into the JVM-allocated Arrow C Data structs, returning the number of rows. + /// + /// # Safety + /// Called from the JVM via JNI; the address arrays must reference valid + /// caller-allocated `FFI_ArrowArray`/`FFI_ArrowSchema` structs (one per + /// output column of the plan's final fragment). + #[no_mangle] + pub unsafe extern "system" fn Java_org_apache_comet_ballista_NativeBallista_executeOffloadPlan( + e: EnvUnowned, + _class: JClass, + plan: JByteArray, + array_addrs: JLongArray, + schema_addrs: JLongArray, + scheduler_url: JString, + ) -> jlong { + try_unwrap_or_throw(&e, |env| { + let plan_bytes = env.convert_byte_array(plan)?; + let scheduler_url: String = scheduler_url.try_to_string(env)?; + + let arrays = unsafe { array_addrs.get_elements(env, ReleaseMode::NoCopyBack)? }; + let schemas = unsafe { schema_addrs.get_elements(env, ReleaseMode::NoCopyBack)? }; + + let num_rows = unsafe { + submit_and_export_offload(&plan_bytes, &scheduler_url, &arrays, &schemas) + } + .map_err(CometError::Internal)?; + Ok(num_rows as jlong) + }) + } } diff --git a/native/core/src/execution/ballista/mod.rs b/native/core/src/execution/ballista/mod.rs index effa91badd..1da09ecb8c 100644 --- a/native/core/src/execution/ballista/mod.rs +++ b/native/core/src/execution/ballista/mod.rs @@ -43,8 +43,8 @@ pub mod table_provider; pub use codec::{CometLogicalCodec, CometPhysicalCodec, COMET_FRAGMENT_MAGIC, COMET_MAGIC}; pub use ffi_jni::{ - build_offload_plan, execute_comet_proto, execute_two_stage, submit_and_export, - submit_and_export_distributed, + build_offload_plan, execute_comet_proto, execute_offload_plan, execute_two_stage, + submit_and_export, submit_and_export_distributed, submit_and_export_offload, }; pub use fragment::CometFragmentExec; pub use scan::CometScanExec; diff --git a/native/core/tests/ballista_offload_dag.rs b/native/core/tests/ballista_offload_dag.rs index d75a0fc4fd..1630b7cbf7 100644 --- a/native/core/tests/ballista_offload_dag.rs +++ b/native/core/tests/ballista_offload_dag.rs @@ -20,7 +20,8 @@ #![cfg(feature = "ballista")] -use comet::execution::ballista::build_offload_plan; +use comet::execution::ballista::{build_offload_plan, execute_offload_plan}; +use datafusion::arrow::util::pretty::pretty_format_batches; use datafusion::physical_plan::displayable; use datafusion_comet_proto::spark_operator::{ CometBallistaOffloadPlan, OffloadFragment, OffloadInput, @@ -105,3 +106,47 @@ fn leaf_count_mismatch_fails_fast() { "expected leaf-count mismatch error, got: {err}" ); } + +/// Real submission smoke test for `execute_offload_plan`: a single-fragment +/// descriptor (one `NativeScan` block, no inputs, no shuffle edges) run on an +/// in-process standalone Ballista cluster. This proves the +/// descriptor -> `build_offload_plan` -> `execute_physical_plan` submission path +/// works end to end, without hand-building a partial+final aggregate pair (see +/// the comment below, which covers the multi-fragment hash-shuffle case and is +/// deferred to the Scala E2E in Task 8). +/// +/// A plain `#[test]` (not `#[tokio::test]`): `execute_offload_plan` builds and +/// drives its own Tokio runtime internally (it is called synchronously from +/// JNI, with no ambient runtime), so calling it from a thread that is already +/// driving one (e.g. inside `#[tokio::test]`) panics with "Cannot start a +/// runtime from within a runtime". +#[ignore = "starts an in-process Ballista cluster; run explicitly"] +#[test] +fn single_fragment_offload_plan_executes() { + let parquet = std::env::temp_dir().join("comet_ffi_ballista_offload_dag_smoke.parquet"); + write_test_parquet(&parquet).expect("write test parquet"); + let producer = build_native_scan_proto(&parquet).expect("build NativeScan producer block"); + + let plan = CometBallistaOffloadPlan { + num_partitions: 2, + fragments: vec![OffloadFragment { + block_proto: producer, + inputs: vec![], + }], + }; + + let (_schema, batches) = + execute_offload_plan(&plan.encode_to_vec(), "").expect("execute_offload_plan"); + println!("{}", pretty_format_batches(&batches).unwrap()); + let rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(rows, 5, "expected all 5 scanned rows (a = 1..=5)"); +} + +// Deferred to the Scala E2E in Task 8 (per the task-3 brief): hand-building a +// partial+final aggregate `CometBallistaOffloadPlan` (two `NativeScan`/`Scan` +// blocks with the right agg-state schema on each side of a hash shuffle) is +// intricate proto plumbing that the Scala path exercises for free via the real +// planner. `single_fragment_offload_plan_executes` above already proves the +// `execute_offload_plan` submission path (session setup, in-process standalone +// cluster, `execute_physical_plan` codecs) works end to end; multi-fragment DAG +// *shape* is covered by `two_stage_aggregate_builds_hash_repartition_dag` above. From ed82695497725024a5ca0ab708c97dac03ee7718 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 3 Jul 2026 09:19:30 -0600 Subject: [PATCH 34/42] feat(ballista): add executeOffloadPlan JNI binding --- .../apache/comet/ballista/NativeBallista.scala | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala b/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala index 1584c66860..59c322d0f7 100644 --- a/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala +++ b/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala @@ -103,6 +103,23 @@ class NativeBallista { schedulerUrl: String, arrayAddrs: Array[Long], schemaAddrs: Array[Long]): Long + + /** + * Run a serialized [[org.apache.comet.serde.OperatorOuterClass.CometBallistaOffloadPlan]] + * (a DAG of Comet native fragments + hash exchanges) on Ballista and export the single + * (concatenated) result batch into the caller-allocated Arrow C Data structs. + * + * @param plan serialized CometBallistaOffloadPlan + * @param arrayAddrs one ArrowArray struct address per output column + * @param schemaAddrs one ArrowSchema struct address per output column + * @param schedulerUrl external Ballista scheduler URL; "" = in-process standalone + * @return number of rows exported + */ + @native def executeOffloadPlan( + plan: Array[Byte], + arrayAddrs: Array[Long], + schemaAddrs: Array[Long], + schedulerUrl: String): Long } object NativeBallista { From a909d379d5b8dd783953b2f522de2f2b1a8cccff Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 3 Jul 2026 09:40:15 -0600 Subject: [PATCH 35/42] feat(ballista): general DAG walker for single + N-block linear offload --- .../ballista/BallistaOffloadPlanner.scala | 170 ++++++++++++++++++ .../comet/ballista/NativeBallista.scala | 19 +- .../apache/spark/sql/comet/operators.scala | 68 ++++--- .../BallistaOffloadPlannerSuite.scala | 69 +++++++ 4 files changed, 292 insertions(+), 34 deletions(-) create mode 100644 spark/src/main/scala/org/apache/comet/ballista/BallistaOffloadPlanner.scala create mode 100644 spark/src/test/spark-4.x/org/apache/comet/ballista/BallistaOffloadPlannerSuite.scala diff --git a/spark/src/main/scala/org/apache/comet/ballista/BallistaOffloadPlanner.scala b/spark/src/main/scala/org/apache/comet/ballista/BallistaOffloadPlanner.scala new file mode 100644 index 0000000000..09c43c8810 --- /dev/null +++ b/spark/src/main/scala/org/apache/comet/ballista/BallistaOffloadPlanner.scala @@ -0,0 +1,170 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.ballista + +import scala.collection.mutable + +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} +import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning +import org.apache.spark.sql.comet.{CometExec, CometNativeExec} +import org.apache.spark.sql.comet.execution.shuffle.CometShuffleExchangeExec +import org.apache.spark.sql.execution.SparkPlan + +import com.google.protobuf.ByteString + +import org.apache.comet.CometConf +import org.apache.comet.serde.OperatorOuterClass.{OffloadFragment, OffloadInput} +import org.apache.comet.serde.OperatorOuterClass.CometBallistaOffloadPlan + +/** + * Driver-side DAG walker that decomposes a Comet physical plan into a fragment/hash-exchange DAG + * and serializes it as a `CometBallistaOffloadPlan` protobuf for submission to Ballista via + * `NativeBallista.executeOffloadPlan`. + * + * Currently supports: + * - a single native block (no Comet exchange): one fragment, no inputs. + * - an N-block LINEAR chain of native blocks connected by [[CometShuffleExchangeExec]] hash + * exchanges (the R2 two-stage GROUP BY shape generalized to N stages). + * + * General join / multi-input DAG shapes (a native block fed by more than one upstream fragment) + * are a future increment; the walker rejects anything it doesn't recognize with an + * [[UnsupportedOperationException]] rather than guessing. + */ +object BallistaOffloadPlanner { + + /** A native block plus the exchanges that directly feed it (its DAG inputs). */ + private case class BlockNode(block: CometNativeExec, inputs: Seq[CometShuffleExchangeExec]) + + /** + * Decompose `root` into a topologically-ordered DAG of native blocks + hash exchanges and + * serialize it as a CometBallistaOffloadPlan. Producers precede consumers; the last fragment is + * the root. Throws [[UnsupportedOperationException]] for shapes not yet supported. + */ + def buildOffloadPlan(root: SparkPlan, numPartitions: Int): Array[Byte] = { + // Assign a fragment index to every native block, discovered in producer-first order. + val ordered = mutable.ArrayBuffer.empty[BlockNode] + val indexOf = mutable.LinkedHashMap.empty[CometNativeExec, Int] + + // `p` itself is usually the native block, but the very top of the collect root may be a + // thin wrapper with no serialized plan of its own (e.g. `CometNativeColumnarToRowExec`, the + // columnar-to-row conversion node that carries the `executeCollect` override which calls in + // here). `collectFirst` is a pre-order search, so it finds the nearest enclosing boundary + // (the outermost `CometNativeExec` with a serialized plan) without ever having to look past + // it into that block's own internals. + def blockOf(p: SparkPlan): CometNativeExec = + p.collectFirst { case n: CometNativeExec if n.serializedPlanOpt.isDefined => n } + .getOrElse( + throw new UnsupportedOperationException( + "Comet Ballista offload: expected a serialized native block reachable from " + + s"${p.nodeName}:\n$root")) + + // The direct native-block inputs of `block` are the Comet exchanges in its subtree that are + // not nested under a deeper native block. + def directExchanges(block: CometNativeExec): Seq[CometShuffleExchangeExec] = { + val found = mutable.ArrayBuffer.empty[CometShuffleExchangeExec] + def walk(p: SparkPlan): Unit = p match { + case e: CometShuffleExchangeExec => found += e // do NOT descend past an exchange + case other => other.children.foreach(walk) + } + block.children.foreach(walk) + found.toSeq + } + + def register(block: CometNativeExec): Int = indexOf.getOrElseUpdate( + block, { + val inputs = directExchanges(block) + // Linear-chain guard (Task 5 scope): a native block may have at most one upstream + // fragment. A block fed by more than one hash exchange is a join/multi-input DAG shape, + // out of scope until Task 6. + if (inputs.size > 1) { + throw new UnsupportedOperationException( + "Comet Ballista offload: block is fed by more than one Comet exchange " + + s"(${inputs.size}); multi-input DAG shapes (e.g. joins) are not yet supported:\n" + + s"$root") + } + // Recurse producers first so their indices are smaller (topological order). + inputs.foreach(ex => register(blockOf(ex.child))) + val idx = ordered.size + ordered += BlockNode(block, inputs) + indexOf.put(block, idx) + idx + }) + + register(blockOf(root)) + + // A multi-fragment plan feeds downstream fragments from the Ballista hash shuffle, which + // requires each consuming fragment's shuffle-input leaf to serialize as a plain `Scan` + // (#100), not a native `ShuffleScan` (#116) that expects to read Comet shuffle blocks + // directly. That requires direct read disabled (mirrors the old two-block R2 check). + if (ordered.size > 1 && CometConf.COMET_SHUFFLE_DIRECT_READ_ENABLED.get()) { + throw new UnsupportedOperationException( + "Comet Ballista multi-fragment offload requires " + + s"${CometConf.COMET_SHUFFLE_DIRECT_READ_ENABLED.key}=false so each downstream " + + "fragment reads a plain Scan leaf (fed by the Ballista shuffle) rather than a " + + s"native ShuffleScan:\n$root") + } + + val planBuilder = CometBallistaOffloadPlan.newBuilder().setNumPartitions(numPartitions) + ordered.foreach { node => + val fragBuilder = OffloadFragment.newBuilder() + // Inject file partitions into NativeScan leaves (reuse the existing helper). + fragBuilder.setBlockProto( + ByteString.copyFrom(CometExec.injectScanFilesFor(root, node.block))) + node.inputs.foreach { ex => + val producer = blockOf(ex.child) + val producerIdx = indexOf(producer) + val keyOrdinals = hashKeyOrdinals(ex, producer.output, root) + val inputBuilder = OffloadInput.newBuilder().setProducer(producerIdx) + keyOrdinals.foreach(o => inputBuilder.addHashKeyOrdinals(o)) + fragBuilder.addInputs(inputBuilder) + } + planBuilder.addFragments(fragBuilder) + } + planBuilder.build().toByteArray + } + + /** Map an exchange's HashPartitioning key expressions to ordinals in the producer's output. */ + private def hashKeyOrdinals( + ex: CometShuffleExchangeExec, + producerOutput: Seq[Attribute], + root: SparkPlan): Seq[Int] = ex.outputPartitioning match { + case HashPartitioning(expressions, _) => + expressions.map { e => + val attr = e match { + case a: AttributeReference => a + case other => + throw new UnsupportedOperationException( + s"Comet Ballista offload: hash key is not a simple column ($other); not " + + s"supported:\n$root") + } + val ord = producerOutput.indexWhere(_.exprId == attr.exprId) + if (ord < 0) { + throw new UnsupportedOperationException( + s"Comet Ballista offload: hash key $attr not found in producer output " + + s"${producerOutput.map(_.name)}:\n$root") + } + ord + } + case other => + throw new UnsupportedOperationException( + s"Comet Ballista offload: only HashPartitioning exchanges are supported; found $other " + + s"(range/single-partition sort is a future increment):\n$root") + } +} diff --git a/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala b/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala index 59c322d0f7..27e20039ed 100644 --- a/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala +++ b/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala @@ -105,15 +105,20 @@ class NativeBallista { schemaAddrs: Array[Long]): Long /** - * Run a serialized [[org.apache.comet.serde.OperatorOuterClass.CometBallistaOffloadPlan]] - * (a DAG of Comet native fragments + hash exchanges) on Ballista and export the single + * Run a serialized [[org.apache.comet.serde.OperatorOuterClass.CometBallistaOffloadPlan]] (a + * DAG of Comet native fragments + hash exchanges) on Ballista and export the single * (concatenated) result batch into the caller-allocated Arrow C Data structs. * - * @param plan serialized CometBallistaOffloadPlan - * @param arrayAddrs one ArrowArray struct address per output column - * @param schemaAddrs one ArrowSchema struct address per output column - * @param schedulerUrl external Ballista scheduler URL; "" = in-process standalone - * @return number of rows exported + * @param plan + * serialized CometBallistaOffloadPlan + * @param arrayAddrs + * one ArrowArray struct address per output column + * @param schemaAddrs + * one ArrowSchema struct address per output column + * @param schedulerUrl + * external Ballista scheduler URL; "" = in-process standalone + * @return + * number of rows exported */ @native def executeOffloadPlan( plan: Array[Byte], diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala index b4b89414ee..bdfc28bf95 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala @@ -53,7 +53,7 @@ import com.google.protobuf.CodedOutputStream import org.apache.comet.{CometConf, CometExecIterator, CometRuntimeException, ConfigEntry} import org.apache.comet.CometSparkSessionExtensions.{isCometShuffleEnabled, withFallbackReason} -import org.apache.comet.ballista.NativeBallista +import org.apache.comet.ballista.{BallistaOffloadPlanner, NativeBallista} import org.apache.comet.parquet.CometParquetUtils import org.apache.comet.rules.CometExecRule import org.apache.comet.serde.{CometOperatorSerde, Compatible, Incompatible, OperatorOuterClass, SupportLevel, Unsupported} @@ -406,35 +406,38 @@ object CometExec { * EXPERIMENTAL: offload a Comet query to an in-process Apache DataFusion Ballista engine on the * Spark driver and return the collected rows, launching NO Spark executor tasks. * - * Enabled by `spark.comet.exec.ballista.enabled`. Two plan shapes are supported: + * Enabled by `spark.comet.exec.ballista.enabled`. The query is decomposed into a DAG of native + * fragments connected by hash exchanges by [[BallistaOffloadPlanner]] (currently: a single + * native block, or an N-block linear chain of hash exchanges -- general join/multi-input DAG + * shapes are a future increment), serialized as a `CometBallistaOffloadPlan`, and submitted to + * Ballista via the general native `executeOffloadPlan` entry point. * - * - **R1 single-stage:** exactly one native block ([[CometNativeExec]] with a serialized - * plan) and no Comet exchange. The whole-query native plan is submitted as one native leaf. - * - **R2 two-stage GROUP BY:** exactly two native blocks with one - * [[CometShuffleExchangeExec]] between them (partial aggregate below the exchange, final - * aggregate above). The two blocks are submitted separately and Ballista distributes them - * across a hash shuffle. - * - * Anything else throws [[UnsupportedOperationException]]. + * Anything not yet supported by the walker throws [[UnsupportedOperationException]]. */ def executeCollectViaBallista(root: SparkPlan): Array[InternalRow] = { - // Every boundary node (top of a native block) carries a serialized plan. - val boundaries = root.collect { - case n: CometNativeExec if n.serializedPlanOpt.isDefined => n - } - val exchanges = root.collect { case e: CometShuffleExchangeExec => e } - - (boundaries, exchanges) match { - case (Seq(single), Nil) => - executeSingleBlockViaBallista(root, single) - case (Seq(_, _), Seq(exchange)) => - executeTwoBlockViaBallista(root, boundaries, exchange) - case _ => - throw new UnsupportedOperationException( - "Comet Ballista offload supports either a single-stage plan (one native block, no " + - "Comet exchange) or a two-stage GROUP BY (two native blocks + one hash exchange); " + - s"found ${boundaries.size} serialized native blocks and ${exchanges.size} Comet " + - s"exchanges in:\n$root") + val numPartitions = root.conf.numShufflePartitions + val planBytes = BallistaOffloadPlanner.buildOffloadPlan(root, numPartitions) + val schedulerUrl = CometConf.COMET_EXEC_BALLISTA_SCHEDULER_URL.get() + val numCols = root.output.length + val nativeUtil = new NativeUtil() + try { + val nativeBallista = new NativeBallista + nativeUtil.getNextBatch( + numCols, + (arrayAddrs, schemaAddrs) => + nativeBallista + .executeOffloadPlan(planBytes, arrayAddrs, schemaAddrs, schedulerUrl)) match { + case Some(batch) => + try { + batch.rowIterator().asScala.map(_.copy()).toArray + } finally { + batch.close() + } + case None => + Array.empty[InternalRow] + } + } finally { + nativeUtil.close() } } @@ -609,6 +612,17 @@ object CometExec { } } + /** + * Thin public wrapper exposing [[injectScanFiles]] to `org.apache.comet.ballista + * .BallistaOffloadPlanner`, which needs to inject file partitions into every native block's + * `NativeScan` leaves (not just a single block/boundary) when walking the offload DAG. Public + * (not `private[comet]`) because the planner lives in a different package tree + * (`org.apache.comet.ballista`, not `org.apache.spark.sql.comet`) that a `comet`-qualified + * private would not reach. + */ + def injectScanFilesFor(root: SparkPlan, boundary: CometNativeExec): Array[Byte] = + injectScanFiles(root, boundary) + /** * Merge the per-partition file lists of a native scan into a single `NativeScan` carrying every * partition's files, serialized as the `partitionBytes` expected by diff --git a/spark/src/test/spark-4.x/org/apache/comet/ballista/BallistaOffloadPlannerSuite.scala b/spark/src/test/spark-4.x/org/apache/comet/ballista/BallistaOffloadPlannerSuite.scala new file mode 100644 index 0000000000..1d35bed239 --- /dev/null +++ b/spark/src/test/spark-4.x/org/apache/comet/ballista/BallistaOffloadPlannerSuite.scala @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.ballista + +import org.apache.spark.sql.CometTestBase +import org.apache.spark.sql.internal.SQLConf + +import org.apache.comet.CometConf +import org.apache.comet.serde.OperatorOuterClass.CometBallistaOffloadPlan + +/** + * Unit tests for [[BallistaOffloadPlanner]]: drives real Comet plans via SQL and asserts the + * `CometBallistaOffloadPlan` descriptor the walker emits, without requiring the native `ballista` + * feature to be built (no execution, only plan decomposition + serialization). + */ +class BallistaOffloadPlannerSuite extends CometTestBase { + + test("two-stage GROUP BY builds a 2-fragment linear descriptor with a hash edge on the key") { + withParquetTable((0 until 100).map(i => (i % 5, i)), "t") { + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false", + SQLConf.SHUFFLE_PARTITIONS.key -> "4", + CometConf.COMET_SHUFFLE_DIRECT_READ_ENABLED.key -> "false", + CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") { + val plan = sql("SELECT _1, count(*) FROM t GROUP BY _1").queryExecution.executedPlan + val bytes = BallistaOffloadPlanner.buildOffloadPlan(plan, numPartitions = 4) + val desc = CometBallistaOffloadPlan.parseFrom(bytes) + assert(desc.getFragmentsCount == 2) + assert(desc.getNumPartitions == 4) + // fragment 1 (root) has one input from fragment 0, hashed on ordinal 0 (the group key) + val rootInputs = desc.getFragments(1).getInputsList + assert(rootInputs.size == 1) + assert(rootInputs.get(0).getProducer == 0) + assert(rootInputs.get(0).getHashKeyOrdinalsList.contains(0)) + } + } + } + + test("single native block builds a 1-fragment descriptor with no inputs") { + withParquetTable((0 until 10).map(i => (i, i)), "t") { + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false", + CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") { + val plan = sql("SELECT _1 + 1 FROM t").queryExecution.executedPlan + val desc = CometBallistaOffloadPlan.parseFrom( + BallistaOffloadPlanner.buildOffloadPlan(plan, numPartitions = 4)) + assert(desc.getFragmentsCount == 1) + assert(desc.getFragments(0).getInputsCount == 0) + } + } + } +} From 2753a3b791de3cb6832d82a0ef45eca89dfc8ed4 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 3 Jul 2026 09:43:58 -0600 Subject: [PATCH 36/42] fix(ballista): remove dead single/two-block Ballista offload paths executeSingleBlockViaBallista and executeTwoBlockViaBallista in CometExec are fully superseded by BallistaOffloadPlanner.buildOffloadPlan + NativeBallista.executeOffloadPlan but were left unused, which fails the build under -Xfatal-warnings (unused-private-method warnings). --- .../apache/spark/sql/comet/operators.scala | 132 ------------------ 1 file changed, 132 deletions(-) diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala index bdfc28bf95..6ecdedfff7 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala @@ -441,138 +441,6 @@ object CometExec { } } - /** - * R1: submit a single native block as one self-contained native leaf. Ballista concatenates the - * whole result into a single exported batch, so one import suffices. - */ - private def executeSingleBlockViaBallista( - root: SparkPlan, - boundary: CometNativeExec): Array[InternalRow] = { - val injectedPlanBytes = injectScanFiles(root, boundary) - val numCols = boundary.output.length - val nativeUtil = new NativeUtil() - try { - val nativeBallista = new NativeBallista - nativeUtil.getNextBatch( - numCols, - (arrayAddrs, schemaAddrs) => - nativeBallista.executeQuery(injectedPlanBytes, arrayAddrs, schemaAddrs)) match { - case Some(batch) => - try { - batch.rowIterator().asScala.map(_.copy()).toArray - } finally { - batch.close() - } - case None => - Array.empty[InternalRow] - } - } finally { - nativeUtil.close() - } - } - - /** - * R2: submit a two-stage GROUP BY. `block1` (below the exchange) is the partial aggregate over - * a `NativeScan`; `block2` (above the exchange) is the final aggregate whose input leaf is a - * plain `Scan` fed by the Ballista shuffle. The exchange's [[HashPartitioning]] gives the - * number of grouping columns and shuffle partitions. The native side assembles - * `CometFragmentExec(block2, [Hash-Repartition(CometFragmentExec(block1))])`, which Ballista - * splits at the hash repartition into the two shuffle stages. - */ - private def executeTwoBlockViaBallista( - root: SparkPlan, - boundaries: Seq[CometNativeExec], - exchange: CometShuffleExchangeExec): Array[InternalRow] = { - // The final-aggregate block's input leaf must serialize as a plain `Scan` (#100), which the - // native fragment feeds from the Ballista shuffle reader — NOT a native `ShuffleScan` (#116), - // which expects to read Comet shuffle blocks directly. That requires direct read disabled. - if (CometConf.COMET_SHUFFLE_DIRECT_READ_ENABLED.get()) { - throw new UnsupportedOperationException( - "Comet Ballista two-stage (R2) offload requires " + - s"${CometConf.COMET_SHUFFLE_DIRECT_READ_ENABLED.key}=false so the final-aggregate " + - "block reads a plain Scan leaf (fed by the Ballista shuffle) rather than a native " + - s"ShuffleScan:\n$root") - } - - val (numGroupKeys, numPartitions) = exchange.outputPartitioning match { - case HashPartitioning(expressions, n) => (expressions.length, n) - case other => - throw new UnsupportedOperationException( - "Comet Ballista two-stage (R2) offload requires a HashPartitioning exchange; found " + - s"$other in:\n$root") - } - - // block1 = the serialized native boundary within the exchange's subtree (partial aggregate); - // block2 = the other boundary (final aggregate, an ancestor of the exchange). - val block1 = exchange - .collectFirst { case n: CometNativeExec if n.serializedPlanOpt.isDefined => n } - .getOrElse( - throw new UnsupportedOperationException( - s"Comet Ballista two-stage (R2) offload: no serialized native block below the " + - s"exchange:\n$root")) - val block2 = boundaries - .find(_ ne block1) - .getOrElse(throw new UnsupportedOperationException( - s"Comet Ballista two-stage (R2) offload: could not identify the final-aggregate block " + - s"above the exchange:\n$root")) - - // The native side hashes block1's leading `numGroupKeys` output columns to repartition across - // the shuffle, which is correct ONLY when block1 is a partial `HashAggregate` (output layout - // `[groupKeys..., aggStates...]`) and block2 the matching final `HashAggregate`. A different - // single-hash-exchange shape (e.g. a `... OVER (PARTITION BY k)` window) would hash the wrong - // columns and silently return wrong results, so reject anything else rather than offload it. - (block1, block2) match { - case (b1: CometHashAggregateExec, b2: CometHashAggregateExec) - if b1.modes.contains(Partial) && b2.modes.contains(Final) && - b1.groupingExpressions.length == numGroupKeys => - // ok: partial -> hash-shuffle -> final aggregate, grouping-key width matches the exchange. - case _ => - throw new UnsupportedOperationException( - "Comet Ballista two-stage (R2) offload requires a partial HashAggregate below the hash " + - s"exchange and a final HashAggregate above it (grouping keys: $numGroupKeys); found " + - s"block1=${block1.nodeName}, block2=${block2.nodeName}. Other single-hash-exchange " + - s"shapes (e.g. a window PARTITION BY) are not supported:\n$root") - } - - val block1Bytes = injectScanFiles(root, block1) - val block2Bytes = block2.serializedPlanOpt.plan.getOrElse( - throw new UnsupportedOperationException( - s"Comet Ballista two-stage (R2) offload: the final-aggregate block carries no " + - s"serialized plan:\n$root")) - - // Empty => in-process standalone Ballista; non-empty => submit to that external scheduler. - val schedulerUrl = CometConf.COMET_EXEC_BALLISTA_SCHEDULER_URL.get() - - val numCols = block2.output.length - val nativeUtil = new NativeUtil() - try { - val nativeBallista = new NativeBallista - // The native side concatenates all shuffle-partition outputs into a single exported batch. - nativeUtil.getNextBatch( - numCols, - (arrayAddrs, schemaAddrs) => - nativeBallista.executeQueryDistributed( - block1Bytes, - block2Bytes, - numGroupKeys, - numPartitions, - schedulerUrl, - arrayAddrs, - schemaAddrs)) match { - case Some(batch) => - try { - batch.rowIterator().asScala.map(_.copy()).toArray - } finally { - batch.close() - } - case None => - Array.empty[InternalRow] - } - } finally { - nativeUtil.close() - } - } - /** * Inject file partitions into a native block's serialized plan. The serialized template carries * each `NativeScan`'s `common` metadata but NOT its file list (Comet normally injects files From 3718fd1675df9696082a09fe38ec1f834b3f4321 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 3 Jul 2026 09:56:44 -0600 Subject: [PATCH 37/42] feat(ballista): shuffle-hash joins in the DAG walker --- .../ballista/BallistaOffloadPlanner.scala | 77 +++++++++++++++---- .../BallistaOffloadPlannerSuite.scala | 38 +++++++++ 2 files changed, 98 insertions(+), 17 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/ballista/BallistaOffloadPlanner.scala b/spark/src/main/scala/org/apache/comet/ballista/BallistaOffloadPlanner.scala index 09c43c8810..98d4dddda8 100644 --- a/spark/src/main/scala/org/apache/comet/ballista/BallistaOffloadPlanner.scala +++ b/spark/src/main/scala/org/apache/comet/ballista/BallistaOffloadPlanner.scala @@ -23,7 +23,7 @@ import scala.collection.mutable import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning -import org.apache.spark.sql.comet.{CometExec, CometNativeExec} +import org.apache.spark.sql.comet.{CometExec, CometHashJoinExec, CometNativeExec} import org.apache.spark.sql.comet.execution.shuffle.CometShuffleExchangeExec import org.apache.spark.sql.execution.SparkPlan @@ -42,10 +42,13 @@ import org.apache.comet.serde.OperatorOuterClass.CometBallistaOffloadPlan * - a single native block (no Comet exchange): one fragment, no inputs. * - an N-block LINEAR chain of native blocks connected by [[CometShuffleExchangeExec]] hash * exchanges (the R2 two-stage GROUP BY shape generalized to N stages). + * - a co-partitioned shuffle-hash join: a [[CometHashJoinExec]] block fed by exactly two Comet + * hash exchanges, one under its `left` input and one under its `right` input, emitted in + * left-then-right order to match the join proto's DFS leaf order. * - * General join / multi-input DAG shapes (a native block fed by more than one upstream fragment) - * are a future increment; the walker rejects anything it doesn't recognize with an - * [[UnsupportedOperationException]] rather than guessing. + * Other multi-input DAG shapes (e.g. a non-join native block fed by more than one upstream + * fragment, or broadcast joins) are a future increment; the walker rejects anything it doesn't + * recognize with an [[UnsupportedOperationException]] rather than guessing. */ object BallistaOffloadPlanner { @@ -77,27 +80,54 @@ object BallistaOffloadPlanner { // The direct native-block inputs of `block` are the Comet exchanges in its subtree that are // not nested under a deeper native block. - def directExchanges(block: CometNativeExec): Seq[CometShuffleExchangeExec] = { - val found = mutable.ArrayBuffer.empty[CometShuffleExchangeExec] + def directExchanges(block: CometNativeExec): Seq[CometShuffleExchangeExec] = block match { + case j: CometHashJoinExec => + // Left input then right input — matches the join proto's DFS leaf order + // (`parse_join_parameters` appends left then right). + Seq(exchangeUnder(j.left, block), exchangeUnder(j.right, block)) + case _ => + val found = mutable.ArrayBuffer.empty[CometShuffleExchangeExec] + def walk(p: SparkPlan): Unit = p match { + case e: CometShuffleExchangeExec => found += e // do NOT descend past an exchange + case other => other.children.foreach(walk) + } + block.children.foreach(walk) + found.toSeq + } + + /** The single Comet exchange directly under `side` (a join input); reject otherwise. */ + def exchangeUnder(side: SparkPlan, block: CometNativeExec): CometShuffleExchangeExec = { + val exchanges = mutable.ArrayBuffer.empty[CometShuffleExchangeExec] def walk(p: SparkPlan): Unit = p match { - case e: CometShuffleExchangeExec => found += e // do NOT descend past an exchange + case e: CometShuffleExchangeExec => exchanges += e case other => other.children.foreach(walk) } - block.children.foreach(walk) - found.toSeq + walk(side) + exchanges.toSeq match { + case Seq(one) => one + case other => + throw new UnsupportedOperationException( + s"Comet Ballista offload: a shuffle-hash join input must have exactly one Comet " + + s"hash exchange (found ${other.size}); broadcast joins are a future increment:\n" + + s"$block") + } } def register(block: CometNativeExec): Int = indexOf.getOrElseUpdate( block, { val inputs = directExchanges(block) - // Linear-chain guard (Task 5 scope): a native block may have at most one upstream - // fragment. A block fed by more than one hash exchange is a join/multi-input DAG shape, - // out of scope until Task 6. - if (inputs.size > 1) { - throw new UnsupportedOperationException( - "Comet Ballista offload: block is fed by more than one Comet exchange " + - s"(${inputs.size}); multi-input DAG shapes (e.g. joins) are not yet supported:\n" + - s"$root") + // Linear-chain guard: a non-join native block may have at most one upstream fragment. + // Joins are handled explicitly above and always resolve to exactly two inputs (left, + // right); any other block fed by more than one hash exchange is a multi-input DAG shape + // that's out of scope for now. + block match { + case _: CometHashJoinExec => // exactly two inputs, guaranteed by directExchanges above + case _ if inputs.size > 1 => + throw new UnsupportedOperationException( + "Comet Ballista offload: block is fed by more than one Comet exchange " + + s"(${inputs.size}); multi-input DAG shapes (e.g. joins) are not yet supported:\n" + + s"$root") + case _ => } // Recurse producers first so their indices are smaller (topological order). inputs.foreach(ex => register(blockOf(ex.child))) @@ -123,6 +153,19 @@ object BallistaOffloadPlanner { val planBuilder = CometBallistaOffloadPlan.newBuilder().setNumPartitions(numPartitions) ordered.foreach { node => + // Co-partition check: a join's two inputs must be hash-partitioned to the same width. + // Spark's EnsureRequirements guarantees this; assert to fail fast otherwise. + node.block match { + case _: CometHashJoinExec => + val ns = node.inputs.map(_.outputPartitioning).collect { case HashPartitioning(_, n) => + n + } + require( + ns.distinct.size == 1, + s"Comet Ballista offload: join inputs are not co-partitioned to the same width " + + s"($ns):\n$root") + case _ => + } val fragBuilder = OffloadFragment.newBuilder() // Inject file partitions into NativeScan leaves (reuse the existing helper). fragBuilder.setBlockProto( diff --git a/spark/src/test/spark-4.x/org/apache/comet/ballista/BallistaOffloadPlannerSuite.scala b/spark/src/test/spark-4.x/org/apache/comet/ballista/BallistaOffloadPlannerSuite.scala index 1d35bed239..7d9f0b0967 100644 --- a/spark/src/test/spark-4.x/org/apache/comet/ballista/BallistaOffloadPlannerSuite.scala +++ b/spark/src/test/spark-4.x/org/apache/comet/ballista/BallistaOffloadPlannerSuite.scala @@ -66,4 +66,42 @@ class BallistaOffloadPlannerSuite extends CometTestBase { } } } + + test("shuffle-hash join builds a join fragment with two hash inputs on the join keys") { + withParquetTable((0 until 50).map(i => (i, i * 10)), "l") { + withParquetTable((0 until 50).map(i => (i, i * 100)), "r") { + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false", + SQLConf.SHUFFLE_PARTITIONS.key -> "4", + CometConf.COMET_SHUFFLE_DIRECT_READ_ENABLED.key -> "false", + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", + CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") { + val plan = + sql("SELECT l._2, r._2 FROM l JOIN r ON l._1 = r._1").queryExecution.executedPlan + val desc = CometBallistaOffloadPlan.parseFrom( + BallistaOffloadPlanner.buildOffloadPlan(plan, numPartitions = 4)) + // root fragment = the join; two inputs (left, right) each on one key ordinal + val join = desc.getFragments(desc.getFragmentsCount - 1) + assert(join.getInputsCount == 2, s"expected 2 join inputs, got:\n$desc") + assert(join.getInputs(0).getHashKeyOrdinalsList.size == 1) + assert(join.getInputs(1).getHashKeyOrdinalsList.size == 1) + } + } + } + } + + test("range/ORDER BY exchange is rejected with a clear message") { + withParquetTable((0 until 20).map(i => (i % 3, i)), "t") { + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false", + CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") { + val plan = + sql("SELECT _1, count(*) c FROM t GROUP BY _1 ORDER BY _1").queryExecution.executedPlan + val e = intercept[UnsupportedOperationException] { + BallistaOffloadPlanner.buildOffloadPlan(plan, numPartitions = 4) + } + assert(e.getMessage.contains("HashPartitioning") || e.getMessage.contains("range")) + } + } + } } From 0f47bb2cfde825acf2df138cda32792b5cfd23a9 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 3 Jul 2026 10:02:27 -0600 Subject: [PATCH 38/42] fix(ballista): handle 2-input join blocks generically (sort-merge + hash) --- .../ballista/BallistaOffloadPlanner.scala | 99 +++++++------------ 1 file changed, 38 insertions(+), 61 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/ballista/BallistaOffloadPlanner.scala b/spark/src/main/scala/org/apache/comet/ballista/BallistaOffloadPlanner.scala index 98d4dddda8..028d450edb 100644 --- a/spark/src/main/scala/org/apache/comet/ballista/BallistaOffloadPlanner.scala +++ b/spark/src/main/scala/org/apache/comet/ballista/BallistaOffloadPlanner.scala @@ -23,7 +23,7 @@ import scala.collection.mutable import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning -import org.apache.spark.sql.comet.{CometExec, CometHashJoinExec, CometNativeExec} +import org.apache.spark.sql.comet.{CometExec, CometNativeExec} import org.apache.spark.sql.comet.execution.shuffle.CometShuffleExchangeExec import org.apache.spark.sql.execution.SparkPlan @@ -42,13 +42,15 @@ import org.apache.comet.serde.OperatorOuterClass.CometBallistaOffloadPlan * - a single native block (no Comet exchange): one fragment, no inputs. * - an N-block LINEAR chain of native blocks connected by [[CometShuffleExchangeExec]] hash * exchanges (the R2 two-stage GROUP BY shape generalized to N stages). - * - a co-partitioned shuffle-hash join: a [[CometHashJoinExec]] block fed by exactly two Comet - * hash exchanges, one under its `left` input and one under its `right` input, emitted in - * left-then-right order to match the join proto's DFS leaf order. + * - a co-partitioned join block (shuffle-hash or sort-merge) fed by exactly two Comet hash + * exchanges, discovered via a generic DFS over the block that stops descent at each exchange. + * DFS pre-order naturally visits a binary join's left input before its right input, matching + * the join proto's `[left_leaf, right_leaf]` leaf order; no join-specific handling is + * required. * - * Other multi-input DAG shapes (e.g. a non-join native block fed by more than one upstream - * fragment, or broadcast joins) are a future increment; the walker rejects anything it doesn't - * recognize with an [[UnsupportedOperationException]] rather than guessing. + * Other multi-input DAG shapes (e.g. a native block fed by more than two upstream fragments, or + * broadcast joins) are a future increment; the walker rejects anything it doesn't recognize with + * an [[UnsupportedOperationException]] rather than guessing. */ object BallistaOffloadPlanner { @@ -78,56 +80,33 @@ object BallistaOffloadPlanner { "Comet Ballista offload: expected a serialized native block reachable from " + s"${p.nodeName}:\n$root")) - // The direct native-block inputs of `block` are the Comet exchanges in its subtree that are - // not nested under a deeper native block. - def directExchanges(block: CometNativeExec): Seq[CometShuffleExchangeExec] = block match { - case j: CometHashJoinExec => - // Left input then right input — matches the join proto's DFS leaf order - // (`parse_join_parameters` appends left then right). - Seq(exchangeUnder(j.left, block), exchangeUnder(j.right, block)) - case _ => - val found = mutable.ArrayBuffer.empty[CometShuffleExchangeExec] - def walk(p: SparkPlan): Unit = p match { - case e: CometShuffleExchangeExec => found += e // do NOT descend past an exchange - case other => other.children.foreach(walk) - } - block.children.foreach(walk) - found.toSeq - } - - /** The single Comet exchange directly under `side` (a join input); reject otherwise. */ - def exchangeUnder(side: SparkPlan, block: CometNativeExec): CometShuffleExchangeExec = { - val exchanges = mutable.ArrayBuffer.empty[CometShuffleExchangeExec] + // The direct native-block inputs of `block` are the Comet hash exchanges in its subtree that + // are not nested under a deeper native block, discovered by a plain DFS that stops descent + // at each exchange. This is deliberately generic (no join-specific type matching): for a + // binary join block (shuffle-hash or sort-merge), DFS pre-order visits the left child's + // exchange before the right child's, which matches the join proto's `[left_leaf, right_leaf]` + // leaf order. + def directExchanges(block: CometNativeExec): Seq[CometShuffleExchangeExec] = { + val found = mutable.ArrayBuffer.empty[CometShuffleExchangeExec] def walk(p: SparkPlan): Unit = p match { - case e: CometShuffleExchangeExec => exchanges += e + case e: CometShuffleExchangeExec => found += e // do NOT descend past an exchange case other => other.children.foreach(walk) } - walk(side) - exchanges.toSeq match { - case Seq(one) => one - case other => - throw new UnsupportedOperationException( - s"Comet Ballista offload: a shuffle-hash join input must have exactly one Comet " + - s"hash exchange (found ${other.size}); broadcast joins are a future increment:\n" + - s"$block") - } + block.children.foreach(walk) + found.toSeq } def register(block: CometNativeExec): Int = indexOf.getOrElseUpdate( block, { val inputs = directExchanges(block) - // Linear-chain guard: a non-join native block may have at most one upstream fragment. - // Joins are handled explicitly above and always resolve to exactly two inputs (left, - // right); any other block fed by more than one hash exchange is a multi-input DAG shape - // that's out of scope for now. - block match { - case _: CometHashJoinExec => // exactly two inputs, guaranteed by directExchanges above - case _ if inputs.size > 1 => - throw new UnsupportedOperationException( - "Comet Ballista offload: block is fed by more than one Comet exchange " + - s"(${inputs.size}); multi-input DAG shapes (e.g. joins) are not yet supported:\n" + - s"$root") - case _ => + // A block may have 0 inputs (leaf fragment), 1 input (linear chain), or 2 inputs (a + // co-partitioned join, left then right). More than 2 is a multi-input DAG shape that's + // out of scope for now. + if (inputs.size > 2) { + throw new UnsupportedOperationException( + "Comet Ballista offload: block is fed by more than two Comet exchanges " + + s"(${inputs.size}); multi-input DAG shapes beyond 2-way joins are not yet " + + s"supported:\n$root") } // Recurse producers first so their indices are smaller (topological order). inputs.foreach(ex => register(blockOf(ex.child))) @@ -153,18 +132,16 @@ object BallistaOffloadPlanner { val planBuilder = CometBallistaOffloadPlan.newBuilder().setNumPartitions(numPartitions) ordered.foreach { node => - // Co-partition check: a join's two inputs must be hash-partitioned to the same width. - // Spark's EnsureRequirements guarantees this; assert to fail fast otherwise. - node.block match { - case _: CometHashJoinExec => - val ns = node.inputs.map(_.outputPartitioning).collect { case HashPartitioning(_, n) => - n - } - require( - ns.distinct.size == 1, - s"Comet Ballista offload: join inputs are not co-partitioned to the same width " + - s"($ns):\n$root") - case _ => + // Co-partition check: a two-input (join) block's inputs must be hash-partitioned to the + // same width. Spark's EnsureRequirements guarantees this; assert to fail fast otherwise. + if (node.inputs.size == 2) { + val ns = node.inputs.map(_.outputPartitioning).collect { case HashPartitioning(_, n) => + n + } + require( + ns.distinct.size == 1, + s"Comet Ballista offload: join inputs are not co-partitioned to the same width " + + s"($ns):\n$root") } val fragBuilder = OffloadFragment.newBuilder() // Inject file partitions into NativeScan leaves (reuse the existing helper). From 9e9c3b6e1bf702f223cbf8bf1bc31c5768dbb6d1 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 3 Jul 2026 10:15:35 -0600 Subject: [PATCH 39/42] fix(ballista): resolve 2-input blocks via a single join, reject fused multi-join Comet fuses adjacent native operators into one serialized block, so the old directExchanges-only walk could flatten exchanges from two different joins in a fused multi-join block into one list. A block that happened to surface exactly two exchanges from different joins passed the size<=2 guard and was silently mis-paired as one join's left/right, producing wrong results instead of a clean rejection. Resolve a block's DAG inputs by validating that its exchanges are explained by exactly one binary Comet join (CometHashJoinExec or CometSortMergeJoinExec) whose left and right sides each contribute exactly one of the block's exchanges. Any other shape -- a fused multi-join block, a join with a broadcast (non-hash-exchange) side, or exchanges not cleanly split by a single join -- now throws UnsupportedOperationException instead of guessing. Also update the stale executeCollectViaBallista docstring in operators.scala to describe the current single-join support. --- .../ballista/BallistaOffloadPlanner.scala | 84 ++++++++++++++----- .../apache/spark/sql/comet/operators.scala | 7 +- 2 files changed, 69 insertions(+), 22 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/ballista/BallistaOffloadPlanner.scala b/spark/src/main/scala/org/apache/comet/ballista/BallistaOffloadPlanner.scala index 028d450edb..302a1e32c0 100644 --- a/spark/src/main/scala/org/apache/comet/ballista/BallistaOffloadPlanner.scala +++ b/spark/src/main/scala/org/apache/comet/ballista/BallistaOffloadPlanner.scala @@ -23,7 +23,7 @@ import scala.collection.mutable import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning -import org.apache.spark.sql.comet.{CometExec, CometNativeExec} +import org.apache.spark.sql.comet.{CometExec, CometHashJoinExec, CometNativeExec, CometSortMergeJoinExec} import org.apache.spark.sql.comet.execution.shuffle.CometShuffleExchangeExec import org.apache.spark.sql.execution.SparkPlan @@ -80,34 +80,80 @@ object BallistaOffloadPlanner { "Comet Ballista offload: expected a serialized native block reachable from " + s"${p.nodeName}:\n$root")) - // The direct native-block inputs of `block` are the Comet hash exchanges in its subtree that - // are not nested under a deeper native block, discovered by a plain DFS that stops descent - // at each exchange. This is deliberately generic (no join-specific type matching): for a - // binary join block (shuffle-hash or sort-merge), DFS pre-order visits the left child's - // exchange before the right child's, which matches the join proto's `[left_leaf, right_leaf]` - // leaf order. - def directExchanges(block: CometNativeExec): Seq[CometShuffleExchangeExec] = { + // The direct native-block inputs of `p`'s subtree are the Comet hash exchanges reachable + // without crossing a deeper native block, discovered by a plain DFS that stops descent at + // each exchange. Used both at block level (to find a block's own inputs) and rooted at a + // single join's left/right child (to validate how a join's two sides split those inputs). + def directExchanges(p: SparkPlan): Seq[CometShuffleExchangeExec] = { val found = mutable.ArrayBuffer.empty[CometShuffleExchangeExec] def walk(p: SparkPlan): Unit = p match { case e: CometShuffleExchangeExec => found += e // do NOT descend past an exchange case other => other.children.foreach(walk) } - block.children.foreach(walk) + walk(p) found.toSeq } + // The binary Comet join nodes directly inside `p`'s subtree, discovered the same way (DFS, + // stop descent at exchanges) but continuing past a join into its own children so a fused + // MULTI-join block (two joins with no exchange between them) is still detected. + def directJoins(p: SparkPlan): Seq[SparkPlan] = { + val found = mutable.ArrayBuffer.empty[SparkPlan] + def walk(p: SparkPlan): Unit = p match { + case _: CometShuffleExchangeExec => // do NOT descend past an exchange + case j @ (_: CometHashJoinExec | _: CometSortMergeJoinExec) => + found += j + j.children.foreach(walk) + case other => other.children.foreach(walk) + } + walk(p) + found.toSeq + } + + // Resolve a block's ordered DAG inputs (producer exchanges, in the proto's + // `[left_leaf, right_leaf]` order) from its direct exchanges and joins. A block may be a + // leaf (0 exchanges), a linear chain (1 exchange, no join -- e.g. partial->final + // aggregate), or a single co-partitioned join whose two sides each contribute exactly one + // of exactly two exchanges. Any other shape -- a fused multi-join block, a join with a + // broadcast (non-hash-exchange) side, or exchanges not cleanly split by a single join -- + // throws rather than silently mis-pairing exchanges from different joins. + def resolveInputs(block: CometNativeExec): Seq[CometShuffleExchangeExec] = { + val exchanges = directExchanges(block) + val joins = directJoins(block) + (exchanges.size, joins.size) match { + case (0, _) => Seq.empty + case (1, 0) => exchanges + case (2, 1) => + val join = joins.head + val (leftPlan, rightPlan) = join match { + case j: CometHashJoinExec => (j.left, j.right) + case j: CometSortMergeJoinExec => (j.left, j.right) + } + val leftEx = directExchanges(leftPlan) + val rightEx = directExchanges(rightPlan) + if (leftEx.size == 1 && rightEx.size == 1 && + (leftEx.toSet ++ rightEx.toSet) == exchanges.toSet) { + Seq(leftEx.head, rightEx.head) + } else { + throw new UnsupportedOperationException( + "Comet Ballista offload: join block's two Comet exchanges are not cleanly split " + + s"one-each across the join's left (${leftEx.size} exchange(s)) and right " + + s"(${rightEx.size} exchange(s)) sides -- a broadcast join side is a future " + + s"increment:\n$root") + } + case (n, m) => + throw new UnsupportedOperationException( + s"Comet Ballista offload: block resolves to $n Comet exchange(s) and $m binary " + + "join(s); only a leaf block (0 exchanges), a linear chain (1 exchange, no " + + "join), or a single co-partitioned join (exactly 2 exchanges split one-each " + + "across exactly one join's inputs) are supported -- a fused multi-join block " + + s"is a future increment:\n$root") + } + } + def register(block: CometNativeExec): Int = indexOf.getOrElseUpdate( block, { - val inputs = directExchanges(block) - // A block may have 0 inputs (leaf fragment), 1 input (linear chain), or 2 inputs (a - // co-partitioned join, left then right). More than 2 is a multi-input DAG shape that's - // out of scope for now. - if (inputs.size > 2) { - throw new UnsupportedOperationException( - "Comet Ballista offload: block is fed by more than two Comet exchanges " + - s"(${inputs.size}); multi-input DAG shapes beyond 2-way joins are not yet " + - s"supported:\n$root") - } + val inputs = resolveInputs(block) // Recurse producers first so their indices are smaller (topological order). inputs.foreach(ex => register(blockOf(ex.child))) val idx = ordered.size diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala index 6ecdedfff7..f269675c76 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala @@ -408,9 +408,10 @@ object CometExec { * * Enabled by `spark.comet.exec.ballista.enabled`. The query is decomposed into a DAG of native * fragments connected by hash exchanges by [[BallistaOffloadPlanner]] (currently: a single - * native block, or an N-block linear chain of hash exchanges -- general join/multi-input DAG - * shapes are a future increment), serialized as a `CometBallistaOffloadPlan`, and submitted to - * Ballista via the general native `executeOffloadPlan` entry point. + * native block; an N-block linear chain of hash exchanges; or a single co-partitioned join fed + * by exactly two hash exchanges -- fused multi-join blocks and broadcast join sides are a + * future increment), serialized as a `CometBallistaOffloadPlan`, and submitted to Ballista via + * the general native `executeOffloadPlan` entry point. * * Anything not yet supported by the walker throws [[UnsupportedOperationException]]. */ From 7dd641438da9cae1dd5c3b9f02ee33fcc833ff77 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 3 Jul 2026 10:21:28 -0600 Subject: [PATCH 40/42] test(ballista): distributed shuffle-hash join + N-stage aggregate offload --- .../ballista/CometBallistaJoinSuite.scala | 192 ++++++++++++++++++ 1 file changed, 192 insertions(+) create mode 100644 spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaJoinSuite.scala diff --git a/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaJoinSuite.scala b/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaJoinSuite.scala new file mode 100644 index 0000000000..66499c6c80 --- /dev/null +++ b/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaJoinSuite.scala @@ -0,0 +1,192 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.ballista + +import java.util.concurrent.atomic.AtomicInteger + +import org.apache.spark.CometListenerBusUtils +import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskStart} +import org.apache.spark.sql.CometTestBase +import org.apache.spark.sql.comet.CometNativeExec +import org.apache.spark.sql.comet.execution.shuffle.CometShuffleExchangeExec +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper +import org.apache.spark.sql.internal.SQLConf + +import org.apache.comet.CometConf + +/** + * Task 8 (proof milestone): a distributed shuffle-hash/sort-merge JOIN and an N-stage aggregate + * (>2 native blocks / 2 hash exchanges) both run DISTRIBUTED on the in-process Ballista engine, + * producing results identical to the flag-off (Spark/Comet-on-executors) baseline while launching + * ZERO Spark executor tasks. + * + * The join fragment shape (see [[BallistaOffloadPlanner]]): a single co-partitioned join block + * fed by exactly two Comet hash exchanges, one per side. + * `spark.sql.autoBroadcastJoinThreshold=-1` forces a shuffle-hash / sort-merge join instead of a + * broadcast join, since a broadcast join side is not yet a supported offload shape (the walker + * rejects it). + */ +class CometBallistaJoinSuite extends CometTestBase with AdaptiveSparkPlanHelper { + + /** + * Runs `f`, counting Spark executor task starts during it. Drains the listener bus before + * attaching and after running so asynchronous task-start events are flushed. (Same apparatus as + * `CometBallistaDistributedSuite` / `CometBallistaOffloadSuite`.) + */ + private def countTaskStarts(f: => Unit): Int = { + val taskStarts = new AtomicInteger(0) + val listener = new SparkListener { + override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = { + taskStarts.incrementAndGet() + } + } + CometListenerBusUtils.waitUntilEmpty(spark.sparkContext) + spark.sparkContext.addSparkListener(listener) + try { + f + CometListenerBusUtils.waitUntilEmpty(spark.sparkContext) + } finally { + spark.sparkContext.removeSparkListener(listener) + } + taskStarts.get() + } + + /** + * Runs `query` with the Ballista offload flag set to `ballista`, counting Spark executor task + * starts. AQE is off, shuffle partitions are pinned small, shuffle direct-read is off (so a + * downstream fragment's input leaf serializes as a plain `Scan` fed by the Ballista shuffle), + * and the broadcast-join threshold is disabled so joins plan as shuffle-hash / sort-merge + * rather than broadcast. + */ + private def runWith(ballista: Boolean, query: String): (Seq[Seq[Any]], Int) = { + var rows: Seq[Seq[Any]] = null + val taskStarts = countTaskStarts { + rows = withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false", + SQLConf.SHUFFLE_PARTITIONS.key -> "4", + CometConf.COMET_SHUFFLE_DIRECT_READ_ENABLED.key -> "false", + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", + CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> ballista.toString) { + spark.sql(query).collect().map(_.toSeq.toIndexedSeq).toIndexedSeq + } + } + (rows, taskStarts) + } + + private def sortKey(r: Seq[Any]): String = r.map(v => s"$v").mkString(",") + + test("distributed shuffle-hash join offloads with zero Spark tasks") { + assume( + NativeBallista.isAvailable, + s"native ballista library not available: ${NativeBallista.loadFailure.map(_.getMessage)}") + + withParquetTable((0 until 200).map(i => (i, i * 10)), "l") { + withParquetTable((0 until 200).map(i => (i % 50, i * 100)), "r") { + val query = "SELECT l._1, l._2, r._2 FROM l JOIN r ON l._1 = r._1" + + // Baseline: normal Comet execution (offload off), a positive control proving the + // listener observes executor task starts. + val (baseline, baselineTaskStarts) = runWith(ballista = false, query) + assert( + baselineTaskStarts > 0, + "expected the flag-off baseline collect to launch at least one Spark executor task " + + s"(sanity check for the listener apparatus); got $baselineTaskStarts") + + // Ballista offload: same query, flag on. + val (offloaded, offloadedTaskStarts) = runWith(ballista = true, query) + + val baselineSorted = baseline.sortBy(sortKey) + val offloadedSorted = offloaded.sortBy(sortKey) + assert( + offloadedSorted == baselineSorted, + "offloaded (distributed) rows do not match baseline\n" + + s" baseline: $baselineSorted\n offloaded: $offloadedSorted") + + assert( + offloadedTaskStarts == 0, + "expected 0 Spark executor tasks for the Ballista-offloaded distributed join, " + + s"but $offloadedTaskStarts started") + } + } + } + + test("three-stage aggregate (two hash exchanges) offloads with zero Spark tasks") { + assume( + NativeBallista.isAvailable, + s"native ballista library not available: ${NativeBallista.loadFailure.map(_.getMessage)}") + + // _1 ranges over 200 distinct values (0..199), each appearing twice, so the inner + // `GROUP BY _1` needs a real shuffle to de-duplicate across partitions. k = _1 % 10 then + // buckets those 200 distinct values into 10 groups of 20, and the outer `GROUP BY k` needs a + // second shuffle to merge partial counts. Two shuffles => three native blocks (partial + // dedupe -> exchange -> final dedupe + partial count -> exchange -> final count). + withParquetTable((0 until 400).map(i => (i % 200, i)), "t") { + val query = "SELECT k, count(*) AS c FROM (SELECT _1 % 10 AS k FROM t GROUP BY _1) " + + "GROUP BY k" + + // Pre-flight: confirm the plan is the >=3-block / 2-exchange shape BEFORE running it. + val executed = withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false", + SQLConf.SHUFFLE_PARTITIONS.key -> "4", + CometConf.COMET_SHUFFLE_DIRECT_READ_ENABLED.key -> "false", + CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") { + spark.sql(query).queryExecution.executedPlan + } + val exchanges = executed.collect { case e: CometShuffleExchangeExec => e } + assert( + exchanges.size == 2, + s"expected exactly two Comet hash exchanges (three stages), found ${exchanges.size}:\n" + + s"$executed") + val nativeBlocks = executed.collect { + case n: CometNativeExec if n.serializedPlanOpt.isDefined => n + } + assert( + nativeBlocks.size == 3, + s"expected exactly three serialized CometNativeExec blocks, found " + + s"${nativeBlocks.size}:\n$executed") + + // Baseline: normal Comet execution (offload off), a positive control proving the listener + // observes executor task starts. + val (baseline, baselineTaskStarts) = runWith(ballista = false, query) + assert( + baselineTaskStarts > 0, + "expected the flag-off baseline collect to launch at least one Spark executor task " + + s"(sanity check for the listener apparatus); got $baselineTaskStarts") + assert( + baseline.sortBy(sortKey) == (0 until 10).map(k => Seq(k, 20L)), + s"unexpected group counts: ${baseline.sortBy(sortKey)}") + + // Ballista offload: same query, flag on. + val (offloaded, offloadedTaskStarts) = runWith(ballista = true, query) + + val baselineSorted = baseline.sortBy(sortKey) + val offloadedSorted = offloaded.sortBy(sortKey) + assert( + offloadedSorted == baselineSorted, + "offloaded (distributed) rows do not match baseline\n" + + s" baseline: $baselineSorted\n offloaded: $offloadedSorted") + + assert( + offloadedTaskStarts == 0, + "expected 0 Spark executor tasks for the Ballista-offloaded distributed aggregate, " + + s"but $offloadedTaskStarts started") + } + } +} From 1c7146888e22078c77fc58b2f7ba26c4a40be256 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 3 Jul 2026 10:36:50 -0600 Subject: [PATCH 41/42] refactor(ballista): remove single/two-stage path superseded by the DAG offload --- native/core/src/execution/ballista/ffi_jni.rs | 289 +----------------- native/core/src/execution/ballista/mod.rs | 4 +- .../core/tests/ballista_external_cluster.rs | 47 +-- .../comet/ballista/NativeBallista.scala | 63 +--- 4 files changed, 45 insertions(+), 358 deletions(-) diff --git a/native/core/src/execution/ballista/ffi_jni.rs b/native/core/src/execution/ballista/ffi_jni.rs index 94b0486373..52ec64f848 100644 --- a/native/core/src/execution/ballista/ffi_jni.rs +++ b/native/core/src/execution/ballista/ffi_jni.rs @@ -105,7 +105,7 @@ pub fn execute_comet_proto(proto: &[u8]) -> Result<(SchemaRef, Vec) } // --------------------------------------------------------------------------- -// R2: two-stage (distributed) GROUP BY offload +// R3: general DAG offload (`CometBallistaOffloadPlan`) // --------------------------------------------------------------------------- use std::time::Duration; @@ -121,80 +121,6 @@ use datafusion::physical_plan::Partitioning; use datafusion_proto::protobuf::PhysicalPlanNode; use futures::TryStreamExt; -/// Build the R2 two-stage physical plan for a distributed GROUP BY: -/// -/// ```text -/// CometFragmentExec(block2, children=[ -/// RepartitionExec::Hash( CometFragmentExec(block1, children=[]), keys=0..num_group_keys, N ) -/// ]) -/// ``` -/// -/// `block1` is the partial aggregate (self-contained `NativeScan` leaf); its -/// output is `[group_keys..., agg_states...]`, so the group keys are columns -/// `0..num_group_keys`. Hash-repartitioning on those columns co-locates every -/// row for a group key on one downstream task, which is what lets the final -/// aggregate in `block2` compose across the shuffle. Ballista's -/// `DistributedPlanner` splits this plan at the `RepartitionExec(Hash)` into two -/// stages (block1 -> ShuffleWriter; ShuffleReader -> block2), and at stage 2 the -/// ShuffleReader becomes `block2`'s `Scan` (#100) input leaf. -fn build_two_stage_plan( - block1_proto: &[u8], - block2_proto: &[u8], - num_group_keys: usize, - num_partitions: usize, -) -> Result, String> { - let block1: Arc = Arc::new( - CometFragmentExec::try_new(block1_proto.to_vec(), vec![]) - .map_err(|e| format!("failed to build block1 (partial-agg) fragment: {e}"))?, - ); - - let schema1 = block1.schema(); - if num_group_keys == 0 || num_group_keys > schema1.fields().len() { - return Err(format!( - "invalid num_group_keys {num_group_keys}: block1 output has {} columns ({:?})", - schema1.fields().len(), - schema1 - .fields() - .iter() - .map(|f| f.name()) - .collect::>() - )); - } - - // Investigation aid: the schema of the batches that cross Ballista's IPC - // shuffle. block2's `Scan` (#100) leaf schema (derived from the exchange - // output on the JVM side) must match this for the aggregate to compose. - log::debug!("[comet-ballista R2] block1 (partial-agg) output schema = {schema1:?}"); - - let hash_exprs: Vec> = (0..num_group_keys) - .map(|i| Arc::new(Column::new(schema1.field(i).name(), i)) as Arc) - .collect(); - - let repart: Arc = Arc::new( - RepartitionExec::try_new( - block1, - Partitioning::Hash(hash_exprs, num_partitions.max(1)), - ) - .map_err(|e| format!("failed to build hash RepartitionExec: {e}"))?, - ); - - let block2: Arc = Arc::new( - CometFragmentExec::try_new(block2_proto.to_vec(), vec![repart]) - .map_err(|e| format!("failed to build block2 (final-agg) fragment: {e}"))?, - ); - - log::debug!( - "[comet-ballista R2] block2 (final-agg) output schema = {:?}", - block2.schema() - ); - - Ok(block2) -} - -// --------------------------------------------------------------------------- -// R3: general DAG offload (`CometBallistaOffloadPlan`) -// --------------------------------------------------------------------------- - /// Count the `Scan` (#100) input leaves in a serialized Comet `Operator` block — /// the same leaves `build_native_fragment` (`native/core/src/execution/fragment.rs`) /// expects one child stream per, in DFS order. Used as a build-time guard so a @@ -285,16 +211,14 @@ pub fn build_offload_plan(plan_bytes: &[u8]) -> Result, S /// Build and submit a general `CometBallistaOffloadPlan` DAG to a Ballista /// cluster, returning the collected Arrow result batches plus the result schema. /// -/// Mirrors [`execute_two_stage`], but the plan is an arbitrary DAG of -/// `CometFragmentExec` nodes (folded by [`build_offload_plan`]) rather than a -/// fixed two-stage GROUP BY shape. The shuffle width `n` is read directly from -/// the descriptor's `num_partitions` field — the authoritative parallelism for -/// every hash repartition `build_offload_plan` builds — so `build_offload_plan` -/// itself keeps its single-return signature. +/// The plan is an arbitrary DAG of `CometFragmentExec` nodes (folded by +/// [`build_offload_plan`]), not a fixed two-stage shape. The shuffle width `n` +/// is read directly from the descriptor's `num_partitions` field — the +/// authoritative parallelism for every hash repartition `build_offload_plan` +/// builds — so `build_offload_plan` itself keeps its single-return signature. /// -/// As with `execute_two_stage`, an empty `scheduler_url` starts an in-process -/// standalone cluster; a non-empty one submits to that external scheduler -/// instead. +/// An empty `scheduler_url` starts an in-process standalone cluster; a +/// non-empty one submits to that external scheduler instead. pub fn execute_offload_plan( plan_bytes: &[u8], scheduler_url: &str, @@ -409,117 +333,6 @@ async fn start_standalone_from_state(state: &SessionState) -> Result Result<(SchemaRef, Vec), String> { - let runtime = tokio::runtime::Builder::new_multi_thread() - .enable_all() - .build() - .map_err(|e| format!("failed to build tokio runtime: {e}"))?; - - runtime.block_on(async move { - let n = num_partitions.max(1); - let config = SessionConfig::new_with_ballista() - .with_target_partitions(n) - .with_ballista_standalone_parallelism(n) - .with_ballista_physical_extension_codec(Arc::new(CometPhysicalCodec::default())) - .with_ballista_logical_extension_codec(Arc::new(CometLogicalCodec::default())); - let state = SessionStateBuilder::new() - .with_config(config) - .with_default_features() - .build(); - - // Build the plan inside the runtime: the fragments' NativeScan leaf builds - // via Comet's planner, which requires an active Tokio runtime. - let plan = build_two_stage_plan(block1_proto, block2_proto, num_group_keys, n)?; - let schema = plan.schema(); - - // Empty URL => in-process standalone; non-empty => external cluster. For - // the external path the scheduler creates the session from the submitted - // settings + its own (Comet) codecs, so we do not start a local cluster. - let scheduler_url = if scheduler_url.is_empty() { - log::debug!("[comet-ballista R2] submitting to in-process standalone cluster"); - start_standalone_from_state(&state).await? - } else { - log::debug!("[comet-ballista R2] submitting to external cluster at {scheduler_url}"); - scheduler_url.to_string() - }; - - let session_config = state.config().clone(); - let codec = CometPhysicalCodec::default(); - let session_id = state.session_id().to_string(); - - let stream = execute_physical_plan::( - scheduler_url, - &BallistaConfig::default(), - plan, - &codec, - session_id, - session_config, - ) - .await - .map_err(|e| format!("failed to submit two-stage physical plan: {e}"))?; - - let batches = stream - .try_collect::>() - .await - .map_err(|e| format!("failed to collect distributed results: {e}"))?; - - Ok((schema, batches)) - }) -} - -/// Run the R2 two-stage plan and export the (single, concatenated) result batch -/// into the JVM-allocated FFI structs. Returns the row count. -/// -/// # Safety -/// See [`export_batch_to_addresses`]. -pub unsafe fn submit_and_export_distributed( - block1_proto: &[u8], - block2_proto: &[u8], - num_group_keys: usize, - num_partitions: usize, - scheduler_url: &str, - array_addrs: &[i64], - schema_addrs: &[i64], -) -> Result { - let (schema, batches) = execute_two_stage( - block1_proto, - block2_proto, - num_group_keys, - num_partitions, - scheduler_url, - )?; - // The final stage's partitions are concatenated into one batch so the JVM - // imports exactly one set of column structs (same contract as R1). - let batch = concat_batches(&schema, &batches) - .map_err(|e| format!("failed to concatenate result batches: {e}"))?; - export_batch_to_addresses(&batch, array_addrs, schema_addrs)?; - Ok(batch.num_rows() as i64) -} - /// Export one Arrow batch into caller-allocated `FFI_ArrowArray` / /// `FFI_ArrowSchema` structs, one per column, whose addresses were allocated by /// the JVM (Arrow Java `ArrowArray.allocateNew` / `ArrowSchema.allocateNew`). @@ -589,10 +402,10 @@ pub unsafe fn submit_and_export( // --------------------------------------------------------------------------- mod jni_entry { - use super::{submit_and_export, submit_and_export_distributed, submit_and_export_offload}; + use super::submit_and_export_offload; use crate::errors::{try_unwrap_or_throw, CometError}; use jni::objects::{JByteArray, JClass, JLongArray, JString, ReleaseMode}; - use jni::sys::{jint, jlong}; + use jni::sys::jlong; use jni::EnvUnowned; /// JVM entry: a no-op whose only purpose is symbol resolution. It is compiled @@ -610,88 +423,6 @@ mod jni_entry { ) { } - /// JVM entry: run a Comet `Operator` proto on in-process standalone Ballista - /// and export the result batch into the JVM-allocated Arrow C Data structs - /// (`FFI_ArrowArray`/`FFI_ArrowSchema`), returning the number of rows. This - /// mirrors `Java_org_apache_comet_Native_executePlan`'s use of - /// `prepare_output` — the JVM allocates the structs and imports them after - /// this call returns. - /// - /// # Safety - /// Called from the JVM via JNI; the address arrays must reference valid - /// caller-allocated `FFI_ArrowArray`/`FFI_ArrowSchema` structs. - #[no_mangle] - pub unsafe extern "system" fn Java_org_apache_comet_ballista_NativeBallista_executeQuery( - e: EnvUnowned, - _class: JClass, - proto: JByteArray, - array_addrs: JLongArray, - schema_addrs: JLongArray, - ) -> jlong { - try_unwrap_or_throw(&e, |env| { - let proto_bytes = env.convert_byte_array(proto)?; - - let arrays = unsafe { array_addrs.get_elements(env, ReleaseMode::NoCopyBack)? }; - let schemas = unsafe { schema_addrs.get_elements(env, ReleaseMode::NoCopyBack)? }; - - // SAFETY: the JVM allocated these FFI structs (Arrow Java - // ArrowArray/ArrowSchema.allocateNew); we write the exported values - // into them and the JVM imports them after this returns. - let num_rows = unsafe { submit_and_export(&proto_bytes, &arrays, &schemas) } - .map_err(CometError::Internal)?; - Ok(num_rows as jlong) - }) - } - - /// JVM entry: run a distributed (R2) two-stage GROUP BY offload. Builds - /// `CometFragmentExec(block2, [Hash-Repartition(CometFragmentExec(block1))])`, - /// submits it to an in-process standalone Ballista cluster (which splits it at - /// the hash-repartition into a partial-agg stage and a final-agg stage over a - /// shuffle), and exports the concatenated result batch into the JVM-allocated - /// Arrow C Data structs, returning the number of rows. - /// - /// # Safety - /// Called from the JVM via JNI; the address arrays must reference valid - /// caller-allocated `FFI_ArrowArray`/`FFI_ArrowSchema` structs (one per output - /// column of `block2`). - #[no_mangle] - pub unsafe extern "system" fn Java_org_apache_comet_ballista_NativeBallista_executeQueryDistributed( - e: EnvUnowned, - _class: JClass, - block1: JByteArray, - block2: JByteArray, - num_group_keys: jint, - num_partitions: jint, - scheduler_url: JString, - array_addrs: JLongArray, - schema_addrs: JLongArray, - ) -> jlong { - try_unwrap_or_throw(&e, |env| { - let block1_bytes = env.convert_byte_array(block1)?; - let block2_bytes = env.convert_byte_array(block2)?; - // Empty => in-process standalone (as before); non-empty (e.g. - // "http://host:50050") => submit to that external scheduler. - let scheduler_url: String = scheduler_url.try_to_string(env)?; - - let arrays = unsafe { array_addrs.get_elements(env, ReleaseMode::NoCopyBack)? }; - let schemas = unsafe { schema_addrs.get_elements(env, ReleaseMode::NoCopyBack)? }; - - let num_rows = unsafe { - submit_and_export_distributed( - &block1_bytes, - &block2_bytes, - num_group_keys as usize, - num_partitions as usize, - &scheduler_url, - &arrays, - &schemas, - ) - } - .map_err(CometError::Internal)?; - Ok(num_rows as jlong) - }) - } - /// JVM entry: run a general DAG offload (R3), a `CometBallistaOffloadPlan` /// describing an arbitrary DAG of `CometFragmentExec` nodes joined by hash /// shuffles (folded by `build_offload_plan`). Submits it to a Ballista diff --git a/native/core/src/execution/ballista/mod.rs b/native/core/src/execution/ballista/mod.rs index 1da09ecb8c..3d60e475c0 100644 --- a/native/core/src/execution/ballista/mod.rs +++ b/native/core/src/execution/ballista/mod.rs @@ -43,8 +43,8 @@ pub mod table_provider; pub use codec::{CometLogicalCodec, CometPhysicalCodec, COMET_FRAGMENT_MAGIC, COMET_MAGIC}; pub use ffi_jni::{ - build_offload_plan, execute_comet_proto, execute_offload_plan, execute_two_stage, - submit_and_export, submit_and_export_distributed, submit_and_export_offload, + build_offload_plan, execute_comet_proto, execute_offload_plan, submit_and_export, + submit_and_export_offload, }; pub use fragment::CometFragmentExec; pub use scan::CometScanExec; diff --git a/native/core/tests/ballista_external_cluster.rs b/native/core/tests/ballista_external_cluster.rs index 3ec9f0f25d..b3d58b6539 100644 --- a/native/core/tests/ballista_external_cluster.rs +++ b/native/core/tests/ballista_external_cluster.rs @@ -23,11 +23,11 @@ // The point it proves: // 1. The two Comet-flavored binaries (which inject Comet's extension codecs, // unlike the stock Ballista CLIs) start, and the executor registers. -// 2. A two-stage Comet plan (`CometFragment(NativeScan) -> hash-shuffle -> -// CometFragment(Filter over a Scan)`) submitted to the external scheduler -// is split into two stages, shipped to the *separate* executor process, -// reconstructed there via the codecs, and executed — returning correct -// results across the process boundary. +// 2. A two-fragment Comet DAG offload plan (`CometFragment(NativeScan) -> +// hash-shuffle -> CometFragment(Filter over a Scan)`) submitted to the +// external scheduler is split into two stages, shipped to the *separate* +// executor process, reconstructed there via the codecs, and executed — +// returning correct results across the process boundary. // 3. Crucially, the executor is a plain Rust process with NO running JVM (only // `libjvm` on the loader path). The Comet fragments must execute there // without a "JAVA_VM not initialized" panic. A childless `NativeScan` @@ -52,14 +52,15 @@ use datafusion::arrow::datatypes::{DataType as ArrowDataType, Field, Schema, Sch use datafusion::parquet::arrow::ArrowWriter; use prost::Message; -use comet::execution::ballista::execute_two_stage; +use comet::execution::ballista::execute_offload_plan; use datafusion_comet_proto::spark_expression::{ data_type::DataTypeId, expr::ExprStruct, literal, BinaryExpr, BoundReference, DataType, Expr, Literal, }; use datafusion_comet_proto::spark_operator::{ - operator::OpStruct, Filter, NativeScan, NativeScanCommon, Operator, Scan, SparkFilePartition, - SparkPartitionedFile, SparkStructField, + operator::OpStruct, CometBallistaOffloadPlan, Filter, NativeScan, NativeScanCommon, + OffloadFragment, OffloadInput, Operator, Scan, SparkFilePartition, SparkPartitionedFile, + SparkStructField, }; // Non-default ports so this test does not collide with a real cluster on the @@ -279,23 +280,33 @@ fn comet_plan_on_external_cluster() -> anyhow::Result<()> { // Grace for the executor to complete registration with the scheduler. std::thread::sleep(Duration::from_secs(3)); - // --- 3. Build the two-stage Comet plan protos --- + // --- 3. Build the two-fragment Comet DAG offload plan --- let parquet = std::env::temp_dir().join("comet_external_cluster.parquet"); write_test_parquet(&parquet)?; let block1 = build_native_scan_proto(&parquet)?; // NativeScan a=[1..5] let block2 = build_filter_over_scan_proto(); // Filter(a > 2) + let plan = CometBallistaOffloadPlan { + num_partitions: 4, + fragments: vec![ + OffloadFragment { + block_proto: block1, + inputs: vec![], + }, + OffloadFragment { + block_proto: block2, + inputs: vec![OffloadInput { + producer: 0, + hash_key_ordinals: vec![0], + }], + }, + ], + }; // --- 4. Submit to the EXTERNAL scheduler (non-empty URL => remote path) --- let scheduler_url = format!("http://127.0.0.1:{SCHEDULER_PORT}"); - eprintln!("[harness] submitting two-stage Comet plan to {scheduler_url}"); - let (schema, batches) = execute_two_stage( - &block1, - &block2, - /* num_group_keys */ 1, - /* num_partitions */ 4, - &scheduler_url, - ) - .map_err(|e| anyhow::anyhow!("external submission failed: {e}"))?; + eprintln!("[harness] submitting DAG offload plan to {scheduler_url}"); + let (schema, batches) = execute_offload_plan(&plan.encode_to_vec(), &scheduler_url) + .map_err(|e| anyhow::anyhow!("external submission failed: {e}"))?; // --- 5. Verify correctness across the process boundary --- let mut values: Vec = Vec::new(); diff --git a/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala b/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala index 27e20039ed..73341afd7b 100644 --- a/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala +++ b/spark/src/main/scala/org/apache/comet/ballista/NativeBallista.scala @@ -32,8 +32,10 @@ import org.apache.comet.NativeBase * Comet core's single `JAVA_VM` static, so a Comet-on-executor query and an in-process offload * can coexist in one JVM without the "JAVA_VM not initialized" panic. * - * EXPERIMENTAL (R1): used by [[org.apache.spark.sql.comet.CometExec.executeCollectViaBallista]] - * to offload a single-stage Comet query to an in-process Ballista engine on the Spark driver. + * EXPERIMENTAL: used by [[org.apache.spark.sql.comet.CometExec.executeCollectViaBallista]] to + * offload a Comet query - a DAG of native fragments joined by hash exchanges, folded from the + * plan by `executeOffloadPlan` - to a Ballista engine (in-process standalone or an external + * cluster) on the Spark driver. */ class NativeBallista { @@ -47,63 +49,6 @@ class NativeBallista { */ @native def probeAvailable(): Unit - /** - * Run a serialized Comet `Operator` proto on an in-process standalone Ballista engine (no Spark - * executors) and export the single (concatenated) result batch into the caller-allocated Arrow - * C Data structs. - * - * @param proto - * serialized Comet `Operator` proto - * @param arrayAddrs - * memory addresses of one `ArrowArray` struct per output column - * @param schemaAddrs - * memory addresses of one `ArrowSchema` struct per output column - * @return - * the number of rows exported - */ - @native def executeQuery( - proto: Array[Byte], - arrayAddrs: Array[Long], - schemaAddrs: Array[Long]): Long - - /** - * EXPERIMENTAL (R2): run a distributed two-stage GROUP BY offload on an in-process standalone - * Ballista cluster (no Spark executors). - * - * `block1` is the serialized partial-aggregate block (self-contained `NativeScan` leaf); - * `block2` is the serialized final-aggregate block (whose input leaf is a `Scan` fed by the - * shuffle). The native side assembles `CometFragmentExec(block2, - * [Hash-Repartition(CometFragmentExec(block1))])`, which Ballista splits at the hash - * repartition into a partial-agg stage and a final-agg stage across a shuffle, then exports the - * concatenated result batch into the caller-allocated Arrow C Data structs. - * - * @param block1 - * serialized partial-aggregate `Operator` proto (with file partitions injected) - * @param block2 - * serialized final-aggregate `Operator` proto (leaf is a `Scan`, not a `ShuffleScan`) - * @param numGroupKeys - * number of grouping columns (the leading columns of block1's output to hash on) - * @param numPartitions - * number of shuffle partitions - * @param schedulerUrl - * the external Ballista scheduler URL (e.g. `http://host:50050`) to submit the plan to; an - * empty string submits to an in-process standalone Ballista cluster instead - * @param arrayAddrs - * memory addresses of one `ArrowArray` struct per output column of `block2` - * @param schemaAddrs - * memory addresses of one `ArrowSchema` struct per output column of `block2` - * @return - * the number of rows exported - */ - @native def executeQueryDistributed( - block1: Array[Byte], - block2: Array[Byte], - numGroupKeys: Int, - numPartitions: Int, - schedulerUrl: String, - arrayAddrs: Array[Long], - schemaAddrs: Array[Long]): Long - /** * Run a serialized [[org.apache.comet.serde.OperatorOuterClass.CometBallistaOffloadPlan]] (a * DAG of Comet native fragments + hash exchanges) on Ballista and export the single From e9c38792617405d62e04101b070595b4fa37dfbc Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 3 Jul 2026 10:55:14 -0600 Subject: [PATCH 42/42] fix(ballista): explicitly reject broadcast joins; assert join plan shape in E2E The walker previously classified a CometBroadcastHashJoinExec as a leaf block because directExchanges did not stop at CometBroadcastExchangeExec and directJoins only matched CometHashJoinExec/CometSortMergeJoinExec, so a broadcast join only failed by luck downstream instead of being explicitly rejected. Reject it in resolveInputs and stop/reject on CometBroadcastExchangeExec/CometBroadcastHashJoinExec during the DAG walk. Also add a pre-flight plan-shape assertion to the join E2E test, mirroring the existing aggregate test's assertion, so a future plan regression is caught before comparing offloaded vs baseline output. --- .../ballista/BallistaOffloadPlanner.scala | 21 +++++++++++++++- .../BallistaOffloadPlannerSuite.scala | 24 ++++++++++++++++++ .../ballista/CometBallistaJoinSuite.scala | 25 ++++++++++++++++++- 3 files changed, 68 insertions(+), 2 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/ballista/BallistaOffloadPlanner.scala b/spark/src/main/scala/org/apache/comet/ballista/BallistaOffloadPlanner.scala index 302a1e32c0..0a9ecd03c3 100644 --- a/spark/src/main/scala/org/apache/comet/ballista/BallistaOffloadPlanner.scala +++ b/spark/src/main/scala/org/apache/comet/ballista/BallistaOffloadPlanner.scala @@ -23,7 +23,7 @@ import scala.collection.mutable import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning -import org.apache.spark.sql.comet.{CometExec, CometHashJoinExec, CometNativeExec, CometSortMergeJoinExec} +import org.apache.spark.sql.comet.{CometBroadcastExchangeExec, CometBroadcastHashJoinExec, CometExec, CometHashJoinExec, CometNativeExec, CometSortMergeJoinExec} import org.apache.spark.sql.comet.execution.shuffle.CometShuffleExchangeExec import org.apache.spark.sql.execution.SparkPlan @@ -88,6 +88,12 @@ object BallistaOffloadPlanner { val found = mutable.ArrayBuffer.empty[CometShuffleExchangeExec] def walk(p: SparkPlan): Unit = p match { case e: CometShuffleExchangeExec => found += e // do NOT descend past an exchange + case _: CometBroadcastExchangeExec => + // A broadcast build side is not a Comet hash exchange and must never be silently + // walked through and ignored -- reject rather than mis-classify the block. + throw new UnsupportedOperationException( + "Comet Ballista offload: broadcast joins are not supported (found a " + + s"CometBroadcastExchangeExec build side); a future increment:\n$root") case other => other.children.foreach(walk) } walk(p) @@ -101,6 +107,10 @@ object BallistaOffloadPlanner { val found = mutable.ArrayBuffer.empty[SparkPlan] def walk(p: SparkPlan): Unit = p match { case _: CometShuffleExchangeExec => // do NOT descend past an exchange + case _: CometBroadcastHashJoinExec => + throw new UnsupportedOperationException( + "Comet Ballista offload: broadcast joins (CometBroadcastHashJoinExec) are not " + + s"supported; a future increment:\n$root") case j @ (_: CometHashJoinExec | _: CometSortMergeJoinExec) => found += j j.children.foreach(walk) @@ -118,6 +128,15 @@ object BallistaOffloadPlanner { // broadcast (non-hash-exchange) side, or exchanges not cleanly split by a single join -- // throws rather than silently mis-pairing exchanges from different joins. def resolveInputs(block: CometNativeExec): Seq[CometShuffleExchangeExec] = { + // A broadcast join is a supported-looking CometNativeExec (it IS a CometExec, so offload + // is attempted for it) but its build side is a CometBroadcastExchangeExec, not a Comet + // hash exchange -- reject explicitly here rather than letting it fall through and be + // mis-classified as a leaf/linear block by directExchanges/directJoins below. + if (block.isInstanceOf[CometBroadcastHashJoinExec]) { + throw new UnsupportedOperationException( + "Comet Ballista offload: broadcast joins (CometBroadcastHashJoinExec) are not " + + s"supported; a future increment:\n$root") + } val exchanges = directExchanges(block) val joins = directJoins(block) (exchanges.size, joins.size) match { diff --git a/spark/src/test/spark-4.x/org/apache/comet/ballista/BallistaOffloadPlannerSuite.scala b/spark/src/test/spark-4.x/org/apache/comet/ballista/BallistaOffloadPlannerSuite.scala index 7d9f0b0967..46d86c4997 100644 --- a/spark/src/test/spark-4.x/org/apache/comet/ballista/BallistaOffloadPlannerSuite.scala +++ b/spark/src/test/spark-4.x/org/apache/comet/ballista/BallistaOffloadPlannerSuite.scala @@ -104,4 +104,28 @@ class BallistaOffloadPlannerSuite extends CometTestBase { } } } + + test("broadcast join is rejected with a clear message") { + // Do NOT disable auto-broadcast (no AUTO_BROADCASTJOIN_THRESHOLD=-1 override): `r` is tiny + // so Spark plans a broadcast join, giving a `CometBroadcastHashJoinExec` fed by a + // `CometBroadcastExchangeExec` build side -- the out-of-scope shape the walker must reject. + withParquetTable((0 until 50).map(i => (i, i * 10)), "l") { + withParquetTable((0 until 5).map(i => (i, i * 100)), "r") { + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false", + SQLConf.SHUFFLE_PARTITIONS.key -> "4", + CometConf.COMET_SHUFFLE_DIRECT_READ_ENABLED.key -> "false", + CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") { + val plan = + sql("SELECT l._2, r._2 FROM l JOIN r ON l._1 = r._1").queryExecution.executedPlan + val e = intercept[UnsupportedOperationException] { + BallistaOffloadPlanner.buildOffloadPlan(plan, numPartitions = 4) + } + assert( + e.getMessage.toLowerCase.contains("broadcast"), + s"expected a message mentioning broadcast, got: ${e.getMessage}") + } + } + } + } } diff --git a/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaJoinSuite.scala b/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaJoinSuite.scala index 66499c6c80..e923620ae5 100644 --- a/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaJoinSuite.scala +++ b/spark/src/test/spark-4.x/org/apache/comet/ballista/CometBallistaJoinSuite.scala @@ -24,7 +24,7 @@ import java.util.concurrent.atomic.AtomicInteger import org.apache.spark.CometListenerBusUtils import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskStart} import org.apache.spark.sql.CometTestBase -import org.apache.spark.sql.comet.CometNativeExec +import org.apache.spark.sql.comet.{CometHashJoinExec, CometNativeExec, CometSortMergeJoinExec} import org.apache.spark.sql.comet.execution.shuffle.CometShuffleExchangeExec import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.internal.SQLConf @@ -101,6 +101,29 @@ class CometBallistaJoinSuite extends CometTestBase with AdaptiveSparkPlanHelper withParquetTable((0 until 200).map(i => (i % 50, i * 100)), "r") { val query = "SELECT l._1, l._2, r._2 FROM l JOIN r ON l._1 = r._1" + // Pre-flight: confirm the plan is the co-partitioned join shape (two Comet hash + // exchanges feeding a shuffle-hash/sort-merge join) BEFORE running it. + val executed = withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false", + SQLConf.SHUFFLE_PARTITIONS.key -> "4", + CometConf.COMET_SHUFFLE_DIRECT_READ_ENABLED.key -> "false", + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", + CometConf.COMET_EXEC_BALLISTA_ENABLED.key -> "false") { + spark.sql(query).queryExecution.executedPlan + } + val exchanges = executed.collect { case e: CometShuffleExchangeExec => e } + assert( + exchanges.size == 2, + s"expected exactly two Comet hash exchanges (one per join side), found " + + s"${exchanges.size}:\n$executed") + val joins = executed.collect { + case j: CometHashJoinExec => j + case j: CometSortMergeJoinExec => j + } + assert( + joins.size == 1, + s"expected exactly one shuffle-hash/sort-merge join, found ${joins.size}:\n$executed") + // Baseline: normal Comet execution (offload off), a positive control proving the // listener observes executor task starts. val (baseline, baselineTaskStarts) = runWith(ballista = false, query)