apache · andygrove · Jul 2, 2026 · Jul 2, 2026 · Jul 2, 2026 · Jul 2, 2026
diff --git a/docs/source/contributor-guide/expression-audits/agg_funcs.md b/docs/source/contributor-guide/expression-audits/agg_funcs.md
@@ -27,6 +27,11 @@
 - Spark 3.5.8 (audited 2026-05-26): identical to 3.4.3.
 - Spark 4.0.1 (audited 2026-05-26): identical to 3.4.3.
 
+## approx_percentile
+
+- Spark 3.4.3, 3.5.8, 4.0.1, 4.1.1 (audited 2026-07-02): `ApproximatePercentile(child, percentageExpression, accuracyExpression)` is a `TypedImperativeAggregate` backed by a Greenwald-Khanna `PercentileDigest` quantile summary with relative error `1.0 / accuracy`. `child` accepts `NumericType`, `DateType`, `TimestampType`, `TimestampNTZType`, and interval types (all cast to `double` internally); `percentage` is a single literal or literal array in `[0.0, 1.0]`; `accuracy` is a positive literal (default 10000). NULL inputs are skipped; an empty or all-null group returns NULL. `approx_percentile` is a SQL alias for the primary function name `percentile_approx`.
+- `CometApproxPercentile` maps the byte, short, int, long, float, and double input forms to a native Greenwald-Khanna quantile summary port with the same insert/compress/merge/query algorithm and relative error, casting the result back to the input type. `percentage` and `accuracy` must be foldable literals, matching Spark. Date, timestamp, interval, and decimal inputs fall back to Spark.
+
 ## avg
 
 - Spark 3.4.3 (2026-05-26)

diff --git a/docs/source/user-guide/latest/expressions.md b/docs/source/user-guide/latest/expressions.md
@@ -60,7 +60,7 @@ expressions. The following function families are **not currently planned** for n
 
 The file-metadata functions `input_file_name`, `input_file_block_start`, and `input_file_block_length` depend on scan-internal per-row file information rather than the expression layer; their support status is covered in the [scan compatibility guide](compatibility/scans.md).
 
-Note that `approx_count_distinct`, `median`, and `mode` are planned: they are mainstream (`median` and `mode` are exact aggregates). `approx_percentile` / `percentile_approx` are not currently planned because their approximate results cannot be made bit-identical to Spark.
+Note that `approx_count_distinct`, `median`, and `mode` are planned: they are mainstream (`median` and `mode` are exact aggregates).
 
 The tables below list every Spark built-in expression with its current status.
 
@@ -71,6 +71,7 @@ The tables below list every Spark built-in expression with its current status.
 | `any` | ✅ |  |
 | `any_value` | ✅ |  |
 | `approx_count_distinct` | 🔜 | tracking [#4098](https://github.com/apache/datafusion-comet/issues/4098) |
+| `approx_percentile` | ✅ | Byte, short, int, long, float, and double input; other input types fall back to Spark |
 | `array_agg` | 🔜 | Array aggregate (related to `collect_list`, [#2524](https://github.com/apache/datafusion-comet/issues/2524)) |
 | `avg` | ✅ | Interval types fall back |
 | `bit_and` | ✅ |  |

diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
@@ -128,8 +128,8 @@ use datafusion_comet_proto::{
     spark_partitioning::{partitioning::PartitioningStruct, Partitioning as SparkPartitioning},
 };
 use datafusion_comet_spark_expr::{
-    jvm_udf::JvmScalarUdfExpr, ArrayInsert, Avg, AvgDecimal, Cast, CheckOverflow, Correlation,
-    Covariance, CreateNamedStruct, DecimalRescaleCheckOverflow, GetArrayStructFields,
+    jvm_udf::JvmScalarUdfExpr, ApproxPercentile, ArrayInsert, Avg, AvgDecimal, Cast, CheckOverflow,
+    Correlation, Covariance, CreateNamedStruct, DecimalRescaleCheckOverflow, GetArrayStructFields,
     GetStructField, IfExpr, ListExtract, NormalizeNaNAndZero, SparkCastOptions, Stddev, SumDecimal,
     ToJson, UnboundColumn, Variance, WideDecimalBinaryExpr, WideDecimalOp,
 };
@@ -2627,6 +2627,17 @@ impl PhysicalPlanner {
                     .build()
                     .map_err(|e| e.into())
             }
+            AggExprStruct::ApproxPercentile(expr) => {
+                let child = self.create_expr(expr.child.as_ref().unwrap(), Arc::clone(&schema))?;
+                let input_type = to_arrow_datatype(expr.input_type.as_ref().unwrap());
+                let func = AggregateUDF::new_from_impl(ApproxPercentile::new(
+                    expr.percentiles.clone(),
+                    expr.accuracy,
+                    input_type,
+                    expr.return_array,
+                ));
+                Self::create_aggr_func_expr("approx_percentile", schema, vec![child], func)
+            }
             AggExprStruct::BloomFilterAgg(expr) => {
                 let child = self.create_expr(expr.child.as_ref().unwrap(), Arc::clone(&schema))?;
                 let num_items =

diff --git a/native/proto/src/proto/expr.proto b/native/proto/src/proto/expr.proto
@@ -145,6 +145,7 @@ message AggExpr {
     BloomFilterAgg bloomFilterAgg = 16;
     CollectSet collectSet = 17;
     Percentile percentile = 18;
+    ApproxPercentile approxPercentile = 19;
   }
 
   // Optional filter expression for SQL FILTER (WHERE ...) clause.
@@ -253,6 +254,22 @@ message Percentile {
   DataType datatype = 3;
 }
 
+message ApproxPercentile {
+  // Child value expression, already cast to Float64 by the serde.
+  Expr child = 1;
+  // The percentiles and accuracy are carried as resolved scalars rather than
+  // child Exprs (unlike Percentile/BloomFilterAgg) because they are needed at
+  // UDAF construction time to drive return_type and accumulator shape.
+  // One or more percentiles in [0.0, 1.0].
+  repeated double percentiles = 2;
+  // Spark's accuracy argument; relative_error = 1.0 / accuracy.
+  int64 accuracy = 3;
+  // True when the percentile argument was an array (output is a list).
+  bool return_array = 4;
+  // Spark's input/output type, used to cast results back from Float64.
+  DataType input_type = 5;
+}
+
 message BloomFilterAgg {
   Expr child = 1;
   Expr numItems = 2;