From b89a01a6c46c812912b5f26e7620180149c1032a Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Mon, 22 Jun 2026 12:54:22 +0200 Subject: [PATCH 1/4] Do not assume non-selective join when the binary operator for a cyclic graph is lmited to the length of 1 In a cyclic graph in theory all nodes might be reachable, but not for the simple case of a direct edged. The special (pessimistic) handling of cyclic graphs should be restricted to indirect edge searches. --- CHANGELOG.md | 5 +++++ graphannis/src/annis/db/aql/operators/edge_op.rs | 8 ++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f7e16399d..206c8d981 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed + +- Estimation of generic edge operators for cyclic graphs should not assume all + nodes can be reached when the operator itself is limited to have the length 1. + ## [4.1.4] - 2026-04-18 ### Fixed diff --git a/graphannis/src/annis/db/aql/operators/edge_op.rs b/graphannis/src/annis/db/aql/operators/edge_op.rs index 0f1b07a81..49a48519b 100644 --- a/graphannis/src/annis/db/aql/operators/edge_op.rs +++ b/graphannis/src/annis/db/aql/operators/edge_op.rs @@ -350,16 +350,16 @@ impl BinaryOperatorBase for BaseEdgeOp { if let Some(stats) = g.get_statistics() { let stats: &GraphStatistic = stats; - if stats.cyclic { - // can get all other nodes - return Ok(EstimationType::Selectivity(1.0)); - } // get number of nodes reachable from min to max distance let max_dist = match self.spec.dist.max_dist() { std::ops::Bound::Unbounded => usize::MAX, std::ops::Bound::Included(max_dist) => max_dist, std::ops::Bound::Excluded(max_dist) => max_dist - 1, }; + if stats.cyclic && max_dist > 1 { + // can get all other nodes + return Ok(EstimationType::Selectivity(1.0)); + } let max_path_length = std::cmp::min(max_dist, stats.max_depth) as i32; let min_path_length = std::cmp::max(0, self.spec.dist.min_dist() - 1) as i32; From b72e6835798d8fee6fc70dcc67610e71a33a7b3b Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 23 Jun 2026 17:59:59 +0200 Subject: [PATCH 2/4] Generalize to use the operator bound as maximal path length for estimation. This avoids using the max_depth stat with is invalid for cyclic graphs but still handles the case of unbounded operators. Also, add a test for this kind of estimation. --- CHANGELOG.md | 2 +- .../src/annis/db/aql/operators/edge_op.rs | 65 ++++++------ .../annis/db/aql/operators/edge_op/tests.rs | 100 +++++++++++++++++- 3 files changed, 133 insertions(+), 34 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 206c8d981..fa1494d5a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - Estimation of generic edge operators for cyclic graphs should not assume all - nodes can be reached when the operator itself is limited to have the length 1. + nodes can be reached when the operator itself is limited in length. ## [4.1.4] - 2026-04-18 diff --git a/graphannis/src/annis/db/aql/operators/edge_op.rs b/graphannis/src/annis/db/aql/operators/edge_op.rs index 49a48519b..c57eddde2 100644 --- a/graphannis/src/annis/db/aql/operators/edge_op.rs +++ b/graphannis/src/annis/db/aql/operators/edge_op.rs @@ -1,6 +1,7 @@ use crate::annis::db::aql::{model::AnnotationComponentType, operators::RangeSpec}; use crate::annis::db::exec::CostEstimate; use crate::annis::errors::GraphAnnisError; +use crate::annis::operator::EstimationType::Selectivity; use crate::annis::operator::{ BinaryOperator, BinaryOperatorBase, BinaryOperatorIndex, BinaryOperatorSpec, EdgeAnnoSearchSpec, EstimationType, @@ -239,6 +240,27 @@ fn check_edge_annotation( } } +/// Guess how many many nodes are reachable for a given path length. +fn reachable_by_path_length(stats: &GraphStatistic, path_length: i32) -> f64 { + if stats.avg_fan_out > 1.0 { + // Assume two complete k-ary trees (with the average fan-out + // as k) as defined in "Thomas Cormen: Introduction to + // algorithms (2009), page 1179) with the maximum and + // minimum height. Calculate the number of nodes for both + // complete trees and subtract them to get an estimation of + // the number of nodes that fullfull the path length + // criteria. + let k = stats.avg_fan_out; + ((k.powi(path_length) - 1.0) / (k - 1.0)).ceil() + } else { + // We can't use the formula for complete k-ary trees because + // we can't divide by zero and don't want negative numbers. + // Use the simplified estimation with multiplication + // instead. + (stats.avg_fan_out * f64::from(path_length)).ceil() + } +} + impl BaseEdgeOp {} impl std::fmt::Display for BaseEdgeOp { @@ -356,39 +378,20 @@ impl BinaryOperatorBase for BaseEdgeOp { std::ops::Bound::Included(max_dist) => max_dist, std::ops::Bound::Excluded(max_dist) => max_dist - 1, }; - if stats.cyclic && max_dist > 1 { - // can get all other nodes - return Ok(EstimationType::Selectivity(1.0)); - } - let max_path_length = std::cmp::min(max_dist, stats.max_depth) as i32; - let min_path_length = std::cmp::max(0, self.spec.dist.min_dist() - 1) as i32; - - if stats.avg_fan_out > 1.0 { - // Assume two complete k-ary trees (with the average fan-out - // as k) as defined in "Thomas Cormen: Introduction to - // algorithms (2009), page 1179) with the maximum and - // minimum height. Calculate the number of nodes for both - // complete trees and subtract them to get an estimation of - // the number of nodes that fullfull the path length - // criteria. - let k = stats.avg_fan_out; - - let reachable_max: f64 = ((k.powi(max_path_length) - 1.0) / (k - 1.0)).ceil(); - let reachable_min: f64 = ((k.powi(min_path_length) - 1.0) / (k - 1.0)).ceil(); - - let reachable = reachable_max - reachable_min; - - gs_selectivity = reachable / max_nodes; + if stats.cyclic && max_dist == usize::MAX { + // can reach all other nodes without any restriction + return Ok(Selectivity(1.0)); } else { - // We can't use the formula for complete k-ary trees because - // we can't divide by zero and don't want negative numbers. - // Use the simplified estimation with multiplication - // instead. - let reachable_max: f64 = - (stats.avg_fan_out * f64::from(max_path_length)).ceil(); - let reachable_min: f64 = - (stats.avg_fan_out * f64::from(min_path_length)).ceil(); + // stats.max_depth is only valid if the graph is not cyclic + let max_path_length = if stats.cyclic { + max_dist as i32 + } else { + std::cmp::min(max_dist, stats.max_depth) as i32 + }; + let min_path_length = std::cmp::max(0, self.spec.dist.min_dist() - 1) as i32; + let reachable_max = reachable_by_path_length(stats, max_path_length); + let reachable_min = reachable_by_path_length(stats, min_path_length); gs_selectivity = (reachable_max - reachable_min) / max_nodes; } } diff --git a/graphannis/src/annis/db/aql/operators/edge_op/tests.rs b/graphannis/src/annis/db/aql/operators/edge_op/tests.rs index 4697df744..c776f0ffc 100644 --- a/graphannis/src/annis/db/aql/operators/edge_op/tests.rs +++ b/graphannis/src/annis/db/aql/operators/edge_op/tests.rs @@ -2,15 +2,20 @@ use graphannis_core::graph::{ ANNIS_NS, update::{GraphUpdate, UpdateEvent}, }; +use std::assert_matches; use crate::{ AnnotationGraph, annis::{ db::{ - aql::{ast::RangeSpec, operators::PartOfSubCorpusSpec}, + aql::{ + ast::RangeSpec, + operators::{PartOfSubCorpusSpec, PointingSpec}, + }, + example_generator, exec::CostEstimate, }, - operator::{BinaryOperatorBase, BinaryOperatorSpec}, + operator::{BinaryOperatorBase, BinaryOperatorSpec, EstimationType}, }, }; @@ -86,3 +91,94 @@ fn inverted_partof_has_same_estimate() { assert_eq!(orig_estimate, inverted_estimate); } + +/// Test the execution plan of a graph component was cycles +#[test] +fn cycle_component_estimation() { + let mut update = GraphUpdate::new(); + + example_generator::create_corpus_structure_simple(&mut update); + example_generator::create_tokens(&mut update, Some("root/doc1"), Some("root/doc1")); + + for t in 0..10 { + update + .add_event(UpdateEvent::AddEdge { + source_node: format!("root/doc1#tok{t}"), + target_node: format!("root/doc1#tok{}", t + 1), + layer: "default_ns".to_string(), + component_type: "Pointing".to_string(), + component_name: "dep".to_string(), + }) + .unwrap(); + } + // Add a dependency edge from the last to the first token, completing the cycle + update + .add_event(UpdateEvent::AddEdge { + source_node: format!("root/doc1#tok10"), + target_node: format!("root/doc1#tok0"), + layer: "default_ns".to_string(), + component_type: "Pointing".to_string(), + component_name: "dep".to_string(), + }) + .unwrap(); + + let mut g = AnnotationGraph::with_default_graphstorages(false).unwrap(); + g.apply_update(&mut update, |_| {}).unwrap(); + + // Define an operator that operates on the generated dep component and a realistic cost estimate for LHS and RHS + let unbound_spec = PointingSpec { + name: "dep".to_string(), + edge_anno: None, + dist: RangeSpec::Unbound, + }; + let direct_spec1 = PointingSpec { + name: "dep".to_string(), + edge_anno: None, + dist: RangeSpec::Bound { + min_dist: 1, + max_dist: 1, + }, + }; + let direct_spec2 = PointingSpec { + name: "dep".to_string(), + edge_anno: None, + dist: RangeSpec::Bound { + min_dist: 2, + max_dist: 2, + }, + }; + let cost_estimate_lhs = CostEstimate { + output: 10, + intermediate_sum: 0, + processed_in_step: 0, + }; + let cost_estimate_rhs = CostEstimate { + output: 5, + intermediate_sum: 0, + processed_in_step: 0, + }; + + let unbound_op = unbound_spec + .create_operator(&g, Some((&cost_estimate_lhs, &cost_estimate_rhs))) + .unwrap(); + assert_eq!( + unbound_op.estimation_type().unwrap(), + EstimationType::Selectivity(1.0), + ); + + let direct_op1 = direct_spec1 + .create_operator(&g, Some((&cost_estimate_lhs, &cost_estimate_rhs))) + .unwrap(); + assert_matches!( + direct_op1.estimation_type().unwrap(), + EstimationType::Selectivity(v) if v > 0.0 && v < 1.0 + ); + + let direct_op2 = direct_spec2 + .create_operator(&g, Some((&cost_estimate_lhs, &cost_estimate_rhs))) + .unwrap(); + assert_eq!( + direct_op1.estimation_type().unwrap(), + direct_op2.estimation_type().unwrap() + ); +} From 4ad58838abdbf4d7e6cc1ae706e792b37dcf2f04 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 23 Jun 2026 19:04:53 +0200 Subject: [PATCH 3/4] Only components of a graph are marked as cyclic, not the graph as a whole --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fa1494d5a..ba0b1bccf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed -- Estimation of generic edge operators for cyclic graphs should not assume all - nodes can be reached when the operator itself is limited in length. +- Estimation of generic edge operators for cyclic graph components should not + assume all nodes can be reached when the operator itself is limited in length. ## [4.1.4] - 2026-04-18 From b38724ec37f7b537fbb6ba32b30967b2bb8e8793 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 23 Jun 2026 19:08:17 +0200 Subject: [PATCH 4/4] Wording --- graphannis/src/annis/db/aql/operators/edge_op/tests.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphannis/src/annis/db/aql/operators/edge_op/tests.rs b/graphannis/src/annis/db/aql/operators/edge_op/tests.rs index c776f0ffc..58533feeb 100644 --- a/graphannis/src/annis/db/aql/operators/edge_op/tests.rs +++ b/graphannis/src/annis/db/aql/operators/edge_op/tests.rs @@ -92,7 +92,7 @@ fn inverted_partof_has_same_estimate() { assert_eq!(orig_estimate, inverted_estimate); } -/// Test the execution plan of a graph component was cycles +/// Test the execution plan of a graph component with cycles #[test] fn cycle_component_estimation() { let mut update = GraphUpdate::new();