diff --git a/CHANGELOG.md b/CHANGELOG.md index f7e16399d..ba0b1bccf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed + +- Estimation of generic edge operators for cyclic graph components should not + assume all nodes can be reached when the operator itself is limited in length. + ## [4.1.4] - 2026-04-18 ### Fixed diff --git a/graphannis/src/annis/db/aql/operators/edge_op.rs b/graphannis/src/annis/db/aql/operators/edge_op.rs index 0f1b07a81..c57eddde2 100644 --- a/graphannis/src/annis/db/aql/operators/edge_op.rs +++ b/graphannis/src/annis/db/aql/operators/edge_op.rs @@ -1,6 +1,7 @@ use crate::annis::db::aql::{model::AnnotationComponentType, operators::RangeSpec}; use crate::annis::db::exec::CostEstimate; use crate::annis::errors::GraphAnnisError; +use crate::annis::operator::EstimationType::Selectivity; use crate::annis::operator::{ BinaryOperator, BinaryOperatorBase, BinaryOperatorIndex, BinaryOperatorSpec, EdgeAnnoSearchSpec, EstimationType, @@ -239,6 +240,27 @@ fn check_edge_annotation( } } +/// Guess how many many nodes are reachable for a given path length. +fn reachable_by_path_length(stats: &GraphStatistic, path_length: i32) -> f64 { + if stats.avg_fan_out > 1.0 { + // Assume two complete k-ary trees (with the average fan-out + // as k) as defined in "Thomas Cormen: Introduction to + // algorithms (2009), page 1179) with the maximum and + // minimum height. Calculate the number of nodes for both + // complete trees and subtract them to get an estimation of + // the number of nodes that fullfull the path length + // criteria. + let k = stats.avg_fan_out; + ((k.powi(path_length) - 1.0) / (k - 1.0)).ceil() + } else { + // We can't use the formula for complete k-ary trees because + // we can't divide by zero and don't want negative numbers. + // Use the simplified estimation with multiplication + // instead. + (stats.avg_fan_out * f64::from(path_length)).ceil() + } +} + impl BaseEdgeOp {} impl std::fmt::Display for BaseEdgeOp { @@ -350,45 +372,26 @@ impl BinaryOperatorBase for BaseEdgeOp { if let Some(stats) = g.get_statistics() { let stats: &GraphStatistic = stats; - if stats.cyclic { - // can get all other nodes - return Ok(EstimationType::Selectivity(1.0)); - } // get number of nodes reachable from min to max distance let max_dist = match self.spec.dist.max_dist() { std::ops::Bound::Unbounded => usize::MAX, std::ops::Bound::Included(max_dist) => max_dist, std::ops::Bound::Excluded(max_dist) => max_dist - 1, }; - let max_path_length = std::cmp::min(max_dist, stats.max_depth) as i32; - let min_path_length = std::cmp::max(0, self.spec.dist.min_dist() - 1) as i32; - - if stats.avg_fan_out > 1.0 { - // Assume two complete k-ary trees (with the average fan-out - // as k) as defined in "Thomas Cormen: Introduction to - // algorithms (2009), page 1179) with the maximum and - // minimum height. Calculate the number of nodes for both - // complete trees and subtract them to get an estimation of - // the number of nodes that fullfull the path length - // criteria. - let k = stats.avg_fan_out; - - let reachable_max: f64 = ((k.powi(max_path_length) - 1.0) / (k - 1.0)).ceil(); - let reachable_min: f64 = ((k.powi(min_path_length) - 1.0) / (k - 1.0)).ceil(); - - let reachable = reachable_max - reachable_min; - - gs_selectivity = reachable / max_nodes; + if stats.cyclic && max_dist == usize::MAX { + // can reach all other nodes without any restriction + return Ok(Selectivity(1.0)); } else { - // We can't use the formula for complete k-ary trees because - // we can't divide by zero and don't want negative numbers. - // Use the simplified estimation with multiplication - // instead. - let reachable_max: f64 = - (stats.avg_fan_out * f64::from(max_path_length)).ceil(); - let reachable_min: f64 = - (stats.avg_fan_out * f64::from(min_path_length)).ceil(); + // stats.max_depth is only valid if the graph is not cyclic + let max_path_length = if stats.cyclic { + max_dist as i32 + } else { + std::cmp::min(max_dist, stats.max_depth) as i32 + }; + let min_path_length = std::cmp::max(0, self.spec.dist.min_dist() - 1) as i32; + let reachable_max = reachable_by_path_length(stats, max_path_length); + let reachable_min = reachable_by_path_length(stats, min_path_length); gs_selectivity = (reachable_max - reachable_min) / max_nodes; } } diff --git a/graphannis/src/annis/db/aql/operators/edge_op/tests.rs b/graphannis/src/annis/db/aql/operators/edge_op/tests.rs index 4697df744..58533feeb 100644 --- a/graphannis/src/annis/db/aql/operators/edge_op/tests.rs +++ b/graphannis/src/annis/db/aql/operators/edge_op/tests.rs @@ -2,15 +2,20 @@ use graphannis_core::graph::{ ANNIS_NS, update::{GraphUpdate, UpdateEvent}, }; +use std::assert_matches; use crate::{ AnnotationGraph, annis::{ db::{ - aql::{ast::RangeSpec, operators::PartOfSubCorpusSpec}, + aql::{ + ast::RangeSpec, + operators::{PartOfSubCorpusSpec, PointingSpec}, + }, + example_generator, exec::CostEstimate, }, - operator::{BinaryOperatorBase, BinaryOperatorSpec}, + operator::{BinaryOperatorBase, BinaryOperatorSpec, EstimationType}, }, }; @@ -86,3 +91,94 @@ fn inverted_partof_has_same_estimate() { assert_eq!(orig_estimate, inverted_estimate); } + +/// Test the execution plan of a graph component with cycles +#[test] +fn cycle_component_estimation() { + let mut update = GraphUpdate::new(); + + example_generator::create_corpus_structure_simple(&mut update); + example_generator::create_tokens(&mut update, Some("root/doc1"), Some("root/doc1")); + + for t in 0..10 { + update + .add_event(UpdateEvent::AddEdge { + source_node: format!("root/doc1#tok{t}"), + target_node: format!("root/doc1#tok{}", t + 1), + layer: "default_ns".to_string(), + component_type: "Pointing".to_string(), + component_name: "dep".to_string(), + }) + .unwrap(); + } + // Add a dependency edge from the last to the first token, completing the cycle + update + .add_event(UpdateEvent::AddEdge { + source_node: format!("root/doc1#tok10"), + target_node: format!("root/doc1#tok0"), + layer: "default_ns".to_string(), + component_type: "Pointing".to_string(), + component_name: "dep".to_string(), + }) + .unwrap(); + + let mut g = AnnotationGraph::with_default_graphstorages(false).unwrap(); + g.apply_update(&mut update, |_| {}).unwrap(); + + // Define an operator that operates on the generated dep component and a realistic cost estimate for LHS and RHS + let unbound_spec = PointingSpec { + name: "dep".to_string(), + edge_anno: None, + dist: RangeSpec::Unbound, + }; + let direct_spec1 = PointingSpec { + name: "dep".to_string(), + edge_anno: None, + dist: RangeSpec::Bound { + min_dist: 1, + max_dist: 1, + }, + }; + let direct_spec2 = PointingSpec { + name: "dep".to_string(), + edge_anno: None, + dist: RangeSpec::Bound { + min_dist: 2, + max_dist: 2, + }, + }; + let cost_estimate_lhs = CostEstimate { + output: 10, + intermediate_sum: 0, + processed_in_step: 0, + }; + let cost_estimate_rhs = CostEstimate { + output: 5, + intermediate_sum: 0, + processed_in_step: 0, + }; + + let unbound_op = unbound_spec + .create_operator(&g, Some((&cost_estimate_lhs, &cost_estimate_rhs))) + .unwrap(); + assert_eq!( + unbound_op.estimation_type().unwrap(), + EstimationType::Selectivity(1.0), + ); + + let direct_op1 = direct_spec1 + .create_operator(&g, Some((&cost_estimate_lhs, &cost_estimate_rhs))) + .unwrap(); + assert_matches!( + direct_op1.estimation_type().unwrap(), + EstimationType::Selectivity(v) if v > 0.0 && v < 1.0 + ); + + let direct_op2 = direct_spec2 + .create_operator(&g, Some((&cost_estimate_lhs, &cost_estimate_rhs))) + .unwrap(); + assert_eq!( + direct_op1.estimation_type().unwrap(), + direct_op2.estimation_type().unwrap() + ); +}