Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Fixed

- Estimation of generic edge operators for cyclic graph components should not
assume all nodes can be reached when the operator itself is limited in length.

## [4.1.4] - 2026-04-18

### Fixed
Expand Down
65 changes: 34 additions & 31 deletions graphannis/src/annis/db/aql/operators/edge_op.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use crate::annis::db::aql::{model::AnnotationComponentType, operators::RangeSpec};
use crate::annis::db::exec::CostEstimate;
use crate::annis::errors::GraphAnnisError;
use crate::annis::operator::EstimationType::Selectivity;
use crate::annis::operator::{
BinaryOperator, BinaryOperatorBase, BinaryOperatorIndex, BinaryOperatorSpec,
EdgeAnnoSearchSpec, EstimationType,
Expand Down Expand Up @@ -239,6 +240,27 @@ fn check_edge_annotation(
}
}

/// Guess how many many nodes are reachable for a given path length.
fn reachable_by_path_length(stats: &GraphStatistic, path_length: i32) -> f64 {
if stats.avg_fan_out > 1.0 {
// Assume two complete k-ary trees (with the average fan-out
// as k) as defined in "Thomas Cormen: Introduction to
// algorithms (2009), page 1179) with the maximum and
// minimum height. Calculate the number of nodes for both
// complete trees and subtract them to get an estimation of
// the number of nodes that fullfull the path length
// criteria.
let k = stats.avg_fan_out;
((k.powi(path_length) - 1.0) / (k - 1.0)).ceil()
} else {
// We can't use the formula for complete k-ary trees because
// we can't divide by zero and don't want negative numbers.
// Use the simplified estimation with multiplication
// instead.
(stats.avg_fan_out * f64::from(path_length)).ceil()
}
}

impl BaseEdgeOp {}

impl std::fmt::Display for BaseEdgeOp {
Expand Down Expand Up @@ -350,45 +372,26 @@ impl BinaryOperatorBase for BaseEdgeOp {

if let Some(stats) = g.get_statistics() {
let stats: &GraphStatistic = stats;
if stats.cyclic {
// can get all other nodes
return Ok(EstimationType::Selectivity(1.0));
}
// get number of nodes reachable from min to max distance
let max_dist = match self.spec.dist.max_dist() {
std::ops::Bound::Unbounded => usize::MAX,
std::ops::Bound::Included(max_dist) => max_dist,
std::ops::Bound::Excluded(max_dist) => max_dist - 1,
};
let max_path_length = std::cmp::min(max_dist, stats.max_depth) as i32;
let min_path_length = std::cmp::max(0, self.spec.dist.min_dist() - 1) as i32;

if stats.avg_fan_out > 1.0 {
// Assume two complete k-ary trees (with the average fan-out
// as k) as defined in "Thomas Cormen: Introduction to
// algorithms (2009), page 1179) with the maximum and
// minimum height. Calculate the number of nodes for both
// complete trees and subtract them to get an estimation of
// the number of nodes that fullfull the path length
// criteria.
let k = stats.avg_fan_out;

let reachable_max: f64 = ((k.powi(max_path_length) - 1.0) / (k - 1.0)).ceil();
let reachable_min: f64 = ((k.powi(min_path_length) - 1.0) / (k - 1.0)).ceil();

let reachable = reachable_max - reachable_min;

gs_selectivity = reachable / max_nodes;
if stats.cyclic && max_dist == usize::MAX {
// can reach all other nodes without any restriction
return Ok(Selectivity(1.0));
} else {
// We can't use the formula for complete k-ary trees because
// we can't divide by zero and don't want negative numbers.
// Use the simplified estimation with multiplication
// instead.
let reachable_max: f64 =
(stats.avg_fan_out * f64::from(max_path_length)).ceil();
let reachable_min: f64 =
(stats.avg_fan_out * f64::from(min_path_length)).ceil();
// stats.max_depth is only valid if the graph is not cyclic
let max_path_length = if stats.cyclic {
max_dist as i32
} else {
std::cmp::min(max_dist, stats.max_depth) as i32
};
let min_path_length = std::cmp::max(0, self.spec.dist.min_dist() - 1) as i32;

let reachable_max = reachable_by_path_length(stats, max_path_length);
let reachable_min = reachable_by_path_length(stats, min_path_length);
gs_selectivity = (reachable_max - reachable_min) / max_nodes;
}
}
Expand Down
100 changes: 98 additions & 2 deletions graphannis/src/annis/db/aql/operators/edge_op/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,20 @@ use graphannis_core::graph::{
ANNIS_NS,
update::{GraphUpdate, UpdateEvent},
};
use std::assert_matches;

use crate::{
AnnotationGraph,
annis::{
db::{
aql::{ast::RangeSpec, operators::PartOfSubCorpusSpec},
aql::{
ast::RangeSpec,
operators::{PartOfSubCorpusSpec, PointingSpec},
},
example_generator,
exec::CostEstimate,
},
operator::{BinaryOperatorBase, BinaryOperatorSpec},
operator::{BinaryOperatorBase, BinaryOperatorSpec, EstimationType},
},
};

Expand Down Expand Up @@ -86,3 +91,94 @@ fn inverted_partof_has_same_estimate() {

assert_eq!(orig_estimate, inverted_estimate);
}

/// Test the execution plan of a graph component with cycles
#[test]
fn cycle_component_estimation() {
let mut update = GraphUpdate::new();

example_generator::create_corpus_structure_simple(&mut update);
example_generator::create_tokens(&mut update, Some("root/doc1"), Some("root/doc1"));

for t in 0..10 {
update
.add_event(UpdateEvent::AddEdge {
source_node: format!("root/doc1#tok{t}"),
target_node: format!("root/doc1#tok{}", t + 1),
layer: "default_ns".to_string(),
component_type: "Pointing".to_string(),
component_name: "dep".to_string(),
})
.unwrap();
}
// Add a dependency edge from the last to the first token, completing the cycle
update
.add_event(UpdateEvent::AddEdge {
source_node: format!("root/doc1#tok10"),
target_node: format!("root/doc1#tok0"),
layer: "default_ns".to_string(),
component_type: "Pointing".to_string(),
component_name: "dep".to_string(),
})
.unwrap();

let mut g = AnnotationGraph::with_default_graphstorages(false).unwrap();
g.apply_update(&mut update, |_| {}).unwrap();

// Define an operator that operates on the generated dep component and a realistic cost estimate for LHS and RHS
let unbound_spec = PointingSpec {
name: "dep".to_string(),
edge_anno: None,
dist: RangeSpec::Unbound,
};
let direct_spec1 = PointingSpec {
name: "dep".to_string(),
edge_anno: None,
dist: RangeSpec::Bound {
min_dist: 1,
max_dist: 1,
},
};
let direct_spec2 = PointingSpec {
name: "dep".to_string(),
edge_anno: None,
dist: RangeSpec::Bound {
min_dist: 2,
max_dist: 2,
},
};
let cost_estimate_lhs = CostEstimate {
output: 10,
intermediate_sum: 0,
processed_in_step: 0,
};
let cost_estimate_rhs = CostEstimate {
output: 5,
intermediate_sum: 0,
processed_in_step: 0,
};

let unbound_op = unbound_spec
.create_operator(&g, Some((&cost_estimate_lhs, &cost_estimate_rhs)))
.unwrap();
assert_eq!(
unbound_op.estimation_type().unwrap(),
EstimationType::Selectivity(1.0),
);

let direct_op1 = direct_spec1
.create_operator(&g, Some((&cost_estimate_lhs, &cost_estimate_rhs)))
.unwrap();
assert_matches!(
direct_op1.estimation_type().unwrap(),
EstimationType::Selectivity(v) if v > 0.0 && v < 1.0
);

let direct_op2 = direct_spec2
.create_operator(&g, Some((&cost_estimate_lhs, &cost_estimate_rhs)))
.unwrap();
assert_eq!(
direct_op1.estimation_type().unwrap(),
direct_op2.estimation_type().unwrap()
);
}
Loading