From 6b2ee0b45ba85a223a4bcbfcf14e31f9ec64175d Mon Sep 17 00:00:00 2001 From: isaacr Date: Wed, 8 Apr 2026 15:32:15 -0600 Subject: [PATCH 01/39] Fix warnings, bug in visual... --- Cargo.toml | 1 - src/balanced_tree.rs | 4 ++-- src/segments.rs | 2 +- src/substitution_matrix.rs | 1 + src/util.rs | 2 +- src/viz/mod.rs | 4 ++-- src/windowed_scores.rs | 1 + 7 files changed, 8 insertions(+), 7 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index fbf7952..24d0d77 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,4 +28,3 @@ opt-level = 3 lto = "thin" codegen-units = 1 debug = false - diff --git a/src/balanced_tree.rs b/src/balanced_tree.rs index 603b68d..80629cf 100644 --- a/src/balanced_tree.rs +++ b/src/balanced_tree.rs @@ -181,7 +181,7 @@ impl + TryFrom + Copy + Debug> AVLIndexSet { } /// Iterate over the elements in the tree, in sorted order. Returns the element index and the depth of the element in the tree. - pub fn iter(&self) -> AVLInOrderSetIterator { + pub fn iter(&self) -> AVLInOrderSetIterator<'_, T> { let mut stack = Vec::with_capacity(self.depth() + 1); if let Some(root) = self.root { stack.push((root, 0_u8)); @@ -190,7 +190,7 @@ impl + TryFrom + Copy + Debug> AVLIndexSet { AVLInOrderSetIterator { tree: self, stack } } - pub fn bfs(&self) -> AVLBFSSetIterator { + pub fn bfs(&self) -> AVLBFSSetIterator<'_, T> { let mut queue = VecDeque::new(); if let Some(root) = self.root { diff --git a/src/segments.rs b/src/segments.rs index f0914f1..6d7ddd2 100644 --- a/src/segments.rs +++ b/src/segments.rs @@ -213,7 +213,7 @@ pub struct SegmentView<'a> { #[allow(dead_code)] impl InitialSegments { - pub fn iter_segments(&self) -> impl Iterator { + pub fn iter_segments(&self) -> impl Iterator> { self.segments.iter().map(|v| SegmentView { start_col: v.start_col, end_col: v.end_col, diff --git a/src/substitution_matrix.rs b/src/substitution_matrix.rs index 6364857..40127f5 100644 --- a/src/substitution_matrix.rs +++ b/src/substitution_matrix.rs @@ -9,6 +9,7 @@ use crate::alphabet::{ }; pub trait AlignmentScore { + #[allow(dead_code)] fn score(&self, target_char: u8, query_char: u8) -> f64; fn score_with_background(&self, target_char: u8, query_char: u8, frequencies: &[f64; 4]) -> f64; diff --git a/src/util.rs b/src/util.rs index 62cabe0..1c7fe5c 100644 --- a/src/util.rs +++ b/src/util.rs @@ -15,7 +15,7 @@ impl VecMap { Self { values } } - pub fn values(&self) -> std::slice::Iter { + pub fn values(&self) -> std::slice::Iter<'_, T> { self.values.iter() } diff --git a/src/viz/mod.rs b/src/viz/mod.rs index ad03f7f..410b86c 100644 --- a/src/viz/mod.rs +++ b/src/viz/mod.rs @@ -58,7 +58,7 @@ pub fn write_index_file( proximity_groups.iter().enumerate().for_each(|(idx, g)| { index_links.push_str(&format!( - "

region {idx} | {name} {start}:{end}

\n", + "
\n", name = alignment_data.target_name_map.get(g.target_id), start = g.target_start, end = g.target_end, @@ -746,7 +746,7 @@ impl<'a> AdjudicationSodaData<'a> { let columns = self .active_columns .iter() - .flat_map(|&(start, end)| (start..=end)) + .flat_map(|&(start, end)| start..=end) .collect_vec(); vec![columns] } diff --git a/src/windowed_scores.rs b/src/windowed_scores.rs index 6044e30..39456b4 100644 --- a/src/windowed_scores.rs +++ b/src/windowed_scores.rs @@ -45,6 +45,7 @@ impl BackgroundFrequencies for Background<'_> { } } +#[allow(dead_code)] pub struct DummyBackground {} impl BackgroundFrequencies for DummyBackground { From 67a11004848a47ce1d423906756ee2143ecfd620 Mon Sep 17 00:00:00 2001 From: isaacr Date: Wed, 8 Apr 2026 17:27:33 -0600 Subject: [PATCH 02/39] Fix index page html. --- src/viz/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/viz/mod.rs b/src/viz/mod.rs index 410b86c..cf535cf 100644 --- a/src/viz/mod.rs +++ b/src/viz/mod.rs @@ -48,7 +48,7 @@ pub fn write_index_file( .enumerate() .for_each(|(idx, c)| { index_links.push_str(&format!( - "
\n", + "\n", name = c.target_name, start = c.target_start, end = c.target_end, @@ -58,7 +58,7 @@ pub fn write_index_file( proximity_groups.iter().enumerate().for_each(|(idx, g)| { index_links.push_str(&format!( - "
\n", + "\n", name = alignment_data.target_name_map.get(g.target_id), start = g.target_start, end = g.target_end, From c2bf76405d08d62fdd30862c05a425290b5dbd67 Mon Sep 17 00:00:00 2001 From: isaacr Date: Wed, 8 Apr 2026 17:51:00 -0600 Subject: [PATCH 03/39] Further index page fixes. --- src/viz/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/viz/mod.rs b/src/viz/mod.rs index cf535cf..06e12c3 100644 --- a/src/viz/mod.rs +++ b/src/viz/mod.rs @@ -48,7 +48,7 @@ pub fn write_index_file( .enumerate() .for_each(|(idx, c)| { index_links.push_str(&format!( - "\n", + "\n", name = c.target_name, start = c.target_start, end = c.target_end, From 32e8d23b1e2f3af84aea7417f939785298cc390a Mon Sep 17 00:00:00 2001 From: isaacr Date: Thu, 9 Apr 2026 16:01:14 -0600 Subject: [PATCH 04/39] Gather basic global trace statistics. --- src/main.rs | 31 ++++++++----- src/pipeline.rs | 2 + src/segments.rs | 4 ++ src/trace_statistics.rs | 99 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 125 insertions(+), 11 deletions(-) create mode 100644 src/trace_statistics.rs diff --git a/src/main.rs b/src/main.rs index 9d6b56b..02c71f9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -18,6 +18,7 @@ mod support; #[allow(dead_code)] mod union_find; +mod trace_statistics; mod util; mod viterbi; mod viz; @@ -43,6 +44,7 @@ use crate::{ annotation::AmbiguousAnnotation, chunks::validate_groups, pipeline::{run_history_trace, run_naive_trace, NaiveTraceResults}, + trace_statistics::{trace_statistics, OccuranceCountingMode}, viz::{ stats::{write_family_statistics, write_inversion_statistics}, write_index_file, ICON_SVG, @@ -448,22 +450,29 @@ fn main() -> Result<()> { .par_iter() .panic_fuse() .enumerate() - .map(|(region_idx, group)| { - ( - region_idx, - run_naive_trace(group, &alignment_data, region_idx, &args), - ) - }) - .collect::>(); - naive_results.sort_by_key(|v| v.0); + .map(|(region_idx, group)| run_naive_trace(group, &alignment_data, region_idx, &args)) + .collect::>(); + naive_results.sort_by_key(|v| v.region_index); + + let trace_stats = trace_statistics( + &naive_results, + &alignment_data, + OccuranceCountingMode::Segments, + ); let mut results: Vec<(usize, Vec)> = proximity_groups .par_iter() .zip(naive_results) - .map(|(group, (region_idx, mut naive_trace))| { + .map(|(group, mut naive_trace)| { ( - region_idx, - run_history_trace(group, &alignment_data, &mut naive_trace, &args), + naive_trace.region_index, + run_history_trace( + group, + &alignment_data, + &trace_stats, + &mut naive_trace, + &args, + ), ) }) .collect(); diff --git a/src/pipeline.rs b/src/pipeline.rs index 64e8d11..7f7c0fd 100644 --- a/src/pipeline.rs +++ b/src/pipeline.rs @@ -14,6 +14,7 @@ use crate::{ score_params::{approximate_ideal_skip_state_score, ScoreParams}, segments::{assemble_and_link_segments, segments_from_matrix_trace, InitialSegments}, support::windowed_confidence, + trace_statistics::TraceStatistics, viterbi::{trace_segments, traceback, viterbi_collapsed, TraceSegment}, viz::{ debug::{dump_debug_history_info, dump_final_trace_statistics}, @@ -260,6 +261,7 @@ pub fn run_naive_trace( pub fn run_history_trace( proximity_group: &ProximityGroup, alignment_data: &AlignmentData, + trace_statistics: &TraceStatistics, naive_trace: &mut NaiveTraceResults, args: &AuroraArgs, ) -> Vec { diff --git a/src/segments.rs b/src/segments.rs index 6d7ddd2..a2d8e20 100644 --- a/src/segments.rs +++ b/src/segments.rs @@ -220,6 +220,10 @@ impl InitialSegments { blocks: &v.blocks, }) } + + pub fn len(&self) -> usize { + self.segments.len() + } } impl, F: Fn(&I::Item, &I::Item) -> Ordering> Iterator diff --git a/src/trace_statistics.rs b/src/trace_statistics.rs new file mode 100644 index 0000000..7a73daf --- /dev/null +++ b/src/trace_statistics.rs @@ -0,0 +1,99 @@ +use crate::{alignment::AlignmentData, pipeline::NaiveTraceResults, segments::SegmentView}; + +pub struct RegionStatistics { + pub total_bases: usize, + pub unexplained_bases: Vec, +} + +#[derive(Debug, Clone)] +pub struct QueryStatistics { + pub occurances: usize, + pub coverage: usize, +} + +pub struct TraceStatistics { + pub total_bases: usize, + pub query_statistics: Vec, + pub region_statistics: Vec, +} + +pub enum OccuranceCountingMode { + Segments, + Trace, +} + +pub fn trace_statistics( + naive_traces: &[NaiveTraceResults], + alignment_data: &AlignmentData, + count_mode: OccuranceCountingMode, +) -> TraceStatistics { + // Asumption... All regions are sorted, no gaps. At least 1 region expected... + debug_assert!(naive_traces.first().map(|v| v.region_index) == Some(0)); + debug_assert!(naive_traces + .iter() + .zip(naive_traces.iter().skip(1)) + .all(|(v1, v2)| v1.region_index + 1 == v2.region_index)); + + let mut query_stats = vec![ + QueryStatistics { + occurances: 0, + coverage: 0 + }; + alignment_data.query_name_map.size() + ]; + + let mut all_region_stats: Vec = Vec::with_capacity(naive_traces.len()); + + for trace_results in naive_traces.iter() { + match count_mode { + OccuranceCountingMode::Segments => { + for seg in trace_results.segments.iter_segments() { + for blk in seg.blocks.iter() { + if let Some(query_id) = blk.query_id { + query_stats[query_id].occurances += 1; + query_stats[query_id].coverage += blk.col_end - blk.col_start + 1; + } + } + } + } + OccuranceCountingMode::Trace => { + for trace_blk in trace_results.trace_segments.iter() { + query_stats[trace_blk.query_id].occurances += 1; + query_stats[trace_blk.query_id].coverage += + trace_blk.col_end - trace_blk.col_start + 1; + } + } + } + + let mut region_stat = RegionStatistics { + total_bases: 0, + unexplained_bases: Vec::with_capacity(trace_results.segments.len()), + }; + + let mut unexplained_bases_up_to: usize = 0; + let mut prior_segment: Option = None; + + for seg in trace_results.segments.iter_segments() { + if let Some(prior_segment) = prior_segment { + // If a skip block was the prior block, add it's bases as unexplained. + if prior_segment.blocks.len() == 1 && prior_segment.blocks[0].row_idx == 0 { + unexplained_bases_up_to += seg.end_col - seg.start_col + 1; + } + unexplained_bases_up_to += prior_segment.end_col - seg.start_col - 1; + region_stat.total_bases += prior_segment.end_col - seg.start_col - 1; + } + region_stat.total_bases += seg.end_col - seg.start_col + 1; + region_stat.unexplained_bases.push(unexplained_bases_up_to); + + prior_segment = Some(seg); + } + + all_region_stats.push(region_stat); + } + + TraceStatistics { + total_bases: all_region_stats.iter().map(|v| v.total_bases).sum(), + query_statistics: query_stats, + region_statistics: all_region_stats, + } +} From e62c2801293654e91099c6b726f0e1b71127eff6 Mon Sep 17 00:00:00 2001 From: isaacr Date: Thu, 16 Apr 2026 17:56:31 -0600 Subject: [PATCH 05/39] New statistics module, incorperating into linking... --- scripts/plot_distance_distribution.py | 233 ++++++++++++++++++++++++++ src/assembly.rs | 31 ++-- src/main.rs | 1 + src/pipeline.rs | 5 +- src/statistics.rs | 75 +++++++++ src/trace_statistics.rs | 46 ++++- 6 files changed, 372 insertions(+), 19 deletions(-) create mode 100755 scripts/plot_distance_distribution.py create mode 100644 src/statistics.rs diff --git a/scripts/plot_distance_distribution.py b/scripts/plot_distance_distribution.py new file mode 100755 index 0000000..3e64a73 --- /dev/null +++ b/scripts/plot_distance_distribution.py @@ -0,0 +1,233 @@ +# Plots distributions of distances between sequences of the same family in the Genome +# Takes a single bed file as an argument. +# Results vary between exponential and power-law depending on the family. Basically all look exponential with fatter tails for a good chunk of families. +# +# Most likely, based on results on understanding the genome (and shape!), the actual distributions are very likely Hyper-Exponential Distributions. +# Basically, it means each family is sampled from one of a weighted set of exponential distributions with different rates. +# This would make sense as TE's have different rates depending on what part of the genome your in (basically, there are functional domains or regions) +# +# Testing this though, would require an implementation of Prony's method to determine the fit. I wasn't able to find any implementations so this +# and it seems implementing such a method would take quite some time. It may be worth eventually adding if better fit's are needed. +# +# For now, an exponential distribution seems to provide a good enough approximation for use in aurora. It also is easy to fit well. +# Here's an interesting paper on the topic that seems to have landed in the same space I've been in: https://www.columbia.edu/~ww2040/FittingMixturesPerfEval98.pdf + +import sys +from inspect import signature + +import matplotlib.pyplot as plt +import numpy as np +from scipy.optimize import curve_fit +from scipy.stats import ( + betaprime, + burr12, + ecdf, + expon, + fisk, + genextreme, + genpareto, + invweibull, + linregress, + lognorm, + lomax, + weibull_min, +) + +bed_file = sys.argv[1] + +seq_info = {} +cs = np.inf +ce = -np.inf + +with open(bed_file, "r") as f: + for line in f: + tokens = line.strip().split() + start = int(tokens[1]) + end = int(tokens[2]) + join_id = int(tokens[12]) + name = str(tokens[3]) + + if name.endswith("Simple_repeat"): + continue + + name = name.upper() + + if name not in seq_info: + seq_info[name] = [] + seq_info[name].append((start, end, join_id)) + + cs = min(cs, start) + ce = max(ce, end) + + +class Distribution: + def __init__(self, dist, defaults, exclude_location=True): + self._dist = dist + self.DEFAULTS = list(defaults) + self._excl_loc = exclude_location + self.NAME = getattr( + dist, "__name__", getattr(type(dist), "__qualname__", repr(dist)) + ) + + def pdf(self, x, *args): + # print(*args) + if self._excl_loc: + return self._dist.pdf(x, *args[:-1], 0.0, args[-1]) + else: + return self._dist.pdf(x, *args) + + def cdf(self, x, *args): + if self._excl_loc: + return self._dist.cdf(x, *args[:-1], 0.0, args[-1]) + else: + return self._dist.cdf(x, *args) + + def logcdf(self, x, *args): + print(*args) + if self._excl_loc: + return self._dist.logcdf(x, *args[:-1], 0.0, args[-1]) + else: + return self._dist.logcdf(x, *args) + + +distributions = [ + Distribution(expon, (1000.0,)), + Distribution(weibull_min, (2.0, 1000.0)), + # Distribution(invweibull, (1.0, 20000.0)), + Distribution(lomax, (1.0, 1000.0)), + # Distribution(burr12, (1.0, 1.0, 1000.0)), + # Distribution(lognorm, (1.0, 1000.0)), + # Distribution(fisk, (1.0, 1000.0)), + Distribution(genpareto, (0.0, 1000.0)), + Distribution(betaprime, (1.0, 2.0, 1000.0)), +] + + +for name, info in sorted(seq_info.items(), key=lambda k: -len(k[1])): + # if not name.startswith("CHARLIE1#"): + # continue + + info = np.array(info) + print(info.shape) + info = info[np.argsort(info[:, 0])] + starts = info[:, 0] + ends = info[:, 1] + + # Get distances between consecutive sequences... + _, indexes = np.unique(info[:, 2], return_index=True) + indexes = np.sort(indexes) + + dists = np.diff(starts[indexes]) + + counts, bins = np.histogram(dists, "auto", density=True) + x = (bins[1:] + bins[:-1]) / 2 + + # Get fit for exponential dist... + beta = np.mean(dists) # np.median(dists) / np.log(2) + + # Get fit for weibull dist... (and CDF)... + cdf = ecdf(dists).cdf + + y_transform = lambda y: np.log(-np.log(1 - y)) + + with np.errstate(divide="ignore"): + wcdfx = np.log(cdf.quantiles) + wcdfy = y_transform(cdf.probabilities) + # Estimate the error that the transform adds to the line. This makes linear fit better fit a CDF... + wcdfy_p1 = y_transform( + cdf.probabilities - np.sign(cdf.probabilities - 0.5) * 0.01 + ) + wcdfy_err = np.maximum(np.abs((wcdfy_p1 - wcdfy) / 0.01), 1e-8) + + valid_values = np.isfinite(wcdfx) & np.isfinite(wcdfy) + fit_line = curve_fit( + lambda x, m, b: m * x + b, + wcdfx[valid_values], + wcdfy[valid_values], + sigma=wcdfy_err[valid_values], + )[0] + + wk = fit_line[0] + w_beta = np.exp(-fit_line[1] / wk) + + cdf_fits = [ + curve_fit( + distrib.cdf, + cdf.quantiles, + cdf.probabilities, + distrib.DEFAULTS, + full_output=True, + )[0] + for distrib in distributions + ] + # print(cdf_fits) + # raise ValueError + + # Printout results... + print(f"Count: {starts.shape[0]}") + print(f"Genome Size: {ce - cs}") + est_scale = (ce - cs) / starts.shape[0] + print(f"Avg Occurance Rate (in nucleotides): {est_scale}") + print(f"Exponential Fit: Scale (beta): {beta}, Rate (lambda): {1 / beta}") + print( + f"Weibull Line Fit: Shape (k): {wk}, Scale (beta): {w_beta}, Rate (lambda): {1 / w_beta}" + ) + with np.printoptions(suppress=True): + for distrib, fit in zip(distributions, cdf_fits): + print(f"{distrib.NAME} CDF Fit: {fit}") + + fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2) + fig.suptitle(name) + + ax1.set_title("Histogram and Predicted PDFs") + ax1.set_ylabel("Density") + ax1.set_xlabel("Distance (nucleotides)") + ax1.stairs(counts, bins, fill=True, color="tan", label="Histogram") + ax1.plot( + x, weibull_min.pdf(x, wk, 0.0, w_beta), color="orange", label="Weibull Line Fit" + ) + ax1.plot(x, expon.pdf(x, 0.0, est_scale), color="green", label="Naive Fit Exp") + for distrib, fit in zip(distributions, cdf_fits): + ax1.plot(x, distrib.pdf(x, *fit), label=f"{distrib.NAME} CDF Fit") + ax1.legend() + + ax2.set_title("Weibull-Space of EDF") + ax2.set_ylabel(r"$ \ln(x) $") + ax2.set_xlabel(r"$ \ln(-\ln(1 - F(x))) $") + ax2.scatter(wcdfx, wcdfy, label="Empirical Distribution Function") + ax2.plot( + wcdfx, + fit_line[1] + fit_line[0] * wcdfx, + label=f"Line: {fit_line[0]:.02f}x + {fit_line[1]:.02f}", + ) + ax2.legend() + + ax3.set_title("Cumulative Distribution Functions") + ax3.set_ylabel(r"$ P(X <= x) $") + ax3.set_xlabel("Distance (nucleotides)") + ax3.scatter(cdf.quantiles, cdf.probabilities, color="tan", label="EDF") + ax3.plot( + cdf.quantiles, + weibull_min.cdf(cdf.quantiles, wk, 0.0, w_beta), + color="orange", + label="Weibull Line Fit", + ) + ax3.plot( + cdf.quantiles, + expon.cdf(cdf.quantiles, 0.0, est_scale), + color="green", + label="Naive Fit Exp", + ) + for distrib, fit in zip(distributions, cdf_fits): + ax3.plot( + cdf.quantiles, + distrib.cdf(cdf.quantiles, *fit), + label=f"{distrib.NAME} CDF Fit", + ) + ax3.legend() + + ax4.plot(cdf.quantiles, cdf.probabilities) + + fig.set_size_inches(12, 12) + fig.tight_layout() + plt.show() diff --git a/src/assembly.rs b/src/assembly.rs index 527e08f..ff92f33 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -6,6 +6,8 @@ use crate::{ alignment::{Alignment, Strand}, score_params::ScoreParams, segments::SegmentedMatrix, + statistics::Distribution, + trace_statistics::{self, TraceStatistics}, AnnotationArgs, }; @@ -164,10 +166,10 @@ fn link_assemblies( // return; //} - let target_distance = b_block.col_start as isize - a_block.col_end as isize; + let target_distance = b_block.col_start as isize - a_block.col_end as isize - 1; - let a_length = a_block.query_end.abs_diff(a_block.query_start); - let b_length = b_block.query_end.abs_diff(b_block.query_start); + let a_length = a_block.query_end.abs_diff(a_block.query_start) + 1; + let b_length = b_block.query_end.abs_diff(b_block.query_start) + 1; let min_length = a_length.min(b_length); let select_closest = |prop1: (isize, LinkType), prop2: (isize, LinkType)| { @@ -266,11 +268,13 @@ pub struct SegmentAssemblyGraph { } impl SegmentAssemblyGraph { - pub fn new( + pub fn new( alignments: &[Alignment], segments: &SegmentedMatrix, + trace_statistics: &TraceStatistics, score_params: &ScoreParams, annotation_args: &AnnotationArgs, + region_idx: usize, ) -> Self { let mut alignment_block_map = vec![Vec::::new(); alignments.len()]; @@ -291,19 +295,24 @@ impl SegmentAssemblyGraph { query_ids .iter() // grab the alignments for this ID - .map(|id| { - alignments - .iter() - .enumerate() - .filter(|&(_, a)| a.query_id == *id) - .flat_map(|(a_idx, _)| alignment_block_map[a_idx].iter().copied()) + .map(|&id| { + ( + id, + alignments + .iter() + .enumerate() + .filter(|&(_, a)| a.query_id == id) + .flat_map(|(a_idx, _)| alignment_block_map[a_idx].iter().copied()), + ) }) - .for_each(|compat_blocks| { + .for_each(|(id, compat_blocks)| { link_assemblies( &mut link_graph, compat_blocks, alignments, segments, + trace_statistics.query_statistics[id], + trace_statistics.region_statistics[region_idx], score_params, annotation_args, ); diff --git a/src/main.rs b/src/main.rs index 02c71f9..dd411ef 100644 --- a/src/main.rs +++ b/src/main.rs @@ -11,6 +11,7 @@ mod pipeline; mod score_params; mod segment_groups; mod segments; +mod statistics; mod substitution_matrix; mod support; diff --git a/src/pipeline.rs b/src/pipeline.rs index 7f7c0fd..358c617 100644 --- a/src/pipeline.rs +++ b/src/pipeline.rs @@ -13,6 +13,7 @@ use crate::{ matrix::{Matrix, MatrixDef}, score_params::{approximate_ideal_skip_state_score, ScoreParams}, segments::{assemble_and_link_segments, segments_from_matrix_trace, InitialSegments}, + statistics::Distribution, support::windowed_confidence, trace_statistics::TraceStatistics, viterbi::{trace_segments, traceback, viterbi_collapsed, TraceSegment}, @@ -258,10 +259,10 @@ pub fn run_naive_trace( } } -pub fn run_history_trace( +pub fn run_history_trace( proximity_group: &ProximityGroup, alignment_data: &AlignmentData, - trace_statistics: &TraceStatistics, + trace_statistics: &TraceStatistics, naive_trace: &mut NaiveTraceResults, args: &AuroraArgs, ) -> Vec { diff --git a/src/statistics.rs b/src/statistics.rs new file mode 100644 index 0000000..fa3b4ef --- /dev/null +++ b/src/statistics.rs @@ -0,0 +1,75 @@ +#[allow(dead_code)] +pub trait Distribution: Clone { + fn unit() -> Self; + fn pdf(&self, x: f64) -> f64; + fn cdf(&self, x: f64) -> f64; + fn ppf(&self, p: f64) -> f64; + fn support(&self) -> (f64, f64); + + fn ccdf(&self, x: f64) -> f64 { + 1.0 - self.cdf(x) + } + + fn logpdf(&self, x: f64) -> f64 { + self.pdf(x).ln() + } + fn logcdf(&self, x: f64) -> f64 { + self.cdf(x).ln() + } + fn logccdf(&self, x: f64) -> f64 { + (1.0 - self.ccdf(x)).ln() + } +} + +#[derive(Clone)] +pub struct Exponential { + lambda: f64, +} + +impl Exponential { + pub fn new(lambda: f64) -> Self { + Self { lambda } + } + + pub fn from_scale(beta: f64) -> Self { + Self::new(1.0 / beta) + } +} + +impl Distribution for Exponential { + fn unit() -> Self { + Self::new(1.0) + } + + fn pdf(&self, x: f64) -> f64 { + self.lambda * (-self.lambda * x).exp() + } + + fn cdf(&self, x: f64) -> f64 { + 1.0 - (-self.lambda * x).exp() + } + + fn ppf(&self, p: f64) -> f64 { + -(1.0 - p).ln() / self.lambda + } + + fn ccdf(&self, x: f64) -> f64 { + (-self.lambda * x).exp() + } + + fn logpdf(&self, x: f64) -> f64 { + self.lambda.ln() - self.lambda * x + } + + fn logcdf(&self, x: f64) -> f64 { + (-(-self.lambda * x).exp()).ln_1p() + } + + fn logccdf(&self, x: f64) -> f64 { + -self.lambda * x + } + + fn support(&self) -> (f64, f64) { + (0.0, f64::INFINITY) + } +} diff --git a/src/trace_statistics.rs b/src/trace_statistics.rs index 7a73daf..39db395 100644 --- a/src/trace_statistics.rs +++ b/src/trace_statistics.rs @@ -1,4 +1,9 @@ -use crate::{alignment::AlignmentData, pipeline::NaiveTraceResults, segments::SegmentView}; +use crate::{ + alignment::AlignmentData, + pipeline::NaiveTraceResults, + segments::SegmentView, + statistics::{Distribution, Exponential}, +}; pub struct RegionStatistics { pub total_bases: usize, @@ -6,14 +11,16 @@ pub struct RegionStatistics { } #[derive(Debug, Clone)] -pub struct QueryStatistics { +pub struct QueryStatistics { pub occurances: usize, pub coverage: usize, + pub target_span: usize, + pub distribution: T, } -pub struct TraceStatistics { +pub struct TraceStatistics { pub total_bases: usize, - pub query_statistics: Vec, + pub query_statistics: Vec>, pub region_statistics: Vec, } @@ -26,7 +33,7 @@ pub fn trace_statistics( naive_traces: &[NaiveTraceResults], alignment_data: &AlignmentData, count_mode: OccuranceCountingMode, -) -> TraceStatistics { +) -> TraceStatistics { // Asumption... All regions are sorted, no gaps. At least 1 region expected... debug_assert!(naive_traces.first().map(|v| v.region_index) == Some(0)); debug_assert!(naive_traces @@ -37,11 +44,16 @@ pub fn trace_statistics( let mut query_stats = vec![ QueryStatistics { occurances: 0, - coverage: 0 + coverage: 0, + target_span: 0, + distribution: Exponential::unit(), }; alignment_data.query_name_map.size() ]; + let mut query_span: Vec> = + vec![None; alignment_data.query_name_map.size()]; + let mut all_region_stats: Vec = Vec::with_capacity(naive_traces.len()); for trace_results in naive_traces.iter() { @@ -52,6 +64,12 @@ pub fn trace_statistics( if let Some(query_id) = blk.query_id { query_stats[query_id].occurances += 1; query_stats[query_id].coverage += blk.col_end - blk.col_start + 1; + query_span[query_id] = match query_span[query_id] { + None => Some((blk.col_start, blk.col_end)), + Some((start, end)) => { + Some((start.min(blk.col_start), end.min(blk.col_end))) + } + } } } } @@ -61,6 +79,13 @@ pub fn trace_statistics( query_stats[trace_blk.query_id].occurances += 1; query_stats[trace_blk.query_id].coverage += trace_blk.col_end - trace_blk.col_start + 1; + + query_span[trace_blk.query_id] = match query_span[trace_blk.query_id] { + None => Some((trace_blk.col_start, trace_blk.col_end)), + Some((start, end)) => { + Some((start.min(trace_blk.col_start), end.min(trace_blk.col_end))) + } + } } } } @@ -91,6 +116,15 @@ pub fn trace_statistics( all_region_stats.push(region_stat); } + for (query_info, query_span) in query_stats.iter_mut().zip(query_span.iter()) { + if let Some((start, end)) = query_span { + query_info.target_span = end - start + 1; + query_info.distribution = Exponential::from_scale( + query_info.occurances as f64 / query_info.target_span as f64, + ); + } + } + TraceStatistics { total_bases: all_region_stats.iter().map(|v| v.total_bases).sum(), query_statistics: query_stats, From decf2329140f5350f2822669a9a0a3437a910c55 Mon Sep 17 00:00:00 2001 From: isaacr Date: Fri, 17 Apr 2026 01:14:12 -0600 Subject: [PATCH 06/39] Fix errors. --- src/assembly.rs | 32 +++++++++++++++++--------------- src/pipeline.rs | 6 ++++++ src/segments.rs | 16 +++++++++++++--- src/statistics.rs | 6 ++++-- src/trace_statistics.rs | 24 ++++++++++++++++-------- 5 files changed, 56 insertions(+), 28 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index ff92f33..449ec68 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -7,7 +7,7 @@ use crate::{ score_params::ScoreParams, segments::SegmentedMatrix, statistics::Distribution, - trace_statistics::{self, TraceStatistics}, + trace_statistics::{QueryStatistics, RegionStatistics, TraceStatistics}, AnnotationArgs, }; @@ -138,11 +138,13 @@ fn get_link_cost( + lambda * target_gap } -fn link_assemblies( +fn link_assemblies( graph: &mut HashMap<(SegmentAndDenseRow, SegmentAndDenseRow), Edge>, compatable_blocks: impl Iterator, alignments: &[Alignment], segments: &SegmentedMatrix, + query_statistics: &QueryStatistics, + region_statistics: &RegionStatistics, score_params: &ScoreParams, args: &AnnotationArgs, ) { @@ -187,30 +189,30 @@ fn link_assemblies( alignments[b_block.row_idx - 1].strand, ) { (Strand::Forward, Strand::Forward) => ( - b_block.query_start as isize - a_block.query_end as isize, + b_block.query_start as isize - a_block.query_end as isize - 1, LinkType::Forward, ), (Strand::Reverse, Strand::Reverse) => ( - a_block.query_end as isize - b_block.query_start as isize, + a_block.query_end as isize - b_block.query_start as isize - 1, LinkType::Reverse, ), (Strand::Forward, Strand::Reverse) => select_closest( ( - a_block.query_start as isize - b_block.query_start as isize, + a_block.query_start as isize - b_block.query_start as isize - 1, LinkType::FRInversion1, ), ( - b_block.query_end as isize - a_block.query_end as isize, + b_block.query_end as isize - a_block.query_end as isize - 1, LinkType::FRInversion2, ), ), (Strand::Reverse, Strand::Forward) => select_closest( ( - b_block.query_start as isize - a_block.query_start as isize, + b_block.query_start as isize - a_block.query_start as isize - 1, LinkType::RFInversion1, ), ( - a_block.query_end as isize - b_block.query_end as isize, + a_block.query_end as isize - b_block.query_end as isize - 1, LinkType::RFInversion2, ), ), @@ -271,10 +273,10 @@ impl SegmentAssemblyGraph { pub fn new( alignments: &[Alignment], segments: &SegmentedMatrix, - trace_statistics: &TraceStatistics, + region_statistics: &RegionStatistics, + query_statistics: &[QueryStatistics], score_params: &ScoreParams, annotation_args: &AnnotationArgs, - region_idx: usize, ) -> Self { let mut alignment_block_map = vec![Vec::::new(); alignments.len()]; @@ -295,13 +297,13 @@ impl SegmentAssemblyGraph { query_ids .iter() // grab the alignments for this ID - .map(|&id| { + .map(|id| { ( - id, + *id, alignments .iter() .enumerate() - .filter(|&(_, a)| a.query_id == id) + .filter(|&(_, a)| a.query_id == *id) .flat_map(|(a_idx, _)| alignment_block_map[a_idx].iter().copied()), ) }) @@ -311,8 +313,8 @@ impl SegmentAssemblyGraph { compat_blocks, alignments, segments, - trace_statistics.query_statistics[id], - trace_statistics.region_statistics[region_idx], + &query_statistics[id], + region_statistics, score_params, annotation_args, ); diff --git a/src/pipeline.rs b/src/pipeline.rs index 358c617..42851a4 100644 --- a/src/pipeline.rs +++ b/src/pipeline.rs @@ -142,6 +142,8 @@ fn get_active_columns(matrix: &Matrix) -> Vec<(u } pub struct NaiveTraceResults { + pub target_start: usize, + pub target_end: usize, pub trace_segments: Vec, pub segments: InitialSegments, pub score_params: ScoreParams, @@ -249,6 +251,8 @@ pub fn run_naive_trace( } NaiveTraceResults { + target_start: proximity_group.target_start, + target_end: proximity_group.target_end, trace_segments: simple_trace, segments, score_params, @@ -272,6 +276,8 @@ pub fn run_history_trace( proximity_group, &mut naive_trace.segments, &naive_trace.trace_segments, + &trace_statistics.region_statistics[naive_trace.region_index], + &trace_statistics.query_statistics, &naive_trace.score_params, &args.annotation_args, ); diff --git a/src/segments.rs b/src/segments.rs index a2d8e20..c990b25 100644 --- a/src/segments.rs +++ b/src/segments.rs @@ -2,8 +2,14 @@ use core::f64; use std::{cmp::Ordering, fmt::Debug, iter::Fuse}; use crate::{ - assembly::SegmentAssemblyGraph, chunks::ProximityGroup, matrix::Matrix, - score_params::ScoreParams, viterbi::TraceSegment, AnnotationArgs, + assembly::SegmentAssemblyGraph, + chunks::ProximityGroup, + matrix::Matrix, + score_params::ScoreParams, + statistics::Distribution, + trace_statistics::{QueryStatistics, RegionStatistics, TraceStatistics}, + viterbi::TraceSegment, + AnnotationArgs, }; use itertools::Itertools; @@ -556,16 +562,20 @@ pub fn segments_from_matrix_trace( } } -pub fn assemble_and_link_segments<'a>( +pub fn assemble_and_link_segments<'a, T: Distribution>( proximity_group: &ProximityGroup, initial_segments: &'a mut InitialSegments, trace_segments: &[TraceSegment], + region_statistics: &RegionStatistics, + query_statistics: &[QueryStatistics], score_params: &ScoreParams, annotation_args: &AnnotationArgs, ) -> (&'a SegmentedMatrix, SegmentAssemblyGraph) { let assembly_graph = SegmentAssemblyGraph::new( proximity_group.alignments, &initial_segments.segments, + region_statistics, + query_statistics, score_params, annotation_args, ); diff --git a/src/statistics.rs b/src/statistics.rs index fa3b4ef..5db05b7 100644 --- a/src/statistics.rs +++ b/src/statistics.rs @@ -1,5 +1,7 @@ +use std::fmt::Debug; + #[allow(dead_code)] -pub trait Distribution: Clone { +pub trait Distribution: Clone + Debug { fn unit() -> Self; fn pdf(&self, x: f64) -> f64; fn cdf(&self, x: f64) -> f64; @@ -21,7 +23,7 @@ pub trait Distribution: Clone { } } -#[derive(Clone)] +#[derive(Clone, Debug)] pub struct Exponential { lambda: f64, } diff --git a/src/trace_statistics.rs b/src/trace_statistics.rs index 39db395..7d7f9c6 100644 --- a/src/trace_statistics.rs +++ b/src/trace_statistics.rs @@ -65,10 +65,14 @@ pub fn trace_statistics( query_stats[query_id].occurances += 1; query_stats[query_id].coverage += blk.col_end - blk.col_start + 1; query_span[query_id] = match query_span[query_id] { - None => Some((blk.col_start, blk.col_end)), - Some((start, end)) => { - Some((start.min(blk.col_start), end.min(blk.col_end))) - } + None => Some(( + trace_results.target_start + blk.col_start, + trace_results.target_start + blk.col_end, + )), + Some((start, end)) => Some(( + start.min(trace_results.target_start + blk.col_start), + end.max(trace_results.target_start + blk.col_end), + )), } } } @@ -81,10 +85,14 @@ pub fn trace_statistics( trace_blk.col_end - trace_blk.col_start + 1; query_span[trace_blk.query_id] = match query_span[trace_blk.query_id] { - None => Some((trace_blk.col_start, trace_blk.col_end)), - Some((start, end)) => { - Some((start.min(trace_blk.col_start), end.min(trace_blk.col_end))) - } + None => Some(( + trace_results.target_start + trace_blk.col_start, + trace_results.target_start + trace_blk.col_end, + )), + Some((start, end)) => Some(( + start.min(trace_results.target_start + trace_blk.col_start), + end.max(trace_results.target_start + trace_blk.col_end), + )), } } } From 2a39a859b9993af0df7ec20dae45407ad55e4456 Mon Sep 17 00:00:00 2001 From: isaacr Date: Fri, 17 Apr 2026 22:27:35 -0600 Subject: [PATCH 07/39] remove unused imports --- src/assembly.rs | 2 +- src/segments.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 449ec68..b35948c 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -7,7 +7,7 @@ use crate::{ score_params::ScoreParams, segments::SegmentedMatrix, statistics::Distribution, - trace_statistics::{QueryStatistics, RegionStatistics, TraceStatistics}, + trace_statistics::{QueryStatistics, RegionStatistics}, AnnotationArgs, }; diff --git a/src/segments.rs b/src/segments.rs index c990b25..b9a7349 100644 --- a/src/segments.rs +++ b/src/segments.rs @@ -7,7 +7,7 @@ use crate::{ matrix::Matrix, score_params::ScoreParams, statistics::Distribution, - trace_statistics::{QueryStatistics, RegionStatistics, TraceStatistics}, + trace_statistics::{QueryStatistics, RegionStatistics}, viterbi::TraceSegment, AnnotationArgs, }; From 6ce0e30278a5b52ce282c95111d6f89e1c24735c Mon Sep 17 00:00:00 2001 From: isaacr Date: Tue, 21 Apr 2026 00:15:19 -0600 Subject: [PATCH 08/39] New target scoring scheme is done, also added unexplained base removal. --- .gitignore | 4 ++- src/assembly.rs | 52 ++++++++++++++++++++---------- src/main.rs | 19 +++++++---- src/statistics.rs | 71 +++++++++++++++++++++++++++++++++++++++-- src/trace_statistics.rs | 25 +++++++++++---- 5 files changed, 137 insertions(+), 34 deletions(-) diff --git a/.gitignore b/.gitignore index a744cfb..801aca5 100644 --- a/.gitignore +++ b/.gitignore @@ -19,4 +19,6 @@ Cargo.lock # Visuals generated by aurora... /viz/ -out.txt + +# Temporary output files... +/out*.txt diff --git a/src/assembly.rs b/src/assembly.rs index b35948c..1061361 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -100,6 +100,7 @@ fn piecewise_linear_cost( fn get_link_cost( annotation_args: &AnnotationArgs, score_params: &ScoreParams, + target_gap_distribution: &impl Distribution, consensus_gap: f64, target_gap: f64, ) -> f64 { @@ -118,14 +119,16 @@ fn get_link_cost( .max(1.0); // Compute slopes.... - let lambda = -value_range - * (annotation_args.join_target_gap_penalty - / annotation_args.target_join_distance.max(1) as f64) - .abs(); let alpha = -value_range * (annotation_args.join_consensus_overlap_penalty / overlap_range).abs(); let beta = -value_range * (annotation_args.join_consensus_gap_penalty / gap_range).abs(); + // Compute target gap penalty. + // Doing this as the expected value over the transition scores... + let target_random_prob = target_gap_distribution.cdf(target_gap); + let target_expected_score = target_random_prob * score_params.query_jump_score + + (1.0 - target_random_prob) * score_params.query_loop_score; + // Cost = linear consensus cost + linear target gap cost... min_value + piecewise_linear_cost( @@ -135,7 +138,7 @@ fn get_link_cost( beta, consensus_gap, ) - + lambda * target_gap + + target_expected_score } fn link_assemblies( @@ -149,7 +152,6 @@ fn link_assemblies( args: &AnnotationArgs, ) { // this relies on the alignments being sorted by target start - // note: this assertion iter will only run in debug mode let compatable_blocks = compatable_blocks.sorted().collect_vec(); compatable_blocks.iter().enumerate().for_each(|(idx, a)| { @@ -162,12 +164,6 @@ fn link_assemblies( let a_block = &segments[a.0].blocks[a.1]; let b_block = &segments[b.0].blocks[b.1]; - // We allow this now, otherwise inversions might not properly join... - // If same alignment, and neighboring segments, don't join... - //if a_block.row_idx == b_block.row_idx && ((b.0 - 1) <= a.0) { - // return; - //} - let target_distance = b_block.col_start as isize - a_block.col_end as isize - 1; let a_length = a_block.query_end.abs_diff(a_block.query_start) + 1; @@ -219,14 +215,23 @@ fn link_assemblies( _ => panic!("Invalid strand types!"), }; - let within_target_distance_threshold = - target_distance < args.target_join_distance as isize; + // Incorperate unexplained bases into query distance... + let unexplained_bases = + region_statistics.unexplained_bases[b.0] - region_statistics.unexplained_bases[a.0]; + let corrected_consensus_distance = + (consensus_distance - unexplained_bases as isize).max(consensus_distance.min(0)); + + // Within target distance??? + let within_target_distance_threshold = (target_distance + < args.target_join_distance as isize) + && (query_statistics.distribution.ccdf(target_distance as f64) + >= args.target_distance_likelihood_threshold); let consensus_is_colinear = if link_type.is_inversion() { - consensus_distance.abs() < args.inversion_distance + corrected_consensus_distance.abs() < args.inversion_distance } else { - consensus_distance > -args.consensus_join_overlap - && consensus_distance < args.consensus_join_distance + corrected_consensus_distance > -args.consensus_join_overlap + && corrected_consensus_distance < args.consensus_join_distance }; // TODO: Hardcoded, change later... @@ -239,12 +244,25 @@ fn link_assemblies( get_link_cost( args, score_params, + &query_statistics.distribution, consensus_distance as f64, target_distance as f64, ) }; if within_target_distance_threshold && consensus_is_colinear && is_significant { + println!("{:?}->{:?}", a, b); + println!( + "CD: {}, UEB: {} => Corrected: {}", + consensus_distance, unexplained_bases, corrected_consensus_distance + ); + + println!( + "Target distance = {} => Prob not seeing at random = {}", + target_distance, + query_statistics.distribution.ccdf(target_distance as f64) + ); + graph.insert( ((a.0, a_block.row_idx), (b.0, b_block.row_idx)), Edge { diff --git a/src/main.rs b/src/main.rs index dd411ef..3767ccc 100644 --- a/src/main.rs +++ b/src/main.rs @@ -125,7 +125,7 @@ pub struct AnnotationArgs { )] pub num_skip_loops_eq_to_jump: usize, - /// The max distance across unaligned positions + /// The max distance across positions /// in the target (genome) at which a join is /// considered between compatible alignments #[arg( @@ -136,6 +136,17 @@ pub struct AnnotationArgs { )] pub target_join_distance: usize, + /// Removes joins across positions + /// in the target (genome) at which a join is + /// less than this likely to not be generated + /// at random. + #[arg( + long = "target-join-likelihood-threshold", + default_value = "0.5", + value_name = "f" + )] + pub target_distance_likelihood_threshold: f64, + /// The maximum overlap in the consensus at which /// a join is considered between compatible alignments. #[arg( @@ -241,12 +252,6 @@ pub struct AnnotationArgs { value_name = "f" )] pub join_consensus_gap_penalty: f64, - - /// The amount of penalty to apply to a join at the maximum allowed target gap - /// A value of 1 means to apply a penalty equal to a query jump. - /// The cost grows linearly to this value as the gap between the sequences in the target space increases. - #[arg(long = "target-gap-penalty", default_value = "0.4", value_name = "f")] - pub join_target_gap_penalty: f64, } #[derive(Args, Debug, Clone, Default)] diff --git a/src/statistics.rs b/src/statistics.rs index 5db05b7..309c7ff 100644 --- a/src/statistics.rs +++ b/src/statistics.rs @@ -11,7 +11,6 @@ pub trait Distribution: Clone + Debug { fn ccdf(&self, x: f64) -> f64 { 1.0 - self.cdf(x) } - fn logpdf(&self, x: f64) -> f64 { self.pdf(x).ln() } @@ -19,7 +18,7 @@ pub trait Distribution: Clone + Debug { self.cdf(x).ln() } fn logccdf(&self, x: f64) -> f64 { - (1.0 - self.ccdf(x)).ln() + self.ccdf(x).ln() } } @@ -75,3 +74,71 @@ impl Distribution for Exponential { (0.0, f64::INFINITY) } } + +#[derive(Debug, Clone)] +pub struct ExponentialEstimator { + sample_mean: f64, + degrees_of_freedom: usize, +} + +impl ExponentialEstimator { + pub fn new(sample_mean: f64, sample_size: usize) -> Self { + Self { + sample_mean: sample_mean, + degrees_of_freedom: sample_size, + } + } +} + +impl From for Exponential { + fn from(value: ExponentialEstimator) -> Self { + Self::from_scale(value.sample_mean) + } +} + +impl Distribution for ExponentialEstimator { + fn unit() -> Self { + Self { + sample_mean: 1.0, + degrees_of_freedom: 1, + } + } + + fn logpdf(&self, x: f64) -> f64 { + let n = self.degrees_of_freedom as f64; + let sm = self.sample_mean; + ((n + 1.0) * n.ln() + n * sm.ln()) - ((n + 1.0) * (n * sm + x).ln()) + } + + fn pdf(&self, x: f64) -> f64 { + self.logpdf(x).exp() + } + + fn logccdf(&self, x: f64) -> f64 { + let n = self.degrees_of_freedom as f64; + let sm = self.sample_mean; + n * ((n * sm).ln() - (n * sm + x).ln()) + } + + fn logcdf(&self, x: f64) -> f64 { + self.cdf(x).ln() + } + + fn cdf(&self, x: f64) -> f64 { + -(self.logccdf(x).exp_m1()) + } + + fn ccdf(&self, x: f64) -> f64 { + self.logccdf(x).exp() + } + + fn ppf(&self, p: f64) -> f64 { + let n = self.degrees_of_freedom as f64; + let sm = self.sample_mean; + (n * sm) * ((1.0 - p).powf(-1.0 / n) - 1.0) + } + + fn support(&self) -> (f64, f64) { + (0.0, f64::INFINITY) + } +} diff --git a/src/trace_statistics.rs b/src/trace_statistics.rs index 7d7f9c6..f8183fd 100644 --- a/src/trace_statistics.rs +++ b/src/trace_statistics.rs @@ -2,9 +2,10 @@ use crate::{ alignment::AlignmentData, pipeline::NaiveTraceResults, segments::SegmentView, - statistics::{Distribution, Exponential}, + statistics::{Distribution, ExponentialEstimator}, }; +#[derive(Debug)] pub struct RegionStatistics { pub total_bases: usize, pub unexplained_bases: Vec, @@ -18,7 +19,9 @@ pub struct QueryStatistics { pub distribution: T, } +#[derive(Debug)] pub struct TraceStatistics { + #[allow(dead_code)] pub total_bases: usize, pub query_statistics: Vec>, pub region_statistics: Vec, @@ -26,6 +29,7 @@ pub struct TraceStatistics { pub enum OccuranceCountingMode { Segments, + #[allow(dead_code)] Trace, } @@ -33,7 +37,7 @@ pub fn trace_statistics( naive_traces: &[NaiveTraceResults], alignment_data: &AlignmentData, count_mode: OccuranceCountingMode, -) -> TraceStatistics { +) -> TraceStatistics { // Asumption... All regions are sorted, no gaps. At least 1 region expected... debug_assert!(naive_traces.first().map(|v| v.region_index) == Some(0)); debug_assert!(naive_traces @@ -41,12 +45,17 @@ pub fn trace_statistics( .zip(naive_traces.iter().skip(1)) .all(|(v1, v2)| v1.region_index + 1 == v2.region_index)); + assert!(naive_traces + .iter() + .zip(naive_traces.iter().skip(1)) + .all(|(v1, v2)| v1.region_index + 1 == v2.region_index && v1.target_end < v2.target_start)); + let mut query_stats = vec![ QueryStatistics { occurances: 0, coverage: 0, target_span: 0, - distribution: Exponential::unit(), + distribution: ExponentialEstimator::unit(), }; alignment_data.query_name_map.size() ]; @@ -112,8 +121,8 @@ pub fn trace_statistics( if prior_segment.blocks.len() == 1 && prior_segment.blocks[0].row_idx == 0 { unexplained_bases_up_to += seg.end_col - seg.start_col + 1; } - unexplained_bases_up_to += prior_segment.end_col - seg.start_col - 1; - region_stat.total_bases += prior_segment.end_col - seg.start_col - 1; + unexplained_bases_up_to += seg.start_col - prior_segment.end_col - 1; + region_stat.total_bases += seg.start_col - prior_segment.end_col - 1; } region_stat.total_bases += seg.end_col - seg.start_col + 1; region_stat.unexplained_bases.push(unexplained_bases_up_to); @@ -127,8 +136,10 @@ pub fn trace_statistics( for (query_info, query_span) in query_stats.iter_mut().zip(query_span.iter()) { if let Some((start, end)) = query_span { query_info.target_span = end - start + 1; - query_info.distribution = Exponential::from_scale( - query_info.occurances as f64 / query_info.target_span as f64, + // We subtract 1 because were looking at distances between each occurance as a sample value. + query_info.distribution = ExponentialEstimator::new( + query_info.target_span as f64 / query_info.occurances.saturating_sub(1) as f64, + query_info.occurances.saturating_sub(1), ); } } From cf319e384f3d38e702766109407385d923ba8609 Mon Sep 17 00:00:00 2001 From: isaacr Date: Tue, 21 Apr 2026 16:48:53 -0600 Subject: [PATCH 09/39] Remove unexplained gaps. --- src/assembly.rs | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 1061361..f8604d0 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -215,12 +215,6 @@ fn link_assemblies( _ => panic!("Invalid strand types!"), }; - // Incorperate unexplained bases into query distance... - let unexplained_bases = - region_statistics.unexplained_bases[b.0] - region_statistics.unexplained_bases[a.0]; - let corrected_consensus_distance = - (consensus_distance - unexplained_bases as isize).max(consensus_distance.min(0)); - // Within target distance??? let within_target_distance_threshold = (target_distance < args.target_join_distance as isize) @@ -228,10 +222,10 @@ fn link_assemblies( >= args.target_distance_likelihood_threshold); let consensus_is_colinear = if link_type.is_inversion() { - corrected_consensus_distance.abs() < args.inversion_distance + consensus_distance.abs() < args.inversion_distance } else { - corrected_consensus_distance > -args.consensus_join_overlap - && corrected_consensus_distance < args.consensus_join_distance + consensus_distance > -args.consensus_join_overlap + && consensus_distance < args.consensus_join_distance }; // TODO: Hardcoded, change later... @@ -251,18 +245,6 @@ fn link_assemblies( }; if within_target_distance_threshold && consensus_is_colinear && is_significant { - println!("{:?}->{:?}", a, b); - println!( - "CD: {}, UEB: {} => Corrected: {}", - consensus_distance, unexplained_bases, corrected_consensus_distance - ); - - println!( - "Target distance = {} => Prob not seeing at random = {}", - target_distance, - query_statistics.distribution.ccdf(target_distance as f64) - ); - graph.insert( ((a.0, a_block.row_idx), (b.0, b_block.row_idx)), Edge { From 4aab08d9bce1d5a90512f6a45f8e9679470eeeef Mon Sep 17 00:00:00 2001 From: isaacr Date: Thu, 30 Apr 2026 11:07:49 -0600 Subject: [PATCH 10/39] Fix reading of bed files. --- scripts/plot_consensus_distance_caf.py | 44 ++++++++++++ .../plot_consensus_distance_repeatmasker.py | 71 +++++++++++++++++++ src/viz/block.rs | 2 +- 3 files changed, 116 insertions(+), 1 deletion(-) create mode 100644 scripts/plot_consensus_distance_caf.py create mode 100644 scripts/plot_consensus_distance_repeatmasker.py diff --git a/scripts/plot_consensus_distance_caf.py b/scripts/plot_consensus_distance_caf.py new file mode 100644 index 0000000..18d4937 --- /dev/null +++ b/scripts/plot_consensus_distance_caf.py @@ -0,0 +1,44 @@ +import sys + +import matplotlib.pyplot as plt +import numpy as np + +caf_file = sys.argv[1] + +gap_info = {} +prior = {} + +with open(caf_file, "r") as f: + for line in f: + tokens = line.strip().split(",") + name = tokens[8].strip() + qstart = int(tokens[10]) + qend = int(tokens[11]) + tstart = int(tokens[5]) + is_neg_strand = int(tokens[13]) + + if name in prior: + pstart, pend, p_is_neg, p_tstart = prior[name] + + if tstart - p_tstart < 10000: + if name not in gap_info: + gap_info[name] = [] + + if not is_neg_strand and not p_is_neg: + gap = qstart - pend + elif is_neg_strand and p_is_neg: + gap = qend - pstart + else: + continue + + gap_info[name].append(gap) + + prior[name] = (qstart, qend, is_neg_strand, tstart) + + +for gap_name, gap_vals in sorted( + gap_info.items(), key=lambda k: len(k[1]), reverse=True +): + plt.title(gap_name) + plt.hist(gap_vals, 500) + plt.show() diff --git a/scripts/plot_consensus_distance_repeatmasker.py b/scripts/plot_consensus_distance_repeatmasker.py new file mode 100644 index 0000000..ebef9cd --- /dev/null +++ b/scripts/plot_consensus_distance_repeatmasker.py @@ -0,0 +1,71 @@ +import sys + +import matplotlib.pyplot as plt +import numpy as np + +caf_file = sys.argv[1] + +gap_info = {} +prior = {} + +with open(caf_file, "r") as f: + for line in f: + shared_name = line.strip().split("\t")[3].split("#")[-1] + seqs = line.strip().split("\t")[-1].split(",") + + if len(seqs) <= 1: + continue + + prior = None + + for seq in seqs: + tokens = seq.split(" ") + name = f"{tokens[9]}#{tokens[10]}" + is_pos = tokens[8] == "+" + + if is_pos: + qstart = int(tokens[11]) + qend = int(tokens[12]) + else: + qstart = int(tokens[13]) + qend = int(tokens[12]) + + assert qend >= qstart + + if prior is not None: + pname, ppos, pstart, pend = prior + + try: + if is_pos and ppos: + assert pstart <= qstart <= qend and pstart <= pend <= qend + gap = qstart - pend + elif not is_pos and not ppos: + assert qstart <= qend <= pstart and qstart <= pstart <= pend + gap = pstart - qend + else: + gap = None + except AssertionError: + if is_pos and ppos: + print(f"Violation: {pstart} => {pend} => {qstart} => {qend}") + else: + print(f"Violation: {pstart} <= {pend} <= {qstart} <= {qend}") + + print(f"Offending annotation: {' '.join(line.split('\t')[:-1])}") + print(f"Offending sequences:\n\t {'\n\t'.join(seqs)}") + gap = None + + if gap is not None: + if shared_name not in gap_info: + gap_info[shared_name] = [] + + gap_info[shared_name].append(gap) + + prior = (name, is_pos, qstart, qend) + + +for gap_name, gap_vals in sorted( + gap_info.items(), key=lambda k: len(k[1]), reverse=True +): + plt.title(gap_name) + plt.hist(gap_vals, 100) + plt.show() diff --git a/src/viz/block.rs b/src/viz/block.rs index 6005b35..4065e6d 100644 --- a/src/viz/block.rs +++ b/src/viz/block.rs @@ -189,7 +189,7 @@ impl BlockGroup { None } else { match elems[8] { - "C" => Some(bed.strand), + "C" => Some(Strand::Reverse), _ => Some(Strand::from_str(elems[8])), } } From 03c24b4226dde25450f9ac7a72465efe9012e260 Mon Sep 17 00:00:00 2001 From: isaacr Date: Thu, 30 Apr 2026 17:39:59 -0600 Subject: [PATCH 11/39] Add divergence calculation for segments, to use in linking. Adjust max consensus distance to match what is used in repeat masker --- .../plot_consensus_distance_repeatmasker.py | 89 ++++++++++++++----- src/main.rs | 2 +- src/segments.rs | 16 +++- 3 files changed, 80 insertions(+), 27 deletions(-) diff --git a/scripts/plot_consensus_distance_repeatmasker.py b/scripts/plot_consensus_distance_repeatmasker.py index ebef9cd..755dc3e 100644 --- a/scripts/plot_consensus_distance_repeatmasker.py +++ b/scripts/plot_consensus_distance_repeatmasker.py @@ -1,3 +1,5 @@ +# This file plots histograms of join distance per tandem repeat family given a repeatmasker formatted bed file as an argument. + import sys import matplotlib.pyplot as plt @@ -6,6 +8,7 @@ caf_file = sys.argv[1] gap_info = {} +length_estimate = {} prior = {} with open(caf_file, "r") as f: @@ -16,7 +19,11 @@ if len(seqs) <= 1: continue + # Sort by location on the target... + seqs = sorted(seqs, key=lambda k: int(k.split(" ")[5])) + prior = None + length = 0 for seq in seqs: tokens = seq.split(" ") @@ -31,41 +38,75 @@ qend = int(tokens[12]) assert qend >= qstart + length += qend - qstart if prior is not None: pname, ppos, pstart, pend = prior - - try: - if is_pos and ppos: - assert pstart <= qstart <= qend and pstart <= pend <= qend - gap = qstart - pend - elif not is_pos and not ppos: - assert qstart <= qend <= pstart and qstart <= pstart <= pend - gap = pstart - qend - else: - gap = None - except AssertionError: - if is_pos and ppos: - print(f"Violation: {pstart} => {pend} => {qstart} => {qend}") - else: - print(f"Violation: {pstart} <= {pend} <= {qstart} <= {qend}") - - print(f"Offending annotation: {' '.join(line.split('\t')[:-1])}") - print(f"Offending sequences:\n\t {'\n\t'.join(seqs)}") + if pname != name: gap = None + else: + try: + if is_pos and ppos: + assert pstart <= qstart <= qend and pstart <= pend <= qend + gap = qstart - pend + elif not is_pos and not ppos: + assert qstart <= qend <= pend and qstart <= pstart <= pend + gap = pstart - qend + else: + gap = None + except AssertionError: + if is_pos and ppos: + print( + f"Violation: {pstart} => {pend} => {qstart} => {qend}" + ) + else: + print( + f"Violation: {pend} <= {pstart} <= {qend} <= {qstart}" + ) + + print( + f"Offending annotation: {' '.join(line.split('\t')[:-1])}" + ) + print(f"Offending sequences:\n\t {'\n\t'.join(seqs)}") + gap = None - if gap is not None: - if shared_name not in gap_info: - gap_info[shared_name] = [] + if gap is not None: + if shared_name not in gap_info: + gap_info[shared_name] = [] - gap_info[shared_name].append(gap) + gap_info[shared_name].append(gap) prior = (name, is_pos, qstart, qend) + old_length_est = length_estimate.get(shared_name, (0, 0, 0)) + length_estimate[shared_name] = ( + old_length_est[0] + length, + old_length_est[1] + 1, + max(old_length_est[2], length), + ) + for gap_name, gap_vals in sorted( gap_info.items(), key=lambda k: len(k[1]), reverse=True ): - plt.title(gap_name) - plt.hist(gap_vals, 100) + gap_vals = np.array(gap_vals) + # gap_vals = gap_vals[np.abs(gap_vals) > 1] + + avg_len = length_estimate[gap_name][0] / length_estimate[gap_name][1] + max_len = length_estimate[gap_name][2] + + plt.title(f"{gap_name} (Avg Length: {avg_len:.02f}, Max Length: {max_len})") + + avg_pos_d = np.mean(gap_vals[gap_vals >= 0]) + avg_neg_d = np.mean(gap_vals[gap_vals <= 0]) + + info = f"Positive Avg: {avg_pos_d:.02f}, Negative Avg: {avg_neg_d:.02f}, ({avg_len / avg_pos_d:.02f}, {max_len / avg_pos_d:.02f})" + + range = (0, np.max(gap_vals)) + x = np.arange(*range) + lamb = 1 / avg_pos_d + + plt.hist(gap_vals, 100, label=info) + # plt.plot(x, lamb * np.exp(-lamb * x)) + plt.legend() plt.show() diff --git a/src/main.rs b/src/main.rs index 3767ccc..2f1e11a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -162,7 +162,7 @@ pub struct AnnotationArgs { #[arg( short = 'C', long = "consensus-join-distance", - default_value = "2000", + default_value = "3750", value_name = "n" )] pub consensus_join_distance: isize, diff --git a/src/segments.rs b/src/segments.rs index b9a7349..fff8da8 100644 --- a/src/segments.rs +++ b/src/segments.rs @@ -66,6 +66,7 @@ pub struct Block { pub query_end: usize, pub avg_confidence: f64, pub alignment_score: f64, + pub kimura80: f64, pub can_join_up_to: usize, } @@ -534,17 +535,28 @@ pub fn segments_from_matrix_trace( _ => None, }; + let query_start = confidence_matrix.consensus_position(row_idx, start); + let query_end = confidence_matrix.consensus_position(row_idx, end); + + let kimura80 = match block_type { + BlockType::Alignment => { + group.alignments[row_idx - 1].kimura80(query_start, query_end) + } + _ => 0.0, + }; + Block { row_idx, block_type, query_id, col_start: start, col_end: end, - query_start: confidence_matrix.consensus_position(row_idx, start), - query_end: confidence_matrix.consensus_position(row_idx, end), + query_start, + query_end, avg_confidence: row_conf_sum[row_idx] / (row_valid_cell_count[row_idx].max(1) as f64), alignment_score: row_scores[row_idx], + kimura80, can_join_up_to: s_idx, } }) From 8ba72ebf271d379f634bf336127d67473038b555 Mon Sep 17 00:00:00 2001 From: isaacr Date: Fri, 1 May 2026 01:19:30 -0600 Subject: [PATCH 12/39] move consensus distance out into it's own function. --- src/assembly.rs | 95 +++++++++++++++++++++++++------------------------ src/segments.rs | 8 +++++ 2 files changed, 57 insertions(+), 46 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index f8604d0..97e49a7 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -5,7 +5,7 @@ use itertools::Itertools; use crate::{ alignment::{Alignment, Strand}, score_params::ScoreParams, - segments::SegmentedMatrix, + segments::{Block, SegmentedMatrix}, statistics::Distribution, trace_statistics::{QueryStatistics, RegionStatistics}, AnnotationArgs, @@ -141,10 +141,55 @@ fn get_link_cost( + target_expected_score } +pub fn block_target_distance(first_block: &Block, second_block: &Block) -> isize { + second_block.col_start as isize - first_block.col_end as isize - 1 +} + +pub fn block_consensus_distance(first_block: &Block, second_block: &Block) -> (isize, LinkType) { + let select_closest = |prop1: (isize, LinkType), prop2: (isize, LinkType)| { + if prop1.0.abs() < prop2.0.abs() { + prop1 + } else { + prop2 + } + }; + + match (first_block.strand, second_block.strand) { + (Strand::Forward, Strand::Forward) => ( + second_block.query_start as isize - first_block.query_end as isize - 1, + LinkType::Forward, + ), + (Strand::Reverse, Strand::Reverse) => ( + first_block.query_end as isize - second_block.query_start as isize - 1, + LinkType::Reverse, + ), + (Strand::Forward, Strand::Reverse) => select_closest( + ( + first_block.query_start as isize - second_block.query_start as isize - 1, + LinkType::FRInversion1, + ), + ( + second_block.query_end as isize - first_block.query_end as isize - 1, + LinkType::FRInversion2, + ), + ), + (Strand::Reverse, Strand::Forward) => select_closest( + ( + second_block.query_start as isize - first_block.query_start as isize - 1, + LinkType::RFInversion1, + ), + ( + first_block.query_end as isize - second_block.query_end as isize - 1, + LinkType::RFInversion2, + ), + ), + _ => panic!("Invalid strand types!"), + } +} + fn link_assemblies( graph: &mut HashMap<(SegmentAndDenseRow, SegmentAndDenseRow), Edge>, compatable_blocks: impl Iterator, - alignments: &[Alignment], segments: &SegmentedMatrix, query_statistics: &QueryStatistics, region_statistics: &RegionStatistics, @@ -164,56 +209,15 @@ fn link_assemblies( let a_block = &segments[a.0].blocks[a.1]; let b_block = &segments[b.0].blocks[b.1]; - let target_distance = b_block.col_start as isize - a_block.col_end as isize - 1; + let target_distance = block_target_distance(a_block, b_block); let a_length = a_block.query_end.abs_diff(a_block.query_start) + 1; let b_length = b_block.query_end.abs_diff(b_block.query_start) + 1; let min_length = a_length.min(b_length); - let select_closest = |prop1: (isize, LinkType), prop2: (isize, LinkType)| { - if prop1.0.abs() < prop2.0.abs() { - prop1 - } else { - prop2 - } - }; - // Query bounds are reversed for reverse sequences, so the start is actually greater than the end (Ex. start: 1510 -> end: 105) - let (consensus_distance, link_type) = match ( - alignments[a_block.row_idx - 1].strand, - alignments[b_block.row_idx - 1].strand, - ) { - (Strand::Forward, Strand::Forward) => ( - b_block.query_start as isize - a_block.query_end as isize - 1, - LinkType::Forward, - ), - (Strand::Reverse, Strand::Reverse) => ( - a_block.query_end as isize - b_block.query_start as isize - 1, - LinkType::Reverse, - ), - (Strand::Forward, Strand::Reverse) => select_closest( - ( - a_block.query_start as isize - b_block.query_start as isize - 1, - LinkType::FRInversion1, - ), - ( - b_block.query_end as isize - a_block.query_end as isize - 1, - LinkType::FRInversion2, - ), - ), - (Strand::Reverse, Strand::Forward) => select_closest( - ( - b_block.query_start as isize - a_block.query_start as isize - 1, - LinkType::RFInversion1, - ), - ( - a_block.query_end as isize - b_block.query_end as isize - 1, - LinkType::RFInversion2, - ), - ), - _ => panic!("Invalid strand types!"), - }; + let (consensus_distance, link_type) = block_consensus_distance(a_block, b_block); // Within target distance??? let within_target_distance_threshold = (target_distance @@ -311,7 +315,6 @@ impl SegmentAssemblyGraph { link_assemblies( &mut link_graph, compat_blocks, - alignments, segments, &query_statistics[id], region_statistics, diff --git a/src/segments.rs b/src/segments.rs index fff8da8..a9aa282 100644 --- a/src/segments.rs +++ b/src/segments.rs @@ -2,6 +2,7 @@ use core::f64; use std::{cmp::Ordering, fmt::Debug, iter::Fuse}; use crate::{ + alignment::Strand, assembly::SegmentAssemblyGraph, chunks::ProximityGroup, matrix::Matrix, @@ -59,6 +60,7 @@ pub enum BlockType { pub struct Block { pub row_idx: usize, pub block_type: BlockType, + pub strand: Strand, pub query_id: Option, pub col_start: usize, pub col_end: usize, @@ -545,9 +547,15 @@ pub fn segments_from_matrix_trace( _ => 0.0, }; + let strand = match block_type { + BlockType::Alignment => group.alignments[row_idx - 1].strand, + _ => Strand::Forward, + }; + Block { row_idx, block_type, + strand, query_id, col_start: start, col_end: end, From 75a319ab19532c75586ed079580da062ce068ca1 Mon Sep 17 00:00:00 2001 From: isaacr Date: Fri, 1 May 2026 17:16:48 -0600 Subject: [PATCH 13/39] Work on stat extraction from aurora. --- scripts/plot_consensus_distance_repeatmasker.py | 2 +- src/annotation.rs | 10 ++++++++-- src/pipeline.rs | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/scripts/plot_consensus_distance_repeatmasker.py b/scripts/plot_consensus_distance_repeatmasker.py index 755dc3e..5390df2 100644 --- a/scripts/plot_consensus_distance_repeatmasker.py +++ b/scripts/plot_consensus_distance_repeatmasker.py @@ -13,7 +13,7 @@ with open(caf_file, "r") as f: for line in f: - shared_name = line.strip().split("\t")[3].split("#")[-1] + shared_name = line.strip().split("\t")[3] # .split("#")[-1] seqs = line.strip().split("\t")[-1].split(",") if len(seqs) <= 1: diff --git a/src/annotation.rs b/src/annotation.rs index c904a57..0c97aba 100644 --- a/src/annotation.rs +++ b/src/annotation.rs @@ -108,7 +108,12 @@ fn get_strings(simple_annotations: &[SimpleAnnotation], simplify: bool) -> [Stri [ get_mutli_option_string(simple_annotations, |v| v.target_start, simplify), get_mutli_option_string(simple_annotations, |v| v.target_end, simplify), - get_mutli_option_string(simple_annotations, |v| v.query_name.clone(), simplify), + get_mutli_option_string( + simple_annotations, + // Remove spaces as it breaks the format... + |v| v.query_name.replace(" ", "-"), + simplify, + ), get_mutli_option_string(simple_annotations, |v| v.query_start, simplify), get_mutli_option_string(simple_annotations, |v| v.query_end, simplify), get_mutli_option_string(simple_annotations, |v| v.strand, simplify), @@ -127,7 +132,8 @@ impl AmbiguousAnnotation { format!( "{:w0$} {:w1$} {:w2$} {:w3$} {:w4$} {:w5$} {:w6$} {:w7$} {:4.3} {:w8$} {:w9$} {}", self.annotations.len(), - self.target_name, + // Remove spaces as it breaks the format... + self.target_name.replace(" ", "-"), ts, te, qn, diff --git a/src/pipeline.rs b/src/pipeline.rs index 42851a4..377266f 100644 --- a/src/pipeline.rs +++ b/src/pipeline.rs @@ -61,7 +61,7 @@ pub fn to_annotations( a.row_idx - proximity_group.alignments.len() - 1; let repeat = &proximity_group.tandem_repeats[tandem_repeat_idx]; format!( - "({}:{})#tandem repeat", + "({}:{})#tandem-repeat", repeat.period, repeat.consensus_pattern, ) } From 4ce5845e0c460b5c72b33b01dd938293e158476a Mon Sep 17 00:00:00 2001 From: isaacr Date: Mon, 4 May 2026 18:36:41 -0600 Subject: [PATCH 14/39] Work on parameter analysis for joins. --- scripts/plot_distributions_aurora.py | 266 ++++++++++++++++++ ...y => plot_target_distance_distribution.py} | 0 2 files changed, 266 insertions(+) create mode 100644 scripts/plot_distributions_aurora.py rename scripts/{plot_distance_distribution.py => plot_target_distance_distribution.py} (100%) diff --git a/scripts/plot_distributions_aurora.py b/scripts/plot_distributions_aurora.py new file mode 100644 index 0000000..b0f368d --- /dev/null +++ b/scripts/plot_distributions_aurora.py @@ -0,0 +1,266 @@ +import sys +import typing +from dataclasses import dataclass, fields + +import matplotlib.pyplot as plt +import numpy as np +from scipy.optimize import curve_fit +from scipy.stats import ecdf, expon, genpareto, gumbel_r, invweibull, norm, weibull_min + + +@dataclass +class AuroraEntry: + annotation_count: int + target: str + target_start: list[int] + target_end: list[int] + query: list[str] + query_start: list[int] + query_end: list[int] + strand: list[str] + score: float + kimera80: list[float] + join_id: int + region: int + + @classmethod + def from_line(cls, line: str) -> typing.Self: + parts = line.strip().split() + + parsed = [] + max_list_len = 0 + + for field, part in zip(fields(cls), parts): + if isinstance(field.type, str): + raise ValueError("Can't parse strings!") + + origin = typing.get_origin(field.type) + if origin is not None and issubclass(origin, list): + tp = typing.get_args(field.type)[0] + parsed.append([tp(v) for v in part.split(",")]) + max_list_len = max(max_list_len, len(parsed[-1])) + else: + parsed.append(field.type(part)) + + if max_list_len > 0: + parsed_new = [] + + for v in parsed: + if isinstance(v, list): + if len(v) == max_list_len: + parsed_new.append(v) + elif len(v) == 1: + parsed_new.append(v * max_list_len) + else: + raise ValueError( + f"One of the elements has and invalid length: {v}" + ) + else: + parsed_new.append(v) + + parsed = parsed_new + + return cls(*parsed) + + +aurora_file = sys.argv[1] + +joined_annots: dict[tuple[int, int], list[AuroraEntry]] = {} + +with open(aurora_file, "r") as f: + for line in f: + entry = AuroraEntry.from_line(line) + key = (entry.region, entry.join_id) + if key not in joined_annots: + joined_annots[key] = [] + joined_annots[key].append(entry) + +for key, annots in joined_annots.items(): + annots.sort(key=lambda k: min(k.target_start)) + + +random_stats = {} +join_stats = {} +prior_vals = {} +seq_size = {} + + +def consensus_dist(a: AuroraEntry, b: AuroraEntry, a_idx: int, b_idx: int): + def best(*a): + return min(a, key=lambda v: abs(v)) + + match (a.strand[a_idx], b.strand[b_idx]): + case ("+", "+"): + return b.query_start[b_idx] - a.query_end[a_idx] - 1 + case ("-", "-"): + return a.query_end[a_idx] - b.query_start[b_idx] - 1 + case ("-", "+"): + return best( + a.query_start[a_idx] - b.query_start[b_idx] - 1, + b.query_end[b_idx] - a.query_end[a_idx] - 1, + ) + case ("+", "-"): + return best( + b.query_start[b_idx] - a.query_start[a_idx] - 1, + a.query_end[a_idx] - b.query_end[b_idx] - 1, + ) + case _: + raise ValueError("Unknown stand configuration.") + + +def _target_distance(a: AuroraEntry, b: AuroraEntry, ai: int, bi: int): + assert b.target_start[bi] - a.target_end[ai] - 1 >= 0 + return b.target_start[bi] - a.target_end[ai] - 1 + + +stats_to_compute = { + "Consensus Distance": consensus_dist, + "Target Distance": _target_distance, + "Divergence Change": lambda a, b, ai, bi: b.kimera80[bi] - a.kimera80[ai], +} + + +class Distribution: + def __init__(self, dist, defaults, exclude_location=True): + self._dist = dist + self.DEFAULTS = list(defaults) + self._excl_loc = exclude_location + self.NAME = getattr( + dist, "__name__", getattr(type(dist), "__qualname__", repr(dist)) + ) + + def pdf(self, x, *args): + # print(*args) + if self._excl_loc: + return self._dist.pdf(x, *args[:-1], 0.0, args[-1]) + else: + return self._dist.pdf(x, *args) + + def cdf(self, x, *args): + if self._excl_loc: + return self._dist.cdf(x, *args[:-1], 0.0, args[-1]) + else: + return self._dist.cdf(x, *args) + + def logcdf(self, x, *args): + print(*args) + if self._excl_loc: + return self._dist.logcdf(x, *args[:-1], 0.0, args[-1]) + else: + return self._dist.logcdf(x, *args) + + +estimator = { + "Consensus Distance": Distribution(invweibull, (1.0, 0.0, 1.0), False), + "Target Distance": Distribution( + weibull_min, (1.0, 10000.0) + ), # Distribution(genpareto, (0.0, 1.0)), + "Divergence Change": Distribution(norm, (0.0, 1.0), False), +} + + +def fit_dist(data, dist): + emp_cdf = ecdf(data).cdf + + return curve_fit( + dist.cdf, + emp_cdf.quantiles, + emp_cdf.probabilities, + dist.DEFAULTS, + full_output=True, + )[0] + + +for name in stats_to_compute: + join_stats[name] = {} + random_stats[name] = {} + +for k, annots in sorted( + joined_annots.items(), key=lambda k: min(min(v.target_start) for v in k[1]) +): + for ann in annots[:1]: + for i in range(len(ann.query)): + name = ann.query[i] + (pann, j) = prior_vals.get(name, (None, None)) + if pann is not None: + for stat_name, values_per_query in random_stats.items(): + (ann1, idx1), (ann2, idx2) = sorted( + [(pann, j), (ann, i)], key=lambda v: v[0].target_start[v[1]] + ) + + if name not in values_per_query: + values_per_query[name] = [] + values_per_query[name].append( + stats_to_compute[stat_name](ann1, ann2, idx1, idx2) + ) + + seq_size[name] = max( + seq_size.get(name, 1), + ann.query_start[i], + ann.query_end[i], + ) + prior_vals[name] = (ann, i) + + if len(annots) <= 1: + continue + + for a, b in zip(annots[:-1], annots[1:]): + for i in range(len(a.query)): + name = a.query[i] + for stat_name, values_per_query in join_stats.items(): + if name not in values_per_query: + values_per_query[name] = [] + values_per_query[name].append(stats_to_compute[stat_name](a, b, i, i)) + + +for query_name, _ in sorted( + join_stats["Consensus Distance"].items(), key=lambda k: -len(k[1]) +): + fig, axs = plt.subplots(3, len(stats_to_compute)) + axs = axs.T + + fig.suptitle(f"{query_name} (Size: {seq_size.get(query_name, 0)})") + + for name, (ax1, ax2, ax3) in zip(stats_to_compute, axs): + est = estimator[name] + + sx = np.sort(join_stats[name][query_name]) + fit = fit_dist(sx, est) + ax1.set_title(f"Join {name}") + ax1.hist( + join_stats[name][query_name], + 50, + density=True, + label=f"Mean: {np.mean(join_stats[name][query_name]):.02f}\nSTD: {np.std(join_stats[name][query_name]):.02f}", + ) + ax1.plot( + sx, est.pdf(sx, *fit), label=f"Fit: {', '.join(f'{v:.02f}' for v in fit)}" + ) + ax1.legend() + + sx2 = np.sort(random_stats[name][query_name]) + fit2 = fit_dist(sx2, est) + ax2.set_title(f"All {name}") + ax2.hist( + random_stats[name][query_name], + 50, + density=True, + label=f"Mean: {np.mean(random_stats[name][query_name]):.02f}\nSTD: {np.std(random_stats[name][query_name]):.02f}", + ) + ax2.plot( + sx2, + est.pdf(sx2, *fit2), + label=f"Fit: {', '.join(f'{v:.02f}' for v in fit2)}", + ) + ax2.legend() + + ax3.set_title("CDFs") + ax3.ecdf(join_stats[name][query_name], label="Joins CDF") + ax3.ecdf(random_stats[name][query_name], label="All CDF") + ax3.plot(sx, est.cdf(sx, *fit), label="Est. Join CDF") + ax3.plot(sx2, est.cdf(sx2, *fit2), label="Est. All CDF") + ax3.legend() + + fig.set_size_inches(12, 8) + fig.tight_layout() + plt.show() diff --git a/scripts/plot_distance_distribution.py b/scripts/plot_target_distance_distribution.py similarity index 100% rename from scripts/plot_distance_distribution.py rename to scripts/plot_target_distance_distribution.py From b3fc5e06bfe8b4840244436cf6a117533d631be5 Mon Sep 17 00:00:00 2001 From: isaacr Date: Tue, 5 May 2026 17:31:05 -0600 Subject: [PATCH 15/39] Further exploration of join distributions. --- .../plot_consensus_distance_repeatmasker.py | 141 +++++++++---- scripts/plot_distributions_aurora.py | 190 +++++++++++++++--- 2 files changed, 257 insertions(+), 74 deletions(-) diff --git a/scripts/plot_consensus_distance_repeatmasker.py b/scripts/plot_consensus_distance_repeatmasker.py index 5390df2..2bd07bd 100644 --- a/scripts/plot_consensus_distance_repeatmasker.py +++ b/scripts/plot_consensus_distance_repeatmasker.py @@ -9,12 +9,39 @@ gap_info = {} length_estimate = {} -prior = {} +sequences = [] + + +def get_gap(pstart, pend, qstart, qend, ppos, is_pos, check_valid: bool = True): + try: + if is_pos and ppos: + if check_valid: + assert pstart <= qstart <= qend and pstart <= pend <= qend + gap = qstart - pend + elif not is_pos and not ppos: + if check_valid: + assert qstart <= qend <= pend and qstart <= pstart <= pend + gap = pstart - qend + else: + gap = None + except AssertionError: + if is_pos and ppos: + print(f"Violation: {pstart} => {pend} => {qstart} => {qend}") + else: + print(f"Violation: {pend} <= {pstart} <= {qend} <= {qstart}") + + print(f"Offending annotation: {' '.join(line.split('\t')[:-1])}") + print(f"Offending sequences:\n\t {'\n\t'.join(seqs)}") + gap = None + + return gap + with open(caf_file, "r") as f: for line in f: shared_name = line.strip().split("\t")[3] # .split("#")[-1] seqs = line.strip().split("\t")[-1].split(",") + sequences.extend(seqs) if len(seqs) <= 1: continue @@ -29,6 +56,7 @@ tokens = seq.split(" ") name = f"{tokens[9]}#{tokens[10]}" is_pos = tokens[8] == "+" + join_id = int(tokens[14]) if is_pos: qstart = int(tokens[11]) @@ -45,36 +73,13 @@ if pname != name: gap = None else: - try: - if is_pos and ppos: - assert pstart <= qstart <= qend and pstart <= pend <= qend - gap = qstart - pend - elif not is_pos and not ppos: - assert qstart <= qend <= pend and qstart <= pstart <= pend - gap = pstart - qend - else: - gap = None - except AssertionError: - if is_pos and ppos: - print( - f"Violation: {pstart} => {pend} => {qstart} => {qend}" - ) - else: - print( - f"Violation: {pend} <= {pstart} <= {qend} <= {qstart}" - ) - - print( - f"Offending annotation: {' '.join(line.split('\t')[:-1])}" - ) - print(f"Offending sequences:\n\t {'\n\t'.join(seqs)}") - gap = None - - if gap is not None: - if shared_name not in gap_info: - gap_info[shared_name] = [] - - gap_info[shared_name].append(gap) + gap = get_gap(pstart, pend, qstart, qend, ppos, is_pos) + + if gap is not None: + if shared_name not in gap_info: + gap_info[shared_name] = [] + + gap_info[shared_name].append(gap) prior = (name, is_pos, qstart, qend) @@ -86,27 +91,81 @@ ) -for gap_name, gap_vals in sorted( +sequences.sort(key=lambda seq: int(seq.split(" ")[5])) +gap_nojoin_info = {} +other_priors = {} +target_nojoin_info = {} + +for seq in sequences: + tokens = seq.split(" ") + name = f"{tokens[9]}#{tokens[10]}" + is_pos = tokens[8] == "+" + join_id = int(tokens[14]) + tstart = int(tokens[5]) + + if is_pos: + qstart = int(tokens[11]) + qend = int(tokens[12]) + else: + qstart = int(tokens[13]) + qend = int(tokens[12]) + + random_prior = other_priors.get(name, None) + if random_prior is not None: + ppos, pstart, pend, pjoin_id, p_tstart = random_prior + if pjoin_id == join_id: + gap = None + else: + gap = get_gap(pstart, pend, qstart, qend, ppos, is_pos, False) + + target_gap = tstart - p_tstart + + if gap is not None: + if name not in target_nojoin_info: + target_nojoin_info[name] = [] + target_nojoin_info[name].append(target_gap) + + if name not in gap_nojoin_info: + gap_nojoin_info[name] = [] + gap_nojoin_info[name].append(gap) + + other_priors[name] = (is_pos, qstart, qend, join_id, tstart) + + +for gap_name, gap_join_vals in sorted( gap_info.items(), key=lambda k: len(k[1]), reverse=True ): - gap_vals = np.array(gap_vals) + gap_join_vals = np.array(gap_join_vals) # gap_vals = gap_vals[np.abs(gap_vals) > 1] avg_len = length_estimate[gap_name][0] / length_estimate[gap_name][1] max_len = length_estimate[gap_name][2] - plt.title(f"{gap_name} (Avg Length: {avg_len:.02f}, Max Length: {max_len})") - - avg_pos_d = np.mean(gap_vals[gap_vals >= 0]) - avg_neg_d = np.mean(gap_vals[gap_vals <= 0]) + avg_pos_d = np.mean(gap_join_vals[gap_join_vals >= 0]) + avg_neg_d = np.mean(gap_join_vals[gap_join_vals <= 0]) - info = f"Positive Avg: {avg_pos_d:.02f}, Negative Avg: {avg_neg_d:.02f}, ({avg_len / avg_pos_d:.02f}, {max_len / avg_pos_d:.02f})" + info = f"Positive Avg: {avg_pos_d:.02f},\n Negative Avg: {avg_neg_d:.02f}, ({avg_len / avg_pos_d:.02f}, {max_len / avg_pos_d:.02f})" - range = (0, np.max(gap_vals)) + range = (0, np.max(gap_join_vals)) x = np.arange(*range) lamb = 1 / avg_pos_d - plt.hist(gap_vals, 100, label=info) + fig, (ax1, ax2, ax3) = plt.subplots(1, 3, squeeze=True) + ax1.set_title(f"{gap_name} (Avg Length: {avg_len:.02f}, Max Length: {max_len})") + + ax1.hist(gap_join_vals, 50, label=info) # plt.plot(x, lamb * np.exp(-lamb * x)) - plt.legend() + ax1.legend() + + ax2.set_title(f"{gap_name} Non-Join Consensus Gaps") + ax2.hist(gap_nojoin_info[gap_name], 50) + + ax3.set_title(f"{gap_name} Scatterplot") + ax3.scatter(target_nojoin_info[gap_name], gap_nojoin_info[gap_name]) + ax3.set_xlabel("Target Distance") + ax3.set_ylabel("Consensus Distance") + + w, h = fig.get_size_inches() + fig.set_size_inches(w * 3, h) + fig.tight_layout() plt.show() diff --git a/scripts/plot_distributions_aurora.py b/scripts/plot_distributions_aurora.py index b0f368d..4d12d43 100644 --- a/scripts/plot_distributions_aurora.py +++ b/scripts/plot_distributions_aurora.py @@ -62,6 +62,25 @@ def from_line(cls, line: str) -> typing.Self: return cls(*parsed) + def get_key(self) -> tuple[int, int]: + return (self.region, self.join_id) + + def select(self, idx: int) -> typing.Self: + return type(self)( + self.annotation_count, + self.target, + [self.target_start[idx]], + [self.target_end[idx]], + [self.query[idx]], + [self.query_start[idx]], + [self.query_end[idx]], + [self.strand[idx]], + self.score, + [self.kimera80[idx]], + self.join_id, + self.region, + ) + aurora_file = sys.argv[1] @@ -70,7 +89,7 @@ def from_line(cls, line: str) -> typing.Self: with open(aurora_file, "r") as f: for line in f: entry = AuroraEntry.from_line(line) - key = (entry.region, entry.join_id) + key = entry.get_key() if key not in joined_annots: joined_annots[key] = [] joined_annots[key].append(entry) @@ -95,11 +114,13 @@ def best(*a): case ("-", "-"): return a.query_end[a_idx] - b.query_start[b_idx] - 1 case ("-", "+"): + return None return best( a.query_start[a_idx] - b.query_start[b_idx] - 1, b.query_end[b_idx] - a.query_end[a_idx] - 1, ) case ("+", "-"): + return None return best( b.query_start[b_idx] - a.query_start[a_idx] - 1, a.query_end[a_idx] - b.query_end[b_idx] - 1, @@ -113,10 +134,34 @@ def _target_distance(a: AuroraEntry, b: AuroraEntry, ai: int, bi: int): return b.target_start[bi] - a.target_end[ai] - 1 +def _kimura_dist(a, b, ai, bi): + return b.kimera80[bi] - a.kimera80[ai] + + +def _relative_consensus_dist(a, b, ai, bi): + d = consensus_dist(a, b, ai, bi) + min_seq_len = min( + [ + abs(v) + for v in ( + a.query_end[ai] - a.query_start[ai], + b.query_end[bi] - b.query_start[bi], + ) + ] + ) + if d is None: + return None + try: + return d / min_seq_len + except ZeroDivisionError: + return None + + stats_to_compute = { "Consensus Distance": consensus_dist, "Target Distance": _target_distance, - "Divergence Change": lambda a, b, ai, bi: b.kimera80[bi] - a.kimera80[ai], + "Divergence Change": _kimura_dist, + "Relative Consensus Distance": _relative_consensus_dist, } @@ -151,10 +196,11 @@ def logcdf(self, x, *args): estimator = { + "Relative Consensus Distance": Distribution(invweibull, (1.0, 0.0, 1.0), False), "Consensus Distance": Distribution(invweibull, (1.0, 0.0, 1.0), False), "Target Distance": Distribution( - weibull_min, (1.0, 10000.0) - ), # Distribution(genpareto, (0.0, 1.0)), + genpareto, (0.0, 1.0) + ), # Distribution(expon, (1.0,)), # Distribution(genpareto, (0.0, 1.0)), "Divergence Change": Distribution(norm, (0.0, 1.0), False), } @@ -171,51 +217,85 @@ def fit_dist(data, dist): )[0] +random_idx_reference = {} +random_is_join = {} + for name in stats_to_compute: join_stats[name] = {} random_stats[name] = {} -for k, annots in sorted( - joined_annots.items(), key=lambda k: min(min(v.target_start) for v in k[1]) -): - for ann in annots[:1]: - for i in range(len(ann.query)): - name = ann.query[i] - (pann, j) = prior_vals.get(name, (None, None)) - if pann is not None: - for stat_name, values_per_query in random_stats.items(): - (ann1, idx1), (ann2, idx2) = sorted( - [(pann, j), (ann, i)], key=lambda v: v[0].target_start[v[1]] - ) +all_anots_flat = [ann for annots in joined_annots.values() for ann in annots] +all_anots_flat.sort(key=lambda v: min(v.target_start)) + + +for ann_i, ann in enumerate(all_anots_flat): + for i in range(len(ann.query)): + name = ann.query[i] + (pann, j, pann_i) = prior_vals.get(name, (None, None, None)) + if pann is not None: + # if pann.region == ann.region and pann.join_id == ann.join_id: + # continue + + (ann1, idx1), (ann2, idx2) = sorted( + [(pann, j), (ann, i)], key=lambda v: v[0].target_start[v[1]] + ) + + stats = { + stat_name: stats_to_compute[stat_name](ann1, ann2, idx1, idx2) + for stat_name in stats_to_compute + } + + if all([v is not None for v in stats.values()]): + for stat_name, value in stats.items(): + values_per_query = random_stats[stat_name] if name not in values_per_query: values_per_query[name] = [] - values_per_query[name].append( - stats_to_compute[stat_name](ann1, ann2, idx1, idx2) - ) - - seq_size[name] = max( - seq_size.get(name, 1), - ann.query_start[i], - ann.query_end[i], - ) - prior_vals[name] = (ann, i) + values_per_query[name].append(value) + + if name not in random_idx_reference: + random_idx_reference[name] = [] + random_is_join[name] = [] + + random_idx_reference[name].append((ann_i, i, pann_i, j)) + random_is_join[name].append( + ann in joined_annots.get(pann.get_key(), []) + ) + + seq_size[name] = max( + seq_size.get(name, 1), + ann.query_start[i], + ann.query_end[i], + ) + prior_vals[name] = (ann, i, ann_i) +for k, annots in joined_annots.items(): if len(annots) <= 1: continue for a, b in zip(annots[:-1], annots[1:]): for i in range(len(a.query)): name = a.query[i] - for stat_name, values_per_query in join_stats.items(): - if name not in values_per_query: - values_per_query[name] = [] - values_per_query[name].append(stats_to_compute[stat_name](a, b, i, i)) + + stats = { + stat_name: stats_to_compute[stat_name](a, b, i, i) + for stat_name in stats_to_compute + } + + if all([v is not None for v in stats.values()]): + for stat_name, value in stats.items(): + values_per_query = join_stats[stat_name] + + if name not in values_per_query: + values_per_query[name] = [] + values_per_query[name].append(value) for query_name, _ in sorted( join_stats["Consensus Distance"].items(), key=lambda k: -len(k[1]) ): + # if not query_name.startswith("alu"): + # continue fig, axs = plt.subplots(3, len(stats_to_compute)) axs = axs.T @@ -229,7 +309,7 @@ def fit_dist(data, dist): ax1.set_title(f"Join {name}") ax1.hist( join_stats[name][query_name], - 50, + 100, density=True, label=f"Mean: {np.mean(join_stats[name][query_name]):.02f}\nSTD: {np.std(join_stats[name][query_name]):.02f}", ) @@ -243,7 +323,7 @@ def fit_dist(data, dist): ax2.set_title(f"All {name}") ax2.hist( random_stats[name][query_name], - 50, + 100, density=True, label=f"Mean: {np.mean(random_stats[name][query_name]):.02f}\nSTD: {np.std(random_stats[name][query_name]):.02f}", ) @@ -261,6 +341,50 @@ def fit_dist(data, dist): ax3.plot(sx2, est.cdf(sx2, *fit2), label="Est. All CDF") ax3.legend() - fig.set_size_inches(12, 8) + fig.set_size_inches(16, 8) fig.tight_layout() plt.show() + + plt.title(f"{query_name} (Size: {seq_size.get(query_name, 0)})") + + join_indexes = np.flatnonzero(random_is_join[query_name]) + not_join_indexes = np.flatnonzero(~np.array(random_is_join[query_name])) + + join_art = plt.plot( + np.array(random_stats["Target Distance"][query_name])[join_indexes], + np.array(random_stats["Consensus Distance"][query_name])[join_indexes], + "ro", + picker=5, + label="Joins", + ) + no_join_art = plt.plot( + np.array(random_stats["Target Distance"][query_name])[not_join_indexes], + np.array(random_stats["Consensus Distance"][query_name])[not_join_indexes], + "bo", + picker=5, + label="Not Joins", + ) + plt.xlabel("Target Distance") + plt.ylabel("Consensus Distance") + plt.legend() + fig = plt.gcf() + + def on_pick(evt): + mask = join_indexes if evt.artist == join_art else not_join_indexes + + for idx in evt.ind: + idx = mask[idx] + annot_idx, sub_i, pann_idx, p_sub_i = random_idx_reference[query_name][idx] + print(f"Index: {annot_idx}, Sub-Index: {sub_i}") + print( + f"\tTarget Distance: {random_stats['Target Distance'][query_name][idx]}" + ) + print( + f"\tConsensus Distance: {random_stats['Consensus Distance'][query_name][idx]}" + ) + print(f"\tPrior: {all_anots_flat[pann_idx].select(p_sub_i)}") + print(f"\tCurrent: {all_anots_flat[annot_idx].select(sub_i)}") + print(f"\tIs Joined: {random_is_join[query_name][idx]}") + + fig.canvas.mpl_connect("pick_event", on_pick) + plt.show() From 0d2bba685cafa9493e0578e0bffcb1bcf9ec64f2 Mon Sep 17 00:00:00 2001 From: isaacr Date: Wed, 6 May 2026 17:49:50 -0600 Subject: [PATCH 16/39] Join estimation WIP --- Cargo.toml | 1 + scripts/plot_distributions_aurora.py | 205 +++++++++++-------- scripts/plot_target_distance_distribution.py | 33 --- src/join_estimation.rs | 34 +++ src/main.rs | 1 + src/statistics.rs | 34 +++ 6 files changed, 188 insertions(+), 120 deletions(-) create mode 100644 src/join_estimation.rs diff --git a/Cargo.toml b/Cargo.toml index 24d0d77..c5b43b6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,7 @@ serde_json = "1.0.93" itertools = "0.11.0" rayon = "1.8.0" base64 = "0.22.1" +puruspe = "0.4.4" [target.'cfg(not(target_env = "msvc"))'.dependencies] tikv-jemallocator = "0.5" diff --git a/scripts/plot_distributions_aurora.py b/scripts/plot_distributions_aurora.py index 4d12d43..31609d0 100644 --- a/scripts/plot_distributions_aurora.py +++ b/scripts/plot_distributions_aurora.py @@ -1,6 +1,7 @@ import sys import typing from dataclasses import dataclass, fields +from pathlib import Path import matplotlib.pyplot as plt import numpy as np @@ -82,7 +83,22 @@ def select(self, idx: int) -> typing.Self: ) +if len(sys.argv) not in [2, 3]: + print("Usage:") + print(f"\t{Path(sys.argv[0]).name} AURORA_OUTPUT_FILE [dist|scatter]") + sys.exit(1) + aurora_file = sys.argv[1] +mode = sys.argv[2] if len(sys.argv) > 2 else "dist" + +if mode not in ["scatter", "dist"]: + print("Second argument (the mode) must be 'scatter' or 'dist'!") + sys.exit(1) + +if mode == "dist": + print("Generating distributions plots...") +else: + print("Generating scatter plots...") joined_annots: dict[tuple[int, int], list[AuroraEntry]] = {} @@ -140,7 +156,7 @@ def _kimura_dist(a, b, ai, bi): def _relative_consensus_dist(a, b, ai, bi): d = consensus_dist(a, b, ai, bi) - min_seq_len = min( + sum_seq_len = sum( [ abs(v) for v in ( @@ -152,7 +168,7 @@ def _relative_consensus_dist(a, b, ai, bi): if d is None: return None try: - return d / min_seq_len + return d / sum_seq_len except ZeroDivisionError: return None @@ -296,95 +312,110 @@ def fit_dist(data, dist): ): # if not query_name.startswith("alu"): # continue - fig, axs = plt.subplots(3, len(stats_to_compute)) - axs = axs.T - - fig.suptitle(f"{query_name} (Size: {seq_size.get(query_name, 0)})") - - for name, (ax1, ax2, ax3) in zip(stats_to_compute, axs): - est = estimator[name] - - sx = np.sort(join_stats[name][query_name]) - fit = fit_dist(sx, est) - ax1.set_title(f"Join {name}") - ax1.hist( - join_stats[name][query_name], - 100, - density=True, - label=f"Mean: {np.mean(join_stats[name][query_name]):.02f}\nSTD: {np.std(join_stats[name][query_name]):.02f}", - ) - ax1.plot( - sx, est.pdf(sx, *fit), label=f"Fit: {', '.join(f'{v:.02f}' for v in fit)}" - ) - ax1.legend() - - sx2 = np.sort(random_stats[name][query_name]) - fit2 = fit_dist(sx2, est) - ax2.set_title(f"All {name}") - ax2.hist( - random_stats[name][query_name], - 100, - density=True, - label=f"Mean: {np.mean(random_stats[name][query_name]):.02f}\nSTD: {np.std(random_stats[name][query_name]):.02f}", - ) - ax2.plot( - sx2, - est.pdf(sx2, *fit2), - label=f"Fit: {', '.join(f'{v:.02f}' for v in fit2)}", - ) - ax2.legend() + join_indexes = np.flatnonzero(random_is_join[query_name]) + not_join_indexes = np.flatnonzero(~np.array(random_is_join[query_name])) - ax3.set_title("CDFs") - ax3.ecdf(join_stats[name][query_name], label="Joins CDF") - ax3.ecdf(random_stats[name][query_name], label="All CDF") - ax3.plot(sx, est.cdf(sx, *fit), label="Est. Join CDF") - ax3.plot(sx2, est.cdf(sx2, *fit2), label="Est. All CDF") - ax3.legend() + if mode == "dist": + fig, axs = plt.subplots(3, len(stats_to_compute)) + axs = axs.T - fig.set_size_inches(16, 8) - fig.tight_layout() - plt.show() + fig.suptitle(f"{query_name} (Size: {seq_size.get(query_name, 0)})") - plt.title(f"{query_name} (Size: {seq_size.get(query_name, 0)})") + for name, (ax1, ax2, ax3) in zip(stats_to_compute, axs): + est = estimator[name] - join_indexes = np.flatnonzero(random_is_join[query_name]) - not_join_indexes = np.flatnonzero(~np.array(random_is_join[query_name])) - - join_art = plt.plot( - np.array(random_stats["Target Distance"][query_name])[join_indexes], - np.array(random_stats["Consensus Distance"][query_name])[join_indexes], - "ro", - picker=5, - label="Joins", - ) - no_join_art = plt.plot( - np.array(random_stats["Target Distance"][query_name])[not_join_indexes], - np.array(random_stats["Consensus Distance"][query_name])[not_join_indexes], - "bo", - picker=5, - label="Not Joins", - ) - plt.xlabel("Target Distance") - plt.ylabel("Consensus Distance") - plt.legend() - fig = plt.gcf() - - def on_pick(evt): - mask = join_indexes if evt.artist == join_art else not_join_indexes - - for idx in evt.ind: - idx = mask[idx] - annot_idx, sub_i, pann_idx, p_sub_i = random_idx_reference[query_name][idx] - print(f"Index: {annot_idx}, Sub-Index: {sub_i}") - print( - f"\tTarget Distance: {random_stats['Target Distance'][query_name][idx]}" + join_samples = np.array(join_stats[name][query_name]) + sx = np.linspace(join_samples.min(), join_samples.max(), 1000) + fit = fit_dist(join_samples, est) + ax1.set_title(f"Join {name}") + ax1.hist( + join_samples, + 100, + density=True, + label=f"Mean: {np.mean(join_samples):.02f}\nSTD: {np.std(join_samples):.02f}", + ) + ax1.plot( + sx, + est.pdf(sx, *fit), + label=f"Fit: {', '.join(f'{v:.02f}' for v in fit)}", + ) + ax1.legend() + + random_samples = np.array(random_stats[name][query_name]) + sx2 = np.linspace(random_samples.min(), random_samples.max(), 1000) + fit2 = fit_dist(random_samples, est) + ax2.set_title(f"All {name}") + ax2.plot( + [0], + [0], + color="black", + visible=False, + label=f"Mean: {np.mean(random_samples):.02f}\nSTD: {np.std(random_samples):.02f}", ) - print( - f"\tConsensus Distance: {random_stats['Consensus Distance'][query_name][idx]}" + ax2.hist( + [random_samples[not_join_indexes], random_samples[join_indexes]], + 100, + label=["Not Joined", "Joined"], + density=True, + stacked=True, ) - print(f"\tPrior: {all_anots_flat[pann_idx].select(p_sub_i)}") - print(f"\tCurrent: {all_anots_flat[annot_idx].select(sub_i)}") - print(f"\tIs Joined: {random_is_join[query_name][idx]}") + ax2.plot( + sx2, + est.pdf(sx2, *fit2), + label=f"Fit: {', '.join(f'{v:.02f}' for v in fit2)}", + ) + ax2.legend() + + ax3.set_title("CDFs") + ax3.ecdf(join_stats[name][query_name], label="Joins CDF") + ax3.ecdf(random_stats[name][query_name], label="All CDF") + ax3.plot(sx, est.cdf(sx, *fit), label="Est. Join CDF") + ax3.plot(sx2, est.cdf(sx2, *fit2), label="Est. All CDF") + ax3.legend() + + fig.set_size_inches(16, 8) + fig.tight_layout() + plt.show() + else: + plt.title(f"{query_name} (Size: {seq_size.get(query_name, 0)})") + + join_art = plt.plot( + np.array(random_stats["Target Distance"][query_name])[join_indexes], + np.array(random_stats["Consensus Distance"][query_name])[join_indexes], + "ro", + picker=5, + label="Joins", + ) + no_join_art = plt.plot( + np.array(random_stats["Target Distance"][query_name])[not_join_indexes], + np.array(random_stats["Consensus Distance"][query_name])[not_join_indexes], + "bo", + picker=5, + label="Not Joins", + ) + plt.xlabel("Target Distance") + plt.ylabel("Consensus Distance") + plt.legend() + fig = plt.gcf() + + def on_pick(evt): + mask = join_indexes if evt.artist == join_art else not_join_indexes + + for idx in evt.ind: + idx = mask[idx] + annot_idx, sub_i, pann_idx, p_sub_i = random_idx_reference[query_name][ + idx + ] + print(f"Index: {annot_idx}, Sub-Index: {sub_i}") + print( + f"\tTarget Distance: {random_stats['Target Distance'][query_name][idx]}" + ) + print( + f"\tConsensus Distance: {random_stats['Consensus Distance'][query_name][idx]}" + ) + print(f"\tPrior: {all_anots_flat[pann_idx].select(p_sub_i)}") + print(f"\tCurrent: {all_anots_flat[annot_idx].select(sub_i)}") + print(f"\tIs Joined: {random_is_join[query_name][idx]}") - fig.canvas.mpl_connect("pick_event", on_pick) - plt.show() + fig.canvas.mpl_connect("pick_event", on_pick) + plt.show() diff --git a/scripts/plot_target_distance_distribution.py b/scripts/plot_target_distance_distribution.py index 3e64a73..c107e36 100755 --- a/scripts/plot_target_distance_distribution.py +++ b/scripts/plot_target_distance_distribution.py @@ -12,27 +12,6 @@ # For now, an exponential distribution seems to provide a good enough approximation for use in aurora. It also is easy to fit well. # Here's an interesting paper on the topic that seems to have landed in the same space I've been in: https://www.columbia.edu/~ww2040/FittingMixturesPerfEval98.pdf -import sys -from inspect import signature - -import matplotlib.pyplot as plt -import numpy as np -from scipy.optimize import curve_fit -from scipy.stats import ( - betaprime, - burr12, - ecdf, - expon, - fisk, - genextreme, - genpareto, - invweibull, - linregress, - lognorm, - lomax, - weibull_min, -) - bed_file = sys.argv[1] seq_info = {} @@ -126,18 +105,6 @@ def logcdf(self, x, *args): beta = np.mean(dists) # np.median(dists) / np.log(2) # Get fit for weibull dist... (and CDF)... - cdf = ecdf(dists).cdf - - y_transform = lambda y: np.log(-np.log(1 - y)) - - with np.errstate(divide="ignore"): - wcdfx = np.log(cdf.quantiles) - wcdfy = y_transform(cdf.probabilities) - # Estimate the error that the transform adds to the line. This makes linear fit better fit a CDF... - wcdfy_p1 = y_transform( - cdf.probabilities - np.sign(cdf.probabilities - 0.5) * 0.01 - ) - wcdfy_err = np.maximum(np.abs((wcdfy_p1 - wcdfy) / 0.01), 1e-8) valid_values = np.isfinite(wcdfx) & np.isfinite(wcdfy) fit_line = curve_fit( diff --git a/src/join_estimation.rs b/src/join_estimation.rs new file mode 100644 index 0000000..abef24f --- /dev/null +++ b/src/join_estimation.rs @@ -0,0 +1,34 @@ +use crate::{ + segments::Block, + statistics::{ExponentialEstimator, StudentsT}, +}; + +trait JoinEstimator { + fn predict(&self, first_block: &Block, second_block: &Block) -> f64; +} + +trait JoinStatistics { + fn new() -> Self; + fn combine(&self, other: &Self) -> Self; + fn add(&self, first_block: &Block, second_block: &Block, neighbors: bool, joinable: bool); + fn to_estimator(&self) -> T; +} + +struct BayesianJoinEstimator { + target_distance_join: ExponentialEstimator, + target_distance_background: ExponentialEstimator, + divergence_join: StudentsT, + divergence_background: StudentsT, +} + +struct BayesianJoinStatistics { + joinable_target_distance_sum: usize, + all_target_distance_sum: usize, + divergence_sum: f64, + divergence_square_sum: f64, + join_divergence_sum: f64, + join_divergence_square_sum: f64, + divergence_offset: f64, + joinable_count: usize, + all_count: usize, +} diff --git a/src/main.rs b/src/main.rs index 2f1e11a..927fbf9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,6 +6,7 @@ mod balanced_tree; mod chunks; mod confidence; mod history_tracing; +mod join_estimation; mod matrix; mod pipeline; mod score_params; diff --git a/src/statistics.rs b/src/statistics.rs index 309c7ff..0b48788 100644 --- a/src/statistics.rs +++ b/src/statistics.rs @@ -1,3 +1,4 @@ +use puruspe::{beta, betai}; use std::fmt::Debug; #[allow(dead_code)] @@ -142,3 +143,36 @@ impl Distribution for ExponentialEstimator { (0.0, f64::INFINITY) } } + +#[derive(Debug, Clone)] +pub struct StudentsT { + mean: f64, + standard_deviation: f64, + degrees_of_freedom: usize, +} + +impl Distribution for StudentsT { + fn unit() -> Self { + Self { + mean: 0.0, + standard_deviation: 1.0, + degrees_of_freedom: 1, + } + } + + fn pdf(&self, x: f64) -> f64 { + let v = self.degrees_of_freedom as f64; + let z = (x - self.mean) / self.standard_deviation; + 1.0 / (v.sqrt() * beta(0.5, 0.5 * v)) * (1.0 + (z * z) / v).powf(-0.5 * (v + 1.0)) + } + + fn cdf(&self, x: f64) -> f64 { + let v = self.degrees_of_freedom as f64; + let z = (x - self.mean) / self.standard_deviation; + if z >= 0.0 { + 1.0 - 0.5 * betai(0.5 * v, 0.5, v / (z * z + v)) + } else { + 0.5 * betai(0.5 * v, 0.5, v / (z * z + v)) + } + } +} From 0601c8bccc302da7e627dd3cd38833942802fb09 Mon Sep 17 00:00:00 2001 From: isaacr Date: Fri, 8 May 2026 00:03:21 -0600 Subject: [PATCH 17/39] WIP Half t distribution. --- scripts/plot_distributions_aurora.py | 42 ++++++---- src/join_estimation.rs | 110 ++++++++++++++++++++++++--- src/statistics.rs | 99 ++++++++++++++++++++++-- 3 files changed, 221 insertions(+), 30 deletions(-) diff --git a/scripts/plot_distributions_aurora.py b/scripts/plot_distributions_aurora.py index 31609d0..d76eb97 100644 --- a/scripts/plot_distributions_aurora.py +++ b/scripts/plot_distributions_aurora.py @@ -6,7 +6,17 @@ import matplotlib.pyplot as plt import numpy as np from scipy.optimize import curve_fit -from scipy.stats import ecdf, expon, genpareto, gumbel_r, invweibull, norm, weibull_min +from scipy.stats import ( + ecdf, + expon, + genpareto, + gumbel_r, + halfnorm, + invweibull, + laplace_asymmetric, + norm, + weibull_min, +) @dataclass @@ -151,12 +161,12 @@ def _target_distance(a: AuroraEntry, b: AuroraEntry, ai: int, bi: int): def _kimura_dist(a, b, ai, bi): - return b.kimera80[bi] - a.kimera80[ai] + return abs(b.kimera80[bi] - a.kimera80[ai]) def _relative_consensus_dist(a, b, ai, bi): d = consensus_dist(a, b, ai, bi) - sum_seq_len = sum( + sum_seq_len = max( [ abs(v) for v in ( @@ -212,12 +222,16 @@ def logcdf(self, x, *args): estimator = { - "Relative Consensus Distance": Distribution(invweibull, (1.0, 0.0, 1.0), False), - "Consensus Distance": Distribution(invweibull, (1.0, 0.0, 1.0), False), + "Relative Consensus Distance": Distribution( + laplace_asymmetric, (1.0, 0.0, 1.0), False + ), # Distribution(invweibull, (1.0, 0.0, 1.0), False), + "Consensus Distance": Distribution( + laplace_asymmetric, (1.0, 0.0, 1.0), False + ), # Distribution(invweibull, (1.0, 0.0, 1.0), False), "Target Distance": Distribution( - genpareto, (0.0, 1.0) + weibull_min, (1.0, 10000) ), # Distribution(expon, (1.0,)), # Distribution(genpareto, (0.0, 1.0)), - "Divergence Change": Distribution(norm, (0.0, 1.0), False), + "Divergence Change": Distribution(halfnorm, (1.0,)), } @@ -310,8 +324,8 @@ def fit_dist(data, dist): for query_name, _ in sorted( join_stats["Consensus Distance"].items(), key=lambda k: -len(k[1]) ): - # if not query_name.startswith("alu"): - # continue + # if not query_name.startswith("sin"): + # continue join_indexes = np.flatnonzero(random_is_join[query_name]) not_join_indexes = np.flatnonzero(~np.array(random_is_join[query_name])) @@ -330,7 +344,7 @@ def fit_dist(data, dist): ax1.set_title(f"Join {name}") ax1.hist( join_samples, - 100, + 200, density=True, label=f"Mean: {np.mean(join_samples):.02f}\nSTD: {np.std(join_samples):.02f}", ) @@ -339,7 +353,7 @@ def fit_dist(data, dist): est.pdf(sx, *fit), label=f"Fit: {', '.join(f'{v:.02f}' for v in fit)}", ) - ax1.legend() + ax1.legend(fontsize="xx-small") random_samples = np.array(random_stats[name][query_name]) sx2 = np.linspace(random_samples.min(), random_samples.max(), 1000) @@ -354,7 +368,7 @@ def fit_dist(data, dist): ) ax2.hist( [random_samples[not_join_indexes], random_samples[join_indexes]], - 100, + 200, label=["Not Joined", "Joined"], density=True, stacked=True, @@ -364,14 +378,14 @@ def fit_dist(data, dist): est.pdf(sx2, *fit2), label=f"Fit: {', '.join(f'{v:.02f}' for v in fit2)}", ) - ax2.legend() + ax2.legend(fontsize="xx-small") ax3.set_title("CDFs") ax3.ecdf(join_stats[name][query_name], label="Joins CDF") ax3.ecdf(random_stats[name][query_name], label="All CDF") ax3.plot(sx, est.cdf(sx, *fit), label="Est. Join CDF") ax3.plot(sx2, est.cdf(sx2, *fit2), label="Est. All CDF") - ax3.legend() + ax3.legend(fontsize="xx-small") fig.set_size_inches(16, 8) fig.tight_layout() diff --git a/src/join_estimation.rs b/src/join_estimation.rs index abef24f..44a7295 100644 --- a/src/join_estimation.rs +++ b/src/join_estimation.rs @@ -1,34 +1,126 @@ use crate::{ + assembly::block_target_distance, segments::Block, - statistics::{ExponentialEstimator, StudentsT}, + statistics::{Distribution, ExponentialEstimator, StudentsT}, }; -trait JoinEstimator { - fn predict(&self, first_block: &Block, second_block: &Block) -> f64; +pub trait JoinEstimator { + fn predict(&self, first_block: &Block, second_block: &Block, log_space: bool) -> f64; + fn from_statistics(statistics: T) -> Self; } -trait JoinStatistics { +pub trait JoinStatistics { fn new() -> Self; fn combine(&self, other: &Self) -> Self; - fn add(&self, first_block: &Block, second_block: &Block, neighbors: bool, joinable: bool); - fn to_estimator(&self) -> T; + fn add(&mut self, first_block: &Block, second_block: &Block, neighbors: bool, joinable: bool); } -struct BayesianJoinEstimator { +pub struct BayesianJoinEstimator { target_distance_join: ExponentialEstimator, target_distance_background: ExponentialEstimator, divergence_join: StudentsT, divergence_background: StudentsT, } -struct BayesianJoinStatistics { +impl JoinEstimator for BayesianJoinEstimator { + fn from_statistics(statistics: BayesianJoinStatistics) -> Self { + let join_td_mean = + statistics.joinable_target_distance_sum as f64 / statistics.joinable_count as f64; + let all_td_mean = statistics.all_target_distance_sum as f64 / statistics.all_count as f64; + + // Divergence distributions should have a mean of 0, so we assume that... + let join_div_std = statistics.join_divergence_square_sum / statistics.joinable_count as f64; + let all_div_std = statistics.divergence_square_sum / statistics.all_count as f64; + + Self { + target_distance_join: ExponentialEstimator::new( + join_td_mean, + statistics.joinable_count, + ), + target_distance_background: ExponentialEstimator::new( + all_td_mean, + statistics.all_count, + ), + divergence_join: StudentsT::new(0.0, join_div_std, statistics.joinable_count), + divergence_background: StudentsT::new(0.0, all_div_std, statistics.all_count), + } + } + + fn predict(&self, first_block: &Block, second_block: &Block, log_space: bool) -> f64 { + let prior_acc: f64 = 0.95; // Accuracy of the prior estimator of joins... + let target_dist = block_target_distance(first_block, second_block) as f64; + // Absolute value as t-dist is symmetric and we want to get prob in tail, also, we know the mean is 0... + let divergence_diff = (second_block.kimura80 - first_block.kimura80).abs(); + + let target_likelihood = self.target_distance_join.logccdf(target_dist) + - self.target_distance_background.logccdf(target_dist); + let diverg_likelihood = self.divergence_join.logccdf(divergence_diff) + - self.divergence_background.logccdf(divergence_diff); + + let score = target_likelihood + diverg_likelihood + prior_acc.ln(); + + if log_space { + score + } else { + score.exp() + } + } +} + +pub struct BayesianJoinStatistics { joinable_target_distance_sum: usize, all_target_distance_sum: usize, divergence_sum: f64, divergence_square_sum: f64, join_divergence_sum: f64, join_divergence_square_sum: f64, - divergence_offset: f64, joinable_count: usize, all_count: usize, } + +impl JoinStatistics for BayesianJoinStatistics { + fn new() -> Self { + Self { + joinable_target_distance_sum: 0, + all_target_distance_sum: 0, + divergence_sum: 0.0, + divergence_square_sum: 0.0, + join_divergence_sum: 0.0, + join_divergence_square_sum: 0.0, + joinable_count: 0, + all_count: 0, + } + } + + fn add(&mut self, first_block: &Block, second_block: &Block, neighbors: bool, joinable: bool) { + let target_dist = block_target_distance(first_block, second_block).abs() as usize; + let divergence_diff = second_block.kimura80 - first_block.kimura80; + + if joinable { + self.joinable_target_distance_sum += target_dist; + self.join_divergence_sum += divergence_diff; + self.join_divergence_square_sum += divergence_diff * divergence_diff; + self.joinable_count += 1; + } + + self.all_target_distance_sum += target_dist; + self.divergence_sum += divergence_diff; + self.divergence_square_sum += divergence_diff * divergence_diff; + self.all_count += 1; + } + + fn combine(&self, other: &Self) -> Self { + Self { + joinable_target_distance_sum: self.joinable_target_distance_sum + + other.joinable_target_distance_sum, + all_target_distance_sum: self.all_target_distance_sum + other.all_target_distance_sum, + divergence_sum: self.divergence_sum + other.divergence_sum, + divergence_square_sum: self.divergence_square_sum + other.divergence_square_sum, + join_divergence_sum: self.join_divergence_sum + other.join_divergence_sum, + join_divergence_square_sum: self.join_divergence_square_sum + + other.join_divergence_square_sum, + joinable_count: self.joinable_count + other.joinable_count, + all_count: self.all_count + other.all_count, + } + } +} diff --git a/src/statistics.rs b/src/statistics.rs index 0b48788..875a75a 100644 --- a/src/statistics.rs +++ b/src/statistics.rs @@ -1,5 +1,5 @@ -use puruspe::{beta, betai}; -use std::fmt::Debug; +use puruspe::{beta, betai, invbetai}; +use std::{f64, fmt::Debug}; #[allow(dead_code)] pub trait Distribution: Clone + Debug { @@ -151,6 +151,16 @@ pub struct StudentsT { degrees_of_freedom: usize, } +impl StudentsT { + pub fn new(mean: f64, standard_deviation: f64, degrees_of_freedom: usize) -> Self { + Self { + mean, + standard_deviation, + degrees_of_freedom, + } + } +} + impl Distribution for StudentsT { fn unit() -> Self { Self { @@ -162,17 +172,92 @@ impl Distribution for StudentsT { fn pdf(&self, x: f64) -> f64 { let v = self.degrees_of_freedom as f64; - let z = (x - self.mean) / self.standard_deviation; - 1.0 / (v.sqrt() * beta(0.5, 0.5 * v)) * (1.0 + (z * z) / v).powf(-0.5 * (v + 1.0)) + let s = self.standard_deviation; + let z = (x - self.mean) / s; + 1.0 / (v.sqrt() * beta(0.5, 0.5 * v) * s) * (1.0 + (z * z) / v).powf(-0.5 * (v + 1.0)) } fn cdf(&self, x: f64) -> f64 { let v = self.degrees_of_freedom as f64; let z = (x - self.mean) / self.standard_deviation; - if z >= 0.0 { - 1.0 - 0.5 * betai(0.5 * v, 0.5, v / (z * z + v)) + let beta_comp = betai(0.5 * v, 0.5, v / (z * z + v)); + if z > 0.0 { + 1.0 - 0.5 * beta_comp } else { - 0.5 * betai(0.5 * v, 0.5, v / (z * z + v)) + 0.5 * beta_comp + } + } + + fn ppf(&self, p: f64) -> f64 { + let v = self.degrees_of_freedom as f64; + let p_in = if p <= 0.5 { 2.0 * p } else { 2.0 * (1.0 - p) }; + let inv_out = invbetai(p_in, 0.5 * v, 0.5); + let x_unit = (v / inv_out - v).sqrt(); + x_unit * self.standard_deviation + self.mean + } + + fn support(&self) -> (f64, f64) { + (f64::NEG_INFINITY, f64::INFINITY) + } +} + +#[derive(Debug, Clone)] +pub struct HalfT { + standard_deviation: f64, + degrees_of_freedom: usize, +} + +impl HalfT { + pub fn new(standard_deviation: f64, degrees_of_freedom: usize) -> Self { + Self { + standard_deviation, + degrees_of_freedom, + } + } + + pub fn from_sample_mean(mean: f64, degrees_of_freedom: usize) -> Self { + Self { + standard_deviation: mean * (2.0 / f64::consts::PI).sqrt(), + degrees_of_freedom, } } } + +impl Distribution for HalfT { + fn unit() -> Self { + Self { + standard_deviation: 1.0, + degrees_of_freedom: 1, + } + } + + fn pdf(&self, x: f64) -> f64 { + let v = self.degrees_of_freedom as f64; + let s = self.standard_deviation; + let z = x / s; + 2.0 / (v.sqrt() * beta(0.5, 0.5 * v) * s) * (1.0 + (z * z) / v).powf(-0.5 * (v + 1.0)) + } + + fn cdf(&self, x: f64) -> f64 { + let v = self.degrees_of_freedom as f64; + let z = x / self.standard_deviation; + let beta_comp = betai(0.5 * v, 0.5, v / (z * z + v)); + if z > 0.0 { + 1.0 - 0.5 * beta_comp + } else { + 0.5 * beta_comp + } + } + + fn ppf(&self, p: f64) -> f64 { + let v = self.degrees_of_freedom as f64; + let p_in = if p <= 0.5 { 2.0 * p } else { 2.0 * (1.0 - p) }; + let inv_out = invbetai(p_in, 0.5 * v, 0.5); + let x_unit = (v / inv_out - v).sqrt(); + x_unit * self.standard_deviation + } + + fn support(&self) -> (f64, f64) { + (0.0, f64::INFINITY) + } +} From 597a2606c3331c13936e8f3988fe8f3060593342 Mon Sep 17 00:00:00 2001 From: isaacr Date: Fri, 8 May 2026 01:23:35 -0600 Subject: [PATCH 18/39] Init impl of HalfT, needs testing... --- src/statistics.rs | 108 +++++++++++++++------------------------------- 1 file changed, 35 insertions(+), 73 deletions(-) diff --git a/src/statistics.rs b/src/statistics.rs index 875a75a..db2497b 100644 --- a/src/statistics.rs +++ b/src/statistics.rs @@ -1,5 +1,6 @@ -use puruspe::{beta, betai, invbetai}; -use std::{f64, fmt::Debug}; +use core::f64; +use puruspe::{beta, betai, erf, invbetai}; +use std::fmt::Debug; #[allow(dead_code)] pub trait Distribution: Clone + Debug { @@ -8,19 +9,10 @@ pub trait Distribution: Clone + Debug { fn cdf(&self, x: f64) -> f64; fn ppf(&self, p: f64) -> f64; fn support(&self) -> (f64, f64); - - fn ccdf(&self, x: f64) -> f64 { - 1.0 - self.cdf(x) - } - fn logpdf(&self, x: f64) -> f64 { - self.pdf(x).ln() - } - fn logcdf(&self, x: f64) -> f64 { - self.cdf(x).ln() - } - fn logccdf(&self, x: f64) -> f64 { - self.ccdf(x).ln() - } + fn ccdf(&self, x: f64) -> f64; + fn logpdf(&self, x: f64) -> f64; + fn logcdf(&self, x: f64) -> f64; + fn logccdf(&self, x: f64) -> f64; } #[derive(Clone, Debug)] @@ -151,56 +143,6 @@ pub struct StudentsT { degrees_of_freedom: usize, } -impl StudentsT { - pub fn new(mean: f64, standard_deviation: f64, degrees_of_freedom: usize) -> Self { - Self { - mean, - standard_deviation, - degrees_of_freedom, - } - } -} - -impl Distribution for StudentsT { - fn unit() -> Self { - Self { - mean: 0.0, - standard_deviation: 1.0, - degrees_of_freedom: 1, - } - } - - fn pdf(&self, x: f64) -> f64 { - let v = self.degrees_of_freedom as f64; - let s = self.standard_deviation; - let z = (x - self.mean) / s; - 1.0 / (v.sqrt() * beta(0.5, 0.5 * v) * s) * (1.0 + (z * z) / v).powf(-0.5 * (v + 1.0)) - } - - fn cdf(&self, x: f64) -> f64 { - let v = self.degrees_of_freedom as f64; - let z = (x - self.mean) / self.standard_deviation; - let beta_comp = betai(0.5 * v, 0.5, v / (z * z + v)); - if z > 0.0 { - 1.0 - 0.5 * beta_comp - } else { - 0.5 * beta_comp - } - } - - fn ppf(&self, p: f64) -> f64 { - let v = self.degrees_of_freedom as f64; - let p_in = if p <= 0.5 { 2.0 * p } else { 2.0 * (1.0 - p) }; - let inv_out = invbetai(p_in, 0.5 * v, 0.5); - let x_unit = (v / inv_out - v).sqrt(); - x_unit * self.standard_deviation + self.mean - } - - fn support(&self) -> (f64, f64) { - (f64::NEG_INFINITY, f64::INFINITY) - } -} - #[derive(Debug, Clone)] pub struct HalfT { standard_deviation: f64, @@ -231,33 +173,53 @@ impl Distribution for HalfT { } } - fn pdf(&self, x: f64) -> f64 { + fn logpdf(&self, x: f64) -> f64 { let v = self.degrees_of_freedom as f64; let s = self.standard_deviation; let z = x / s; - 2.0 / (v.sqrt() * beta(0.5, 0.5 * v) * s) * (1.0 + (z * z) / v).powf(-0.5 * (v + 1.0)) + if z >= 0.0 { + let norm = (2.0_f64).ln() - (0.5 * v.ln() + beta(0.5, 0.5 * v).ln() + s.ln()); + norm - 0.5 * (v + 1.0) * ((z * z) / v).ln_1p() + } else { + 0.0 + } + } + + fn pdf(&self, x: f64) -> f64 { + self.logpdf(x).exp() } fn cdf(&self, x: f64) -> f64 { let v = self.degrees_of_freedom as f64; let z = x / self.standard_deviation; - let beta_comp = betai(0.5 * v, 0.5, v / (z * z + v)); - if z > 0.0 { - 1.0 - 0.5 * beta_comp + if z >= 0.0 { + 1.0 - betai(0.5 * v, 0.5, v / (z * z + v)) } else { - 0.5 * beta_comp + 0.0 } } + fn logcdf(&self, x: f64) -> f64 { + self.cdf(x).ln() + } + fn ppf(&self, p: f64) -> f64 { let v = self.degrees_of_freedom as f64; - let p_in = if p <= 0.5 { 2.0 * p } else { 2.0 * (1.0 - p) }; - let inv_out = invbetai(p_in, 0.5 * v, 0.5); + let inv_out = invbetai(1.0 - p, 0.5 * v, 0.5); let x_unit = (v / inv_out - v).sqrt(); x_unit * self.standard_deviation } + fn ccdf(&self, x: f64) -> f64 { + 1.0 - self.cdf(x) + } + + fn logccdf(&self, x: f64) -> f64 { + self.ccdf(x).ln() + } + fn support(&self) -> (f64, f64) { (0.0, f64::INFINITY) } } + From daa195359e15e0c7480899a1b3f8715aab143db6 Mon Sep 17 00:00:00 2001 From: isaacr Date: Fri, 8 May 2026 16:18:36 -0600 Subject: [PATCH 19/39] Start adding tests for distributions. --- .zed/settings.json | 16 ++++++ src/join_estimation.rs | 10 ++-- src/statistics.rs | 107 ++++++++++++++++++++++++++++++++++++++--- 3 files changed, 120 insertions(+), 13 deletions(-) create mode 100644 .zed/settings.json diff --git a/.zed/settings.json b/.zed/settings.json new file mode 100644 index 0000000..d229d6c --- /dev/null +++ b/.zed/settings.json @@ -0,0 +1,16 @@ +// Folder-specific settings +// +// For a full list of overridable settings, and general information on folder-specific settings, +// see the documentation: https://zed.dev/docs/configuring-zed#settings-files +{ + "terminal": { + "detect_venv": { + "on": { + "directories": [ + ".venv" + ], + "activate_script": "default" + } + } + } +} diff --git a/src/join_estimation.rs b/src/join_estimation.rs index 44a7295..7dd9d56 100644 --- a/src/join_estimation.rs +++ b/src/join_estimation.rs @@ -1,7 +1,7 @@ use crate::{ assembly::block_target_distance, segments::Block, - statistics::{Distribution, ExponentialEstimator, StudentsT}, + statistics::{Distribution, ExponentialEstimator, HalfT}, }; pub trait JoinEstimator { @@ -18,8 +18,8 @@ pub trait JoinStatistics { pub struct BayesianJoinEstimator { target_distance_join: ExponentialEstimator, target_distance_background: ExponentialEstimator, - divergence_join: StudentsT, - divergence_background: StudentsT, + divergence_join: HalfT, + divergence_background: HalfT, } impl JoinEstimator for BayesianJoinEstimator { @@ -41,8 +41,8 @@ impl JoinEstimator for BayesianJoinEstimator { all_td_mean, statistics.all_count, ), - divergence_join: StudentsT::new(0.0, join_div_std, statistics.joinable_count), - divergence_background: StudentsT::new(0.0, all_div_std, statistics.all_count), + divergence_join: HalfT::new(join_div_std, statistics.joinable_count), + divergence_background: HalfT::new(all_div_std, statistics.all_count), } } diff --git a/src/statistics.rs b/src/statistics.rs index db2497b..3a02ce8 100644 --- a/src/statistics.rs +++ b/src/statistics.rs @@ -1,5 +1,5 @@ use core::f64; -use puruspe::{beta, betai, erf, invbetai}; +use puruspe::{beta, betai, invbetai}; use std::fmt::Debug; #[allow(dead_code)] @@ -136,13 +136,6 @@ impl Distribution for ExponentialEstimator { } } -#[derive(Debug, Clone)] -pub struct StudentsT { - mean: f64, - standard_deviation: f64, - degrees_of_freedom: usize, -} - #[derive(Debug, Clone)] pub struct HalfT { standard_deviation: f64, @@ -223,3 +216,101 @@ impl Distribution for HalfT { } } +#[cfg(test)] +mod test { + use crate::statistics::{ExponentialEstimator, HalfT}; + use std::fmt::Debug; + + pub trait TestDistribution: Debug { + fn tpdf(&self, x: f64) -> f64; + fn tcdf(&self, x: f64) -> f64; + fn tppf(&self, p: f64) -> f64; + fn tsupport(&self) -> (f64, f64); + fn tccdf(&self, x: f64) -> f64; + fn tlogpdf(&self, x: f64) -> f64; + fn tlogcdf(&self, x: f64) -> f64; + fn tlogccdf(&self, x: f64) -> f64; + } + + impl TestDistribution for T { + fn tpdf(&self, x: f64) -> f64 { + self.pdf(x) + } + fn tcdf(&self, x: f64) -> f64 { + self.cdf(x) + } + fn tppf(&self, p: f64) -> f64 { + self.ppf(p) + } + fn tsupport(&self) -> (f64, f64) { + self.support() + } + fn tccdf(&self, x: f64) -> f64 { + self.ccdf(x) + } + fn tlogpdf(&self, x: f64) -> f64 { + self.logpdf(x) + } + fn tlogcdf(&self, x: f64) -> f64 { + self.logcdf(x) + } + fn tlogccdf(&self, x: f64) -> f64 { + self.logccdf(x) + } + } + + fn as_box(d: T) -> Box { + Box::new(d) + } + + use super::{Distribution, Exponential}; + + fn get_dists() -> [Box; 3] { + [ + as_box(Exponential::unit()), + as_box(ExponentialEstimator::unit()), + as_box(HalfT::unit()), + ] + } + + fn is_close(a: f64, b: f64) -> bool { + let rel_tol = 1e-9; + let abs_tol = 0.0; + (a - b).abs() <= (rel_tol * (a.abs()).max(b.abs())).max(abs_tol) + } + + fn linspace(start: f64, stop: f64, steps: usize) -> impl Iterator { + (0..steps) + .map(move |n| n as f64 / (steps as f64 - 1.0)) + .map(move |n| start * (1.0 - n) + stop * n) + } + + #[test] + fn basic_distribution_propery_checks() { + for dist in get_dists() { + println!("Testing distribution: {:?}", dist); + let (mut low, mut high) = dist.tsupport(); + + if high == f64::INFINITY { + high = 5.0; + } + if low == f64::INFINITY { + low = -5.0; + } + + for x in linspace(low, high, 100) { + // Basic properties... + assert!(is_close(dist.tpdf(x), dist.tlogpdf(x).exp())); + assert!(is_close(dist.tcdf(x), dist.tlogcdf(x).exp())); + assert!(is_close(dist.tccdf(x), dist.tlogccdf(x).exp())); + assert!(is_close(dist.tccdf(x), 1.0 - dist.tcdf(x))); + assert!(is_close(dist.tppf(dist.tcdf(x)), x)); + } + } + } + + #[test] + fn test_exponential_distribution() { + let dist = Exponential::unit(); + } +} From a0236ac30e188cc0a74240aa0851c4bbf887f431 Mon Sep 17 00:00:00 2001 From: isaacr Date: Tue, 12 May 2026 03:24:17 -0600 Subject: [PATCH 20/39] P2 Estimator wip for doing quantile estimators. --- Cargo.toml | 1 + scripts/plot_distributions_aurora.py | 23 +++++----- src/main.rs | 1 + src/p2estimator.rs | 67 ++++++++++++++++++++++++++++ src/statistics.rs | 1 + 5 files changed, 81 insertions(+), 12 deletions(-) create mode 100644 src/p2estimator.rs diff --git a/Cargo.toml b/Cargo.toml index c5b43b6..9055fa9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,7 @@ itertools = "0.11.0" rayon = "1.8.0" base64 = "0.22.1" puruspe = "0.4.4" +num-traits = "0.2.19" [target.'cfg(not(target_env = "msvc"))'.dependencies] tikv-jemallocator = "0.5" diff --git a/scripts/plot_distributions_aurora.py b/scripts/plot_distributions_aurora.py index d76eb97..26fa437 100644 --- a/scripts/plot_distributions_aurora.py +++ b/scripts/plot_distributions_aurora.py @@ -5,7 +5,8 @@ import matplotlib.pyplot as plt import numpy as np -from scipy.optimize import curve_fit +from matplotlib.colors import to_rgba +from scipy.optimize import curve_fit, minimize from scipy.stats import ( ecdf, expon, @@ -184,7 +185,6 @@ def _relative_consensus_dist(a, b, ai, bi): stats_to_compute = { - "Consensus Distance": consensus_dist, "Target Distance": _target_distance, "Divergence Change": _kimura_dist, "Relative Consensus Distance": _relative_consensus_dist, @@ -225,12 +225,9 @@ def logcdf(self, x, *args): "Relative Consensus Distance": Distribution( laplace_asymmetric, (1.0, 0.0, 1.0), False ), # Distribution(invweibull, (1.0, 0.0, 1.0), False), - "Consensus Distance": Distribution( - laplace_asymmetric, (1.0, 0.0, 1.0), False - ), # Distribution(invweibull, (1.0, 0.0, 1.0), False), "Target Distance": Distribution( - weibull_min, (1.0, 10000) - ), # Distribution(expon, (1.0,)), # Distribution(genpareto, (0.0, 1.0)), + genpareto, (0.0, 1.0) + ), # Distribution(expon, (1.0,)), # Distribution(genpareto, (0.0, 1.0)), Distribution(weibull_min, (1.0, 10000) "Divergence Change": Distribution(halfnorm, (1.0,)), } @@ -322,7 +319,7 @@ def fit_dist(data, dist): for query_name, _ in sorted( - join_stats["Consensus Distance"].items(), key=lambda k: -len(k[1]) + next(iter(join_stats.values())).items(), key=lambda k: -len(k[1]) ): # if not query_name.startswith("sin"): # continue @@ -357,7 +354,7 @@ def fit_dist(data, dist): random_samples = np.array(random_stats[name][query_name]) sx2 = np.linspace(random_samples.min(), random_samples.max(), 1000) - fit2 = fit_dist(random_samples, est) + fit2 = fit_dist(random_samples[not_join_indexes], est) ax2.set_title(f"All {name}") ax2.plot( [0], @@ -372,19 +369,21 @@ def fit_dist(data, dist): label=["Not Joined", "Joined"], density=True, stacked=True, + color=[to_rgba(c) for c in ["tab:orange", "tab:blue"]], ) ax2.plot( sx2, est.pdf(sx2, *fit2), + "black", label=f"Fit: {', '.join(f'{v:.02f}' for v in fit2)}", ) ax2.legend(fontsize="xx-small") ax3.set_title("CDFs") - ax3.ecdf(join_stats[name][query_name], label="Joins CDF") - ax3.ecdf(random_stats[name][query_name], label="All CDF") + ax3.ecdf(join_samples, label="Joins CDF") + ax3.ecdf(random_samples[not_join_indexes], label="No Joins CDF") ax3.plot(sx, est.cdf(sx, *fit), label="Est. Join CDF") - ax3.plot(sx2, est.cdf(sx2, *fit2), label="Est. All CDF") + ax3.plot(sx2, est.cdf(sx2, *fit2), label="Est. No Join CDF") ax3.legend(fontsize="xx-small") fig.set_size_inches(16, 8) diff --git a/src/main.rs b/src/main.rs index 927fbf9..c0ba526 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,6 +8,7 @@ mod confidence; mod history_tracing; mod join_estimation; mod matrix; +mod p2estimator; mod pipeline; mod score_params; mod segment_groups; diff --git a/src/p2estimator.rs b/src/p2estimator.rs new file mode 100644 index 0000000..8bdf897 --- /dev/null +++ b/src/p2estimator.rs @@ -0,0 +1,67 @@ +/// Implementation of P2 estimator. +/// See "The P2 Algorithm for Dynamic Statistical Computing Calculation of Quantiles and Histograms Without Storing Observations" +/// at https://www.cse.wustl.edu/~jain/papers/ftp/psqr.pdf +use num_traits::{AsPrimitive, Float, Unsigned}; + +struct P2HistogramPoint { + value: F, + rank: I, +} + +fn linear_prediction, I: Unsigned + Copy + Ord + Into>( + points: &[P2HistogramPoint; 3], + d: isize, +) -> F { + let n: [F; 3] = points.each_ref().map(|v| v.rank.into()); + let q: [F; 3] = points.each_ref().map(|v| v.value); + let d_f: F = d.into(); + let d_off = 1 + d as usize; + + q[1] + d_f * ((q[d_off] - q[1]) / (n[d_off] - n[1])) +} + +fn parabolic_prediction, I: Unsigned + Copy + Ord + Into>( + points: &[P2HistogramPoint; 3], + d: isize, +) -> F { + let n: [F; 3] = points.each_ref().map(|v| v.rank.into()); + let q: [F; 3] = points.each_ref().map(|v| v.value); + let d = d.into(); + + let left = (n[1] - n[0] + d) * ((q[2] - q[1]) / (n[2] - n[1])); + let right = (n[2] - n[1] - d) * ((q[1] - q[0]) / (n[1] - n[0])); + q[1] + (d / (n[2] - n[0])) * (left + right) +} + +fn p2update, I: Unsigned + Ord + Into>( + points: &[P2HistogramPoint], + center_index: I, + observations: I, + total_points: I, + proposal: F, +) -> (F, I) { + // Actual rank desired for the given quantile... + let rank_proposal: F = + (center_index * (observations - I::one())).into() / (total_points - I::one()).into(); + let d: F = rank_proposal - points[1].rank.into(); + + if d >= F::one() && (points[2].rank - points[1].rank) > I::one() + || (d <= -F::one()) && points[1].rank - points[0].rank > I::one() + { + let d: isize = if d >= F::zero() { 1 } else { -1 }; + let n: [F; 3]; + for i in 0..3 { + n[i] = points[i].rank.into(); + } + + let est = points[1].value + (d / (points[2].rank - points[0].rank).into()); + est + } else { + (points[1].value, I::zero()) + } +} + +struct P2HistogramData<'a, F: Float, I: Unsigned + Into> { + observations: I, + points: &'a mut [P2HistogramPoint], +} diff --git a/src/statistics.rs b/src/statistics.rs index 3a02ce8..078bcbb 100644 --- a/src/statistics.rs +++ b/src/statistics.rs @@ -2,6 +2,7 @@ use core::f64; use puruspe::{beta, betai, invbetai}; use std::fmt::Debug; +// TODO: Support for generic floating types... #[allow(dead_code)] pub trait Distribution: Clone + Debug { fn unit() -> Self; From aba5104aa3550bee7f99db488e79057c92c695f0 Mon Sep 17 00:00:00 2001 From: isaacr Date: Tue, 12 May 2026 17:32:19 -0600 Subject: [PATCH 21/39] Init p2 est done.. --- src/p2estimator.rs | 136 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 115 insertions(+), 21 deletions(-) diff --git a/src/p2estimator.rs b/src/p2estimator.rs index 8bdf897..1bfe387 100644 --- a/src/p2estimator.rs +++ b/src/p2estimator.rs @@ -1,67 +1,161 @@ +use std::{cmp::Ordering, ops::Neg}; + /// Implementation of P2 estimator. /// See "The P2 Algorithm for Dynamic Statistical Computing Calculation of Quantiles and Histograms Without Storing Observations" /// at https://www.cse.wustl.edu/~jain/papers/ftp/psqr.pdf -use num_traits::{AsPrimitive, Float, Unsigned}; +use num_traits::{float::TotalOrder, Float, Num, Unsigned}; struct P2HistogramPoint { value: F, rank: I, } -fn linear_prediction, I: Unsigned + Copy + Ord + Into>( +fn get_sign, B: Num + PartialOrd + Neg>( + val: A, +) -> B { + if val >= A::zero() { + B::one() + } else { + -B::one() + } +} + +fn inc_or_dec>(val: A, delta: B) -> A { + match delta.partial_cmp(&B::zero()) { + Some(Ordering::Less) => val - A::one(), + Some(Ordering::Greater) => val + A::one(), + _ => val, + } +} + +fn linear_prediction>( points: &[P2HistogramPoint; 3], d: isize, ) -> F { let n: [F; 3] = points.each_ref().map(|v| v.rank.into()); let q: [F; 3] = points.each_ref().map(|v| v.value); - let d_f: F = d.into(); - let d_off = 1 + d as usize; + let d_f: F = get_sign(d); + let d_off = (1 + d) as usize; q[1] + d_f * ((q[d_off] - q[1]) / (n[d_off] - n[1])) } -fn parabolic_prediction, I: Unsigned + Copy + Ord + Into>( +fn parabolic_prediction>( points: &[P2HistogramPoint; 3], d: isize, ) -> F { let n: [F; 3] = points.each_ref().map(|v| v.rank.into()); let q: [F; 3] = points.each_ref().map(|v| v.value); - let d = d.into(); + let d: F = get_sign(d); let left = (n[1] - n[0] + d) * ((q[2] - q[1]) / (n[2] - n[1])); let right = (n[2] - n[1] - d) * ((q[1] - q[0]) / (n[1] - n[0])); q[1] + (d / (n[2] - n[0])) * (left + right) } -fn p2update, I: Unsigned + Ord + Into>( - points: &[P2HistogramPoint], - center_index: I, +fn _p2update + From>( + points: &mut [P2HistogramPoint], + center_index: usize, observations: I, total_points: I, - proposal: F, -) -> (F, I) { +) { // Actual rank desired for the given quantile... + let ci: I = center_index.into(); let rank_proposal: F = - (center_index * (observations - I::one())).into() / (total_points - I::one()).into(); + (ci * (observations - I::one())).into() / (total_points - I::one()).into(); let d: F = rank_proposal - points[1].rank.into(); if d >= F::one() && (points[2].rank - points[1].rank) > I::one() || (d <= -F::one()) && points[1].rank - points[0].rank > I::one() { - let d: isize = if d >= F::zero() { 1 } else { -1 }; - let n: [F; 3]; - for i in 0..3 { - n[i] = points[i].rank.into(); + let d: isize = get_sign(d); + let mut p_est = parabolic_prediction( + (&points[center_index - 1..center_index + 1]) + .as_array() + .unwrap(), + d, + ); + if p_est <= points[center_index - 1].value || p_est >= points[center_index + 1].value { + p_est = linear_prediction( + (&points[center_index - 1..center_index + 1]) + .as_array() + .unwrap(), + d, + ); } - let est = points[1].value + (d / (points[2].rank - points[0].rank).into()); - est - } else { - (points[1].value, I::zero()) + points[center_index].value = p_est; + points[center_index].rank = inc_or_dec(points[center_index].rank, d); } } -struct P2HistogramData<'a, F: Float, I: Unsigned + Into> { +struct P2HistogramData<'a, F: Float, I: Unsigned + Copy + Ord + Into> { observations: I, points: &'a mut [P2HistogramPoint], } + +impl<'a, F: Float + TotalOrder, I: Unsigned + Copy + Ord + Into + From + Into> + P2HistogramData<'a, F, I> +{ + fn _standard_update(&mut self, sample: F) { + // Find where sample falls within distribution... + let p = self.points.partition_point(|v| v.value <= sample); + let bound_p = p.min(self.points.len() - 1); + + // Update extremes... + if bound_p == 0 { + self.points[bound_p].value = self.points[bound_p].value.min(sample); + } else if bound_p == (self.points.len() - 1) { + self.points[bound_p].value = self.points[bound_p].value.max(sample); + } + + // Increment ranks of markers above newly inserted sample... + for i in (bound_p + 1)..self.points.len() { + self.points[i].rank = self.points[i].rank + I::one(); + } + + // Adjust inner markers to within 1 of their target quantile using p2 formula... + for i in 1..(self.points.len() - 1) { + _p2update(self.points, i, self.observations, self.points.len().into()); + } + + self.observations = self.observations + I::one(); + } + + fn _pre_init_update(&mut self, sample: F) { + let nxt_idx: usize = self.observations.into(); + self.points[nxt_idx].value = sample; + self.observations = self.observations + I::one(); + } + + fn _initialize(&mut self) { + self.points.sort_by(|a, b| a.value.total_cmp(&b.value)); + self.points.iter_mut().enumerate().for_each(|(i, p)| { + p.rank = i.into(); + }); + } + + pub fn update(&mut self, sample: F) { + let obs: usize = self.observations.into(); + match (obs + 1).cmp(&self.points.len()) { + Ordering::Less => self._pre_init_update(sample), + Ordering::Equal => { + self._pre_init_update(sample); + self._initialize(); + } + Ordering::Greater => { + self._standard_update(sample); + } + } + } + + pub fn is_initialized(&self) -> bool { + let obs: usize = self.observations.into(); + obs >= self.points.len() + } + + fn combine(&mut self, other: &P2HistogramData) { + // TODO: Need to think about how to do this efficiently while maintaining accuracy... + panic!("Not implemented!") + } +} From 88c0a9f354d8213dd574b0a65b21d805e18ed332 Mon Sep 17 00:00:00 2001 From: isaacr Date: Thu, 14 May 2026 01:48:41 -0600 Subject: [PATCH 22/39] First refactor with new scoring done... --- src/assembly.rs | 225 ++++++++++++++++++++++++++++------------ src/join_estimation.rs | 142 +++++++++++++------------ src/main.rs | 19 ++-- src/p2estimator.rs | 2 +- src/pipeline.rs | 18 ++-- src/segments.rs | 39 ++++--- src/statistics.rs | 10 +- src/trace_statistics.rs | 48 +++++---- 8 files changed, 316 insertions(+), 187 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 97e49a7..fa4b6e2 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -4,9 +4,9 @@ use itertools::Itertools; use crate::{ alignment::{Alignment, Strand}, + join_estimation::{JoinEstimator, JoinStatisticsCollector}, score_params::ScoreParams, - segments::{Block, SegmentedMatrix}, - statistics::Distribution, + segments::{Block, SegmentedMatrix, SegmentedMatrixView}, trace_statistics::{QueryStatistics, RegionStatistics}, AnnotationArgs, }; @@ -100,9 +100,8 @@ fn piecewise_linear_cost( fn get_link_cost( annotation_args: &AnnotationArgs, score_params: &ScoreParams, - target_gap_distribution: &impl Distribution, - consensus_gap: f64, - target_gap: f64, + consensus_gap: isize, + join_prob: f64, ) -> f64 { // Minimum cost (a query loop) let min_value = score_params.query_loop_score; @@ -123,11 +122,9 @@ fn get_link_cost( -value_range * (annotation_args.join_consensus_overlap_penalty / overlap_range).abs(); let beta = -value_range * (annotation_args.join_consensus_gap_penalty / gap_range).abs(); - // Compute target gap penalty. // Doing this as the expected value over the transition scores... - let target_random_prob = target_gap_distribution.cdf(target_gap); - let target_expected_score = target_random_prob * score_params.query_jump_score - + (1.0 - target_random_prob) * score_params.query_loop_score; + let expected_score = join_prob * score_params.query_loop_score + + (1.0 - join_prob) * score_params.query_jump_score; // Cost = linear consensus cost + linear target gap cost... min_value @@ -136,9 +133,9 @@ fn get_link_cost( (annotation_args.free_join_consensus_gap as f64).abs(), alpha, beta, - consensus_gap, + consensus_gap as f64, ) - + target_expected_score + + expected_score } pub fn block_target_distance(first_block: &Block, second_block: &Block) -> isize { @@ -187,12 +184,126 @@ pub fn block_consensus_distance(first_block: &Block, second_block: &Block) -> (i } } -fn link_assemblies( +pub fn block_length_on_query(b: &Block) -> usize { + b.query_end.abs_diff(b.query_start) + 1 +} + +fn is_joinable( + target_distance: isize, + consensus_distance: isize, + link_type: LinkType, + min_block_length: usize, + args: &AnnotationArgs, +) -> bool { + let within_target_distance_threshold = + target_distance < args.target_join_distance as isize && target_distance >= 0; + + let consensus_is_colinear = if link_type.is_inversion() { + consensus_distance.abs() < args.inversion_distance + } else { + consensus_distance > -args.consensus_join_overlap + && consensus_distance < args.consensus_join_distance + }; + + // TODO: Hardcoded, change later... + let is_significant = + min_block_length >= 10 && -consensus_distance <= ((min_block_length / 2) as isize); + + within_target_distance_threshold && consensus_is_colinear && is_significant +} + +fn new_alignment_to_blocks_map( + segments: SegmentedMatrixView, + alignments: &[Alignment], +) -> Vec> { + let mut alignment_block_map = vec![Vec::::new(); alignments.len()]; + + for (s_idx, segment) in segments.iter().enumerate() { + for (b_idx, block) in segment.blocks.iter().enumerate() { + if block.row_idx > 0 && block.row_idx <= alignments.len() { + alignment_block_map[block.row_idx - 1].push((s_idx, b_idx)); + } + } + } + + alignment_block_map +} + +pub fn gather_join_statistics( + alignments: &[Alignment], + annotation_args: &AnnotationArgs, +) -> Vec<(usize, T)> { + let mut query_ids: Vec = alignments.iter().map(|a| a.query_id).unique().collect(); + query_ids.sort(); + + let mut query_stats: Vec<(usize, T)> = Vec::with_capacity(query_ids.len()); + + query_ids + .iter() + // grab the alignments for this ID + .map(|id| { + ( + *id, + alignments + .iter() + .enumerate() + .filter(|&(_, a)| a.query_id == *id) + .map(|(i, a)| Block::from_alignment(a, i, 0.0, 0.0)), + ) + }) + .for_each(|(id, compat_alignments)| { + let mut new_stats = T::new(); + + gather_join_statistics_single_family( + compat_alignments, + annotation_args, + &mut new_stats, + ); + + query_stats.push((id, new_stats)); + }); + + query_stats +} + +fn gather_join_statistics_single_family<'a>( + compatable_alignments: impl Iterator, + args: &AnnotationArgs, + join_stats: &mut impl JoinStatisticsCollector, +) { + let compatable_blocks = compatable_alignments + .sorted_by_key(|a| a.col_start) + .collect_vec(); + + compatable_blocks + .iter() + .enumerate() + .for_each(|(idx, a_block)| { + compatable_blocks[idx + 1..] + .iter() + .enumerate() + .for_each(|(idx2, b_block)| { + let (consensus_distance, link_type) = + block_consensus_distance(a_block, b_block); + let joinable = is_joinable( + block_target_distance(a_block, b_block), + consensus_distance, + link_type, + block_length_on_query(a_block).min(block_length_on_query(b_block)), + args, + ); + + join_stats.add(a_block, b_block, idx + 1 == idx2, joinable); + }) + }) +} + +fn link_assemblies( graph: &mut HashMap<(SegmentAndDenseRow, SegmentAndDenseRow), Edge>, compatable_blocks: impl Iterator, segments: &SegmentedMatrix, query_statistics: &QueryStatistics, - region_statistics: &RegionStatistics, + _region_statistics: &RegionStatistics, score_params: &ScoreParams, args: &AnnotationArgs, ) { @@ -210,54 +321,39 @@ fn link_assemblies( let b_block = &segments[b.0].blocks[b.1]; let target_distance = block_target_distance(a_block, b_block); - - let a_length = a_block.query_end.abs_diff(a_block.query_start) + 1; - let b_length = b_block.query_end.abs_diff(b_block.query_start) + 1; - let min_length = a_length.min(b_length); - - // Query bounds are reversed for reverse sequences, so the start is actually greater than the end (Ex. start: 1510 -> end: 105) + let min_block_length = + block_length_on_query(a_block).min(block_length_on_query(b_block)); let (consensus_distance, link_type) = block_consensus_distance(a_block, b_block); - // Within target distance??? - let within_target_distance_threshold = (target_distance - < args.target_join_distance as isize) - && (query_statistics.distribution.ccdf(target_distance as f64) - >= args.target_distance_likelihood_threshold); - - let consensus_is_colinear = if link_type.is_inversion() { - consensus_distance.abs() < args.inversion_distance - } else { - consensus_distance > -args.consensus_join_overlap - && consensus_distance < args.consensus_join_distance - }; - - // TODO: Hardcoded, change later... - let is_significant = - min_length >= 10 && -consensus_distance <= ((min_length / 2) as isize); - - let weight = if a_block.row_idx == b_block.row_idx && ((b.0 - 1) <= a.0) { - score_params.query_loop_score - } else { - get_link_cost( - args, - score_params, - &query_statistics.distribution, - consensus_distance as f64, - target_distance as f64, - ) - }; - - if within_target_distance_threshold && consensus_is_colinear && is_significant { - graph.insert( - ((a.0, a_block.row_idx), (b.0, b_block.row_idx)), - Edge { - weight, - first_sparse_row: a.1, - second_sparse_row: b.1, - link_type, - }, - ); + if is_joinable( + target_distance, + consensus_distance, + link_type, + min_block_length, + args, + ) { + if let Some(estimator) = &query_statistics.estimator { + let join_prob = estimator.predict(a_block, b_block, false); + + if join_prob >= args.join_likelihood_threshold { + let weight = if a_block.row_idx == b_block.row_idx && ((b.0 - 1) <= a.0) { + score_params.query_loop_score + } else { + get_link_cost(args, score_params, consensus_distance, join_prob) + }; + + graph.insert( + ((a.0, a_block.row_idx), (b.0, b_block.row_idx)), + Edge { + weight, + first_sparse_row: a.1, + second_sparse_row: b.1, + link_type, + }, + ); + } + } } }); }); @@ -274,7 +370,7 @@ pub struct SegmentAssemblyGraph { } impl SegmentAssemblyGraph { - pub fn new( + pub fn new( alignments: &[Alignment], segments: &SegmentedMatrix, region_statistics: &RegionStatistics, @@ -282,16 +378,7 @@ impl SegmentAssemblyGraph { score_params: &ScoreParams, annotation_args: &AnnotationArgs, ) -> Self { - let mut alignment_block_map = vec![Vec::::new(); alignments.len()]; - - for (s_idx, segment) in segments.iter().enumerate() { - for (b_idx, block) in segment.blocks.iter().enumerate() { - if block.row_idx > 0 && block.row_idx <= alignments.len() { - alignment_block_map[block.row_idx - 1].push((s_idx, b_idx)); - } - } - } - + let alignment_block_map = new_alignment_to_blocks_map(segments, alignments); let mut query_ids: Vec = alignments.iter().map(|a| a.query_id).unique().collect(); query_ids.sort(); diff --git a/src/join_estimation.rs b/src/join_estimation.rs index 7dd9d56..7b0cd3e 100644 --- a/src/join_estimation.rs +++ b/src/join_estimation.rs @@ -1,63 +1,43 @@ use crate::{ assembly::block_target_distance, segments::Block, - statistics::{Distribution, ExponentialEstimator, HalfT}, + statistics::{ln_add_exp, Distribution, ExponentialEstimator, HalfT}, }; -pub trait JoinEstimator { +pub trait JoinEstimator: Clone { fn predict(&self, first_block: &Block, second_block: &Block, log_space: bool) -> f64; - fn from_statistics(statistics: T) -> Self; } -pub trait JoinStatistics { +pub trait JoinStatisticsCollector: Clone { fn new() -> Self; fn combine(&self, other: &Self) -> Self; fn add(&mut self, first_block: &Block, second_block: &Block, neighbors: bool, joinable: bool); } +#[derive(Debug, Clone)] pub struct BayesianJoinEstimator { target_distance_join: ExponentialEstimator, - target_distance_background: ExponentialEstimator, + target_distance_nojoin: ExponentialEstimator, divergence_join: HalfT, - divergence_background: HalfT, + divergence_nojoin: HalfT, + join_prior: f64, } -impl JoinEstimator for BayesianJoinEstimator { - fn from_statistics(statistics: BayesianJoinStatistics) -> Self { - let join_td_mean = - statistics.joinable_target_distance_sum as f64 / statistics.joinable_count as f64; - let all_td_mean = statistics.all_target_distance_sum as f64 / statistics.all_count as f64; - - // Divergence distributions should have a mean of 0, so we assume that... - let join_div_std = statistics.join_divergence_square_sum / statistics.joinable_count as f64; - let all_div_std = statistics.divergence_square_sum / statistics.all_count as f64; - - Self { - target_distance_join: ExponentialEstimator::new( - join_td_mean, - statistics.joinable_count, - ), - target_distance_background: ExponentialEstimator::new( - all_td_mean, - statistics.all_count, - ), - divergence_join: HalfT::new(join_div_std, statistics.joinable_count), - divergence_background: HalfT::new(all_div_std, statistics.all_count), - } - } - +impl JoinEstimator for BayesianJoinEstimator { fn predict(&self, first_block: &Block, second_block: &Block, log_space: bool) -> f64 { - let prior_acc: f64 = 0.95; // Accuracy of the prior estimator of joins... let target_dist = block_target_distance(first_block, second_block) as f64; // Absolute value as t-dist is symmetric and we want to get prob in tail, also, we know the mean is 0... let divergence_diff = (second_block.kimura80 - first_block.kimura80).abs(); - let target_likelihood = self.target_distance_join.logccdf(target_dist) - - self.target_distance_background.logccdf(target_dist); - let diverg_likelihood = self.divergence_join.logccdf(divergence_diff) - - self.divergence_background.logccdf(divergence_diff); + let join_score = self.join_prior.ln() + + self.target_distance_join.logpdf(target_dist) + + self.divergence_join.logpdf(divergence_diff); + let nojoin_score = (-self.join_prior).ln_1p() + + self.target_distance_nojoin.logpdf(target_dist) + + self.divergence_nojoin.logpdf(target_dist); - let score = target_likelihood + diverg_likelihood + prior_acc.ln(); + let score_norm = ln_add_exp(join_score, nojoin_score); + let score = join_score - score_norm; if log_space { score @@ -67,60 +47,92 @@ impl JoinEstimator for BayesianJoinEstimator { } } +impl From for BayesianJoinEstimator { + fn from(value: BayesianJoinStatistics) -> Self { + Self::from(&value) + } +} + +impl From<&BayesianJoinStatistics> for BayesianJoinEstimator { + fn from(statistics: &BayesianJoinStatistics) -> Self { + let join_td_mean = + statistics.joinable_target_distance_sum as f64 / statistics.joinable_count as f64; + let nojoin_td_mean = + statistics.unjoinable_target_distance_sum as f64 / statistics.unjoinable_count as f64; + + // Divergence distributions should have a mean of 0, so we assume that... + let join_div_mean = statistics.joinable_divergence_sum / statistics.joinable_count as f64; + let nojoin_div_mean = + statistics.unjoinable_divergence_sum / statistics.joinable_count as f64; + + Self { + target_distance_join: ExponentialEstimator::new( + join_td_mean, + statistics.joinable_count, + ), + target_distance_nojoin: ExponentialEstimator::new( + nojoin_td_mean, + statistics.unjoinable_count, + ), + divergence_join: HalfT::from_sample_mean(join_div_mean, statistics.joinable_count), + divergence_nojoin: HalfT::from_sample_mean( + nojoin_div_mean, + statistics.unjoinable_count, + ), + join_prior: statistics.joinable_count as f64 + / (statistics.joinable_count + statistics.unjoinable_count) as f64, + } + } +} + +#[derive(Debug, Clone)] pub struct BayesianJoinStatistics { joinable_target_distance_sum: usize, - all_target_distance_sum: usize, - divergence_sum: f64, - divergence_square_sum: f64, - join_divergence_sum: f64, - join_divergence_square_sum: f64, + unjoinable_target_distance_sum: usize, + joinable_divergence_sum: f64, + unjoinable_divergence_sum: f64, joinable_count: usize, - all_count: usize, + unjoinable_count: usize, } -impl JoinStatistics for BayesianJoinStatistics { +impl JoinStatisticsCollector for BayesianJoinStatistics { fn new() -> Self { Self { joinable_target_distance_sum: 0, - all_target_distance_sum: 0, - divergence_sum: 0.0, - divergence_square_sum: 0.0, - join_divergence_sum: 0.0, - join_divergence_square_sum: 0.0, + unjoinable_target_distance_sum: 0, + joinable_divergence_sum: 0.0, + unjoinable_divergence_sum: 0.0, joinable_count: 0, - all_count: 0, + unjoinable_count: 0, } } - fn add(&mut self, first_block: &Block, second_block: &Block, neighbors: bool, joinable: bool) { + fn add(&mut self, first_block: &Block, second_block: &Block, _neighbors: bool, joinable: bool) { let target_dist = block_target_distance(first_block, second_block).abs() as usize; - let divergence_diff = second_block.kimura80 - first_block.kimura80; + let divergence_diff = (second_block.kimura80 - first_block.kimura80).abs(); if joinable { self.joinable_target_distance_sum += target_dist; - self.join_divergence_sum += divergence_diff; - self.join_divergence_square_sum += divergence_diff * divergence_diff; + self.joinable_divergence_sum += divergence_diff; self.joinable_count += 1; + } else { + self.unjoinable_target_distance_sum += target_dist; + self.unjoinable_divergence_sum += divergence_diff; + self.unjoinable_count += 1; } - - self.all_target_distance_sum += target_dist; - self.divergence_sum += divergence_diff; - self.divergence_square_sum += divergence_diff * divergence_diff; - self.all_count += 1; } fn combine(&self, other: &Self) -> Self { Self { joinable_target_distance_sum: self.joinable_target_distance_sum + other.joinable_target_distance_sum, - all_target_distance_sum: self.all_target_distance_sum + other.all_target_distance_sum, - divergence_sum: self.divergence_sum + other.divergence_sum, - divergence_square_sum: self.divergence_square_sum + other.divergence_square_sum, - join_divergence_sum: self.join_divergence_sum + other.join_divergence_sum, - join_divergence_square_sum: self.join_divergence_square_sum - + other.join_divergence_square_sum, + unjoinable_target_distance_sum: self.unjoinable_target_distance_sum + + other.unjoinable_target_distance_sum, + joinable_divergence_sum: self.joinable_divergence_sum + other.joinable_divergence_sum, + unjoinable_divergence_sum: self.unjoinable_divergence_sum + + other.unjoinable_divergence_sum, joinable_count: self.joinable_count + other.joinable_count, - all_count: self.all_count + other.all_count, + unjoinable_count: self.unjoinable_count + other.unjoinable_count, } } } diff --git a/src/main.rs b/src/main.rs index c0ba526..0063815 100644 --- a/src/main.rs +++ b/src/main.rs @@ -46,8 +46,9 @@ use viz::VizConstraint; use crate::{ annotation::AmbiguousAnnotation, chunks::validate_groups, + join_estimation::{BayesianJoinEstimator, BayesianJoinStatistics}, pipeline::{run_history_trace, run_naive_trace, NaiveTraceResults}, - trace_statistics::{trace_statistics, OccuranceCountingMode}, + trace_statistics::{trace_statistics, OccuranceCountingMode, TraceStatistics}, viz::{ stats::{write_family_statistics, write_inversion_statistics}, write_index_file, ICON_SVG, @@ -138,16 +139,14 @@ pub struct AnnotationArgs { )] pub target_join_distance: usize, - /// Removes joins across positions - /// in the target (genome) at which a join is - /// less than this likely to not be generated - /// at random. + /// Removes joins that fall below this threshold of occuring. + /// Value can be set between 0 and 1. #[arg( - long = "target-join-likelihood-threshold", - default_value = "0.5", + long = "join-likelihood-threshold", + default_value = "0.25", value_name = "f" )] - pub target_distance_likelihood_threshold: f64, + pub join_likelihood_threshold: f64, /// The maximum overlap in the consensus at which /// a join is considered between compatible alignments. @@ -459,10 +458,10 @@ fn main() -> Result<()> { .panic_fuse() .enumerate() .map(|(region_idx, group)| run_naive_trace(group, &alignment_data, region_idx, &args)) - .collect::>(); + .collect::>>(); naive_results.sort_by_key(|v| v.region_index); - let trace_stats = trace_statistics( + let trace_stats: TraceStatistics = trace_statistics( &naive_results, &alignment_data, OccuranceCountingMode::Segments, diff --git a/src/p2estimator.rs b/src/p2estimator.rs index 1bfe387..fbcdeb2 100644 --- a/src/p2estimator.rs +++ b/src/p2estimator.rs @@ -154,7 +154,7 @@ impl<'a, F: Float + TotalOrder, I: Unsigned + Copy + Ord + Into + From obs >= self.points.len() } - fn combine(&mut self, other: &P2HistogramData) { + fn combine(&mut self, _other: &P2HistogramData) { // TODO: Need to think about how to do this efficiently while maintaining accuracy... panic!("Not implemented!") } diff --git a/src/pipeline.rs b/src/pipeline.rs index 377266f..5ca4f2a 100644 --- a/src/pipeline.rs +++ b/src/pipeline.rs @@ -5,15 +5,16 @@ use itertools::Itertools; use crate::{ alignment::{AlignmentData, Strand}, annotation::{AmbiguousAnnotation, SimpleAnnotation}, + assembly::gather_join_statistics, chunks::ProximityGroup, confidence::confidence, history_tracing::{ backtrace_histories, history_viterbi_on_segments, History, RefinedTraceSegment, }, + join_estimation::{JoinEstimator, JoinStatisticsCollector}, matrix::{Matrix, MatrixDef}, score_params::{approximate_ideal_skip_state_score, ScoreParams}, segments::{assemble_and_link_segments, segments_from_matrix_trace, InitialSegments}, - statistics::Distribution, support::windowed_confidence, trace_statistics::TraceStatistics, viterbi::{trace_segments, traceback, viterbi_collapsed, TraceSegment}, @@ -141,7 +142,7 @@ fn get_active_columns(matrix: &Matrix) -> Vec<(u active_cols } -pub struct NaiveTraceResults { +pub struct NaiveTraceResults { pub target_start: usize, pub target_end: usize, pub trace_segments: Vec, @@ -149,16 +150,17 @@ pub struct NaiveTraceResults { pub score_params: ScoreParams, pub alignment_confidences: Vec, pub active_columns: Vec<(usize, usize)>, + pub query_join_statistics: Vec<(usize, T)>, pub viz_writer: AdjudicationSodaWriter, pub region_index: usize, } -pub fn run_naive_trace( +pub fn run_naive_trace( proximity_group: &ProximityGroup, alignment_data: &AlignmentData, region_idx: usize, args: &AuroraArgs, -) -> NaiveTraceResults { +) -> NaiveTraceResults { let annot_args = &args.annotation_args; let score_params = ScoreParams::new( @@ -250,6 +252,9 @@ pub fn run_naive_trace( .expect("Unable to write confidences!!!"); } + let query_join_statistics = + gather_join_statistics(proximity_group.alignments, &args.annotation_args); + NaiveTraceResults { target_start: proximity_group.target_start, target_end: proximity_group.target_end, @@ -258,16 +263,17 @@ pub fn run_naive_trace( score_params, alignment_confidences: confidence_by_row, active_columns: get_active_columns(&confidence_matrix), + query_join_statistics, viz_writer, region_index: region_idx, } } -pub fn run_history_trace( +pub fn run_history_trace( proximity_group: &ProximityGroup, alignment_data: &AlignmentData, trace_statistics: &TraceStatistics, - naive_trace: &mut NaiveTraceResults, + naive_trace: &mut NaiveTraceResults, args: &AuroraArgs, ) -> Vec { let vis_args = &args.visualization_args; diff --git a/src/segments.rs b/src/segments.rs index a9aa282..30de252 100644 --- a/src/segments.rs +++ b/src/segments.rs @@ -2,12 +2,12 @@ use core::f64; use std::{cmp::Ordering, fmt::Debug, iter::Fuse}; use crate::{ - alignment::Strand, + alignment::{Alignment, Strand}, assembly::SegmentAssemblyGraph, chunks::ProximityGroup, + join_estimation::JoinEstimator, matrix::Matrix, score_params::ScoreParams, - statistics::Distribution, trace_statistics::{QueryStatistics, RegionStatistics}, viterbi::TraceSegment, AnnotationArgs, @@ -99,6 +99,23 @@ impl Block { pub fn to_comparable(&self) -> (Option, usize) { (self.query_id, self.row_idx) } + + pub fn from_alignment(alignment: &Alignment, row: usize, confidence: f64, score: f64) -> Self { + Self { + row_idx: row, + block_type: BlockType::Alignment, + strand: alignment.strand, + query_id: Some(alignment.query_id), + col_start: alignment.target_start, + col_end: alignment.target_end, + query_start: alignment.query_start, + query_end: alignment.query_end, + avg_confidence: confidence, + alignment_score: score, + kimura80: alignment.kimura80(alignment.query_start, alignment.query_end), + can_join_up_to: 0, + } + } } #[derive(Debug)] @@ -111,6 +128,7 @@ pub struct Segment { } pub type SegmentedMatrix = Vec; +pub type SegmentedMatrixView<'a> = &'a [Segment]; #[derive(Copy, Clone, Debug)] enum MergeEntry { @@ -213,21 +231,10 @@ pub struct InitialSegments { initial_trace_scores: Vec, } -#[allow(dead_code)] -pub struct SegmentView<'a> { - pub start_col: usize, - pub end_col: usize, - pub blocks: &'a [Block], -} - #[allow(dead_code)] impl InitialSegments { - pub fn iter_segments(&self) -> impl Iterator> { - self.segments.iter().map(|v| SegmentView { - start_col: v.start_col, - end_col: v.end_col, - blocks: &v.blocks, - }) + pub fn view_segments(&self) -> SegmentedMatrixView<'_> { + return &self.segments; } pub fn len(&self) -> usize { @@ -582,7 +589,7 @@ pub fn segments_from_matrix_trace( } } -pub fn assemble_and_link_segments<'a, T: Distribution>( +pub fn assemble_and_link_segments<'a, T: JoinEstimator>( proximity_group: &ProximityGroup, initial_segments: &'a mut InitialSegments, trace_segments: &[TraceSegment], diff --git a/src/statistics.rs b/src/statistics.rs index 078bcbb..9f2722b 100644 --- a/src/statistics.rs +++ b/src/statistics.rs @@ -2,6 +2,13 @@ use core::f64; use puruspe::{beta, betai, invbetai}; use std::fmt::Debug; +pub fn ln_add_exp(a: f64, b: f64) -> f64 { + let max = a.max(b); + let min = a.min(b); + // TODO: Possibly use more stable ln_1p_exp at https://github.com/JuliaStats/LogExpFunctions.jl/files/8218470/log1pexp.pdf (Implemented at https://github.com/JuliaStats/LogExpFunctions.jl/blob/master/src/basicfuns.jl#L263) + max + (min - max).exp().ln_1p() +} + // TODO: Support for generic floating types... #[allow(dead_code)] pub trait Distribution: Clone + Debug { @@ -144,6 +151,7 @@ pub struct HalfT { } impl HalfT { + #[allow(dead_code)] pub fn new(standard_deviation: f64, degrees_of_freedom: usize) -> Self { Self { standard_deviation, @@ -312,6 +320,6 @@ mod test { #[test] fn test_exponential_distribution() { - let dist = Exponential::unit(); + let _dist = Exponential::unit(); } } diff --git a/src/trace_statistics.rs b/src/trace_statistics.rs index f8183fd..537ff1d 100644 --- a/src/trace_statistics.rs +++ b/src/trace_statistics.rs @@ -1,8 +1,12 @@ +use std::fmt::Debug; + +use itertools::izip; + use crate::{ alignment::AlignmentData, + join_estimation::{JoinEstimator, JoinStatisticsCollector}, pipeline::NaiveTraceResults, - segments::SegmentView, - statistics::{Distribution, ExponentialEstimator}, + segments::Segment, }; #[derive(Debug)] @@ -12,15 +16,15 @@ pub struct RegionStatistics { } #[derive(Debug, Clone)] -pub struct QueryStatistics { +pub struct QueryStatistics { pub occurances: usize, pub coverage: usize, pub target_span: usize, - pub distribution: T, + pub estimator: Option, } #[derive(Debug)] -pub struct TraceStatistics { +pub struct TraceStatistics { #[allow(dead_code)] pub total_bases: usize, pub query_statistics: Vec>, @@ -33,11 +37,11 @@ pub enum OccuranceCountingMode { Trace, } -pub fn trace_statistics( - naive_traces: &[NaiveTraceResults], +pub fn trace_statistics, E: JoinEstimator>( + naive_traces: &[NaiveTraceResults], alignment_data: &AlignmentData, count_mode: OccuranceCountingMode, -) -> TraceStatistics { +) -> TraceStatistics { // Asumption... All regions are sorted, no gaps. At least 1 region expected... debug_assert!(naive_traces.first().map(|v| v.region_index) == Some(0)); debug_assert!(naive_traces @@ -50,12 +54,12 @@ pub fn trace_statistics( .zip(naive_traces.iter().skip(1)) .all(|(v1, v2)| v1.region_index + 1 == v2.region_index && v1.target_end < v2.target_start)); - let mut query_stats = vec![ + let mut query_stats: Vec> = vec![ QueryStatistics { occurances: 0, coverage: 0, target_span: 0, - distribution: ExponentialEstimator::unit(), + estimator: None, }; alignment_data.query_name_map.size() ]; @@ -64,11 +68,19 @@ pub fn trace_statistics( vec![None; alignment_data.query_name_map.size()]; let mut all_region_stats: Vec = Vec::with_capacity(naive_traces.len()); + let mut all_join_stats: Vec> = vec![None; alignment_data.query_name_map.size()]; for trace_results in naive_traces.iter() { + for (query_id, stats) in trace_results.query_join_statistics.iter() { + all_join_stats[*query_id] = match &all_join_stats[*query_id] { + None => Some(stats.clone()), + Some(other_stats) => Some(other_stats.combine(stats)), + }; + } + match count_mode { OccuranceCountingMode::Segments => { - for seg in trace_results.segments.iter_segments() { + for seg in trace_results.segments.view_segments().iter() { for blk in seg.blocks.iter() { if let Some(query_id) = blk.query_id { query_stats[query_id].occurances += 1; @@ -113,9 +125,9 @@ pub fn trace_statistics( }; let mut unexplained_bases_up_to: usize = 0; - let mut prior_segment: Option = None; + let mut prior_segment: Option<&Segment> = None; - for seg in trace_results.segments.iter_segments() { + for seg in trace_results.segments.view_segments() { if let Some(prior_segment) = prior_segment { // If a skip block was the prior block, add it's bases as unexplained. if prior_segment.blocks.len() == 1 && prior_segment.blocks[0].row_idx == 0 { @@ -133,15 +145,13 @@ pub fn trace_statistics( all_region_stats.push(region_stat); } - for (query_info, query_span) in query_stats.iter_mut().zip(query_span.iter()) { + for (query_info, query_span, join_stat) in + izip!(query_stats.iter_mut(), query_span.iter(), all_join_stats) + { if let Some((start, end)) = query_span { query_info.target_span = end - start + 1; - // We subtract 1 because were looking at distances between each occurance as a sample value. - query_info.distribution = ExponentialEstimator::new( - query_info.target_span as f64 / query_info.occurances.saturating_sub(1) as f64, - query_info.occurances.saturating_sub(1), - ); } + query_info.estimator = join_stat.map(|v| v.into()); } TraceStatistics { From a8e923c4bd11f7a3d09a8e1db57fd66a9898ac47 Mon Sep 17 00:00:00 2001 From: isaacr Date: Thu, 14 May 2026 15:38:28 -0600 Subject: [PATCH 23/39] Target distance and divergence bayesian scoring working... Very clear bug in history backtrace. --- src/assembly.rs | 82 +++++++++++++++++++++++++++++++---------- src/history_tracing.rs | 1 + src/join_estimation.rs | 67 +++++++++++++++++++++------------ src/statistics.rs | 25 +++++++++---- src/trace_statistics.rs | 27 +++++++++----- 5 files changed, 142 insertions(+), 60 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index fa4b6e2..df96601 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -326,6 +326,47 @@ fn link_assemblies( let (consensus_distance, link_type) = block_consensus_distance(a_block, b_block); + if b_block.row_idx == 583 { + println!("Block: {}", a_block.row_idx); + println!( + "Score: {}", + query_statistics.estimator.predict(a_block, b_block, false) + ); + + println!( + "Is Joinable: {}", + is_joinable( + target_distance, + consensus_distance, + link_type, + min_block_length, + args, + ) + ); + + println!( + "Weight: {}", + if a_block.row_idx == b_block.row_idx && ((b.0 - 1) <= a.0) { + score_params.query_loop_score + } else { + get_link_cost( + args, + score_params, + consensus_distance, + query_statistics.estimator.predict(a_block, b_block, false), + ) + } + ); + + println!("Estimator: {:#?}", query_statistics.estimator); + println!( + "Target Dist: {}, Div: {}, Cons Dist: {}", + target_distance, + (a_block.kimura80 - b_block.kimura80).abs(), + consensus_distance + ) + } + if is_joinable( target_distance, consensus_distance, @@ -333,26 +374,29 @@ fn link_assemblies( min_block_length, args, ) { - if let Some(estimator) = &query_statistics.estimator { - let join_prob = estimator.predict(a_block, b_block, false); - - if join_prob >= args.join_likelihood_threshold { - let weight = if a_block.row_idx == b_block.row_idx && ((b.0 - 1) <= a.0) { - score_params.query_loop_score - } else { - get_link_cost(args, score_params, consensus_distance, join_prob) - }; - - graph.insert( - ((a.0, a_block.row_idx), (b.0, b_block.row_idx)), - Edge { - weight, - first_sparse_row: a.1, - second_sparse_row: b.1, - link_type, - }, - ); + let join_prob = query_statistics.estimator.predict(a_block, b_block, false); + + if join_prob >= args.join_likelihood_threshold { + let mut weight = if a_block.row_idx == b_block.row_idx && ((b.0 - 1) <= a.0) { + score_params.query_loop_score + } else { + get_link_cost(args, score_params, consensus_distance, join_prob) + }; + + if b_block.query_id == Some(196) { + println!("Setting Weight to 1 for {}", b_block.row_idx); + weight = score_params.query_loop_score } + + graph.insert( + ((a.0, a_block.row_idx), (b.0, b_block.row_idx)), + Edge { + weight, + first_sparse_row: a.1, + second_sparse_row: b.1, + link_type, + }, + ); } } }); diff --git a/src/history_tracing.rs b/src/history_tracing.rs index 8f39011..436b330 100644 --- a/src/history_tracing.rs +++ b/src/history_tracing.rs @@ -1357,6 +1357,7 @@ pub fn backtrace_histories( let mut current_entry = &history.entries[current_idx]; while let HistoryEntry::Join(entry_info) | HistoryEntry::Append(entry_info) = current_entry { + println!("{:#?}", current_entry); // Append current entry to segment stack... let blocks = history.segment_groups[entry_info.segment] .get_group(entry_info.group_index) diff --git a/src/join_estimation.rs b/src/join_estimation.rs index 7b0cd3e..975536c 100644 --- a/src/join_estimation.rs +++ b/src/join_estimation.rs @@ -1,20 +1,23 @@ +use std::fmt::Debug; + use crate::{ assembly::block_target_distance, segments::Block, statistics::{ln_add_exp, Distribution, ExponentialEstimator, HalfT}, }; -pub trait JoinEstimator: Clone { +pub trait JoinEstimator: Clone + Default + Debug { fn predict(&self, first_block: &Block, second_block: &Block, log_space: bool) -> f64; } -pub trait JoinStatisticsCollector: Clone { +pub trait JoinStatisticsCollector: Clone + Debug { fn new() -> Self; + fn new_from_prior(bayesian_prior: &Self) -> Self; fn combine(&self, other: &Self) -> Self; fn add(&mut self, first_block: &Block, second_block: &Block, neighbors: bool, joinable: bool); } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Default)] pub struct BayesianJoinEstimator { target_distance_join: ExponentialEstimator, target_distance_nojoin: ExponentialEstimator, @@ -34,7 +37,7 @@ impl JoinEstimator for BayesianJoinEstimator { + self.divergence_join.logpdf(divergence_diff); let nojoin_score = (-self.join_prior).ln_1p() + self.target_distance_nojoin.logpdf(target_dist) - + self.divergence_nojoin.logpdf(target_dist); + + self.divergence_nojoin.logpdf(divergence_diff); let score_norm = ln_add_exp(join_score, nojoin_score); let score = join_score - score_norm; @@ -55,32 +58,30 @@ impl From for BayesianJoinEstimator { impl From<&BayesianJoinStatistics> for BayesianJoinEstimator { fn from(statistics: &BayesianJoinStatistics) -> Self { + let join_psuedo_count = statistics.joinable_count.max(1); + let nojoin_psuedo_count = statistics.unjoinable_count.max(1); + let join_td_mean = - statistics.joinable_target_distance_sum as f64 / statistics.joinable_count as f64; - let nojoin_td_mean = - statistics.unjoinable_target_distance_sum as f64 / statistics.unjoinable_count as f64; + (statistics.joinable_target_distance_sum as f64 / join_psuedo_count as f64).max(1.0); + let nojoin_td_mean = (statistics.unjoinable_target_distance_sum as f64 + / nojoin_psuedo_count as f64) + .max(join_td_mean); // Divergence distributions should have a mean of 0, so we assume that... - let join_div_mean = statistics.joinable_divergence_sum / statistics.joinable_count as f64; + let join_div_mean = + (statistics.joinable_divergence_sum / join_psuedo_count as f64).max(1.0); let nojoin_div_mean = - statistics.unjoinable_divergence_sum / statistics.joinable_count as f64; + (statistics.unjoinable_divergence_sum / nojoin_psuedo_count as f64).max(join_div_mean); Self { - target_distance_join: ExponentialEstimator::new( - join_td_mean, - statistics.joinable_count, - ), - target_distance_nojoin: ExponentialEstimator::new( - nojoin_td_mean, - statistics.unjoinable_count, - ), - divergence_join: HalfT::from_sample_mean(join_div_mean, statistics.joinable_count), - divergence_nojoin: HalfT::from_sample_mean( - nojoin_div_mean, - statistics.unjoinable_count, - ), - join_prior: statistics.joinable_count as f64 - / (statistics.joinable_count + statistics.unjoinable_count) as f64, + target_distance_join: ExponentialEstimator::new(join_td_mean, join_psuedo_count), + target_distance_nojoin: ExponentialEstimator::new(nojoin_td_mean, nojoin_psuedo_count), + divergence_join: HalfT::from_sample_mean(join_div_mean, join_psuedo_count), + divergence_nojoin: HalfT::from_sample_mean(nojoin_div_mean, nojoin_psuedo_count), + // We take sqrt since we count all pairs, not just neighbors. + join_prior: (join_psuedo_count as f64 + / (nojoin_psuedo_count + join_psuedo_count) as f64) + .sqrt(), } } } @@ -107,6 +108,24 @@ impl JoinStatisticsCollector for BayesianJoinStatistics { } } + fn new_from_prior(bayesian_prior: &Self) -> Self { + let join_psuedo_count = bayesian_prior.joinable_count.max(1); + let nojoin_psuedo_count = bayesian_prior.unjoinable_count.max(1); + + Self { + joinable_target_distance_sum: bayesian_prior.joinable_target_distance_sum + / join_psuedo_count, + unjoinable_target_distance_sum: bayesian_prior.unjoinable_target_distance_sum + / nojoin_psuedo_count, + joinable_divergence_sum: bayesian_prior.joinable_divergence_sum + / join_psuedo_count as f64, + unjoinable_divergence_sum: bayesian_prior.unjoinable_divergence_sum + / nojoin_psuedo_count as f64, + joinable_count: 1, + unjoinable_count: 1, + } + } + fn add(&mut self, first_block: &Block, second_block: &Block, _neighbors: bool, joinable: bool) { let target_dist = block_target_distance(first_block, second_block).abs() as usize; let divergence_diff = (second_block.kimura80 - first_block.kimura80).abs(); diff --git a/src/statistics.rs b/src/statistics.rs index 9f2722b..77e5ae6 100644 --- a/src/statistics.rs +++ b/src/statistics.rs @@ -11,8 +11,7 @@ pub fn ln_add_exp(a: f64, b: f64) -> f64 { // TODO: Support for generic floating types... #[allow(dead_code)] -pub trait Distribution: Clone + Debug { - fn unit() -> Self; +pub trait Distribution: Clone + Debug + Default { fn pdf(&self, x: f64) -> f64; fn cdf(&self, x: f64) -> f64; fn ppf(&self, p: f64) -> f64; @@ -21,6 +20,10 @@ pub trait Distribution: Clone + Debug { fn logpdf(&self, x: f64) -> f64; fn logcdf(&self, x: f64) -> f64; fn logccdf(&self, x: f64) -> f64; + + fn unit() -> Self { + Self::default() + } } #[derive(Clone, Debug)] @@ -38,11 +41,13 @@ impl Exponential { } } -impl Distribution for Exponential { - fn unit() -> Self { +impl Default for Exponential { + fn default() -> Self { Self::new(1.0) } +} +impl Distribution for Exponential { fn pdf(&self, x: f64) -> f64 { self.lambda * (-self.lambda * x).exp() } @@ -97,14 +102,16 @@ impl From for Exponential { } } -impl Distribution for ExponentialEstimator { - fn unit() -> Self { +impl Default for ExponentialEstimator { + fn default() -> Self { Self { sample_mean: 1.0, degrees_of_freedom: 1, } } +} +impl Distribution for ExponentialEstimator { fn logpdf(&self, x: f64) -> f64 { let n = self.degrees_of_freedom as f64; let sm = self.sample_mean; @@ -167,14 +174,16 @@ impl HalfT { } } -impl Distribution for HalfT { - fn unit() -> Self { +impl Default for HalfT { + fn default() -> Self { Self { standard_deviation: 1.0, degrees_of_freedom: 1, } } +} +impl Distribution for HalfT { fn logpdf(&self, x: f64) -> f64 { let v = self.degrees_of_freedom as f64; let s = self.standard_deviation; diff --git a/src/trace_statistics.rs b/src/trace_statistics.rs index 537ff1d..705f761 100644 --- a/src/trace_statistics.rs +++ b/src/trace_statistics.rs @@ -20,7 +20,7 @@ pub struct QueryStatistics { pub occurances: usize, pub coverage: usize, pub target_span: usize, - pub estimator: Option, + pub estimator: T, } #[derive(Debug)] @@ -59,7 +59,7 @@ pub fn trace_statistics, E: JoinEst occurances: 0, coverage: 0, target_span: 0, - estimator: None, + estimator: E::default(), }; alignment_data.query_name_map.size() ]; @@ -68,14 +68,12 @@ pub fn trace_statistics, E: JoinEst vec![None; alignment_data.query_name_map.size()]; let mut all_region_stats: Vec = Vec::with_capacity(naive_traces.len()); - let mut all_join_stats: Vec> = vec![None; alignment_data.query_name_map.size()]; + // We combine stats for all families to use as a prior (psuedo-count, single sample) for all stats... + let mut all_family_stats: S = S::new(); for trace_results in naive_traces.iter() { - for (query_id, stats) in trace_results.query_join_statistics.iter() { - all_join_stats[*query_id] = match &all_join_stats[*query_id] { - None => Some(stats.clone()), - Some(other_stats) => Some(other_stats.combine(stats)), - }; + for (_query_id, stats) in trace_results.query_join_statistics.iter() { + all_family_stats = all_family_stats.combine(stats); } match count_mode { @@ -145,13 +143,24 @@ pub fn trace_statistics, E: JoinEst all_region_stats.push(region_stat); } + // Calculate join statistics for all families using combined prior as a starting point... + let mut all_join_stats: Vec = vec![S::new(); alignment_data.query_name_map.size()]; + + for trace_results in naive_traces.iter() { + for (query_id, stats) in trace_results.query_join_statistics.iter() { + all_join_stats[*query_id] = all_join_stats[*query_id].combine(stats); + } + } + + println!("{:#?}", all_join_stats); + for (query_info, query_span, join_stat) in izip!(query_stats.iter_mut(), query_span.iter(), all_join_stats) { if let Some((start, end)) = query_span { query_info.target_span = end - start + 1; } - query_info.estimator = join_stat.map(|v| v.into()); + query_info.estimator = join_stat.into(); } TraceStatistics { From 88dabb8a82312712546d0f4faee45f5f0778a9a8 Mon Sep 17 00:00:00 2001 From: isaacr Date: Mon, 18 May 2026 18:52:52 -0600 Subject: [PATCH 24/39] Bug fix: properly adjust history when join is made. --- src/assembly.rs | 48 +---------------------------------------- src/history_tracing.rs | 40 +++++++++++++++++++++++----------- src/trace_statistics.rs | 5 ++--- 3 files changed, 31 insertions(+), 62 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index df96601..a58875a 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -326,47 +326,6 @@ fn link_assemblies( let (consensus_distance, link_type) = block_consensus_distance(a_block, b_block); - if b_block.row_idx == 583 { - println!("Block: {}", a_block.row_idx); - println!( - "Score: {}", - query_statistics.estimator.predict(a_block, b_block, false) - ); - - println!( - "Is Joinable: {}", - is_joinable( - target_distance, - consensus_distance, - link_type, - min_block_length, - args, - ) - ); - - println!( - "Weight: {}", - if a_block.row_idx == b_block.row_idx && ((b.0 - 1) <= a.0) { - score_params.query_loop_score - } else { - get_link_cost( - args, - score_params, - consensus_distance, - query_statistics.estimator.predict(a_block, b_block, false), - ) - } - ); - - println!("Estimator: {:#?}", query_statistics.estimator); - println!( - "Target Dist: {}, Div: {}, Cons Dist: {}", - target_distance, - (a_block.kimura80 - b_block.kimura80).abs(), - consensus_distance - ) - } - if is_joinable( target_distance, consensus_distance, @@ -377,17 +336,12 @@ fn link_assemblies( let join_prob = query_statistics.estimator.predict(a_block, b_block, false); if join_prob >= args.join_likelihood_threshold { - let mut weight = if a_block.row_idx == b_block.row_idx && ((b.0 - 1) <= a.0) { + let weight = if a_block.row_idx == b_block.row_idx && ((b.0 - 1) <= a.0) { score_params.query_loop_score } else { get_link_cost(args, score_params, consensus_distance, join_prob) }; - if b_block.query_id == Some(196) { - println!("Setting Weight to 1 for {}", b_block.row_idx); - weight = score_params.query_loop_score - } - graph.insert( ((a.0, a_block.row_idx), (b.0, b_block.row_idx)), Edge { diff --git a/src/history_tracing.rs b/src/history_tracing.rs index 436b330..5348aa1 100644 --- a/src/history_tracing.rs +++ b/src/history_tracing.rs @@ -659,6 +659,15 @@ fn get_join_endpoints_from_links( (left_side, right_side) } +fn prior_history_index(entry: &HistoryEntry) -> usize { + if let HistoryEntry::Append(info) | HistoryEntry::Join(info) = entry { + info.prior_history + } else { + // 0 for the root of the histories... + 0 + } +} + fn add_single_join( histories: &mut Vec, segments: &[Segment], @@ -679,11 +688,13 @@ fn add_single_join( let new_group_index = segment_groups[segment_idx].add_group(&segments[segment_idx], join_blocks); - let join_prior_index = match (left_join_link, right_join_link) { - (Some(lv), Some(rv)) => lv.origin_history.min(rv.origin_history), - (Some(v), None) | (None, Some(v)) => v.origin_history, - _ => prior_hist_idx, - }; + let join_prior_index = prior_history_index( + &histories[match (left_join_link, right_join_link) { + (Some(lv), Some(rv)) => lv.origin_history.min(rv.origin_history), + (Some(v), None) | (None, Some(v)) => v.origin_history, + _ => panic!("Unreachable branch here, something went really wrong..."), + }], + ); // Clean expired history entries from the join path.... let simplified_join_index = remove_expired_history_entries( @@ -1158,6 +1169,7 @@ fn get_joinable_extensions<'a>( .collect_vec() } +#[derive(Debug)] struct JoinStackEntry { joined_history_offset: usize, trace_segment_offset: usize, @@ -1170,6 +1182,7 @@ struct AddedBlockInfo { join_index: usize, } +#[derive(Debug)] struct JoinStack { pub stack: Vec, pub next_join_index: usize, @@ -1193,7 +1206,7 @@ impl JoinStack { /// Try adding one or two joins to the join stack if this block is a join and has linked edges. fn try_push(&mut self, entry: &HistoryEntry, added_block_info: Option<&AddedBlockInfo>) { if let (HistoryEntry::Join(val), Some(info)) = (entry, added_block_info) { - let mut top_stack_entries = 0; + let top_offset = self.stack.len(); if val.join_left_block.caused_by_history != info.history_index { self.stack.push(JoinStackEntry { @@ -1201,7 +1214,6 @@ impl JoinStack { trace_segment_offset: info.trace_stack_index, join_index: info.join_index, }); - top_stack_entries += 1; } if val.join_right_block.caused_by_history != info.history_index { @@ -1210,11 +1222,9 @@ impl JoinStack { trace_segment_offset: info.trace_stack_index, join_index: info.join_index, }); - top_stack_entries += 1; } - let top_vals_offset = self.stack.len() - top_stack_entries; - self.stack[top_vals_offset..].sort_unstable_by_key(|v| v.joined_history_offset); + self.stack[top_offset..].sort_unstable_by_key(|v| v.joined_history_offset); } } @@ -1278,7 +1288,7 @@ fn history_backtrace_append_block( .any(|&b| matches!(b.block_type, BlockType::Alignment | BlockType::TandemRepeat)) { match joiner.check_for_join(current_history_index) { - // Case 2: Involved in a join, add new block, but don't + // Case 2: Involved in a join, add new block. BlockAction::Join(join_index, stack_pos) => { let joins = get_joinable_extensions( blocks.iter().copied(), @@ -1357,7 +1367,6 @@ pub fn backtrace_histories( let mut current_entry = &history.entries[current_idx]; while let HistoryEntry::Join(entry_info) | HistoryEntry::Append(entry_info) = current_entry { - println!("{:#?}", current_entry); // Append current entry to segment stack... let blocks = history.segment_groups[entry_info.segment] .get_group(entry_info.group_index) @@ -1384,6 +1393,13 @@ pub fn backtrace_histories( current_entry = &history.entries[current_idx]; } + if joiner.stack.len() != 0 { + panic!( + "Backtrace not done properly, there are {} leftover values on the join stack!", + joiner.stack.len() + ); + } + // Reverse so trace segments go from start to end instead of end to start. refined_segments.reverse(); refined_segments diff --git a/src/trace_statistics.rs b/src/trace_statistics.rs index 705f761..20e66f1 100644 --- a/src/trace_statistics.rs +++ b/src/trace_statistics.rs @@ -144,7 +144,8 @@ pub fn trace_statistics, E: JoinEst } // Calculate join statistics for all families using combined prior as a starting point... - let mut all_join_stats: Vec = vec![S::new(); alignment_data.query_name_map.size()]; + let mut all_join_stats: Vec = + vec![S::new_from_prior(&all_family_stats); alignment_data.query_name_map.size()]; for trace_results in naive_traces.iter() { for (query_id, stats) in trace_results.query_join_statistics.iter() { @@ -152,8 +153,6 @@ pub fn trace_statistics, E: JoinEst } } - println!("{:#?}", all_join_stats); - for (query_info, query_span, join_stat) in izip!(query_stats.iter_mut(), query_span.iter(), all_join_stats) { From 8a3e2680cce325f0b36422deb279296cade44088 Mon Sep 17 00:00:00 2001 From: isaacr Date: Thu, 21 May 2026 11:45:53 -0600 Subject: [PATCH 25/39] Tested est for frechet, stinks so temp disabled. --- scripts/plot_distributions_aurora.py | 4 +- src/join_estimation.rs | 257 ++++++++++++++++++++------- src/p2estimator.rs | 2 +- src/statistics.rs | 174 +++++++++++++++++- src/trace_statistics.rs | 2 +- 5 files changed, 364 insertions(+), 75 deletions(-) diff --git a/scripts/plot_distributions_aurora.py b/scripts/plot_distributions_aurora.py index 26fa437..dfe8961 100644 --- a/scripts/plot_distributions_aurora.py +++ b/scripts/plot_distributions_aurora.py @@ -223,8 +223,8 @@ def logcdf(self, x, *args): estimator = { "Relative Consensus Distance": Distribution( - laplace_asymmetric, (1.0, 0.0, 1.0), False - ), # Distribution(invweibull, (1.0, 0.0, 1.0), False), + invweibull, (1.0, 0.0, 1.0), False + ), # Distribution(invweibull, (1.0, 0.0, 1.0), False), Distribution(laplace_asymmetric, (1.0, 0.0, 1.0), False) "Target Distance": Distribution( genpareto, (0.0, 1.0) ), # Distribution(expon, (1.0,)), # Distribution(genpareto, (0.0, 1.0)), Distribution(weibull_min, (1.0, 10000) diff --git a/src/join_estimation.rs b/src/join_estimation.rs index 975536c..b688ed9 100644 --- a/src/join_estimation.rs +++ b/src/join_estimation.rs @@ -1,9 +1,9 @@ -use std::fmt::Debug; +use std::{fmt::Debug, ops}; use crate::{ - assembly::block_target_distance, + assembly::{block_consensus_distance, block_length_on_query, block_target_distance, LinkType}, segments::Block, - statistics::{ln_add_exp, Distribution, ExponentialEstimator, HalfT}, + statistics::{ln_add_exp, Distribution, ExponentialEstimator, Frechet, HalfT, Laplace}, }; pub trait JoinEstimator: Clone + Default + Debug { @@ -12,7 +12,7 @@ pub trait JoinEstimator: Clone + Default + Debug { pub trait JoinStatisticsCollector: Clone + Debug { fn new() -> Self; - fn new_from_prior(bayesian_prior: &Self) -> Self; + fn new_from_prior(bayesian_prior: &Self, pseudo_count: usize) -> Self; fn combine(&self, other: &Self) -> Self; fn add(&mut self, first_block: &Block, second_block: &Block, neighbors: bool, joinable: bool); } @@ -23,6 +23,8 @@ pub struct BayesianJoinEstimator { target_distance_nojoin: ExponentialEstimator, divergence_join: HalfT, divergence_nojoin: HalfT, + consensus_distance_join: Frechet, + consensus_distance_nojoin: Laplace, join_prior: f64, } @@ -31,13 +33,39 @@ impl JoinEstimator for BayesianJoinEstimator { let target_dist = block_target_distance(first_block, second_block) as f64; // Absolute value as t-dist is symmetric and we want to get prob in tail, also, we know the mean is 0... let divergence_diff = (second_block.kimura80 - first_block.kimura80).abs(); + let (consensus_dist, _join_type) = block_consensus_distance(first_block, second_block); + let rel_con_dist = consensus_dist as f64 + / block_length_on_query(first_block).max(block_length_on_query(second_block)) as f64; + + /* + println!("{:#?}", self); + println!( + "{} {} {}", + rel_con_dist, + self.consensus_distance_join.pdf(rel_con_dist), + self.consensus_distance_nojoin.pdf(rel_con_dist) + ); + println!( + "{} {} {}", + target_dist, + self.target_distance_join.pdf(target_dist), + self.target_distance_nojoin.pdf(target_dist) + ); + println!( + "{} {} {}", + divergence_diff, + self.divergence_join.pdf(divergence_diff), + self.divergence_nojoin.pdf(divergence_diff) + );*/ let join_score = self.join_prior.ln() + self.target_distance_join.logpdf(target_dist) + self.divergence_join.logpdf(divergence_diff); + //+ self.consensus_distance_join.logpdf(rel_con_dist); let nojoin_score = (-self.join_prior).ln_1p() + self.target_distance_nojoin.logpdf(target_dist) + self.divergence_nojoin.logpdf(divergence_diff); + //+ self.consensus_distance_nojoin.logpdf(rel_con_dist); let score_norm = ln_add_exp(join_score, nojoin_score); let score = join_score - score_norm; @@ -56,102 +84,195 @@ impl From for BayesianJoinEstimator { } } -impl From<&BayesianJoinStatistics> for BayesianJoinEstimator { - fn from(statistics: &BayesianJoinStatistics) -> Self { - let join_psuedo_count = statistics.joinable_count.max(1); - let nojoin_psuedo_count = statistics.unjoinable_count.max(1); +#[derive(Debug, Clone, Copy)] +struct MomentEstimator { + sum_square: f64, + sum: f64, + samples: usize, +} + +impl MomentEstimator { + fn new() -> Self { + Self { + sum_square: 0.0, + sum: 0.0, + samples: 0, + } + } + + fn to_psuedo_count(&self, count: usize) -> Self { + Self { + sum_square: (self.sum_square / self.samples.max(1) as f64) * count as f64, + sum: (self.sum / self.samples.max(1) as f64) * count as f64, + samples: count, + } + } + + fn mean(&self) -> f64 { + self.sum / self.samples.max(1) as f64 + } + + fn variance(&self) -> f64 { + // TODO: Use shifted data alg for more accuracy... + (self.sum_square - (self.sum * self.sum) / self.samples.max(1) as f64) + / (self.samples.max(2) as f64 - 1.0) + } + + fn standard_deviation(&self) -> f64 { + self.variance().sqrt() + } + + fn samples(&self) -> usize { + self.samples + } +} + +impl Default for MomentEstimator { + fn default() -> Self { + Self::new() + } +} - let join_td_mean = - (statistics.joinable_target_distance_sum as f64 / join_psuedo_count as f64).max(1.0); - let nojoin_td_mean = (statistics.unjoinable_target_distance_sum as f64 - / nojoin_psuedo_count as f64) - .max(join_td_mean); +impl ops::Add for MomentEstimator { + type Output = MomentEstimator; - // Divergence distributions should have a mean of 0, so we assume that... - let join_div_mean = - (statistics.joinable_divergence_sum / join_psuedo_count as f64).max(1.0); - let nojoin_div_mean = - (statistics.unjoinable_divergence_sum / nojoin_psuedo_count as f64).max(join_div_mean); + fn add(self, rhs: MomentEstimator) -> Self::Output { + Self { + sum_square: self.sum_square + rhs.sum_square, + sum: self.sum + rhs.sum, + samples: self.samples + rhs.samples, + } + } +} + +impl ops::AddAssign for MomentEstimator { + fn add_assign(&mut self, rhs: MomentEstimator) { + self.sum_square += rhs.sum_square; + self.sum += rhs.sum; + self.samples += rhs.samples; + } +} + +impl ops::AddAssign for MomentEstimator { + fn add_assign(&mut self, rhs: f64) { + self.sum_square += rhs * rhs; + self.sum += rhs; + self.samples += 1; + } +} + +impl From for ExponentialEstimator { + fn from(value: MomentEstimator) -> Self { + Self::new(value.mean(), value.samples().max(1)) + } +} +impl From for HalfT { + fn from(value: MomentEstimator) -> Self { + Self::from_sample_mean(value.mean(), value.samples().max(1)) + } +} + +impl From for Laplace { + fn from(value: MomentEstimator) -> Self { + Self::from_moments(value.mean(), value.standard_deviation()) + } +} + +impl From<&BayesianJoinStatistics> for BayesianJoinEstimator { + fn from(statistics: &BayesianJoinStatistics) -> Self { Self { - target_distance_join: ExponentialEstimator::new(join_td_mean, join_psuedo_count), - target_distance_nojoin: ExponentialEstimator::new(nojoin_td_mean, nojoin_psuedo_count), - divergence_join: HalfT::from_sample_mean(join_div_mean, join_psuedo_count), - divergence_nojoin: HalfT::from_sample_mean(nojoin_div_mean, nojoin_psuedo_count), + target_distance_join: statistics.joinable_target_distance.into(), + target_distance_nojoin: statistics.unjoinable_target_distance.into(), + divergence_join: statistics.joinable_divergence.into(), + divergence_nojoin: statistics.unjoinable_divergence.into(), + consensus_distance_join: Frechet::from_log_moments( + statistics.joinable_consensus_log.mean(), + statistics.joinable_consensus_log.standard_deviation(), + -0.05, + ), + consensus_distance_nojoin: statistics.unjoinable_consensus.into(), // We take sqrt since we count all pairs, not just neighbors. - join_prior: (join_psuedo_count as f64 - / (nojoin_psuedo_count + join_psuedo_count) as f64) + join_prior: (statistics.joinable_target_distance.samples() as f64 + / (statistics.joinable_target_distance.samples() + + statistics.unjoinable_target_distance.samples()) + .max(1) as f64) .sqrt(), } } } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Default)] pub struct BayesianJoinStatistics { - joinable_target_distance_sum: usize, - unjoinable_target_distance_sum: usize, - joinable_divergence_sum: f64, - unjoinable_divergence_sum: f64, - joinable_count: usize, - unjoinable_count: usize, + joinable_target_distance: MomentEstimator, + unjoinable_target_distance: MomentEstimator, + joinable_divergence: MomentEstimator, + unjoinable_divergence: MomentEstimator, + joinable_consensus_log: MomentEstimator, + unjoinable_consensus: MomentEstimator, } impl JoinStatisticsCollector for BayesianJoinStatistics { fn new() -> Self { - Self { - joinable_target_distance_sum: 0, - unjoinable_target_distance_sum: 0, - joinable_divergence_sum: 0.0, - unjoinable_divergence_sum: 0.0, - joinable_count: 0, - unjoinable_count: 0, - } + Self::default() } - fn new_from_prior(bayesian_prior: &Self) -> Self { - let join_psuedo_count = bayesian_prior.joinable_count.max(1); - let nojoin_psuedo_count = bayesian_prior.unjoinable_count.max(1); - + fn new_from_prior(bayesian_prior: &Self, pseudo_count: usize) -> Self { Self { - joinable_target_distance_sum: bayesian_prior.joinable_target_distance_sum - / join_psuedo_count, - unjoinable_target_distance_sum: bayesian_prior.unjoinable_target_distance_sum - / nojoin_psuedo_count, - joinable_divergence_sum: bayesian_prior.joinable_divergence_sum - / join_psuedo_count as f64, - unjoinable_divergence_sum: bayesian_prior.unjoinable_divergence_sum - / nojoin_psuedo_count as f64, - joinable_count: 1, - unjoinable_count: 1, + joinable_target_distance: bayesian_prior + .joinable_target_distance + .to_psuedo_count(pseudo_count), + unjoinable_target_distance: bayesian_prior + .unjoinable_target_distance + .to_psuedo_count(pseudo_count), + joinable_divergence: bayesian_prior + .joinable_divergence + .to_psuedo_count(pseudo_count), + unjoinable_divergence: bayesian_prior + .unjoinable_divergence + .to_psuedo_count(pseudo_count), + joinable_consensus_log: bayesian_prior + .joinable_consensus_log + .to_psuedo_count(pseudo_count), + unjoinable_consensus: bayesian_prior + .unjoinable_consensus + .to_psuedo_count(pseudo_count), } } fn add(&mut self, first_block: &Block, second_block: &Block, _neighbors: bool, joinable: bool) { let target_dist = block_target_distance(first_block, second_block).abs() as usize; let divergence_diff = (second_block.kimura80 - first_block.kimura80).abs(); + let (consensus_dist, join_type) = block_consensus_distance(first_block, second_block); + let rel_con_dist = consensus_dist as f64 + / block_length_on_query(first_block).max(block_length_on_query(second_block)) as f64; if joinable { - self.joinable_target_distance_sum += target_dist; - self.joinable_divergence_sum += divergence_diff; - self.joinable_count += 1; + self.joinable_target_distance += target_dist as f64; + self.joinable_divergence += divergence_diff; + if matches!(join_type, LinkType::Forward | LinkType::Reverse) { + //println!("CDist: {}", rel_con_dist); + self.joinable_consensus_log += (rel_con_dist + 0.05).max(1e-50).ln(); + } } else { - self.unjoinable_target_distance_sum += target_dist; - self.unjoinable_divergence_sum += divergence_diff; - self.unjoinable_count += 1; + self.unjoinable_target_distance += target_dist as f64; + self.unjoinable_divergence += divergence_diff; + if matches!(join_type, LinkType::Forward | LinkType::Reverse) { + self.unjoinable_consensus += rel_con_dist; + } } } fn combine(&self, other: &Self) -> Self { Self { - joinable_target_distance_sum: self.joinable_target_distance_sum - + other.joinable_target_distance_sum, - unjoinable_target_distance_sum: self.unjoinable_target_distance_sum - + other.unjoinable_target_distance_sum, - joinable_divergence_sum: self.joinable_divergence_sum + other.joinable_divergence_sum, - unjoinable_divergence_sum: self.unjoinable_divergence_sum - + other.unjoinable_divergence_sum, - joinable_count: self.joinable_count + other.joinable_count, - unjoinable_count: self.unjoinable_count + other.unjoinable_count, + joinable_target_distance: self.joinable_target_distance + + other.joinable_target_distance, + unjoinable_target_distance: self.unjoinable_target_distance + + other.unjoinable_target_distance, + joinable_divergence: self.joinable_divergence + other.joinable_divergence, + unjoinable_divergence: self.unjoinable_divergence + other.unjoinable_divergence, + joinable_consensus_log: self.joinable_consensus_log + other.joinable_consensus_log, + unjoinable_consensus: self.unjoinable_consensus + other.unjoinable_consensus, } } } diff --git a/src/p2estimator.rs b/src/p2estimator.rs index fbcdeb2..1bfe387 100644 --- a/src/p2estimator.rs +++ b/src/p2estimator.rs @@ -154,7 +154,7 @@ impl<'a, F: Float + TotalOrder, I: Unsigned + Copy + Ord + Into + From obs >= self.points.len() } - fn combine(&mut self, _other: &P2HistogramData) { + fn combine(&mut self, other: &P2HistogramData) { // TODO: Need to think about how to do this efficiently while maintaining accuracy... panic!("Not implemented!") } diff --git a/src/statistics.rs b/src/statistics.rs index 77e5ae6..7670c93 100644 --- a/src/statistics.rs +++ b/src/statistics.rs @@ -234,9 +234,173 @@ impl Distribution for HalfT { } } +#[derive(Debug, Clone)] +pub struct Frechet { + alpha: f64, + scale: f64, + minimum: f64, +} + +impl Frechet { + pub fn new(alpha: f64, scale: f64, minimum: f64) -> Self { + Self { + alpha, + scale, + minimum, + } + } + + pub fn from_log_moments(log_mean: f64, log_std: f64, minimum: f64) -> Self { + let alpha = f64::consts::PI / (6.0 * log_std); + let lambda = (alpha * log_mean - f64::consts::EULER_GAMMA).exp(); + let scale = lambda.powf(1.0 / alpha); + Self { + alpha, + scale, + minimum, + } + } +} + +impl Default for Frechet { + fn default() -> Self { + Self { + alpha: 1.0, + scale: 1.0, + minimum: 0.0, + } + } +} + +impl Distribution for Frechet { + fn logpdf(&self, x: f64) -> f64 { + let a = self.alpha; + let s = self.scale; + let m = self.minimum; + if x > m { + (a / s).ln() + -(a + 1.0) * ((x - m) / s).ln() + -((x - m) / s).powf(-a) + } else { + f64::NEG_INFINITY + } + } + + fn pdf(&self, x: f64) -> f64 { + self.logpdf(x).exp() + } + + fn cdf(&self, x: f64) -> f64 { + self.logcdf(x).exp() + } + + fn logcdf(&self, x: f64) -> f64 { + let a = self.alpha; + let s = self.scale; + let m = self.minimum; + if x > m { + -((x - m) / s).powf(-a) + } else { + f64::NEG_INFINITY + } + } + + fn ppf(&self, p: f64) -> f64 { + let a = self.alpha; + let s = self.scale; + let m = self.minimum; + if p >= 1.0 { + f64::INFINITY + } else if p <= 0.0 { + m + } else { + m + s * (-p.min(1.0).ln()).powf(1.0 / -a) + } + } + + fn ccdf(&self, x: f64) -> f64 { + 1.0 - self.cdf(x) + } + + fn logccdf(&self, x: f64) -> f64 { + self.ccdf(x).ln() + } + + fn support(&self) -> (f64, f64) { + (self.minimum, f64::INFINITY) + } +} + +#[derive(Debug, Clone)] +pub struct Laplace { + mean: f64, + scale: f64, +} + +impl Laplace { + pub fn new(mean: f64, scale: f64) -> Self { + Self { mean, scale } + } + + pub fn from_moments(mean: f64, standard_deviation: f64) -> Self { + Self { + mean, + scale: standard_deviation / f64::consts::SQRT_2, + } + } +} + +impl Default for Laplace { + fn default() -> Self { + Self { + mean: 0.0, + scale: 1.0, + } + } +} + +impl Distribution for Laplace { + fn logpdf(&self, x: f64) -> f64 { + let mu = self.mean; + let b = self.scale; + (0.5 / b).ln() + -((x - mu).abs() / b) + } + + fn pdf(&self, x: f64) -> f64 { + self.logpdf(x).exp() + } + + fn cdf(&self, x: f64) -> f64 { + let mu = self.mean; + let b = self.scale; + 0.5 + 0.5 * (x - mu).signum() * (1.0 - (-(x - mu).abs() / b).exp()) + } + + fn logcdf(&self, x: f64) -> f64 { + self.cdf(x).ln() + } + + fn ccdf(&self, x: f64) -> f64 { + 1.0 - self.cdf(x) + } + + fn logccdf(&self, x: f64) -> f64 { + self.ccdf(x).ln() + } + + fn ppf(&self, p: f64) -> f64 { + let mu = self.mean; + let b = self.scale; + let p = p.clamp(0.0, 1.0); + mu - b * (p - 0.5).signum() * (1.0 - 2.0 * (p - 0.5).abs()).ln() + } + + fn support(&self) -> (f64, f64) { + (f64::NEG_INFINITY, f64::INFINITY) + } +} + #[cfg(test)] mod test { - use crate::statistics::{ExponentialEstimator, HalfT}; + use crate::statistics::{ExponentialEstimator, Frechet, HalfT, Laplace}; use std::fmt::Debug; pub trait TestDistribution: Debug { @@ -283,11 +447,13 @@ mod test { use super::{Distribution, Exponential}; - fn get_dists() -> [Box; 3] { + fn get_dists() -> [Box; 5] { [ as_box(Exponential::unit()), as_box(ExponentialEstimator::unit()), as_box(HalfT::unit()), + as_box(Frechet::unit()), + as_box(Laplace::unit()), ] } @@ -312,16 +478,18 @@ mod test { if high == f64::INFINITY { high = 5.0; } - if low == f64::INFINITY { + if low == f64::NEG_INFINITY { low = -5.0; } for x in linspace(low, high, 100) { // Basic properties... + // println!("{x} -> {} vs {}", dist.tpdf(x), dist.tlogpdf(x).exp()); assert!(is_close(dist.tpdf(x), dist.tlogpdf(x).exp())); assert!(is_close(dist.tcdf(x), dist.tlogcdf(x).exp())); assert!(is_close(dist.tccdf(x), dist.tlogccdf(x).exp())); assert!(is_close(dist.tccdf(x), 1.0 - dist.tcdf(x))); + // println!("{x} -> {}", dist.tppf(dist.tcdf(x))); assert!(is_close(dist.tppf(dist.tcdf(x)), x)); } } diff --git a/src/trace_statistics.rs b/src/trace_statistics.rs index 20e66f1..cb49630 100644 --- a/src/trace_statistics.rs +++ b/src/trace_statistics.rs @@ -145,7 +145,7 @@ pub fn trace_statistics, E: JoinEst // Calculate join statistics for all families using combined prior as a starting point... let mut all_join_stats: Vec = - vec![S::new_from_prior(&all_family_stats); alignment_data.query_name_map.size()]; + vec![S::new_from_prior(&all_family_stats, 1); alignment_data.query_name_map.size()]; for trace_results in naive_traces.iter() { for (query_id, stats) in trace_results.query_join_statistics.iter() { From c065af2515a2181d6471d5349067689e19c1a9f3 Mon Sep 17 00:00:00 2001 From: isaacr Date: Thu, 21 May 2026 18:05:04 -0600 Subject: [PATCH 26/39] WIP on quantile estimator. --- src/p2estimator.rs | 162 +++++++++++++++++++++++++++++---------------- 1 file changed, 106 insertions(+), 56 deletions(-) diff --git a/src/p2estimator.rs b/src/p2estimator.rs index 1bfe387..74873ba 100644 --- a/src/p2estimator.rs +++ b/src/p2estimator.rs @@ -3,11 +3,14 @@ use std::{cmp::Ordering, ops::Neg}; /// Implementation of P2 estimator. /// See "The P2 Algorithm for Dynamic Statistical Computing Calculation of Quantiles and Histograms Without Storing Observations" /// at https://www.cse.wustl.edu/~jain/papers/ftp/psqr.pdf -use num_traits::{float::TotalOrder, Float, Num, Unsigned}; +/// +/// We replace the P2 interpolation with PCHIP instead (See paper A Method for Constructing Local Monotone Piecewise Cubic Interpolants by F. N. Fritsch and J. Butland, or https://doi.org/10.1137/0905021) +use num_traits::{float::TotalOrder, Float, FromPrimitive, Num, Unsigned}; struct P2HistogramPoint { value: F, rank: I, + target: F, } fn get_sign, B: Num + PartialOrd + Neg>( @@ -28,74 +31,88 @@ fn inc_or_dec>(val: A } } -fn linear_prediction>( - points: &[P2HistogramPoint; 3], - d: isize, +fn cubic_hermite_spline( + x0: F, + y0: F, + x1: F, + y1: F, + m0: F, + m1: F, + x: F, ) -> F { - let n: [F; 3] = points.each_ref().map(|v| v.rank.into()); - let q: [F; 3] = points.each_ref().map(|v| v.value); - let d_f: F = get_sign(d); - let d_off = (1 + d) as usize; + let t = (x - x0) / (x1 - x0); + let ms0 = (x1 - x0) * m0; + let ms1 = (x1 - x0) * m1; - q[1] + d_f * ((q[d_off] - q[1]) / (n[d_off] - n[1])) -} + let _1 = F::one(); + let _2 = F::from_i32(2).unwrap(); + let _3 = F::from_i32(3).unwrap(); -fn parabolic_prediction>( - points: &[P2HistogramPoint; 3], - d: isize, -) -> F { - let n: [F; 3] = points.each_ref().map(|v| v.rank.into()); - let q: [F; 3] = points.each_ref().map(|v| v.value); - let d: F = get_sign(d); + let h0: F = (_2 * t - _3) * t * t + _1; + let h1 = ((t - _1) * t + _1) * t; + let h2 = (_2 * t + _3) * t * t; + let h3 = (t - _1) * t * t; - let left = (n[1] - n[0] + d) * ((q[2] - q[1]) / (n[2] - n[1])); - let right = (n[2] - n[1] - d) * ((q[1] - q[0]) / (n[1] - n[0])); - q[1] + (d / (n[2] - n[0])) * (left + right) + h0 * y0 + h1 * ms0 + h2 * y1 + h3 * ms1 } -fn _p2update + From>( - points: &mut [P2HistogramPoint], - center_index: usize, - observations: I, - total_points: I, -) { - // Actual rank desired for the given quantile... - let ci: I = center_index.into(); - let rank_proposal: F = - (ci * (observations - I::one())).into() / (total_points - I::one()).into(); - let d: F = rank_proposal - points[1].rank.into(); - - if d >= F::one() && (points[2].rank - points[1].rank) > I::one() - || (d <= -F::one()) && points[1].rank - points[0].rank > I::one() - { - let d: isize = get_sign(d); - let mut p_est = parabolic_prediction( - (&points[center_index - 1..center_index + 1]) - .as_array() - .unwrap(), - d, - ); - if p_est <= points[center_index - 1].value || p_est >= points[center_index + 1].value { - p_est = linear_prediction( - (&points[center_index - 1..center_index + 1]) - .as_array() - .unwrap(), - d, - ); - } +fn pchip_point_derivative(dx0: F, dy0: F, dx1: F, dy1: F) -> F { + if dy0 * dy1 > F::zero() { + let _1 = F::one(); + let one_third = _1 / F::from_i32(3).unwrap(); + let alpha = one_third * (_1 + dx1 / (dx0 + dx1)); + dy0 * dy1 / (alpha * dy1 + (_1 - alpha) * dy0) + } else { + F::zero() + } +} - points[center_index].value = p_est; - points[center_index].rank = inc_or_dec(points[center_index].rank, d); +fn secant_diff>( + point0: Option<&P2HistogramPoint>, + point1: Option<&P2HistogramPoint>, +) -> (F, F) { + if let (Some(p0), Some(p1)) = (point0, point1) { + ((p1.rank - p0.rank).into(), p1.value - p0.value) + } else { + // Assume slope at endpoints of CDF is 0... + (F::zero(), F::zero()) } } -struct P2HistogramData<'a, F: Float, I: Unsigned + Copy + Ord + Into> { +fn pchip_prediction>( + point0: Option<&P2HistogramPoint>, + point1: &P2HistogramPoint, + point2: &P2HistogramPoint, + point3: Option<&P2HistogramPoint>, + x: F, +) -> F { + let s0 = secant_diff(point0, Some(point1)); + let s1 = secant_diff(Some(point1), Some(point2)); + let s2 = secant_diff(Some(point2), point3); + let m0 = pchip_point_derivative(s0.0, s0.1, s1.0, s1.1); + let m1 = pchip_point_derivative(s1.0, s1.1, s2.0, s2.1); + + cubic_hermite_spline( + point1.rank.into(), + point1.value, + point2.rank.into(), + point2.value, + m0, + m1, + x, + ) +} + +struct QuantileEstimator<'a, F: Float, I: Unsigned + Copy + Ord + Into> { observations: I, points: &'a mut [P2HistogramPoint], } -impl<'a, F: Float + TotalOrder, I: Unsigned + Copy + Ord + Into + From + Into> - P2HistogramData<'a, F, I> +impl< + 'a, + F: Float + TotalOrder + Into + FromPrimitive, + I: Unsigned + Copy + Ord + Into + From + Into, + > QuantileEstimator<'a, F, I> { fn _standard_update(&mut self, sample: F) { // Find where sample falls within distribution... @@ -116,7 +133,38 @@ impl<'a, F: Float + TotalOrder, I: Unsigned + Copy + Ord + Into + From // Adjust inner markers to within 1 of their target quantile using p2 formula... for i in 1..(self.points.len() - 1) { - _p2update(self.points, i, self.observations, self.points.len().into()); + let target_rank: F = (self.points[i].target * self.observations.into()).floor(); + let true_rank: F = self.points[i].rank.into(); + let true_rank_int: usize = self.points[i].rank.into(); + if (true_rank - target_rank).abs() > F::one() { + let target_rank_int: usize = target_rank.into(); + let lower_rank: usize = self.points[i - 1].rank.into(); + let upper_rank: usize = self.points[i + 1].rank.into(); + let new_rank: usize = target_rank_int + .clamp(lower_rank.saturating_add(1), upper_rank.saturating_sub(1)); + if new_rank == true_rank_int { + continue; + } + + let shift = if new_rank > target_rank_int { 1 } else { 0 }; + + self.points[i].rank = new_rank.into(); + self.points[i].value = pchip_prediction( + if i + shift > 2 { + Some(&self.points[i + shift - 2]) + } else { + None + }, + &self.points[i - shift - 1], + &self.points[i + shift], + if i + shift + 1 < self.points.len() { + Some(&self.points[i + shift + 1]) + } else { + None + }, + F::from_usize(new_rank).unwrap(), + ); + } } self.observations = self.observations + I::one(); @@ -129,6 +177,8 @@ impl<'a, F: Float + TotalOrder, I: Unsigned + Copy + Ord + Into + From } fn _initialize(&mut self) { + panic!("Fix!"); + // TODO: Fix this... self.points.sort_by(|a, b| a.value.total_cmp(&b.value)); self.points.iter_mut().enumerate().for_each(|(i, p)| { p.rank = i.into(); From 9c261d38303e253572222f3bcfd960c3cec9514e Mon Sep 17 00:00:00 2001 From: isaacr Date: Fri, 22 May 2026 19:50:57 -0600 Subject: [PATCH 27/39] New quantile estimator updates done. --- Cargo.toml | 1 - src/p2estimator.rs | 497 ++++++++++++++++++++++++++++++--------------- src/segments.rs | 10 +- 3 files changed, 340 insertions(+), 168 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 9055fa9..c5b43b6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,6 @@ itertools = "0.11.0" rayon = "1.8.0" base64 = "0.22.1" puruspe = "0.4.4" -num-traits = "0.2.19" [target.'cfg(not(target_env = "msvc"))'.dependencies] tikv-jemallocator = "0.5" diff --git a/src/p2estimator.rs b/src/p2estimator.rs index 74873ba..5be3edb 100644 --- a/src/p2estimator.rs +++ b/src/p2estimator.rs @@ -1,211 +1,378 @@ -use std::{cmp::Ordering, ops::Neg}; - -/// Implementation of P2 estimator. -/// See "The P2 Algorithm for Dynamic Statistical Computing Calculation of Quantiles and Histograms Without Storing Observations" -/// at https://www.cse.wustl.edu/~jain/papers/ftp/psqr.pdf -/// -/// We replace the P2 interpolation with PCHIP instead (See paper A Method for Constructing Local Monotone Piecewise Cubic Interpolants by F. N. Fritsch and J. Butland, or https://doi.org/10.1137/0905021) -use num_traits::{float::TotalOrder, Float, FromPrimitive, Num, Unsigned}; - -struct P2HistogramPoint { - value: F, - rank: I, - target: F, -} +use std::cmp::Ordering; -fn get_sign, B: Num + PartialOrd + Neg>( - val: A, -) -> B { - if val >= A::zero() { - B::one() - } else { - -B::one() - } +use crate::segments::MergeIterator; +use itertools::izip; +// Implementation of P2 estimator. +// See "The P2 Algorithm for Dynamic Statistical Computing Calculation of Quantiles and Histograms Without Storing Observations" +// at https://www.cse.wustl.edu/~jain/papers/ftp/psqr.pdf +// +// We replace the P2 interpolation with PCHIP instead (See paper A Method for Constructing Local Monotone Piecewise Cubic Interpolants by F. N. Fritsch and J. Butland, or https://doi.org/10.1137/0905021) + +struct QuantileEstimatorData<'a> { + ranks: &'a [usize], + values: &'a [f64], + targets: &'a [f64], + observations: &'a usize, } -fn inc_or_dec>(val: A, delta: B) -> A { - match delta.partial_cmp(&B::zero()) { - Some(Ordering::Less) => val - A::one(), - Some(Ordering::Greater) => val + A::one(), - _ => val, - } +struct MutableQuantileEstimatorData<'a> { + ranks: &'a mut [usize], + values: &'a mut [f64], + targets: &'a [f64], + observations: &'a mut usize, } -fn cubic_hermite_spline( - x0: F, - y0: F, - x1: F, - y1: F, - m0: F, - m1: F, - x: F, -) -> F { +fn cubic_hermite_spline(x0: f64, y0: f64, x1: f64, y1: f64, m0: f64, m1: f64, x: f64) -> f64 { let t = (x - x0) / (x1 - x0); let ms0 = (x1 - x0) * m0; let ms1 = (x1 - x0) * m1; - let _1 = F::one(); - let _2 = F::from_i32(2).unwrap(); - let _3 = F::from_i32(3).unwrap(); - - let h0: F = (_2 * t - _3) * t * t + _1; - let h1 = ((t - _1) * t + _1) * t; - let h2 = (_2 * t + _3) * t * t; - let h3 = (t - _1) * t * t; + let h0 = (2.0 * t - 3.0) * t * t + 1.0; + let h1 = ((t - 1.0) * t + 1.0) * t; + let h2 = (2.0 * t + 3.0) * t * t; + let h3 = (t - 1.0) * t * t; h0 * y0 + h1 * ms0 + h2 * y1 + h3 * ms1 } -fn pchip_point_derivative(dx0: F, dy0: F, dx1: F, dy1: F) -> F { - if dy0 * dy1 > F::zero() { - let _1 = F::one(); - let one_third = _1 / F::from_i32(3).unwrap(); - let alpha = one_third * (_1 + dx1 / (dx0 + dx1)); - dy0 * dy1 / (alpha * dy1 + (_1 - alpha) * dy0) +fn pchip_point_derivative(dx0: f64, dy0: f64, dx1: f64, dy1: f64) -> f64 { + if dy0 * dy1 > 0.0 { + let alpha = (1.0 / 3.0) * (1.0 + dx1 / (dx0 + dx1)); + dy0 * dy1 / (alpha * dy1 + (1.0 - alpha) * dy0) } else { - F::zero() + 0.0 } } -fn secant_diff>( - point0: Option<&P2HistogramPoint>, - point1: Option<&P2HistogramPoint>, -) -> (F, F) { - if let (Some(p0), Some(p1)) = (point0, point1) { - ((p1.rank - p0.rank).into(), p1.value - p0.value) - } else { - // Assume slope at endpoints of CDF is 0... - (F::zero(), F::zero()) - } +fn pchip_prediction(ranks: &[f64; 4], values: &[f64; 4], x: f64) -> f64 { + let m0 = pchip_point_derivative( + ranks[1] - ranks[0], + values[1] - values[0], + ranks[2] - ranks[1], + values[2] - values[1], + ); + let m1 = pchip_point_derivative( + ranks[2] - ranks[1], + values[2] - values[1], + ranks[3] - ranks[2], + values[3] - values[2], + ); + + cubic_hermite_spline(ranks[1], values[1], ranks[2], values[2], m0, m1, x) } -fn pchip_prediction>( - point0: Option<&P2HistogramPoint>, - point1: &P2HistogramPoint, - point2: &P2HistogramPoint, - point3: Option<&P2HistogramPoint>, - x: F, -) -> F { - let s0 = secant_diff(point0, Some(point1)); - let s1 = secant_diff(Some(point1), Some(point2)); - let s2 = secant_diff(Some(point2), point3); - let m0 = pchip_point_derivative(s0.0, s0.1, s1.0, s1.1); - let m1 = pchip_point_derivative(s1.0, s1.1, s2.0, s2.1); - - cubic_hermite_spline( - point1.rank.into(), - point1.value, - point2.rank.into(), - point2.value, - m0, - m1, - x, - ) +fn debug_check_valid_estimator( + ranks: &[usize], + values: &[f64], + targets: &[f64], + observations: usize, +) { + debug_assert!(ranks.len() > 2); + debug_assert!(ranks.len() == values.len() && ranks.len() == targets.len()); + debug_assert!(values.is_sorted() && ranks.is_sorted() && targets.is_sorted()); + debug_assert!(targets.first() == Some(&0.0) && targets.last() == Some(&1.0)); + debug_assert!(observations >= ranks.len()); + debug_assert!(ranks.first() == Some(&0) && ranks.last() == Some(&observations)); } -struct QuantileEstimator<'a, F: Float, I: Unsigned + Copy + Ord + Into> { - observations: I, - points: &'a mut [P2HistogramPoint], +fn debug_check_uninitialized_estimator(ranks: &[usize], values: &[f64], targets: &[f64]) { + debug_assert!(ranks.len() > 2); + debug_assert!(ranks.len() == values.len() && ranks.len() == targets.len()); + debug_assert!(targets.is_sorted()); + debug_assert!(targets.first() == Some(&0.0) && targets.last() == Some(&1.0)); } -impl< - 'a, - F: Float + TotalOrder + Into + FromPrimitive, - I: Unsigned + Copy + Ord + Into + From + Into, - > QuantileEstimator<'a, F, I> -{ - fn _standard_update(&mut self, sample: F) { - // Find where sample falls within distribution... - let p = self.points.partition_point(|v| v.value <= sample); - let bound_p = p.min(self.points.len() - 1); - - // Update extremes... - if bound_p == 0 { - self.points[bound_p].value = self.points[bound_p].value.min(sample); - } else if bound_p == (self.points.len() - 1) { - self.points[bound_p].value = self.points[bound_p].value.max(sample); +fn _add_sample_to_estimator(data: MutableQuantileEstimatorData, sample: f64) { + let MutableQuantileEstimatorData { + ranks, + values, + targets, + observations, + } = data; + debug_check_valid_estimator(ranks, values, targets, *observations); + // Find where sample falls within distribution... + let p = values.partition_point(|&v| v <= sample); + let bound_p = p.min(values.len() - 1); + + // Update extremes... + if bound_p == 0 { + values[bound_p] = values[bound_p].min(sample); + } else if bound_p == (values.len() - 1) { + values[bound_p] = values[bound_p].max(sample); + } + + // Increment ranks of markers above newly inserted sample... + for i in (bound_p + 1)..ranks.len() { + ranks[i] = ranks[1] + 1; + } + + // Adjust inner markers to within 1 of their target quantile using p2 formula... + for i in 1..(values.len() - 1) { + let target_rank = (targets[i] * (*observations) as f64) as usize; + let true_rank = ranks[i]; + if true_rank.abs_diff(target_rank) > 1 { + let new_rank: usize = target_rank.clamp( + ranks[i - 1].saturating_add(1), + ranks[i + 1].saturating_sub(1), + ); + if new_rank == true_rank { + continue; + } + + let idx_shift = if new_rank > target_rank { 1 } else { 0 }; + let indexes = [ + (i + idx_shift).saturating_sub(2), + (i + idx_shift).saturating_sub(1), + (i + idx_shift), + (i + idx_shift).saturating_add(1).min(ranks.len() - 1), + ]; + + ranks[i] = new_rank; + values[i] = pchip_prediction( + &indexes.map(|i| ranks[i] as f64), + &indexes.map(|i| values[i]), + new_rank as f64, + ); } + } + + *observations += 1; +} + +fn _merge_estimators( + q1: QuantileEstimatorData, + q2: QuantileEstimatorData, + mut new_estimator: MutableQuantileEstimatorData, +) { + debug_check_valid_estimator(q1.ranks, q1.values, q1.targets, *q1.observations); + debug_check_valid_estimator(q2.ranks, q2.values, q2.targets, *q2.observations); + debug_check_uninitialized_estimator( + new_estimator.ranks, + new_estimator.values, + new_estimator.targets, + ); + + assert!(new_estimator.targets.len() <= (*q1.observations + *q2.observations)); + + fn get_at(a: &QuantileEstimatorData, i: usize) -> (usize, f64) { + (a.ranks[i], a.values[i]) + } + + fn set_at(a: &mut MutableQuantileEstimatorData, i: usize, data: (usize, f64)) { + a.ranks[i] = data.0; + a.values[i] = data.1; + } + + // Initialize the min/max quantiles... + if q1.values[0] <= q2.values[0] { + set_at(&mut new_estimator, 0, get_at(&q1, 0)); + } else { + set_at(&mut new_estimator, 0, get_at(&q2, 0)); + } + + let new_est_len = new_estimator.ranks.len(); + if q1.values[q1.values.len() - 1] >= q2.values[q2.values.len() - 1] { + set_at( + &mut new_estimator, + new_est_len - 1, + get_at(&q1, q1.ranks.len() - 1), + ); + } else { + set_at( + &mut new_estimator, + new_est_len - 1, + get_at(&q2, q2.ranks.len() - 1), + ); + } + + // May eventually replace with algorithm that doesn't use extra memory... + // Calculate a "merged" quantiles by linearly iterpolating ranks based on the values we see... + let mut dual_est_quants: Vec<(f64, f64)> = Vec::with_capacity(q1.ranks.len() + q2.ranks.len()); + + let mut q1_prior: Option<(usize, f64)> = None; + let mut q2_prior: Option<(usize, f64)> = None; - // Increment ranks of markers above newly inserted sample... - for i in (bound_p + 1)..self.points.len() { - self.points[i].rank = self.points[i].rank + I::one(); + let mut q1_idx = 0; + let mut q2_idx = 0; + + loop { + let q1_past_end = q1_idx >= q1.values.len(); + let q2_past_end = q2_idx >= q2.values.len(); + + if q1_past_end && q2_past_end { + break; + } else if q1_past_end { + let next = get_at(&q2, q2_idx); + dual_est_quants.push(((next.0 + q1_prior.map(|v| v.0).unwrap_or(0)) as f64, next.1)); + q2_prior = Some(next); + q2_idx += 1; + } else if q2_past_end { + let next = get_at(&q1, q1_idx); + dual_est_quants.push(((next.0 + q2_prior.map(|v| v.0).unwrap_or(0)) as f64, next.1)); + q1_prior = Some(next); + q1_idx += 1; + } else if q1.values[q1_idx] <= q2.values[q2_idx] { + let other_next = get_at(&q2, q2_idx); + let next = get_at(&q1, q1_idx); + let w = q2_prior + .map(|other_prior| (next.1 - other_prior.1) / (other_next.1 - other_prior.1)) + .unwrap_or(0.0); + let other_rank_est = q2_prior + .map(|other_prior| other_prior.0 as f64 * (1.0 - w) + other_next.0 as f64 * w) + .unwrap_or(0.0); + dual_est_quants.push((next.0 as f64 + other_rank_est, next.1)); + q1_prior = Some(next); + q1_idx += 1; + } else { + let other_next = get_at(&q1, q1_idx); + let next = get_at(&q2, q2_idx); + let w = q1_prior + .map(|other_prior| (next.1 - other_prior.1) / (other_next.1 - other_prior.1)) + .unwrap_or(0.0); + let other_rank_est = q1_prior + .map(|other_prior| other_prior.0 as f64 * (1.0 - w) + other_next.0 as f64 * w) + .unwrap_or(0.0); + dual_est_quants.push((next.0 as f64 + other_rank_est, next.1)); + q2_prior = Some(next); + q2_idx += 1; } + } - // Adjust inner markers to within 1 of their target quantile using p2 formula... - for i in 1..(self.points.len() - 1) { - let target_rank: F = (self.points[i].target * self.observations.into()).floor(); - let true_rank: F = self.points[i].rank.into(); - let true_rank_int: usize = self.points[i].rank.into(); - if (true_rank - target_rank).abs() > F::one() { - let target_rank_int: usize = target_rank.into(); - let lower_rank: usize = self.points[i - 1].rank.into(); - let upper_rank: usize = self.points[i + 1].rank.into(); - let new_rank: usize = target_rank_int - .clamp(lower_rank.saturating_add(1), upper_rank.saturating_sub(1)); - if new_rank == true_rank_int { - continue; - } + // New number of observations is the sum of both... + *(new_estimator.observations) = *(q1.observations) + *(q2.observations); - let shift = if new_rank > target_rank_int { 1 } else { 0 }; - - self.points[i].rank = new_rank.into(); - self.points[i].value = pchip_prediction( - if i + shift > 2 { - Some(&self.points[i + shift - 2]) - } else { - None - }, - &self.points[i - shift - 1], - &self.points[i + shift], - if i + shift + 1 < self.points.len() { - Some(&self.points[i + shift + 1]) - } else { - None - }, - F::from_usize(new_rank).unwrap(), - ); - } + // Solve all inner quantiles using traditional interpolation... + let mut index_between = 0; + + for ti in 1..new_estimator.targets.len() - 1 { + // Calculate new rank... + let target = new_estimator.targets[ti]; + let approx_obs_rank = ((target * *(new_estimator.observations) as f64) as usize).clamp( + 1 + ti, + *(new_estimator.observations) - (new_estimator.targets.len() - (ti + 1)), + ); + + // Find where it lands in cdf... + while index_between < dual_est_quants.len() + && (approx_obs_rank as f64) < dual_est_quants[index_between].0 + { + index_between += 1; } - self.observations = self.observations + I::one(); + // Get pchip estimate for the value... + let indexes = [ + index_between.saturating_sub(2), + index_between.saturating_sub(1), + index_between, + index_between + .saturating_add(1) + .min(dual_est_quants.len() - 1), + ]; + + new_estimator.ranks[ti] = approx_obs_rank; + new_estimator.values[ti] = pchip_prediction( + &indexes.map(|i| dual_est_quants[i].0), + &indexes.map(|i| dual_est_quants[i].1), + approx_obs_rank as f64, + ) } +} - fn _pre_init_update(&mut self, sample: F) { - let nxt_idx: usize = self.observations.into(); - self.points[nxt_idx].value = sample; - self.observations = self.observations + I::one(); +pub trait QuantileEstimator { + fn update(&mut self, sample: f64); + fn update_all(&mut self, samples: &[f64]) { + for &s in samples.iter() { + self.update(s); + } } + fn combine(&self, other: &Self) -> Self; +} + +#[derive(Clone)] +struct FixedSizeQuantileEstimator { + values: [f64; N], + ranks: [usize; N], + targets: [f64; N], + observations: usize, +} - fn _initialize(&mut self) { - panic!("Fix!"); - // TODO: Fix this... - self.points.sort_by(|a, b| a.value.total_cmp(&b.value)); - self.points.iter_mut().enumerate().for_each(|(i, p)| { - p.rank = i.into(); - }); +impl FixedSizeQuantileEstimator { + pub fn new(targets: &[f64; N]) -> Self { + Self { + values: [0.0; N], + ranks: [0; N], + targets: targets.clone(), + observations: 0, + } } - pub fn update(&mut self, sample: F) { - let obs: usize = self.observations.into(); - match (obs + 1).cmp(&self.points.len()) { - Ordering::Less => self._pre_init_update(sample), + fn _as_data(&self) -> QuantileEstimatorData<'_> { + QuantileEstimatorData { + ranks: &self.ranks, + values: &self.values, + targets: &self.targets, + observations: &self.observations, + } + } + + fn _as_mut_data(&mut self) -> MutableQuantileEstimatorData<'_> { + MutableQuantileEstimatorData { + ranks: &mut self.ranks, + values: &mut self.values, + targets: &self.targets, + observations: &mut self.observations, + } + } + + fn _is_initialized(&self) -> bool { + self.observations >= N + } +} + +impl QuantileEstimator for FixedSizeQuantileEstimator { + fn update(&mut self, sample: f64) { + match (self.observations + 1).cmp(&self.values.len()) { + Ordering::Less => { + self.values[self.observations] = sample; + self.observations += 1; + } Ordering::Equal => { - self._pre_init_update(sample); - self._initialize(); + self.values[self.observations] = sample; + self.values.sort_by(|a, b| a.total_cmp(b)); + for i in 0..self.ranks.len() { + self.ranks[i] = i / (self.ranks.len() - 1); + } + self.observations += 1; } Ordering::Greater => { - self._standard_update(sample); + _add_sample_to_estimator(self._as_mut_data(), sample); } } } - pub fn is_initialized(&self) -> bool { - let obs: usize = self.observations.into(); - obs >= self.points.len() - } + fn combine(&self, other: &Self) -> Self { + match (self._is_initialized(), other._is_initialized()) { + (true, true) => { + let mut new_quant_est = Self::new(&self.targets); + + _merge_estimators( + self._as_data(), + other._as_data(), + new_quant_est._as_mut_data(), + ); - fn combine(&mut self, other: &P2HistogramData) { - // TODO: Need to think about how to do this efficiently while maintaining accuracy... - panic!("Not implemented!") + new_quant_est + } + (true, false) | (false, false) => { + let mut new_quant_est = self.clone(); + new_quant_est.update_all(&other.values[..other.observations]); + new_quant_est + } + (false, true) => { + let mut new_quant_est = other.clone(); + new_quant_est.update_all(&self.values[..self.observations]); + new_quant_est + } + } } } diff --git a/src/segments.rs b/src/segments.rs index 30de252..81cdb3e 100644 --- a/src/segments.rs +++ b/src/segments.rs @@ -199,6 +199,7 @@ pub struct MergeIterator< val1: MergeEntry, val2: MergeEntry, prior_val: MergeEntry, + only_unique: bool, comparator: F, } @@ -207,13 +208,14 @@ impl, F: Fn(&I::Item, &I::Item) -> Orde where I::Item: Copy, { - pub fn new(iter1: I, iter2: J, comparator: F) -> Self { + pub fn new(iter1: I, iter2: J, comparator: F, only_unique: bool) -> Self { Self { iter1: iter1.fuse(), iter2: iter2.fuse(), val1: MergeEntry::Start, val2: MergeEntry::Start, prior_val: MergeEntry::Start, + only_unique, comparator, } } @@ -269,6 +271,10 @@ where next_val = self.val2; self.val2 = self.iter2.next().into(); } + + if !self.only_unique { + break; + } } self.prior_val = next_val; @@ -295,7 +301,7 @@ pub fn unique_merging_iterator>( where I::Item: Copy + Ord, { - MergeIterator::new(list1, list2, |a, b| a.cmp(b)) + MergeIterator::new(list1, list2, |a, b| a.cmp(b), true) } #[derive(Debug)] From 958b3da138b65f04b9d10e2c9745d5317f47629f Mon Sep 17 00:00:00 2001 From: isaacr Date: Sat, 23 May 2026 01:49:18 -0600 Subject: [PATCH 28/39] More work on quantile estimation. --- src/p2estimator.rs | 138 +++++++++++++++++++++++++++++++++++++-------- src/statistics.rs | 20 +++++-- 2 files changed, 132 insertions(+), 26 deletions(-) diff --git a/src/p2estimator.rs b/src/p2estimator.rs index 5be3edb..3a2455e 100644 --- a/src/p2estimator.rs +++ b/src/p2estimator.rs @@ -1,6 +1,6 @@ use std::cmp::Ordering; -use crate::segments::MergeIterator; +use crate::{segments::MergeIterator, statistics::Distribution}; use itertools::izip; // Implementation of P2 estimator. // See "The P2 Algorithm for Dynamic Statistical Computing Calculation of Quantiles and Histograms Without Storing Observations" @@ -36,29 +36,39 @@ fn cubic_hermite_spline(x0: f64, y0: f64, x1: f64, y1: f64, m0: f64, m1: f64, x: } fn pchip_point_derivative(dx0: f64, dy0: f64, dx1: f64, dy1: f64) -> f64 { - if dy0 * dy1 > 0.0 { + let s0 = if dx0 != 0.0 { dy0 / dx0 } else { 0.0 }; + let s1 = if dx1 != 0.0 { dy1 / dx1 } else { 0.0 }; + if s0 * s1 > 0.0 { let alpha = (1.0 / 3.0) * (1.0 + dx1 / (dx0 + dx1)); - dy0 * dy1 / (alpha * dy1 + (1.0 - alpha) * dy0) + s0 * s1 / (alpha * s1 + (1.0 - alpha) * s0) } else { 0.0 } } -fn pchip_prediction(ranks: &[f64; 4], values: &[f64; 4], x: f64) -> f64 { +fn pchip_prediction(x_points: &[f64; 4], y_points: &[f64; 4], x: f64) -> f64 { let m0 = pchip_point_derivative( - ranks[1] - ranks[0], - values[1] - values[0], - ranks[2] - ranks[1], - values[2] - values[1], + x_points[1] - x_points[0], + y_points[1] - y_points[0], + x_points[2] - x_points[1], + y_points[2] - y_points[1], ); let m1 = pchip_point_derivative( - ranks[2] - ranks[1], - values[2] - values[1], - ranks[3] - ranks[2], - values[3] - values[2], + x_points[2] - x_points[1], + y_points[2] - y_points[1], + x_points[3] - x_points[2], + y_points[3] - y_points[2], ); - cubic_hermite_spline(ranks[1], values[1], ranks[2], values[2], m0, m1, x) + cubic_hermite_spline( + x_points[1], + y_points[1], + x_points[2], + x_points[2], + m0, + m1, + x, + ) } fn debug_check_valid_estimator( @@ -72,7 +82,7 @@ fn debug_check_valid_estimator( debug_assert!(values.is_sorted() && ranks.is_sorted() && targets.is_sorted()); debug_assert!(targets.first() == Some(&0.0) && targets.last() == Some(&1.0)); debug_assert!(observations >= ranks.len()); - debug_assert!(ranks.first() == Some(&0) && ranks.last() == Some(&observations)); + debug_assert!(ranks.first() == Some(&0) && ranks.last() == Some(&(observations - 1))); } fn debug_check_uninitialized_estimator(ranks: &[usize], values: &[f64], targets: &[f64]) { @@ -91,7 +101,7 @@ fn _add_sample_to_estimator(data: MutableQuantileEstimatorData, sample: f64) { } = data; debug_check_valid_estimator(ranks, values, targets, *observations); // Find where sample falls within distribution... - let p = values.partition_point(|&v| v <= sample); + let p = values.partition_point(|&v| v < sample); let bound_p = p.min(values.len() - 1); // Update extremes... @@ -102,12 +112,13 @@ fn _add_sample_to_estimator(data: MutableQuantileEstimatorData, sample: f64) { } // Increment ranks of markers above newly inserted sample... - for i in (bound_p + 1)..ranks.len() { + for i in bound_p.max(1)..ranks.len() { ranks[i] = ranks[1] + 1; } // Adjust inner markers to within 1 of their target quantile using p2 formula... for i in 1..(values.len() - 1) { + // Observations hasn't been incremented yet, don't need to subtract 1... let target_rank = (targets[i] * (*observations) as f64) as usize; let true_rank = ranks[i]; if true_rank.abs_diff(target_rank) > 1 { @@ -240,6 +251,7 @@ fn _merge_estimators( // New number of observations is the sum of both... *(new_estimator.observations) = *(q1.observations) + *(q2.observations); + let rank_range = *(new_estimator.observations) - 1; // Solve all inner quantiles using traditional interpolation... let mut index_between = 0; @@ -247,10 +259,8 @@ fn _merge_estimators( for ti in 1..new_estimator.targets.len() - 1 { // Calculate new rank... let target = new_estimator.targets[ti]; - let approx_obs_rank = ((target * *(new_estimator.observations) as f64) as usize).clamp( - 1 + ti, - *(new_estimator.observations) - (new_estimator.targets.len() - (ti + 1)), - ); + let approx_obs_rank = ((target * rank_range as f64) as usize) + .clamp(ti, rank_range - (new_estimator.targets.len() - (ti + 1))); // Find where it lands in cdf... while index_between < dual_est_quants.len() @@ -278,7 +288,7 @@ fn _merge_estimators( } } -pub trait QuantileEstimator { +pub trait QuantileEstimator: Distribution { fn update(&mut self, sample: f64); fn update_all(&mut self, samples: &[f64]) { for &s in samples.iter() { @@ -329,6 +339,90 @@ impl FixedSizeQuantileEstimator { } } +impl Distribution for FixedSizeQuantileEstimator { + fn cdf(&self, x: f64) -> f64 { + let upper_p = self.values.partition_point(|&v| v < x); + if upper_p > self.values.len() { + 1.0 + } else if upper_p == 0 { + 0.0 + } else { + let indexes = [ + upper_p.saturating_sub(2), + upper_p.saturating_sub(1), + upper_p, + upper_p.saturating_add(1).min(self.values.len()), + ]; + + pchip_prediction( + &indexes.map(|i| self.values[i]), + &indexes.map(|i| self.ranks[i] as f64), + x, + ) / (self.observations - 1) as f64 + } + } + + fn logcdf(&self, x: f64) -> f64 { + self.cdf(x).ln() + } + + fn ccdf(&self, x: f64) -> f64 { + 1.0 - self.cdf(x) + } + + fn logccdf(&self, x: f64) -> f64 { + (-self.cdf(x)).ln_1p() + } + + fn ppf(&self, p: f64) -> f64 { + let est_rank = p.clamp(0.0, 1.0) * (self.observations - 1) as f64; + let upper_p = self.ranks.partition_point(|&r| (r as f64) < est_rank); + if upper_p > self.values.len() { + self.values[self.values.len() - 1] + } else if upper_p == 0 { + self.values[0] + } else { + let indexes = [ + upper_p.saturating_sub(2), + upper_p.saturating_sub(1), + upper_p, + upper_p.saturating_add(1).min(self.values.len()), + ]; + + pchip_prediction( + &indexes.map(|i| self.ranks[i] as f64), + &indexes.map(|i| self.values[i]), + est_rank, + ) + } + } + + fn pdf(&self, x: f64) -> f64 { + // Will have to calculate derivatives, cache normalization factor (such that area under curve is 1)... + // May be worth splitting out into different class to allow pre-processing this stuff... + panic!("TODO!"); + let upper_p = self.values.partition_point(|&v| v < x); + if upper_p > self.values.len() { + 0.0 + } else if upper_p == 0 { + 0.0 + } else { + 0.0 + } + } + + fn logpdf(&self, x: f64) -> f64 { + self.pdf(x).ln() + } + + fn support(&self) -> (f64, f64) { + ( + *self.values.first().unwrap_or(&f64::NEG_INFINITY), + *self.values.last().unwrap_or(&f64::INFINITY), + ) + } +} + impl QuantileEstimator for FixedSizeQuantileEstimator { fn update(&mut self, sample: f64) { match (self.observations + 1).cmp(&self.values.len()) { @@ -340,7 +434,7 @@ impl QuantileEstimator for FixedSizeQuantileEstimator { self.values[self.observations] = sample; self.values.sort_by(|a, b| a.total_cmp(b)); for i in 0..self.ranks.len() { - self.ranks[i] = i / (self.ranks.len() - 1); + self.ranks[i] = i; } self.observations += 1; } diff --git a/src/statistics.rs b/src/statistics.rs index 7670c93..6fc6e40 100644 --- a/src/statistics.rs +++ b/src/statistics.rs @@ -11,7 +11,7 @@ pub fn ln_add_exp(a: f64, b: f64) -> f64 { // TODO: Support for generic floating types... #[allow(dead_code)] -pub trait Distribution: Clone + Debug + Default { +pub trait Distribution: Clone { fn pdf(&self, x: f64) -> f64; fn cdf(&self, x: f64) -> f64; fn ppf(&self, p: f64) -> f64; @@ -20,7 +20,9 @@ pub trait Distribution: Clone + Debug + Default { fn logpdf(&self, x: f64) -> f64; fn logcdf(&self, x: f64) -> f64; fn logccdf(&self, x: f64) -> f64; +} +pub trait ParameterizedDistribution: Distribution + Debug + Default { fn unit() -> Self { Self::default() } @@ -31,6 +33,8 @@ pub struct Exponential { lambda: f64, } +impl ParameterizedDistribution for Exponential {} + impl Exponential { pub fn new(lambda: f64) -> Self { Self { lambda } @@ -87,6 +91,8 @@ pub struct ExponentialEstimator { degrees_of_freedom: usize, } +impl ParameterizedDistribution for ExponentialEstimator {} + impl ExponentialEstimator { pub fn new(sample_mean: f64, sample_size: usize) -> Self { Self { @@ -157,6 +163,8 @@ pub struct HalfT { degrees_of_freedom: usize, } +impl ParameterizedDistribution for HalfT {} + impl HalfT { #[allow(dead_code)] pub fn new(standard_deviation: f64, degrees_of_freedom: usize) -> Self { @@ -241,6 +249,8 @@ pub struct Frechet { minimum: f64, } +impl ParameterizedDistribution for Frechet {} + impl Frechet { pub fn new(alpha: f64, scale: f64, minimum: f64) -> Self { Self { @@ -335,6 +345,8 @@ pub struct Laplace { scale: f64, } +impl ParameterizedDistribution for Laplace {} + impl Laplace { pub fn new(mean: f64, scale: f64) -> Self { Self { mean, scale } @@ -414,7 +426,7 @@ mod test { fn tlogccdf(&self, x: f64) -> f64; } - impl TestDistribution for T { + impl TestDistribution for T { fn tpdf(&self, x: f64) -> f64 { self.pdf(x) } @@ -441,11 +453,11 @@ mod test { } } - fn as_box(d: T) -> Box { + fn as_box(d: T) -> Box { Box::new(d) } - use super::{Distribution, Exponential}; + use super::{Exponential, ParameterizedDistribution}; fn get_dists() -> [Box; 5] { [ From 44fde429d13a23f36c41ff955b3dbb315581b955 Mon Sep 17 00:00:00 2001 From: isaacr Date: Tue, 26 May 2026 18:10:14 -0600 Subject: [PATCH 29/39] Quantile est working, need to fix merging. --- Cargo.toml | 3 + src/p2estimator.rs | 509 +++++++++++++++++++++++++++++++-------------- src/statistics.rs | 14 +- 3 files changed, 367 insertions(+), 159 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index c5b43b6..1a636d7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,3 +29,6 @@ opt-level = 3 lto = "thin" codegen-units = 1 debug = false + +[dev-dependencies] +rand = "0.10.1" diff --git a/src/p2estimator.rs b/src/p2estimator.rs index 3a2455e..7e3b702 100644 --- a/src/p2estimator.rs +++ b/src/p2estimator.rs @@ -1,7 +1,8 @@ use std::cmp::Ordering; use crate::{segments::MergeIterator, statistics::Distribution}; -use itertools::izip; +use itertools::{izip, Itertools}; +use rayon::iter::Interleave; // Implementation of P2 estimator. // See "The P2 Algorithm for Dynamic Statistical Computing Calculation of Quantiles and Histograms Without Storing Observations" // at https://www.cse.wustl.edu/~jain/papers/ftp/psqr.pdf @@ -28,8 +29,8 @@ fn cubic_hermite_spline(x0: f64, y0: f64, x1: f64, y1: f64, m0: f64, m1: f64, x: let ms1 = (x1 - x0) * m1; let h0 = (2.0 * t - 3.0) * t * t + 1.0; - let h1 = ((t - 1.0) * t + 1.0) * t; - let h2 = (2.0 * t + 3.0) * t * t; + let h1 = ((t - 2.0) * t + 1.0) * t; + let h2 = (-2.0 * t + 3.0) * t * t; let h3 = (t - 1.0) * t * t; h0 * y0 + h1 * ms0 + h2 * y1 + h3 * ms1 @@ -47,6 +48,8 @@ fn pchip_point_derivative(dx0: f64, dy0: f64, dx1: f64, dy1: f64) -> f64 { } fn pchip_prediction(x_points: &[f64; 4], y_points: &[f64; 4], x: f64) -> f64 { + println!("{x_points:?}, {y_points:?}, {x}"); + debug_assert!(x_points.is_sorted() && x <= x_points[2] && x >= x_points[1]); let m0 = pchip_point_derivative( x_points[1] - x_points[0], y_points[1] - y_points[0], @@ -64,7 +67,7 @@ fn pchip_prediction(x_points: &[f64; 4], y_points: &[f64; 4], x: f64) -> f64 { x_points[1], y_points[1], x_points[2], - x_points[2], + y_points[2], m0, m1, x, @@ -113,24 +116,26 @@ fn _add_sample_to_estimator(data: MutableQuantileEstimatorData, sample: f64) { // Increment ranks of markers above newly inserted sample... for i in bound_p.max(1)..ranks.len() { - ranks[i] = ranks[1] + 1; + ranks[i] = ranks[i] + 1; } // Adjust inner markers to within 1 of their target quantile using p2 formula... for i in 1..(values.len() - 1) { // Observations hasn't been incremented yet, don't need to subtract 1... let target_rank = (targets[i] * (*observations) as f64) as usize; - let true_rank = ranks[i]; - if true_rank.abs_diff(target_rank) > 1 { + let current_rank = ranks[i]; + if current_rank.abs_diff(target_rank) > 1 { + //println!("{:?}, {}, {}", ranks, i, ranks[i]); let new_rank: usize = target_rank.clamp( ranks[i - 1].saturating_add(1), ranks[i + 1].saturating_sub(1), ); - if new_rank == true_rank { + if new_rank == current_rank { continue; } - let idx_shift = if new_rank > target_rank { 1 } else { 0 }; + let idx_shift = if new_rank > current_rank { 1 } else { 0 }; + let indexes = [ (i + idx_shift).saturating_sub(2), (i + idx_shift).saturating_sub(1), @@ -138,12 +143,12 @@ fn _add_sample_to_estimator(data: MutableQuantileEstimatorData, sample: f64) { (i + idx_shift).saturating_add(1).min(ranks.len() - 1), ]; - ranks[i] = new_rank; values[i] = pchip_prediction( &indexes.map(|i| ranks[i] as f64), &indexes.map(|i| values[i]), new_rank as f64, ); + ranks[i] = new_rank; } } @@ -169,33 +174,6 @@ fn _merge_estimators( (a.ranks[i], a.values[i]) } - fn set_at(a: &mut MutableQuantileEstimatorData, i: usize, data: (usize, f64)) { - a.ranks[i] = data.0; - a.values[i] = data.1; - } - - // Initialize the min/max quantiles... - if q1.values[0] <= q2.values[0] { - set_at(&mut new_estimator, 0, get_at(&q1, 0)); - } else { - set_at(&mut new_estimator, 0, get_at(&q2, 0)); - } - - let new_est_len = new_estimator.ranks.len(); - if q1.values[q1.values.len() - 1] >= q2.values[q2.values.len() - 1] { - set_at( - &mut new_estimator, - new_est_len - 1, - get_at(&q1, q1.ranks.len() - 1), - ); - } else { - set_at( - &mut new_estimator, - new_est_len - 1, - get_at(&q2, q2.ranks.len() - 1), - ); - } - // May eventually replace with algorithm that doesn't use extra memory... // Calculate a "merged" quantiles by linearly iterpolating ranks based on the values we see... let mut dual_est_quants: Vec<(f64, f64)> = Vec::with_capacity(q1.ranks.len() + q2.ranks.len()); @@ -214,12 +192,18 @@ fn _merge_estimators( break; } else if q1_past_end { let next = get_at(&q2, q2_idx); - dual_est_quants.push(((next.0 + q1_prior.map(|v| v.0).unwrap_or(0)) as f64, next.1)); + dual_est_quants.push(( + (next.0 + q1_prior.map(|v| v.0 + 1).unwrap_or(0)) as f64, + next.1, + )); q2_prior = Some(next); q2_idx += 1; } else if q2_past_end { let next = get_at(&q1, q1_idx); - dual_est_quants.push(((next.0 + q2_prior.map(|v| v.0).unwrap_or(0)) as f64, next.1)); + dual_est_quants.push(( + (next.0 + q2_prior.map(|v| v.0 + 1).unwrap_or(0)) as f64, + next.1, + )); q1_prior = Some(next); q1_idx += 1; } else if q1.values[q1_idx] <= q2.values[q2_idx] { @@ -252,10 +236,13 @@ fn _merge_estimators( // New number of observations is the sum of both... *(new_estimator.observations) = *(q1.observations) + *(q2.observations); let rank_range = *(new_estimator.observations) - 1; + println!("{rank_range}"); // Solve all inner quantiles using traditional interpolation... let mut index_between = 0; + println!("{dual_est_quants:?}"); + for ti in 1..new_estimator.targets.len() - 1 { // Calculate new rank... let target = new_estimator.targets[ti]; @@ -279,11 +266,68 @@ fn _merge_estimators( .min(dual_est_quants.len() - 1), ]; - new_estimator.ranks[ti] = approx_obs_rank; new_estimator.values[ti] = pchip_prediction( &indexes.map(|i| dual_est_quants[i].0), &indexes.map(|i| dual_est_quants[i].1), approx_obs_rank as f64, + ); + new_estimator.ranks[ti] = approx_obs_rank; + } +} + +trait PrimativeCast { + fn as_(&self) -> T; +} + +impl PrimativeCast for f64 { + #[inline] + fn as_(&self) -> f64 { + *self + } +} + +impl PrimativeCast for usize { + #[inline] + fn as_(&self) -> f64 { + *self as f64 + } +} + +fn _interpolated_value_prediction< + I: PartialOrd + PrimativeCast + Copy, + O: PartialOrd + PrimativeCast + Copy, +>( + xs: &[I], + ys: &[O], + x: f64, + lower_val: f64, + upper_val: f64, + not_enough_data_value: f64, +) -> f64 { + debug_assert!(xs.is_sorted()); + debug_assert!(xs.len() == ys.len()); + + if xs.len() < 1 { + return not_enough_data_value; + } + + let idx = xs.partition_point(|&v| v.as_() < x); + if idx > xs.len() { + upper_val + } else if idx == 0 { + lower_val + } else { + let indexes = [ + idx.saturating_sub(2), + idx.saturating_sub(1), + idx, + idx.saturating_add(1).min(xs.len() - 1), + ]; + + pchip_prediction( + &indexes.map(|i| xs[i].as_()), + &indexes.map(|i| ys[i].as_()), + x, ) } } @@ -298,7 +342,171 @@ pub trait QuantileEstimator: Distribution { fn combine(&self, other: &Self) -> Self; } -#[derive(Clone)] +trait SimpleQuantileEstimatorRepresentation: Clone { + fn new_like(other: &Self) -> Self; + fn _data(&self) -> QuantileEstimatorData; + fn _mut_data(&mut self) -> MutableQuantileEstimatorData; + fn _is_initialized(&self) -> bool { + let data = self._data(); + *data.observations >= data.ranks.len() + } +} + +impl QuantileEstimator for Q { + fn update(&mut self, sample: f64) { + let data = self._mut_data(); + + match (*data.observations + 1).cmp(&data.values.len()) { + Ordering::Less => { + data.values[*data.observations] = sample; + *data.observations += 1; + } + Ordering::Equal => { + data.values[*data.observations] = sample; + data.values.sort_by(|a, b| a.total_cmp(b)); + for i in 0..data.ranks.len() { + data.ranks[i] = i; + } + *data.observations += 1; + } + Ordering::Greater => { + _add_sample_to_estimator(data, sample); + } + } + } + + fn combine(&self, other: &Self) -> Self { + match (self._is_initialized(), other._is_initialized()) { + (true, true) => { + let mut new_quant_est = Self::new_like(&self); + + _merge_estimators(self._data(), other._data(), new_quant_est._mut_data()); + + new_quant_est + } + (true, false) | (false, false) => { + let other_data = other._data(); + let mut new_quant_est = self.clone(); + new_quant_est.update_all(&other_data.values[..*other_data.observations]); + new_quant_est + } + (false, true) => { + let self_data = self._data(); + let mut new_quant_est = other.clone(); + new_quant_est.update_all(&self_data.values[..*self_data.observations]); + new_quant_est + } + } + } +} + +impl Distribution for Q { + fn cdf(&self, x: f64) -> f64 { + let data = self._data(); + if self._is_initialized() { + _interpolated_value_prediction( + data.values, + data.ranks, + x, + 0.0, + (*data.observations - 1) as f64, + 0.0, + ) / (*data.observations - 1).max(1) as f64 + } else { + let xs_sorted = data.values[..*data.observations] + .iter() + .copied() + .sorted_by(|a, b| a.total_cmp(b)) + .collect_vec(); + let ys = (0..*data.observations).collect_vec(); + _interpolated_value_prediction( + &xs_sorted, + &ys, + x, + 0.0, + (*data.observations - 1) as f64, + 0.0, + ) + } + } + + fn logcdf(&self, x: f64) -> f64 { + self.cdf(x).ln() + } + + fn ccdf(&self, x: f64) -> f64 { + 1.0 - self.cdf(x) + } + + fn logccdf(&self, x: f64) -> f64 { + (-self.cdf(x)).ln_1p() + } + + fn ppf(&self, p: f64) -> f64 { + let data = self._data(); + let est_rank = p.clamp(0.0, 1.0) * (*data.observations - 1) as f64; + + let data = self._data(); + let (min_val, max_val) = self.support(); + + if self._is_initialized() { + _interpolated_value_prediction( + data.ranks, + data.values, + est_rank, + min_val, + max_val, + 0.0_f64.clamp(min_val, max_val), + ) + } else { + let ys_sorted = data.values[..*data.observations] + .iter() + .copied() + .sorted_by(|a, b| a.total_cmp(b)) + .collect_vec(); + let xs = (0..*data.observations).collect_vec(); + _interpolated_value_prediction( + &xs, + &ys_sorted, + est_rank, + min_val, + max_val, + 0.0_f64.clamp(min_val, max_val), + ) + } + } + + fn pdf(&self, _x: f64) -> f64 { + // Will have to calculate derivatives, cache normalization factor (such that area under curve is 1)... + // May be worth splitting out into different class to allow pre-processing this stuff... + // TODO: Would prefer quintic splines for this... Allows us to avoid normalization... + panic!("Currently not supported!"); + } + + fn logpdf(&self, x: f64) -> f64 { + self.pdf(x).ln() + } + + fn support(&self) -> (f64, f64) { + let data = self._data(); + + if self._is_initialized() { + ( + *data.values.first().unwrap_or(&f64::NEG_INFINITY), + *data.values.last().unwrap_or(&f64::INFINITY), + ) + } else { + data.values[..*data.observations] + .iter() + .copied() + .minmax() + .into_option() + .unwrap_or((f64::NEG_INFINITY, f64::INFINITY)) + } + } +} + +#[derive(Clone, Debug)] struct FixedSizeQuantileEstimator { values: [f64; N], ranks: [usize; N], @@ -308,6 +516,9 @@ struct FixedSizeQuantileEstimator { impl FixedSizeQuantileEstimator { pub fn new(targets: &[f64; N]) -> Self { + assert!( + targets.is_sorted() && targets.first() == Some(&0.0) && targets.last() == Some(&1.0) + ); Self { values: [0.0; N], ranks: [0; N], @@ -315,8 +526,14 @@ impl FixedSizeQuantileEstimator { observations: 0, } } +} + +impl SimpleQuantileEstimatorRepresentation for FixedSizeQuantileEstimator { + fn new_like(other: &Self) -> Self { + Self::new(&other.targets) + } - fn _as_data(&self) -> QuantileEstimatorData<'_> { + fn _data(&self) -> QuantileEstimatorData<'_> { QuantileEstimatorData { ranks: &self.ranks, values: &self.values, @@ -325,7 +542,7 @@ impl FixedSizeQuantileEstimator { } } - fn _as_mut_data(&mut self) -> MutableQuantileEstimatorData<'_> { + fn _mut_data(&mut self) -> MutableQuantileEstimatorData<'_> { MutableQuantileEstimatorData { ranks: &mut self.ranks, values: &mut self.values, @@ -333,140 +550,128 @@ impl FixedSizeQuantileEstimator { observations: &mut self.observations, } } - - fn _is_initialized(&self) -> bool { - self.observations >= N - } } -impl Distribution for FixedSizeQuantileEstimator { - fn cdf(&self, x: f64) -> f64 { - let upper_p = self.values.partition_point(|&v| v < x); - if upper_p > self.values.len() { - 1.0 - } else if upper_p == 0 { - 0.0 - } else { - let indexes = [ - upper_p.saturating_sub(2), - upper_p.saturating_sub(1), - upper_p, - upper_p.saturating_add(1).min(self.values.len()), - ]; +#[derive(Clone, Debug)] +struct VectorQuantileEstimator { + values: Vec, + ranks: Vec, + targets: Vec, + observations: usize, +} - pchip_prediction( - &indexes.map(|i| self.values[i]), - &indexes.map(|i| self.ranks[i] as f64), - x, - ) / (self.observations - 1) as f64 +impl VectorQuantileEstimator { + fn new(targets: &[f64]) -> Self { + assert!( + targets.is_sorted() && targets.first() == Some(&0.0) && targets.last() == Some(&1.0) + ); + Self { + values: vec![0.0; targets.len()], + ranks: (0..targets.len()).collect_vec(), + targets: Vec::from(targets), + observations: 0, } } +} - fn logcdf(&self, x: f64) -> f64 { - self.cdf(x).ln() - } - - fn ccdf(&self, x: f64) -> f64 { - 1.0 - self.cdf(x) +impl SimpleQuantileEstimatorRepresentation for VectorQuantileEstimator { + fn new_like(other: &Self) -> Self { + Self::new(&other.targets) } - fn logccdf(&self, x: f64) -> f64 { - (-self.cdf(x)).ln_1p() + fn _data(&self) -> QuantileEstimatorData<'_> { + QuantileEstimatorData { + ranks: &self.ranks, + values: &self.values, + targets: &self.targets, + observations: &self.observations, + } } - fn ppf(&self, p: f64) -> f64 { - let est_rank = p.clamp(0.0, 1.0) * (self.observations - 1) as f64; - let upper_p = self.ranks.partition_point(|&r| (r as f64) < est_rank); - if upper_p > self.values.len() { - self.values[self.values.len() - 1] - } else if upper_p == 0 { - self.values[0] - } else { - let indexes = [ - upper_p.saturating_sub(2), - upper_p.saturating_sub(1), - upper_p, - upper_p.saturating_add(1).min(self.values.len()), - ]; - - pchip_prediction( - &indexes.map(|i| self.ranks[i] as f64), - &indexes.map(|i| self.values[i]), - est_rank, - ) + fn _mut_data(&mut self) -> MutableQuantileEstimatorData<'_> { + MutableQuantileEstimatorData { + ranks: &mut self.ranks, + values: &mut self.values, + targets: &self.targets, + observations: &mut self.observations, } } +} - fn pdf(&self, x: f64) -> f64 { - // Will have to calculate derivatives, cache normalization factor (such that area under curve is 1)... - // May be worth splitting out into different class to allow pre-processing this stuff... - panic!("TODO!"); - let upper_p = self.values.partition_point(|&v| v < x); - if upper_p > self.values.len() { - 0.0 - } else if upper_p == 0 { - 0.0 +#[cfg(test)] +mod test { + use crate::{ + p2estimator::{FixedSizeQuantileEstimator, QuantileEstimator, VectorQuantileEstimator}, + statistics::{linspace, Distribution, Exponential}, + }; + use itertools::Itertools; + use rand::{rngs::Xoshiro256PlusPlus, RngExt, SeedableRng}; + + fn is_close(a: f64, b: f64) -> bool { + let rel_tol = 1e-9; + let abs_tol = 0.0; + if a == b { + true } else { - 0.0 + (a - b).abs() <= (rel_tol * (a.abs()).max(b.abs())).max(abs_tol) } } - fn logpdf(&self, x: f64) -> f64 { - self.pdf(x).ln() - } + #[test] + fn quantiles_on_exponential_dist() { + let expon = Exponential::new(1.0); + let mut estimator = + FixedSizeQuantileEstimator::new(&[0.0, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]); - fn support(&self) -> (f64, f64) { - ( - *self.values.first().unwrap_or(&f64::NEG_INFINITY), - *self.values.last().unwrap_or(&f64::INFINITY), - ) - } -} + let mut rng = Xoshiro256PlusPlus::seed_from_u64(12345654321); -impl QuantileEstimator for FixedSizeQuantileEstimator { - fn update(&mut self, sample: f64) { - match (self.observations + 1).cmp(&self.values.len()) { - Ordering::Less => { - self.values[self.observations] = sample; - self.observations += 1; - } - Ordering::Equal => { - self.values[self.observations] = sample; - self.values.sort_by(|a, b| a.total_cmp(b)); - for i in 0..self.ranks.len() { - self.ranks[i] = i; - } - self.observations += 1; - } - Ordering::Greater => { - _add_sample_to_estimator(self._as_mut_data(), sample); - } + for _ in 0..10_000 { + let sample = expon.ppf(rng.random()); + + estimator.update(sample); + } + + for val in linspace(0.0, 0.90, 90) { + assert!((estimator.ppf(val) - expon.ppf(val)).abs() <= 0.04); + let dist_val = expon.ppf(val); + assert!((estimator.cdf(dist_val) - expon.cdf(dist_val)).abs() <= 0.2); + + // Basic probability distribution checks... + assert!(is_close( + estimator.ccdf(dist_val), + 1.0 - estimator.cdf(dist_val) + )); + + assert!(is_close( + estimator.logcdf(dist_val), + estimator.cdf(dist_val).ln() + )); + assert!(is_close( + estimator.logccdf(dist_val), + estimator.ccdf(dist_val).ln() + )); + + assert!((estimator.cdf(estimator.ppf(val)) - val).abs() <= 0.06); } } - fn combine(&self, other: &Self) -> Self { - match (self._is_initialized(), other._is_initialized()) { - (true, true) => { - let mut new_quant_est = Self::new(&self.targets); + #[test] + fn test_quantile_merging() { + let expon = Exponential::new(1.0); + let mut merged_estimator = + VectorQuantileEstimator::new(&linspace(0.0, 1.0, 10).collect_vec()); - _merge_estimators( - self._as_data(), - other._as_data(), - new_quant_est._as_mut_data(), - ); + let mut rng = Xoshiro256PlusPlus::seed_from_u64(12345654321); - new_quant_est - } - (true, false) | (false, false) => { - let mut new_quant_est = self.clone(); - new_quant_est.update_all(&other.values[..other.observations]); - new_quant_est - } - (false, true) => { - let mut new_quant_est = other.clone(); - new_quant_est.update_all(&self.values[..self.observations]); - new_quant_est + for _ in 0..100 { + let targets: Vec = linspace(0.0, 1.0, rng.random_range(5..15)).collect(); + let mut estimator = VectorQuantileEstimator::new(&targets); + + for _ in 0..100 { + estimator.update(expon.ppf(rng.random())); } + + merged_estimator = merged_estimator.combine(&estimator); } } } diff --git a/src/statistics.rs b/src/statistics.rs index 6fc6e40..909bb21 100644 --- a/src/statistics.rs +++ b/src/statistics.rs @@ -410,9 +410,15 @@ impl Distribution for Laplace { } } +pub fn linspace(start: f64, stop: f64, steps: usize) -> impl Iterator { + (0..steps) + .map(move |n| n as f64 / (steps as f64 - 1.0)) + .map(move |n| start * (1.0 - n) + stop * n) +} + #[cfg(test)] mod test { - use crate::statistics::{ExponentialEstimator, Frechet, HalfT, Laplace}; + use crate::statistics::{linspace, ExponentialEstimator, Frechet, HalfT, Laplace}; use std::fmt::Debug; pub trait TestDistribution: Debug { @@ -475,12 +481,6 @@ mod test { (a - b).abs() <= (rel_tol * (a.abs()).max(b.abs())).max(abs_tol) } - fn linspace(start: f64, stop: f64, steps: usize) -> impl Iterator { - (0..steps) - .map(move |n| n as f64 / (steps as f64 - 1.0)) - .map(move |n| start * (1.0 - n) + stop * n) - } - #[test] fn basic_distribution_propery_checks() { for dist in get_dists() { From 117fcfc31ed06eb296c85201b42e28aaba716679 Mon Sep 17 00:00:00 2001 From: isaacr Date: Wed, 27 May 2026 01:29:24 -0600 Subject: [PATCH 30/39] Final touches on quantile estimator... --- src/p2estimator.rs | 78 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 62 insertions(+), 16 deletions(-) diff --git a/src/p2estimator.rs b/src/p2estimator.rs index 7e3b702..aa19e29 100644 --- a/src/p2estimator.rs +++ b/src/p2estimator.rs @@ -1,8 +1,7 @@ use std::cmp::Ordering; -use crate::{segments::MergeIterator, statistics::Distribution}; -use itertools::{izip, Itertools}; -use rayon::iter::Interleave; +use crate::statistics::Distribution; +use itertools::Itertools; // Implementation of P2 estimator. // See "The P2 Algorithm for Dynamic Statistical Computing Calculation of Quantiles and Histograms Without Storing Observations" // at https://www.cse.wustl.edu/~jain/papers/ftp/psqr.pdf @@ -48,7 +47,6 @@ fn pchip_point_derivative(dx0: f64, dy0: f64, dx1: f64, dy1: f64) -> f64 { } fn pchip_prediction(x_points: &[f64; 4], y_points: &[f64; 4], x: f64) -> f64 { - println!("{x_points:?}, {y_points:?}, {x}"); debug_assert!(x_points.is_sorted() && x <= x_points[2] && x >= x_points[1]); let m0 = pchip_point_derivative( x_points[1] - x_points[0], @@ -158,7 +156,7 @@ fn _add_sample_to_estimator(data: MutableQuantileEstimatorData, sample: f64) { fn _merge_estimators( q1: QuantileEstimatorData, q2: QuantileEstimatorData, - mut new_estimator: MutableQuantileEstimatorData, + new_estimator: MutableQuantileEstimatorData, ) { debug_check_valid_estimator(q1.ranks, q1.values, q1.targets, *q1.observations); debug_check_valid_estimator(q2.ranks, q2.values, q2.targets, *q2.observations); @@ -213,7 +211,9 @@ fn _merge_estimators( .map(|other_prior| (next.1 - other_prior.1) / (other_next.1 - other_prior.1)) .unwrap_or(0.0); let other_rank_est = q2_prior - .map(|other_prior| other_prior.0 as f64 * (1.0 - w) + other_next.0 as f64 * w) + .map(|other_prior| { + (other_prior.0 + 1) as f64 * (1.0 - w) + (other_next.0 + 1) as f64 * w + }) .unwrap_or(0.0); dual_est_quants.push((next.0 as f64 + other_rank_est, next.1)); q1_prior = Some(next); @@ -225,7 +225,9 @@ fn _merge_estimators( .map(|other_prior| (next.1 - other_prior.1) / (other_next.1 - other_prior.1)) .unwrap_or(0.0); let other_rank_est = q1_prior - .map(|other_prior| other_prior.0 as f64 * (1.0 - w) + other_next.0 as f64 * w) + .map(|other_prior| { + (other_prior.0 + 1) as f64 * (1.0 - w) + (other_next.0 + 1) as f64 * w + }) .unwrap_or(0.0); dual_est_quants.push((next.0 as f64 + other_rank_est, next.1)); q2_prior = Some(next); @@ -236,12 +238,20 @@ fn _merge_estimators( // New number of observations is the sum of both... *(new_estimator.observations) = *(q1.observations) + *(q2.observations); let rank_range = *(new_estimator.observations) - 1; - println!("{rank_range}"); // Solve all inner quantiles using traditional interpolation... let mut index_between = 0; - println!("{dual_est_quants:?}"); + new_estimator.ranks.first_mut().map(|r| *r = 0); + new_estimator.ranks.last_mut().map(|r| *r = rank_range); + new_estimator + .values + .first_mut() + .map(|v| *v = dual_est_quants[0].1); + new_estimator + .values + .last_mut() + .map(|v| *v = dual_est_quants[dual_est_quants.len() - 1].1); for ti in 1..new_estimator.targets.len() - 1 { // Calculate new rank... @@ -251,7 +261,7 @@ fn _merge_estimators( // Find where it lands in cdf... while index_between < dual_est_quants.len() - && (approx_obs_rank as f64) < dual_est_quants[index_between].0 + && (approx_obs_rank as f64) > dual_est_quants[index_between].0 { index_between += 1; } @@ -260,7 +270,7 @@ fn _merge_estimators( let indexes = [ index_between.saturating_sub(2), index_between.saturating_sub(1), - index_between, + index_between.min(dual_est_quants.len() - 1), index_between .saturating_add(1) .min(dual_est_quants.len() - 1), @@ -333,6 +343,7 @@ fn _interpolated_value_prediction< } pub trait QuantileEstimator: Distribution { + fn from_prior(prior: &Self, count: usize) -> Self; fn update(&mut self, sample: f64); fn update_all(&mut self, samples: &[f64]) { for &s in samples.iter() { @@ -340,12 +351,13 @@ pub trait QuantileEstimator: Distribution { } } fn combine(&self, other: &Self) -> Self; + fn samples(&self) -> usize; } trait SimpleQuantileEstimatorRepresentation: Clone { fn new_like(other: &Self) -> Self; - fn _data(&self) -> QuantileEstimatorData; - fn _mut_data(&mut self) -> MutableQuantileEstimatorData; + fn _data(&self) -> QuantileEstimatorData<'_>; + fn _mut_data(&mut self) -> MutableQuantileEstimatorData<'_>; fn _is_initialized(&self) -> bool { let data = self._data(); *data.observations >= data.ranks.len() @@ -353,6 +365,32 @@ trait SimpleQuantileEstimatorRepresentation: Clone { } impl QuantileEstimator for Q { + fn samples(&self) -> usize { + *self._data().observations + } + + fn from_prior(prior: &Self, count_per_entry: usize) -> Self { + let prior_data = prior._data(); + let mut new_self = Self::new_like(prior); + let new_data = new_self._mut_data(); + + let new_observations = count_per_entry * prior_data.ranks.len(); + + for i in 0..new_data.targets.len() { + let closest_rank = ((new_data.targets[i] * (new_observations - 1) as f64) as usize) + .clamp( + i, + (new_observations - 1) - (new_data.targets.len() - (i + 1)), + ); + + new_data.ranks[i] = closest_rank; + new_data.values[i] = prior.ppf(closest_rank as f64 / (new_observations - 1) as f64) + } + *new_data.observations = new_observations; + + new_self + } + fn update(&mut self, sample: f64) { let data = self._mut_data(); @@ -631,10 +669,12 @@ mod test { estimator.update(sample); } + assert!(estimator.samples() == 10_000); + for val in linspace(0.0, 0.90, 90) { assert!((estimator.ppf(val) - expon.ppf(val)).abs() <= 0.04); let dist_val = expon.ppf(val); - assert!((estimator.cdf(dist_val) - expon.cdf(dist_val)).abs() <= 0.2); + assert!((estimator.cdf(dist_val) - expon.cdf(dist_val)).abs() <= 0.04); // Basic probability distribution checks... assert!(is_close( @@ -650,8 +690,6 @@ mod test { estimator.logccdf(dist_val), estimator.ccdf(dist_val).ln() )); - - assert!((estimator.cdf(estimator.ppf(val)) - val).abs() <= 0.06); } } @@ -673,5 +711,13 @@ mod test { merged_estimator = merged_estimator.combine(&estimator); } + + assert!(merged_estimator.samples() == 10_000); + + for val in linspace(0.0, 0.75, 75) { + assert!((merged_estimator.ppf(val) - expon.ppf(val)).abs() <= 0.1); + let dist_val = expon.ppf(val); + assert!((merged_estimator.cdf(dist_val) - expon.cdf(dist_val)).abs() <= 0.04) + } } } From 1a3b1657a56d6e96452c8fdd724a0fc0ceed00b6 Mon Sep 17 00:00:00 2001 From: isaacr Date: Wed, 27 May 2026 18:08:40 -0600 Subject: [PATCH 31/39] Better fretchet fitting, need to test.. --- src/join_estimation.rs | 64 ++++++++++++++++++++++++---------- src/p2estimator.rs | 78 +++++++++++++++++++++++++++++++++++++++--- src/statistics.rs | 11 ------ 3 files changed, 120 insertions(+), 33 deletions(-) diff --git a/src/join_estimation.rs b/src/join_estimation.rs index b688ed9..2d9f970 100644 --- a/src/join_estimation.rs +++ b/src/join_estimation.rs @@ -1,7 +1,12 @@ -use std::{fmt::Debug, ops}; +use std::{ + f64::{self, consts::E}, + fmt::Debug, + ops, +}; use crate::{ assembly::{block_consensus_distance, block_length_on_query, block_target_distance, LinkType}, + p2estimator::{custom_quantile_estimator::FrechetQuant, QuantileEstimator}, segments::Block, statistics::{ln_add_exp, Distribution, ExponentialEstimator, Frechet, HalfT, Laplace}, }; @@ -37,7 +42,6 @@ impl JoinEstimator for BayesianJoinEstimator { let rel_con_dist = consensus_dist as f64 / block_length_on_query(first_block).max(block_length_on_query(second_block)) as f64; - /* println!("{:#?}", self); println!( "{} {} {}", @@ -56,16 +60,16 @@ impl JoinEstimator for BayesianJoinEstimator { divergence_diff, self.divergence_join.pdf(divergence_diff), self.divergence_nojoin.pdf(divergence_diff) - );*/ + ); let join_score = self.join_prior.ln() + self.target_distance_join.logpdf(target_dist) - + self.divergence_join.logpdf(divergence_diff); - //+ self.consensus_distance_join.logpdf(rel_con_dist); + + self.divergence_join.logpdf(divergence_diff) + + self.consensus_distance_join.logpdf(rel_con_dist); let nojoin_score = (-self.join_prior).ln_1p() + self.target_distance_nojoin.logpdf(target_dist) - + self.divergence_nojoin.logpdf(divergence_diff); - //+ self.consensus_distance_nojoin.logpdf(rel_con_dist); + + self.divergence_nojoin.logpdf(divergence_diff) + + self.consensus_distance_nojoin.logpdf(rel_con_dist); let score_norm = ln_add_exp(join_score, nojoin_score); let score = join_score - score_norm; @@ -179,6 +183,36 @@ impl From for Laplace { } } +impl From<&FrechetQuant> for Frechet { + fn from(value: &FrechetQuant) -> Self { + // Technique developed in notebooks, should write down... + fn unscaled_fretchet_ppf(x: f64, a: f64) -> f64 { + (-(x.ln())).powf(-1.0 / a) + } + + // Chosen so p2 (middle quantile) is close to median... + let power_scale = 1.5; + + let p1 = 1.0 / E; // -ln(1/e)... + let p2 = (1.0 / E).powf(1.0 / power_scale); + let p3 = (1.0 / E).powf(1.0 / (power_scale * power_scale)); + + let q1 = value.ppf(p1); + let q2 = value.ppf(p2); + let q3 = value.ppf(p3); + + let relative_q = (q2 - q1) / (q3 - q1); + + // Because we carefully chose quantiles... Solution for a simplifies to below... + let a = power_scale.ln() / (1.0 / relative_q + 1.0).ln(); + let s = (q2 - q1) / (unscaled_fretchet_ppf(p2, a) - unscaled_fretchet_ppf(p1, a)); + // Rather than directly estimating m, we assume the mode of the distribution is at 0... + let m = -(a / (1.0 + a)).powf(1.0 / a); + + Frechet::new(a, s, m) + } +} + impl From<&BayesianJoinStatistics> for BayesianJoinEstimator { fn from(statistics: &BayesianJoinStatistics) -> Self { Self { @@ -186,11 +220,7 @@ impl From<&BayesianJoinStatistics> for BayesianJoinEstimator { target_distance_nojoin: statistics.unjoinable_target_distance.into(), divergence_join: statistics.joinable_divergence.into(), divergence_nojoin: statistics.unjoinable_divergence.into(), - consensus_distance_join: Frechet::from_log_moments( - statistics.joinable_consensus_log.mean(), - statistics.joinable_consensus_log.standard_deviation(), - -0.05, - ), + consensus_distance_join: (&statistics.joinable_consensus).into(), consensus_distance_nojoin: statistics.unjoinable_consensus.into(), // We take sqrt since we count all pairs, not just neighbors. join_prior: (statistics.joinable_target_distance.samples() as f64 @@ -208,7 +238,7 @@ pub struct BayesianJoinStatistics { unjoinable_target_distance: MomentEstimator, joinable_divergence: MomentEstimator, unjoinable_divergence: MomentEstimator, - joinable_consensus_log: MomentEstimator, + joinable_consensus: FrechetQuant, unjoinable_consensus: MomentEstimator, } @@ -231,9 +261,7 @@ impl JoinStatisticsCollector for BayesianJoinStatistics { unjoinable_divergence: bayesian_prior .unjoinable_divergence .to_psuedo_count(pseudo_count), - joinable_consensus_log: bayesian_prior - .joinable_consensus_log - .to_psuedo_count(pseudo_count), + joinable_consensus: FrechetQuant::from_prior(&bayesian_prior.joinable_consensus, 1), unjoinable_consensus: bayesian_prior .unjoinable_consensus .to_psuedo_count(pseudo_count), @@ -252,7 +280,7 @@ impl JoinStatisticsCollector for BayesianJoinStatistics { self.joinable_divergence += divergence_diff; if matches!(join_type, LinkType::Forward | LinkType::Reverse) { //println!("CDist: {}", rel_con_dist); - self.joinable_consensus_log += (rel_con_dist + 0.05).max(1e-50).ln(); + self.joinable_consensus.update(rel_con_dist); } } else { self.unjoinable_target_distance += target_dist as f64; @@ -271,7 +299,7 @@ impl JoinStatisticsCollector for BayesianJoinStatistics { + other.unjoinable_target_distance, joinable_divergence: self.joinable_divergence + other.joinable_divergence, unjoinable_divergence: self.unjoinable_divergence + other.unjoinable_divergence, - joinable_consensus_log: self.joinable_consensus_log + other.joinable_consensus_log, + joinable_consensus: self.joinable_consensus.combine(&other.joinable_consensus), unjoinable_consensus: self.unjoinable_consensus + other.unjoinable_consensus, } } diff --git a/src/p2estimator.rs b/src/p2estimator.rs index aa19e29..2a8c89c 100644 --- a/src/p2estimator.rs +++ b/src/p2estimator.rs @@ -351,6 +351,7 @@ pub trait QuantileEstimator: Distribution { } } fn combine(&self, other: &Self) -> Self; + #[allow(dead_code)] fn samples(&self) -> usize; } @@ -374,7 +375,7 @@ impl QuantileEstimator for Q { let mut new_self = Self::new_like(prior); let new_data = new_self._mut_data(); - let new_observations = count_per_entry * prior_data.ranks.len(); + let new_observations = count_per_entry.max(1) * prior_data.ranks.len(); for i in 0..new_data.targets.len() { let closest_rank = ((new_data.targets[i] * (new_observations - 1) as f64) as usize) @@ -545,7 +546,7 @@ impl Distribution for Q { } #[derive(Clone, Debug)] -struct FixedSizeQuantileEstimator { +pub struct FixedSizeQuantileEstimator { values: [f64; N], ranks: [usize; N], targets: [f64; N], @@ -591,7 +592,7 @@ impl SimpleQuantileEstimatorRepresentation for FixedSizeQuantile } #[derive(Clone, Debug)] -struct VectorQuantileEstimator { +pub struct VectorQuantileEstimator { values: Vec, ranks: Vec, targets: Vec, @@ -599,7 +600,7 @@ struct VectorQuantileEstimator { } impl VectorQuantileEstimator { - fn new(targets: &[f64]) -> Self { + pub fn new(targets: &[f64]) -> Self { assert!( targets.is_sorted() && targets.first() == Some(&0.0) && targets.last() == Some(&1.0) ); @@ -636,6 +637,75 @@ impl SimpleQuantileEstimatorRepresentation for VectorQuantileEstimator { } } +pub mod custom_quantile_estimator { + use super::*; + use std::f64::consts::E; + + macro_rules! replace_expr { + ($_t:tt,$sub:expr) => { + $sub + }; + } + + macro_rules! count_exprs { + ($($val:expr),+) => {<[()]>::len(&[$(replace_expr!($val,())),+])}; + } + + macro_rules! implement_fixed_quantile_estimator { + ($name:ident[$($val:expr),+]) => { + #[derive(Clone, Debug)] + pub struct $name { + values: [f64; Self::COUNT], + ranks: [usize; Self::COUNT], + observations: usize, + } + + impl $name { + const TARGETS: [f64; count_exprs!($($val),+) + 2] = [0.0, $($val),+, 1.0]; + const COUNT: usize = Self::TARGETS.len(); + + pub fn new() -> Self { + Self { + values: [0.0; _], + ranks: [0; _], + observations: 0 + } + } + } + + impl Default for $name { + fn default() -> Self { + Self::new() + } + } + + impl SimpleQuantileEstimatorRepresentation for $name { + fn new_like(_other: &Self) -> Self { + Self::default() + } + fn _data(&self) -> QuantileEstimatorData<'_> { + QuantileEstimatorData { + ranks: &self.ranks, + values: &self.values, + targets: &Self::TARGETS, + observations: &self.observations, + } + } + fn _mut_data(&mut self) -> MutableQuantileEstimatorData<'_> { + MutableQuantileEstimatorData { + ranks: &mut self.ranks, + values: &mut self.values, + targets: &Self::TARGETS, + observations: &mut self.observations, + } + } + } + }; + } + + implement_fixed_quantile_estimator!(FrechetQuant[0.5 / E, 0.25, 1.0 / E, 0.5, 0.5 + 1.0 / 2.0 * E, 0.75]); +} + #[cfg(test)] mod test { use crate::{ diff --git a/src/statistics.rs b/src/statistics.rs index 909bb21..1a8701e 100644 --- a/src/statistics.rs +++ b/src/statistics.rs @@ -259,17 +259,6 @@ impl Frechet { minimum, } } - - pub fn from_log_moments(log_mean: f64, log_std: f64, minimum: f64) -> Self { - let alpha = f64::consts::PI / (6.0 * log_std); - let lambda = (alpha * log_mean - f64::consts::EULER_GAMMA).exp(); - let scale = lambda.powf(1.0 / alpha); - Self { - alpha, - scale, - minimum, - } - } } impl Default for Frechet { From 2f7c9f569b21bf8a4c81a5daa0689b3f73b52030 Mon Sep 17 00:00:00 2001 From: isaacr Date: Thu, 28 May 2026 00:57:21 -0600 Subject: [PATCH 32/39] Test gumbel... --- src/assembly.rs | 4 +-- src/join_estimation.rs | 32 ++++++++++++-------- src/statistics.rs | 67 ++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 86 insertions(+), 17 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index a58875a..4cbaf6b 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -128,13 +128,13 @@ fn get_link_cost( // Cost = linear consensus cost + linear target gap cost... min_value - + piecewise_linear_cost( + /*+ piecewise_linear_cost( -(annotation_args.free_join_consensus_overlap as f64).abs(), (annotation_args.free_join_consensus_gap as f64).abs(), alpha, beta, consensus_gap as f64, - ) + )*/ + expected_score } diff --git a/src/join_estimation.rs b/src/join_estimation.rs index 2d9f970..3bbd204 100644 --- a/src/join_estimation.rs +++ b/src/join_estimation.rs @@ -8,7 +8,7 @@ use crate::{ assembly::{block_consensus_distance, block_length_on_query, block_target_distance, LinkType}, p2estimator::{custom_quantile_estimator::FrechetQuant, QuantileEstimator}, segments::Block, - statistics::{ln_add_exp, Distribution, ExponentialEstimator, Frechet, HalfT, Laplace}, + statistics::{ln_add_exp, Distribution, ExponentialEstimator, Frechet, Gumbel, HalfT, Laplace}, }; pub trait JoinEstimator: Clone + Default + Debug { @@ -28,7 +28,7 @@ pub struct BayesianJoinEstimator { target_distance_nojoin: ExponentialEstimator, divergence_join: HalfT, divergence_nojoin: HalfT, - consensus_distance_join: Frechet, + consensus_distance_join: Gumbel, consensus_distance_nojoin: Laplace, join_prior: f64, } @@ -64,12 +64,12 @@ impl JoinEstimator for BayesianJoinEstimator { let join_score = self.join_prior.ln() + self.target_distance_join.logpdf(target_dist) - + self.divergence_join.logpdf(divergence_diff) - + self.consensus_distance_join.logpdf(rel_con_dist); + + self.divergence_join.logpdf(divergence_diff); + //+ self.consensus_distance_join.logpdf(rel_con_dist); let nojoin_score = (-self.join_prior).ln_1p() + self.target_distance_nojoin.logpdf(target_dist) - + self.divergence_nojoin.logpdf(divergence_diff) - + self.consensus_distance_nojoin.logpdf(rel_con_dist); + + self.divergence_nojoin.logpdf(divergence_diff); + //+ self.consensus_distance_nojoin.logpdf(rel_con_dist); let score_norm = ln_add_exp(join_score, nojoin_score); let score = join_score - score_norm; @@ -191,7 +191,7 @@ impl From<&FrechetQuant> for Frechet { } // Chosen so p2 (middle quantile) is close to median... - let power_scale = 1.5; + let power_scale = 2.0; let p1 = 1.0 / E; // -ln(1/e)... let p2 = (1.0 / E).powf(1.0 / power_scale); @@ -207,7 +207,7 @@ impl From<&FrechetQuant> for Frechet { let a = power_scale.ln() / (1.0 / relative_q + 1.0).ln(); let s = (q2 - q1) / (unscaled_fretchet_ppf(p2, a) - unscaled_fretchet_ppf(p1, a)); // Rather than directly estimating m, we assume the mode of the distribution is at 0... - let m = -(a / (1.0 + a)).powf(1.0 / a); + let m = -s * (a / (1.0 + a)).powf(1.0 / a); Frechet::new(a, s, m) } @@ -220,7 +220,11 @@ impl From<&BayesianJoinStatistics> for BayesianJoinEstimator { target_distance_nojoin: statistics.unjoinable_target_distance.into(), divergence_join: statistics.joinable_divergence.into(), divergence_nojoin: statistics.unjoinable_divergence.into(), - consensus_distance_join: (&statistics.joinable_consensus).into(), + consensus_distance_join: Gumbel::new( + 0.0, + 6.0_f64.sqrt() * statistics.joinable_consensus.standard_deviation() + / f64::consts::PI, + ), consensus_distance_nojoin: statistics.unjoinable_consensus.into(), // We take sqrt since we count all pairs, not just neighbors. join_prior: (statistics.joinable_target_distance.samples() as f64 @@ -238,7 +242,7 @@ pub struct BayesianJoinStatistics { unjoinable_target_distance: MomentEstimator, joinable_divergence: MomentEstimator, unjoinable_divergence: MomentEstimator, - joinable_consensus: FrechetQuant, + joinable_consensus: MomentEstimator, unjoinable_consensus: MomentEstimator, } @@ -261,7 +265,9 @@ impl JoinStatisticsCollector for BayesianJoinStatistics { unjoinable_divergence: bayesian_prior .unjoinable_divergence .to_psuedo_count(pseudo_count), - joinable_consensus: FrechetQuant::from_prior(&bayesian_prior.joinable_consensus, 1), + joinable_consensus: bayesian_prior + .joinable_consensus + .to_psuedo_count(pseudo_count), unjoinable_consensus: bayesian_prior .unjoinable_consensus .to_psuedo_count(pseudo_count), @@ -280,7 +286,7 @@ impl JoinStatisticsCollector for BayesianJoinStatistics { self.joinable_divergence += divergence_diff; if matches!(join_type, LinkType::Forward | LinkType::Reverse) { //println!("CDist: {}", rel_con_dist); - self.joinable_consensus.update(rel_con_dist); + self.joinable_consensus += rel_con_dist; } } else { self.unjoinable_target_distance += target_dist as f64; @@ -299,7 +305,7 @@ impl JoinStatisticsCollector for BayesianJoinStatistics { + other.unjoinable_target_distance, joinable_divergence: self.joinable_divergence + other.joinable_divergence, unjoinable_divergence: self.unjoinable_divergence + other.unjoinable_divergence, - joinable_consensus: self.joinable_consensus.combine(&other.joinable_consensus), + joinable_consensus: self.joinable_consensus + other.joinable_consensus, unjoinable_consensus: self.unjoinable_consensus + other.unjoinable_consensus, } } diff --git a/src/statistics.rs b/src/statistics.rs index 1a8701e..dc0e728 100644 --- a/src/statistics.rs +++ b/src/statistics.rs @@ -328,6 +328,68 @@ impl Distribution for Frechet { } } +#[derive(Debug, Clone)] +pub struct Gumbel { + location: f64, + scale: f64, +} + +impl Gumbel { + pub fn new(location: f64, scale: f64) -> Self { + Self { location, scale } + } +} + +impl Default for Gumbel { + fn default() -> Self { + Self::new(0.0, 1.0) + } +} + +impl ParameterizedDistribution for Gumbel {} + +impl Distribution for Gumbel { + fn logpdf(&self, x: f64) -> f64 { + let mu = self.location; + let beta = self.scale; + let z = (x - mu) / beta; + (1.0 / beta).ln() - (z + (-z).exp()) + } + + fn pdf(&self, x: f64) -> f64 { + self.logpdf(x).exp() + } + + fn cdf(&self, x: f64) -> f64 { + self.logcdf(x).exp() + } + + fn logcdf(&self, x: f64) -> f64 { + let mu = self.location; + let beta = self.scale; + let z = (x - mu) / beta; + -((-z).exp()) + } + + fn ppf(&self, p: f64) -> f64 { + let mu = self.location; + let beta = self.scale; + mu - beta * (-p.ln()).ln() + } + + fn ccdf(&self, x: f64) -> f64 { + 1.0 - self.cdf(x) + } + + fn logccdf(&self, x: f64) -> f64 { + self.ccdf(x).ln() + } + + fn support(&self) -> (f64, f64) { + (f64::NEG_INFINITY, f64::INFINITY) + } +} + #[derive(Debug, Clone)] pub struct Laplace { mean: f64, @@ -407,7 +469,7 @@ pub fn linspace(start: f64, stop: f64, steps: usize) -> impl Iterator [Box; 5] { + fn get_dists() -> [Box; 6] { [ as_box(Exponential::unit()), as_box(ExponentialEstimator::unit()), as_box(HalfT::unit()), as_box(Frechet::unit()), as_box(Laplace::unit()), + as_box(Gumbel::unit()), ] } From 52dc778ee52ed9bf757197ef4d9de8fc74fb6f12 Mon Sep 17 00:00:00 2001 From: isaacr Date: Fri, 29 May 2026 10:01:01 -0600 Subject: [PATCH 33/39] Normalize by consensus. --- .../plot_consensus_distance_repeatmasker.py | 26 ++- src/assembly.rs | 51 +++++- src/join_estimation.rs | 165 ++++++++++++------ src/p2estimator.rs | 14 +- src/pipeline.rs | 8 +- src/segments.rs | 4 +- src/statistics.rs | 73 +++++++- 7 files changed, 268 insertions(+), 73 deletions(-) diff --git a/scripts/plot_consensus_distance_repeatmasker.py b/scripts/plot_consensus_distance_repeatmasker.py index 2bd07bd..23dab81 100644 --- a/scripts/plot_consensus_distance_repeatmasker.py +++ b/scripts/plot_consensus_distance_repeatmasker.py @@ -12,16 +12,28 @@ sequences = [] -def get_gap(pstart, pend, qstart, qend, ppos, is_pos, check_valid: bool = True): +def get_gap( + pstart, + pend, + qstart, + qend, + qremaining, + ppos, + is_pos, + check_valid: bool = True, +): + c_len = max(qremaining + qend, qremaining + qstart) try: if is_pos and ppos: if check_valid: assert pstart <= qstart <= qend and pstart <= pend <= qend gap = qstart - pend + gap /= c_len elif not is_pos and not ppos: if check_valid: assert qstart <= qend <= pend and qstart <= pstart <= pend gap = pstart - qend + gap /= c_len else: gap = None except AssertionError: @@ -61,9 +73,11 @@ def get_gap(pstart, pend, qstart, qend, ppos, is_pos, check_valid: bool = True): if is_pos: qstart = int(tokens[11]) qend = int(tokens[12]) + qremaining = int(tokens[13].strip("()")) else: qstart = int(tokens[13]) qend = int(tokens[12]) + qremaining = int(tokens[11].strip("()")) assert qend >= qstart length += qend - qstart @@ -73,12 +87,16 @@ def get_gap(pstart, pend, qstart, qend, ppos, is_pos, check_valid: bool = True): if pname != name: gap = None else: - gap = get_gap(pstart, pend, qstart, qend, ppos, is_pos) + gap = get_gap(pstart, pend, qstart, qend, qremaining, ppos, is_pos) if gap is not None: if shared_name not in gap_info: gap_info[shared_name] = [] + if gap > 2: + print( + f"Gap > 2, Gap Value {gap}: {seq}\n\t {'\n\t'.join(seqs)}" + ) gap_info[shared_name].append(gap) prior = (name, is_pos, qstart, qend) @@ -106,9 +124,11 @@ def get_gap(pstart, pend, qstart, qend, ppos, is_pos, check_valid: bool = True): if is_pos: qstart = int(tokens[11]) qend = int(tokens[12]) + qremaining = int(tokens[13].strip("()")) else: qstart = int(tokens[13]) qend = int(tokens[12]) + qremaining = int(tokens[11].strip("()")) random_prior = other_priors.get(name, None) if random_prior is not None: @@ -116,7 +136,7 @@ def get_gap(pstart, pend, qstart, qend, ppos, is_pos, check_valid: bool = True): if pjoin_id == join_id: gap = None else: - gap = get_gap(pstart, pend, qstart, qend, ppos, is_pos, False) + gap = get_gap(pstart, pend, qstart, qend, qremaining, ppos, is_pos, False) target_gap = tstart - p_tstart diff --git a/src/assembly.rs b/src/assembly.rs index 4cbaf6b..5f49087 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -188,6 +188,34 @@ pub fn block_length_on_query(b: &Block) -> usize { b.query_end.abs_diff(b.query_start) + 1 } +pub enum ConsensusDistanceNormalization { + Max, + Min, + Sum, + WithLength(usize), +} + +pub fn relative_consensus_distance( + first_block: &Block, + second_block: &Block, + mode: ConsensusDistanceNormalization, +) -> (f64, LinkType) { + let (dist, link_type) = block_consensus_distance(first_block, second_block); + let div = match mode { + ConsensusDistanceNormalization::Sum => { + block_length_on_query(first_block) + block_length_on_query(second_block) + } + ConsensusDistanceNormalization::Max => { + block_length_on_query(first_block).max(block_length_on_query(second_block)) + } + ConsensusDistanceNormalization::Min => { + block_length_on_query(first_block).min(block_length_on_query(second_block)) + } + ConsensusDistanceNormalization::WithLength(length) => length, + }; + (dist as f64 / div as f64, link_type) +} + fn is_joinable( target_distance: isize, consensus_distance: isize, @@ -231,6 +259,7 @@ fn new_alignment_to_blocks_map( pub fn gather_join_statistics( alignments: &[Alignment], + query_lengths: &HashMap, annotation_args: &AnnotationArgs, ) -> Vec<(usize, T)> { let mut query_ids: Vec = alignments.iter().map(|a| a.query_id).unique().collect(); @@ -256,6 +285,9 @@ pub fn gather_join_statistics( gather_join_statistics_single_family( compat_alignments, + *query_lengths + .get(&id) + .expect("Query length missing for alignment!"), annotation_args, &mut new_stats, ); @@ -268,6 +300,7 @@ pub fn gather_join_statistics( fn gather_join_statistics_single_family<'a>( compatable_alignments: impl Iterator, + consensus_length: usize, args: &AnnotationArgs, join_stats: &mut impl JoinStatisticsCollector, ) { @@ -293,7 +326,13 @@ fn gather_join_statistics_single_family<'a>( args, ); - join_stats.add(a_block, b_block, idx + 1 == idx2, joinable); + join_stats.add( + a_block, + b_block, + consensus_length, + idx + 1 == idx2, + joinable, + ); }) }) } @@ -301,6 +340,7 @@ fn gather_join_statistics_single_family<'a>( fn link_assemblies( graph: &mut HashMap<(SegmentAndDenseRow, SegmentAndDenseRow), Edge>, compatable_blocks: impl Iterator, + consensus_length: usize, segments: &SegmentedMatrix, query_statistics: &QueryStatistics, _region_statistics: &RegionStatistics, @@ -333,7 +373,10 @@ fn link_assemblies( min_block_length, args, ) { - let join_prob = query_statistics.estimator.predict(a_block, b_block, false); + let join_prob = + query_statistics + .estimator + .predict(a_block, b_block, consensus_length, false); if join_prob >= args.join_likelihood_threshold { let weight = if a_block.row_idx == b_block.row_idx && ((b.0 - 1) <= a.0) { @@ -370,6 +413,7 @@ pub struct SegmentAssemblyGraph { impl SegmentAssemblyGraph { pub fn new( alignments: &[Alignment], + query_lengths: &HashMap, segments: &SegmentedMatrix, region_statistics: &RegionStatistics, query_statistics: &[QueryStatistics], @@ -400,6 +444,9 @@ impl SegmentAssemblyGraph { link_assemblies( &mut link_graph, compat_blocks, + *query_lengths + .get(&id) + .expect("Unable to find query length for alignment!"), segments, &query_statistics[id], region_statistics, diff --git a/src/join_estimation.rs b/src/join_estimation.rs index 3bbd204..827f98b 100644 --- a/src/join_estimation.rs +++ b/src/join_estimation.rs @@ -1,25 +1,44 @@ use std::{ - f64::{self, consts::E}, + f64::{self}, fmt::Debug, ops, }; use crate::{ - assembly::{block_consensus_distance, block_length_on_query, block_target_distance, LinkType}, - p2estimator::{custom_quantile_estimator::FrechetQuant, QuantileEstimator}, + assembly::{ + block_target_distance, relative_consensus_distance, ConsensusDistanceNormalization, + LinkType, + }, + p2estimator::{ + custom_quantile_estimator::{LomaxQuant, MedianEstimator}, + QuantileEstimator, + }, segments::Block, - statistics::{ln_add_exp, Distribution, ExponentialEstimator, Frechet, Gumbel, HalfT, Laplace}, + statistics::{ln_add_exp, Distribution, ExponentialEstimator, HalfT, Laplace, Lomax}, }; pub trait JoinEstimator: Clone + Default + Debug { - fn predict(&self, first_block: &Block, second_block: &Block, log_space: bool) -> f64; + fn predict( + &self, + first_block: &Block, + second_block: &Block, + consensus_length: usize, + log_space: bool, + ) -> f64; } pub trait JoinStatisticsCollector: Clone + Debug { fn new() -> Self; fn new_from_prior(bayesian_prior: &Self, pseudo_count: usize) -> Self; fn combine(&self, other: &Self) -> Self; - fn add(&mut self, first_block: &Block, second_block: &Block, neighbors: bool, joinable: bool); + fn add( + &mut self, + first_block: &Block, + second_block: &Block, + consenus_length: usize, + neighbors: bool, + joinable: bool, + ); } #[derive(Debug, Clone, Default)] @@ -28,25 +47,44 @@ pub struct BayesianJoinEstimator { target_distance_nojoin: ExponentialEstimator, divergence_join: HalfT, divergence_nojoin: HalfT, - consensus_distance_join: Gumbel, + consensus_distance_join_pos: ExponentialEstimator, + consensus_distance_join_neg: ExponentialEstimator, + consensus_norm_pos: f64, + consensus_norm_neg: f64, consensus_distance_nojoin: Laplace, join_prior: f64, } impl JoinEstimator for BayesianJoinEstimator { - fn predict(&self, first_block: &Block, second_block: &Block, log_space: bool) -> f64 { + fn predict( + &self, + first_block: &Block, + second_block: &Block, + consensus_distance: usize, + log_space: bool, + ) -> f64 { let target_dist = block_target_distance(first_block, second_block) as f64; // Absolute value as t-dist is symmetric and we want to get prob in tail, also, we know the mean is 0... let divergence_diff = (second_block.kimura80 - first_block.kimura80).abs(); - let (consensus_dist, _join_type) = block_consensus_distance(first_block, second_block); - let rel_con_dist = consensus_dist as f64 - / block_length_on_query(first_block).max(block_length_on_query(second_block)) as f64; + let (rel_con_dist, _join_type) = relative_consensus_distance( + first_block, + second_block, + ConsensusDistanceNormalization::WithLength(consensus_distance), + ); + + let consensus_join_dist_logpdf = |x: f64| { + if x < 0.0 { + self.consensus_distance_join_neg.logpdf(x.abs()) + self.consensus_norm_neg.ln() + } else { + self.consensus_distance_join_pos.logpdf(x.abs()) + self.consensus_norm_pos.ln() + } + }; println!("{:#?}", self); println!( "{} {} {}", rel_con_dist, - self.consensus_distance_join.pdf(rel_con_dist), + consensus_join_dist_logpdf(rel_con_dist).exp(), self.consensus_distance_nojoin.pdf(rel_con_dist) ); println!( @@ -183,48 +221,45 @@ impl From for Laplace { } } -impl From<&FrechetQuant> for Frechet { - fn from(value: &FrechetQuant) -> Self { - // Technique developed in notebooks, should write down... - fn unscaled_fretchet_ppf(x: f64, a: f64) -> f64 { - (-(x.ln())).powf(-1.0 / a) - } - - // Chosen so p2 (middle quantile) is close to median... - let power_scale = 2.0; - - let p1 = 1.0 / E; // -ln(1/e)... - let p2 = (1.0 / E).powf(1.0 / power_scale); - let p3 = (1.0 / E).powf(1.0 / (power_scale * power_scale)); +impl From<&LomaxQuant> for Lomax { + fn from(value: &LomaxQuant) -> Self { + // Using quantile selection trick originally developed for frechet... (quant ratio formula) + // Choose first quantile p, then second such that p2 = 1 - (1 - p)^2 and you get this nice closed form for alpha... + let p1 = LomaxQuant::PROB1; + let p2 = LomaxQuant::PROB2; - let q1 = value.ppf(p1); - let q2 = value.ppf(p2); - let q3 = value.ppf(p3); + let v1 = value.ppf(p1); + let v2 = value.ppf(p2); - let relative_q = (q2 - q1) / (q3 - q1); - - // Because we carefully chose quantiles... Solution for a simplifies to below... - let a = power_scale.ln() / (1.0 / relative_q + 1.0).ln(); - let s = (q2 - q1) / (unscaled_fretchet_ppf(p2, a) - unscaled_fretchet_ppf(p1, a)); - // Rather than directly estimating m, we assume the mode of the distribution is at 0... - let m = -s * (a / (1.0 + a)).powf(1.0 / a); + let a = -(1.0 - p1).ln() / (v2 / v1 - 1.0).ln(); + let y = v1 / ((1.0 - p1).powf(-1.0 / a) - 1.0); + Lomax::new(a, y) + } +} - Frechet::new(a, s, m) +impl From<&MedianEstimator> for ExponentialEstimator { + fn from(value: &MedianEstimator) -> Self { + Self::new(value.ppf(0.5) / 2.0_f64.ln(), value.samples()) } } impl From<&BayesianJoinStatistics> for BayesianJoinEstimator { fn from(statistics: &BayesianJoinStatistics) -> Self { + let cons_total = (statistics.joinable_consensus_pos.samples() + + statistics.joinable_consensus_neg.samples()) + .max(1); + let cons_pos_perc = statistics.joinable_consensus_pos.samples() as f64 / cons_total as f64; + let cons_neg_perc = statistics.joinable_consensus_neg.samples() as f64 / cons_total as f64; + Self { target_distance_join: statistics.joinable_target_distance.into(), target_distance_nojoin: statistics.unjoinable_target_distance.into(), divergence_join: statistics.joinable_divergence.into(), divergence_nojoin: statistics.unjoinable_divergence.into(), - consensus_distance_join: Gumbel::new( - 0.0, - 6.0_f64.sqrt() * statistics.joinable_consensus.standard_deviation() - / f64::consts::PI, - ), + consensus_distance_join_pos: (&statistics.joinable_consensus_pos).into(), + consensus_distance_join_neg: (&statistics.joinable_consensus_neg).into(), + consensus_norm_pos: 0.5 * cons_pos_perc, + consensus_norm_neg: 0.5 * cons_neg_perc, consensus_distance_nojoin: statistics.unjoinable_consensus.into(), // We take sqrt since we count all pairs, not just neighbors. join_prior: (statistics.joinable_target_distance.samples() as f64 @@ -242,7 +277,8 @@ pub struct BayesianJoinStatistics { unjoinable_target_distance: MomentEstimator, joinable_divergence: MomentEstimator, unjoinable_divergence: MomentEstimator, - joinable_consensus: MomentEstimator, + joinable_consensus_pos: MedianEstimator, + joinable_consensus_neg: MedianEstimator, unjoinable_consensus: MomentEstimator, } @@ -265,33 +301,51 @@ impl JoinStatisticsCollector for BayesianJoinStatistics { unjoinable_divergence: bayesian_prior .unjoinable_divergence .to_psuedo_count(pseudo_count), - joinable_consensus: bayesian_prior - .joinable_consensus - .to_psuedo_count(pseudo_count), + joinable_consensus_pos: MedianEstimator::from_prior( + &bayesian_prior.joinable_consensus_pos, + pseudo_count, + ), + joinable_consensus_neg: MedianEstimator::from_prior( + &bayesian_prior.joinable_consensus_neg, + pseudo_count, + ), unjoinable_consensus: bayesian_prior .unjoinable_consensus .to_psuedo_count(pseudo_count), } } - fn add(&mut self, first_block: &Block, second_block: &Block, _neighbors: bool, joinable: bool) { + fn add( + &mut self, + first_block: &Block, + second_block: &Block, + consensus_length: usize, + neighbors: bool, + joinable: bool, + ) { let target_dist = block_target_distance(first_block, second_block).abs() as usize; let divergence_diff = (second_block.kimura80 - first_block.kimura80).abs(); - let (consensus_dist, join_type) = block_consensus_distance(first_block, second_block); - let rel_con_dist = consensus_dist as f64 - / block_length_on_query(first_block).max(block_length_on_query(second_block)) as f64; + let (rel_con_dist, join_type) = relative_consensus_distance( + first_block, + second_block, + ConsensusDistanceNormalization::WithLength(consensus_length), + ); if joinable { self.joinable_target_distance += target_dist as f64; self.joinable_divergence += divergence_diff; - if matches!(join_type, LinkType::Forward | LinkType::Reverse) { + if neighbors && matches!(join_type, LinkType::Forward | LinkType::Reverse) { //println!("CDist: {}", rel_con_dist); - self.joinable_consensus += rel_con_dist; + if rel_con_dist >= 0.0 { + self.joinable_consensus_pos.update(rel_con_dist.abs()); + } else { + self.joinable_consensus_neg.update(rel_con_dist.abs()); + } } } else { self.unjoinable_target_distance += target_dist as f64; self.unjoinable_divergence += divergence_diff; - if matches!(join_type, LinkType::Forward | LinkType::Reverse) { + if neighbors && matches!(join_type, LinkType::Forward | LinkType::Reverse) { self.unjoinable_consensus += rel_con_dist; } } @@ -305,7 +359,12 @@ impl JoinStatisticsCollector for BayesianJoinStatistics { + other.unjoinable_target_distance, joinable_divergence: self.joinable_divergence + other.joinable_divergence, unjoinable_divergence: self.unjoinable_divergence + other.unjoinable_divergence, - joinable_consensus: self.joinable_consensus + other.joinable_consensus, + joinable_consensus_pos: self + .joinable_consensus_pos + .combine(&other.joinable_consensus_pos), + joinable_consensus_neg: self + .joinable_consensus_neg + .combine(&other.joinable_consensus_neg), unjoinable_consensus: self.unjoinable_consensus + other.unjoinable_consensus, } } diff --git a/src/p2estimator.rs b/src/p2estimator.rs index 2a8c89c..0616cbc 100644 --- a/src/p2estimator.rs +++ b/src/p2estimator.rs @@ -322,7 +322,7 @@ fn _interpolated_value_prediction< } let idx = xs.partition_point(|&v| v.as_() < x); - if idx > xs.len() { + if idx >= xs.len() { upper_val } else if idx == 0 { lower_val @@ -639,7 +639,6 @@ impl SimpleQuantileEstimatorRepresentation for VectorQuantileEstimator { pub mod custom_quantile_estimator { use super::*; - use std::f64::consts::E; macro_rules! replace_expr { ($_t:tt,$sub:expr) => { @@ -666,8 +665,8 @@ pub mod custom_quantile_estimator { pub fn new() -> Self { Self { - values: [0.0; _], - ranks: [0; _], + values: [0.0; Self::COUNT], + ranks: [0; Self::COUNT], observations: 0 } } @@ -703,7 +702,12 @@ pub mod custom_quantile_estimator { }; } - implement_fixed_quantile_estimator!(FrechetQuant[0.5 / E, 0.25, 1.0 / E, 0.5, 0.5 + 1.0 / 2.0 * E, 0.75]); + implement_fixed_quantile_estimator!(LomaxQuant[0.18, 0.36, 0.4752, 0.5904, 0.7952]); + impl LomaxQuant { + pub const PROB1: f64 = 0.36; + pub const PROB2: f64 = 0.59; + } + implement_fixed_quantile_estimator!(MedianEstimator[0.25, 0.5, 0.75]); } #[cfg(test)] diff --git a/src/pipeline.rs b/src/pipeline.rs index 5ca4f2a..c3ef770 100644 --- a/src/pipeline.rs +++ b/src/pipeline.rs @@ -252,8 +252,11 @@ pub fn run_naive_trace( .expect("Unable to write confidences!!!"); } - let query_join_statistics = - gather_join_statistics(proximity_group.alignments, &args.annotation_args); + let query_join_statistics = gather_join_statistics( + proximity_group.alignments, + &alignment_data.query_lengths, + &args.annotation_args, + ); NaiveTraceResults { target_start: proximity_group.target_start, @@ -286,6 +289,7 @@ pub fn run_history_trace( &trace_statistics.query_statistics, &naive_trace.score_params, &args.annotation_args, + &alignment_data.query_lengths, ); let history = history_viterbi_on_segments( diff --git a/src/segments.rs b/src/segments.rs index 81cdb3e..74e987b 100644 --- a/src/segments.rs +++ b/src/segments.rs @@ -1,5 +1,5 @@ use core::f64; -use std::{cmp::Ordering, fmt::Debug, iter::Fuse}; +use std::{cmp::Ordering, collections::HashMap, fmt::Debug, iter::Fuse}; use crate::{ alignment::{Alignment, Strand}, @@ -603,9 +603,11 @@ pub fn assemble_and_link_segments<'a, T: JoinEstimator>( query_statistics: &[QueryStatistics], score_params: &ScoreParams, annotation_args: &AnnotationArgs, + query_lengths: &HashMap, ) -> (&'a SegmentedMatrix, SegmentAssemblyGraph) { let assembly_graph = SegmentAssemblyGraph::new( proximity_group.alignments, + query_lengths, &initial_segments.segments, region_statistics, query_statistics, diff --git a/src/statistics.rs b/src/statistics.rs index dc0e728..3271f27 100644 --- a/src/statistics.rs +++ b/src/statistics.rs @@ -135,7 +135,7 @@ impl Distribution for ExponentialEstimator { } fn logcdf(&self, x: f64) -> f64 { - self.cdf(x).ln() + (-self.logccdf(x).exp()).ln_1p() } fn cdf(&self, x: f64) -> f64 { @@ -390,6 +390,66 @@ impl Distribution for Gumbel { } } +#[derive(Debug, Clone)] +pub struct Lomax { + alpha: f64, + lambda: f64, +} + +impl Lomax { + pub fn new(alpha: f64, lambda: f64) -> Self { + Self { alpha, lambda } + } +} + +impl ParameterizedDistribution for Lomax {} + +impl Default for Lomax { + fn default() -> Self { + Self::new(1.0, 1.0) + } +} + +impl Distribution for Lomax { + fn logpdf(&self, x: f64) -> f64 { + let a = self.alpha; + let y = self.lambda; + (a / y).ln() - (a + 1.0) * (1.0 + x / y).ln() + } + + fn pdf(&self, x: f64) -> f64 { + self.logpdf(x).exp() + } + + fn logccdf(&self, x: f64) -> f64 { + let a = self.alpha; + let y = self.lambda; + -a * (1.0 + x / y).ln() + } + + fn ccdf(&self, x: f64) -> f64 { + self.logccdf(x).exp() + } + + fn cdf(&self, x: f64) -> f64 { + -(self.logccdf(x).exp_m1()) + } + + fn logcdf(&self, x: f64) -> f64 { + (-self.ccdf(x)).ln_1p() + } + + fn ppf(&self, p: f64) -> f64 { + let a = self.alpha; + let y = self.lambda; + y * ((1.0 - p).powf(-1.0 / a) - 1.0) + } + + fn support(&self) -> (f64, f64) { + (0.0, f64::INFINITY) + } +} + #[derive(Debug, Clone)] pub struct Laplace { mean: f64, @@ -516,7 +576,7 @@ mod test { use super::{Exponential, ParameterizedDistribution}; - fn get_dists() -> [Box; 6] { + fn get_dists() -> [Box; 7] { [ as_box(Exponential::unit()), as_box(ExponentialEstimator::unit()), @@ -524,6 +584,7 @@ mod test { as_box(Frechet::unit()), as_box(Laplace::unit()), as_box(Gumbel::unit()), + as_box(Lomax::unit()), ] } @@ -539,6 +600,9 @@ mod test { println!("Testing distribution: {:?}", dist); let (mut low, mut high) = dist.tsupport(); + assert!(dist.tcdf(low) == 0.0); + assert!(dist.tcdf(high) == 1.0); + if high == f64::INFINITY { high = 5.0; } @@ -558,9 +622,4 @@ mod test { } } } - - #[test] - fn test_exponential_distribution() { - let _dist = Exponential::unit(); - } } From bce495e891878a8ab2c33e32622e2cd4f1a2254b Mon Sep 17 00:00:00 2001 From: isaacr Date: Tue, 2 Jun 2026 18:59:31 -0600 Subject: [PATCH 34/39] New version working... --- src/assembly.rs | 202 +++++++++++++++++++++------------------- src/join_estimation.rs | 159 +++++++++++-------------------- src/main.rs | 32 +------ src/pipeline.rs | 3 +- src/segments.rs | 12 ++- src/statistics.rs | 141 +++++++++++++--------------- src/trace_statistics.rs | 54 ++++++----- 7 files changed, 269 insertions(+), 334 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 5f49087..35eb626 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -4,10 +4,11 @@ use itertools::Itertools; use crate::{ alignment::{Alignment, Strand}, - join_estimation::{JoinEstimator, JoinStatisticsCollector}, + chunks::ProximityGroup, + join_estimation::{JoinEstimator, JoinStatisticsCollector, LinkInfo}, score_params::ScoreParams, - segments::{Block, SegmentedMatrix, SegmentedMatrixView}, - trace_statistics::{QueryStatistics, RegionStatistics}, + segments::{Block, InitialSegments, SegmentedMatrix, SegmentedMatrixView}, + trace_statistics::{calculate_region_statistics, QueryStatistics, RegionStatistics}, AnnotationArgs, }; @@ -81,61 +82,13 @@ pub struct Edge { pub link_type: LinkType, } -fn piecewise_linear_cost( - neg_start: f64, - pos_start: f64, - neg_slope: f64, - pos_slope: f64, - value: f64, -) -> f64 { - if value < neg_start { - (value - neg_start).abs() * neg_slope - } else if value > pos_slope { - (value - pos_start).abs() * pos_slope - } else { - 0.0 - } -} - -fn get_link_cost( - annotation_args: &AnnotationArgs, - score_params: &ScoreParams, - consensus_gap: isize, - join_prob: f64, -) -> f64 { - // Minimum cost (a query loop) - let min_value = score_params.query_loop_score; - let value_range = (score_params.query_loop_score - score_params.query_jump_score).abs(); - - // Get overlap and gap ranges with free areas incorperated in, otherwise math is not quite right. - let overlap_range = ((annotation_args.consensus_join_overlap as f64) - - (annotation_args.free_join_consensus_overlap as f64)) - .abs() - .max(1.0); - let gap_range = ((annotation_args.consensus_join_distance as f64) - - (annotation_args.free_join_consensus_gap as f64)) - .abs() - .max(1.0); - - // Compute slopes.... - let alpha = - -value_range * (annotation_args.join_consensus_overlap_penalty / overlap_range).abs(); - let beta = -value_range * (annotation_args.join_consensus_gap_penalty / gap_range).abs(); - +fn get_link_cost(score_params: &ScoreParams, join_prob: f64) -> f64 { // Doing this as the expected value over the transition scores... let expected_score = join_prob * score_params.query_loop_score + (1.0 - join_prob) * score_params.query_jump_score; // Cost = linear consensus cost + linear target gap cost... - min_value - /*+ piecewise_linear_cost( - -(annotation_args.free_join_consensus_overlap as f64).abs(), - (annotation_args.free_join_consensus_gap as f64).abs(), - alpha, - beta, - consensus_gap as f64, - )*/ - + expected_score + expected_score } pub fn block_target_distance(first_block: &Block, second_block: &Block) -> isize { @@ -188,6 +141,7 @@ pub fn block_length_on_query(b: &Block) -> usize { b.query_end.abs_diff(b.query_start) + 1 } +#[allow(dead_code)] pub enum ConsensusDistanceNormalization { Max, Min, @@ -257,14 +211,31 @@ fn new_alignment_to_blocks_map( alignment_block_map } +fn calculate_unexplained_bases( + segments: SegmentedMatrixView, + region_statistics: &RegionStatistics, + first_block_segment: usize, + second_block_segment: usize, + second_block_target_start: usize, +) -> usize { + let ub = region_statistics.unexplained_bases[second_block_segment] + .abs_diff(region_statistics.unexplained_bases[first_block_segment]) + + (second_block_target_start - segments[second_block_segment].start_col); + ub +} + pub fn gather_join_statistics( - alignments: &[Alignment], + group: &ProximityGroup, + initial_segments: &InitialSegments, query_lengths: &HashMap, annotation_args: &AnnotationArgs, ) -> Vec<(usize, T)> { + let alignments = group.alignments; + let mut query_ids: Vec = alignments.iter().map(|a| a.query_id).unique().collect(); query_ids.sort(); + let region_stats = calculate_region_statistics(initial_segments); let mut query_stats: Vec<(usize, T)> = Vec::with_capacity(query_ids.len()); query_ids @@ -277,7 +248,14 @@ pub fn gather_join_statistics( .iter() .enumerate() .filter(|&(_, a)| a.query_id == *id) - .map(|(i, a)| Block::from_alignment(a, i, 0.0, 0.0)), + .map(|(i, a)| { + let b = Block::from_alignment(a, group.target_start, i, 0.0, 0.0); + let seg_i = initial_segments + .view_segments() + .partition_point(|v| v.start_col <= b.col_start) + .saturating_sub(1); + (seg_i, b) + }), ) }) .for_each(|(id, compat_alignments)| { @@ -288,6 +266,8 @@ pub fn gather_join_statistics( *query_lengths .get(&id) .expect("Query length missing for alignment!"), + initial_segments.view_segments(), + ®ion_stats, annotation_args, &mut new_stats, ); @@ -298,42 +278,72 @@ pub fn gather_join_statistics( query_stats } +fn link_info( + first_block: &Block, + second_block: &Block, + annotation_args: &AnnotationArgs, + unexplained_bases: usize, + consensus_length: usize, + neighbors: bool, +) -> LinkInfo { + let (consensus_distance, link_type) = block_consensus_distance(first_block, second_block); + let joinable = is_joinable( + block_target_distance(first_block, second_block), + consensus_distance, + link_type, + block_length_on_query(first_block).min(block_length_on_query(second_block)), + annotation_args, + ); + + LinkInfo { + target_distance: block_target_distance(first_block, second_block), + consensus_distance, + link_type, + consensus_length, + unexplained_bases, + neighbors, + joinable, + } +} + fn gather_join_statistics_single_family<'a>( - compatable_alignments: impl Iterator, + compatable_alignments: impl Iterator, consensus_length: usize, + segments: SegmentedMatrixView, + region_statistics: &RegionStatistics, args: &AnnotationArgs, join_stats: &mut impl JoinStatisticsCollector, ) { let compatable_blocks = compatable_alignments - .sorted_by_key(|a| a.col_start) + .sorted_by_key(|(_u_b, a)| a.col_start) .collect_vec(); compatable_blocks .iter() .enumerate() - .for_each(|(idx, a_block)| { - compatable_blocks[idx + 1..] - .iter() - .enumerate() - .for_each(|(idx2, b_block)| { - let (consensus_distance, link_type) = - block_consensus_distance(a_block, b_block); - let joinable = is_joinable( - block_target_distance(a_block, b_block), - consensus_distance, - link_type, - block_length_on_query(a_block).min(block_length_on_query(b_block)), - args, - ); - + .for_each(|(idx, (a_segment_idx, a_block))| { + compatable_blocks[idx + 1..].iter().enumerate().for_each( + |(idx2, (b_segment_idx, b_block))| { join_stats.add( a_block, b_block, - consensus_length, - idx + 1 == idx2, - joinable, + &link_info( + a_block, + b_block, + args, + calculate_unexplained_bases( + segments, + region_statistics, + *a_segment_idx, + *b_segment_idx, + b_block.col_start, + ), + consensus_length, + idx + 1 == idx2, + ), ); - }) + }, + ) }) } @@ -343,7 +353,7 @@ fn link_assemblies( consensus_length: usize, segments: &SegmentedMatrix, query_statistics: &QueryStatistics, - _region_statistics: &RegionStatistics, + region_statistics: &RegionStatistics, score_params: &ScoreParams, args: &AnnotationArgs, ) { @@ -360,29 +370,31 @@ fn link_assemblies( let a_block = &segments[a.0].blocks[a.1]; let b_block = &segments[b.0].blocks[b.1]; - let target_distance = block_target_distance(a_block, b_block); - let min_block_length = - block_length_on_query(a_block).min(block_length_on_query(b_block)); - - let (consensus_distance, link_type) = block_consensus_distance(a_block, b_block); - - if is_joinable( - target_distance, - consensus_distance, - link_type, - min_block_length, + let link = link_info( + a_block, + b_block, args, - ) { - let join_prob = - query_statistics - .estimator - .predict(a_block, b_block, consensus_length, false); + calculate_unexplained_bases( + segments, + region_statistics, + a.0, + b.0, + b_block.col_start, + ), + consensus_length, + a.0 + 1 == b.0, + ); + + if link.joinable { + let join_prob = query_statistics + .estimator + .predict(a_block, b_block, &link, false); if join_prob >= args.join_likelihood_threshold { let weight = if a_block.row_idx == b_block.row_idx && ((b.0 - 1) <= a.0) { score_params.query_loop_score } else { - get_link_cost(args, score_params, consensus_distance, join_prob) + get_link_cost(score_params, join_prob) }; graph.insert( @@ -391,7 +403,7 @@ fn link_assemblies( weight, first_sparse_row: a.1, second_sparse_row: b.1, - link_type, + link_type: link.link_type, }, ); } diff --git a/src/join_estimation.rs b/src/join_estimation.rs index 827f98b..f28fc62 100644 --- a/src/join_estimation.rs +++ b/src/join_estimation.rs @@ -5,16 +5,13 @@ use std::{ }; use crate::{ - assembly::{ - block_target_distance, relative_consensus_distance, ConsensusDistanceNormalization, - LinkType, - }, + assembly::{relative_consensus_distance, ConsensusDistanceNormalization, LinkType}, p2estimator::{ custom_quantile_estimator::{LomaxQuant, MedianEstimator}, QuantileEstimator, }, segments::Block, - statistics::{ln_add_exp, Distribution, ExponentialEstimator, HalfT, Laplace, Lomax}, + statistics::{ln_add_exp, AssymetricLaplace, Distribution, ExponentialEstimator, HalfT, Lomax}, }; pub trait JoinEstimator: Clone + Default + Debug { @@ -22,23 +19,28 @@ pub trait JoinEstimator: Clone + Default + Debug { &self, first_block: &Block, second_block: &Block, - consensus_length: usize, + link_info: &LinkInfo, log_space: bool, ) -> f64; } +pub struct LinkInfo { + #[allow(dead_code)] + pub target_distance: isize, + #[allow(dead_code)] + pub consensus_distance: isize, + pub link_type: LinkType, + pub consensus_length: usize, + pub unexplained_bases: usize, + pub neighbors: bool, + pub joinable: bool, +} + pub trait JoinStatisticsCollector: Clone + Debug { fn new() -> Self; fn new_from_prior(bayesian_prior: &Self, pseudo_count: usize) -> Self; fn combine(&self, other: &Self) -> Self; - fn add( - &mut self, - first_block: &Block, - second_block: &Block, - consenus_length: usize, - neighbors: bool, - joinable: bool, - ); + fn add(&mut self, first_block: &Block, second_block: &Block, link_info: &LinkInfo); } #[derive(Debug, Clone, Default)] @@ -47,11 +49,8 @@ pub struct BayesianJoinEstimator { target_distance_nojoin: ExponentialEstimator, divergence_join: HalfT, divergence_nojoin: HalfT, - consensus_distance_join_pos: ExponentialEstimator, - consensus_distance_join_neg: ExponentialEstimator, - consensus_norm_pos: f64, - consensus_norm_neg: f64, - consensus_distance_nojoin: Laplace, + consensus_distance_join: AssymetricLaplace, + consensus_distance_nojoin: AssymetricLaplace, join_prior: f64, } @@ -60,54 +59,26 @@ impl JoinEstimator for BayesianJoinEstimator { &self, first_block: &Block, second_block: &Block, - consensus_distance: usize, + link_info: &LinkInfo, log_space: bool, ) -> f64 { - let target_dist = block_target_distance(first_block, second_block) as f64; + let target_dist = link_info.unexplained_bases as f64; // Absolute value as t-dist is symmetric and we want to get prob in tail, also, we know the mean is 0... let divergence_diff = (second_block.kimura80 - first_block.kimura80).abs(); let (rel_con_dist, _join_type) = relative_consensus_distance( first_block, second_block, - ConsensusDistanceNormalization::WithLength(consensus_distance), - ); - - let consensus_join_dist_logpdf = |x: f64| { - if x < 0.0 { - self.consensus_distance_join_neg.logpdf(x.abs()) + self.consensus_norm_neg.ln() - } else { - self.consensus_distance_join_pos.logpdf(x.abs()) + self.consensus_norm_pos.ln() - } - }; - - println!("{:#?}", self); - println!( - "{} {} {}", - rel_con_dist, - consensus_join_dist_logpdf(rel_con_dist).exp(), - self.consensus_distance_nojoin.pdf(rel_con_dist) - ); - println!( - "{} {} {}", - target_dist, - self.target_distance_join.pdf(target_dist), - self.target_distance_nojoin.pdf(target_dist) - ); - println!( - "{} {} {}", - divergence_diff, - self.divergence_join.pdf(divergence_diff), - self.divergence_nojoin.pdf(divergence_diff) + ConsensusDistanceNormalization::WithLength(link_info.consensus_length), ); let join_score = self.join_prior.ln() + self.target_distance_join.logpdf(target_dist) - + self.divergence_join.logpdf(divergence_diff); - //+ self.consensus_distance_join.logpdf(rel_con_dist); + + self.divergence_join.logpdf(divergence_diff) + + self.consensus_distance_join.logpdf(rel_con_dist); let nojoin_score = (-self.join_prior).ln_1p() + self.target_distance_nojoin.logpdf(target_dist) - + self.divergence_nojoin.logpdf(divergence_diff); - //+ self.consensus_distance_nojoin.logpdf(rel_con_dist); + + self.divergence_nojoin.logpdf(divergence_diff) + + self.consensus_distance_nojoin.logpdf(rel_con_dist); let score_norm = ln_add_exp(join_score, nojoin_score); let score = join_score - score_norm; @@ -215,12 +186,6 @@ impl From for HalfT { } } -impl From for Laplace { - fn from(value: MomentEstimator) -> Self { - Self::from_moments(value.mean(), value.standard_deviation()) - } -} - impl From<&LomaxQuant> for Lomax { fn from(value: &LomaxQuant) -> Self { // Using quantile selection trick originally developed for frechet... (quant ratio formula) @@ -245,22 +210,20 @@ impl From<&MedianEstimator> for ExponentialEstimator { impl From<&BayesianJoinStatistics> for BayesianJoinEstimator { fn from(statistics: &BayesianJoinStatistics) -> Self { - let cons_total = (statistics.joinable_consensus_pos.samples() - + statistics.joinable_consensus_neg.samples()) - .max(1); - let cons_pos_perc = statistics.joinable_consensus_pos.samples() as f64 / cons_total as f64; - let cons_neg_perc = statistics.joinable_consensus_neg.samples() as f64 / cons_total as f64; - Self { target_distance_join: statistics.joinable_target_distance.into(), target_distance_nojoin: statistics.unjoinable_target_distance.into(), divergence_join: statistics.joinable_divergence.into(), divergence_nojoin: statistics.unjoinable_divergence.into(), - consensus_distance_join_pos: (&statistics.joinable_consensus_pos).into(), - consensus_distance_join_neg: (&statistics.joinable_consensus_neg).into(), - consensus_norm_pos: 0.5 * cons_pos_perc, - consensus_norm_neg: 0.5 * cons_neg_perc, - consensus_distance_nojoin: statistics.unjoinable_consensus.into(), + consensus_distance_join: AssymetricLaplace::from_exponential_halves( + 0.0, + statistics.joinable_consensus_neg.mean(), + statistics.joinable_consensus_pos.mean(), + ), + consensus_distance_nojoin: AssymetricLaplace::symmetric_from_moments( + statistics.unjoinable_consensus.mean(), + statistics.unjoinable_consensus.standard_deviation(), + ), // We take sqrt since we count all pairs, not just neighbors. join_prior: (statistics.joinable_target_distance.samples() as f64 / (statistics.joinable_target_distance.samples() @@ -277,8 +240,8 @@ pub struct BayesianJoinStatistics { unjoinable_target_distance: MomentEstimator, joinable_divergence: MomentEstimator, unjoinable_divergence: MomentEstimator, - joinable_consensus_pos: MedianEstimator, - joinable_consensus_neg: MedianEstimator, + joinable_consensus_pos: MomentEstimator, + joinable_consensus_neg: MomentEstimator, unjoinable_consensus: MomentEstimator, } @@ -301,51 +264,45 @@ impl JoinStatisticsCollector for BayesianJoinStatistics { unjoinable_divergence: bayesian_prior .unjoinable_divergence .to_psuedo_count(pseudo_count), - joinable_consensus_pos: MedianEstimator::from_prior( - &bayesian_prior.joinable_consensus_pos, - pseudo_count, - ), - joinable_consensus_neg: MedianEstimator::from_prior( - &bayesian_prior.joinable_consensus_neg, - pseudo_count, - ), + joinable_consensus_pos: bayesian_prior + .joinable_consensus_pos + .to_psuedo_count(pseudo_count), + joinable_consensus_neg: bayesian_prior + .joinable_consensus_neg + .to_psuedo_count(pseudo_count), unjoinable_consensus: bayesian_prior .unjoinable_consensus .to_psuedo_count(pseudo_count), } } - fn add( - &mut self, - first_block: &Block, - second_block: &Block, - consensus_length: usize, - neighbors: bool, - joinable: bool, - ) { - let target_dist = block_target_distance(first_block, second_block).abs() as usize; + fn add(&mut self, first_block: &Block, second_block: &Block, link_info: &LinkInfo) { + if !link_info.neighbors { + return; + } + + let target_dist = link_info.unexplained_bases; let divergence_diff = (second_block.kimura80 - first_block.kimura80).abs(); let (rel_con_dist, join_type) = relative_consensus_distance( first_block, second_block, - ConsensusDistanceNormalization::WithLength(consensus_length), + ConsensusDistanceNormalization::WithLength(link_info.consensus_length), ); - if joinable { + if link_info.joinable { self.joinable_target_distance += target_dist as f64; self.joinable_divergence += divergence_diff; - if neighbors && matches!(join_type, LinkType::Forward | LinkType::Reverse) { - //println!("CDist: {}", rel_con_dist); + if matches!(join_type, LinkType::Forward | LinkType::Reverse) { if rel_con_dist >= 0.0 { - self.joinable_consensus_pos.update(rel_con_dist.abs()); + self.joinable_consensus_pos += rel_con_dist.abs(); } else { - self.joinable_consensus_neg.update(rel_con_dist.abs()); + self.joinable_consensus_neg += rel_con_dist.abs(); } } } else { self.unjoinable_target_distance += target_dist as f64; self.unjoinable_divergence += divergence_diff; - if neighbors && matches!(join_type, LinkType::Forward | LinkType::Reverse) { + if matches!(join_type, LinkType::Forward | LinkType::Reverse) { self.unjoinable_consensus += rel_con_dist; } } @@ -359,12 +316,8 @@ impl JoinStatisticsCollector for BayesianJoinStatistics { + other.unjoinable_target_distance, joinable_divergence: self.joinable_divergence + other.joinable_divergence, unjoinable_divergence: self.unjoinable_divergence + other.unjoinable_divergence, - joinable_consensus_pos: self - .joinable_consensus_pos - .combine(&other.joinable_consensus_pos), - joinable_consensus_neg: self - .joinable_consensus_neg - .combine(&other.joinable_consensus_neg), + joinable_consensus_pos: self.joinable_consensus_pos + other.joinable_consensus_pos, + joinable_consensus_neg: self.joinable_consensus_neg + other.joinable_consensus_neg, unjoinable_consensus: self.unjoinable_consensus + other.unjoinable_consensus, } } diff --git a/src/main.rs b/src/main.rs index 0063815..5b01b6d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -170,7 +170,7 @@ pub struct AnnotationArgs { /// The maximum seperation or overlap in nucleotides on both target and consensus /// for a join to be allowed between inverted alignments. - #[arg(long = "inversion-distance", default_value = "50", value_name = "n")] + #[arg(long = "inversion-distance", default_value = "200", value_name = "n")] pub inversion_distance: isize, /// The size of the window looked at to determine a single alignment score in nucleotides. @@ -223,36 +223,6 @@ pub struct AnnotationArgs { /// Set to 0 or greater to disable. #[arg(long = "min-history-score", default_value = "-500.0", value_name = "f")] pub min_relative_history_score: f64, - - /// The amount of overlap between two joinable sequences in the consensus - /// before a penalty starts being applied to the join. - #[arg(long = "free-join-overlap", default_value = "4", value_name = "n")] - pub free_join_consensus_overlap: usize, - - /// The amount of gap between two joinable sequences - /// before a penalty starts being applied to the join. - #[arg(long = "free-join-gap", default_value = "10", value_name = "n")] - pub free_join_consensus_gap: usize, - - /// The amount of penalty to apply to a join at the maximum allowed consensus overlap - /// A value of 1 means to apply a penalty equal to a query jump. - /// The cost grows linearly to this value as the overlap increases. - #[arg( - long = "consensus-overlap-penalty", - default_value = "1.0", - value_name = "f" - )] - pub join_consensus_overlap_penalty: f64, - - /// The amount of penalty to apply to a join at the maximum allowed consensus gap - /// A value of 1 means to apply a penalty equal to a query jump. - /// The cost grows linearly to this value as the gap increases. - #[arg( - long = "consensus-gap-penalty", - default_value = "0.5", - value_name = "f" - )] - pub join_consensus_gap_penalty: f64, } #[derive(Args, Debug, Clone, Default)] diff --git a/src/pipeline.rs b/src/pipeline.rs index c3ef770..7200f78 100644 --- a/src/pipeline.rs +++ b/src/pipeline.rs @@ -253,7 +253,8 @@ pub fn run_naive_trace( } let query_join_statistics = gather_join_statistics( - proximity_group.alignments, + proximity_group, + &segments, &alignment_data.query_lengths, &args.annotation_args, ); diff --git a/src/segments.rs b/src/segments.rs index 74e987b..b65964f 100644 --- a/src/segments.rs +++ b/src/segments.rs @@ -100,14 +100,20 @@ impl Block { (self.query_id, self.row_idx) } - pub fn from_alignment(alignment: &Alignment, row: usize, confidence: f64, score: f64) -> Self { + pub fn from_alignment( + alignment: &Alignment, + group_start: usize, + row: usize, + confidence: f64, + score: f64, + ) -> Self { Self { row_idx: row, block_type: BlockType::Alignment, strand: alignment.strand, query_id: Some(alignment.query_id), - col_start: alignment.target_start, - col_end: alignment.target_end, + col_start: alignment.target_start.saturating_sub(group_start), + col_end: alignment.target_end.saturating_sub(group_start), query_start: alignment.query_start, query_end: alignment.query_end, avg_confidence: confidence, diff --git a/src/statistics.rs b/src/statistics.rs index 3271f27..448c118 100644 --- a/src/statistics.rs +++ b/src/statistics.rs @@ -11,7 +11,7 @@ pub fn ln_add_exp(a: f64, b: f64) -> f64 { // TODO: Support for generic floating types... #[allow(dead_code)] -pub trait Distribution: Clone { +pub trait Distribution { fn pdf(&self, x: f64) -> f64; fn cdf(&self, x: f64) -> f64; fn ppf(&self, p: f64) -> f64; @@ -22,7 +22,7 @@ pub trait Distribution: Clone { fn logccdf(&self, x: f64) -> f64; } -pub trait ParameterizedDistribution: Distribution + Debug + Default { +pub trait ParameterizedDistribution: Distribution + Debug + Default + Clone { fn unit() -> Self { Self::default() } @@ -451,40 +451,54 @@ impl Distribution for Lomax { } #[derive(Debug, Clone)] -pub struct Laplace { - mean: f64, +pub struct AssymetricLaplace { + mode: f64, scale: f64, + mode_quantile: f64, } -impl ParameterizedDistribution for Laplace {} +impl ParameterizedDistribution for AssymetricLaplace {} -impl Laplace { - pub fn new(mean: f64, scale: f64) -> Self { - Self { mean, scale } - } - - pub fn from_moments(mean: f64, standard_deviation: f64) -> Self { +impl AssymetricLaplace { + pub fn new(mode: f64, scale: f64, mode_quantile: f64) -> Self { Self { - mean, - scale: standard_deviation / f64::consts::SQRT_2, + mode, + scale, + mode_quantile, } } + + pub fn from_exponential_halves(mode: f64, negative_mean: f64, positive_mean: f64) -> Self { + Self::new( + mode, + (negative_mean * positive_mean) / (negative_mean + positive_mean), + 1.0 / (positive_mean / negative_mean + 1.0), + ) + } + + pub fn symmetric_from_moments(mean: f64, standard_deviation: f64) -> Self { + Self::new(mean, standard_deviation / (8.0_f64.sqrt()), 0.5) + } } -impl Default for Laplace { +impl Default for AssymetricLaplace { fn default() -> Self { - Self { - mean: 0.0, - scale: 1.0, - } + Self::symmetric_from_moments(0.0, 1.0) } } -impl Distribution for Laplace { +impl Distribution for AssymetricLaplace { fn logpdf(&self, x: f64) -> f64 { - let mu = self.mean; - let b = self.scale; - (0.5 / b).ln() + -((x - mu).abs() / b) + let m = self.mode; + let l = self.scale; + let p = self.mode_quantile; + let exp_comp = if x <= m { + ((1.0 - p) / l) * (x - m) + } else { + -(p / l) * (x - m) + }; + + ((p * (1.0 - p)) / l).ln() + exp_comp } fn pdf(&self, x: f64) -> f64 { @@ -492,9 +506,14 @@ impl Distribution for Laplace { } fn cdf(&self, x: f64) -> f64 { - let mu = self.mean; - let b = self.scale; - 0.5 + 0.5 * (x - mu).signum() * (1.0 - (-(x - mu).abs() / b).exp()) + let m = self.mode; + let l = self.scale; + let p = self.mode_quantile; + if x <= m { + p * (((1.0 - p) / l) * (x - m)).exp() + } else { + 1.0 - (1.0 - p) * (-(p / l) * (x - m)).exp() + } } fn logcdf(&self, x: f64) -> f64 { @@ -510,10 +529,14 @@ impl Distribution for Laplace { } fn ppf(&self, p: f64) -> f64 { - let mu = self.mean; - let b = self.scale; - let p = p.clamp(0.0, 1.0); - mu - b * (p - 0.5).signum() * (1.0 - 2.0 * (p - 0.5).abs()).ln() + let m = self.mode; + let l = self.scale; + let pm = self.mode_quantile; + if p <= pm { + m + (l / (1.0 - pm)) * (p / pm).ln() + } else { + m - (l / pm) * ((1.0 - p) / (1.0 - pm)).ln() + } } fn support(&self) -> (f64, f64) { @@ -532,45 +555,11 @@ mod test { use super::*; use std::fmt::Debug; - pub trait TestDistribution: Debug { - fn tpdf(&self, x: f64) -> f64; - fn tcdf(&self, x: f64) -> f64; - fn tppf(&self, p: f64) -> f64; - fn tsupport(&self) -> (f64, f64); - fn tccdf(&self, x: f64) -> f64; - fn tlogpdf(&self, x: f64) -> f64; - fn tlogcdf(&self, x: f64) -> f64; - fn tlogccdf(&self, x: f64) -> f64; - } - - impl TestDistribution for T { - fn tpdf(&self, x: f64) -> f64 { - self.pdf(x) - } - fn tcdf(&self, x: f64) -> f64 { - self.cdf(x) - } - fn tppf(&self, p: f64) -> f64 { - self.ppf(p) - } - fn tsupport(&self) -> (f64, f64) { - self.support() - } - fn tccdf(&self, x: f64) -> f64 { - self.ccdf(x) - } - fn tlogpdf(&self, x: f64) -> f64 { - self.logpdf(x) - } - fn tlogcdf(&self, x: f64) -> f64 { - self.logcdf(x) - } - fn tlogccdf(&self, x: f64) -> f64 { - self.logccdf(x) - } - } + // Add debug trait to allow for printout... + pub trait TestDistribution: Distribution + Debug {} + impl TestDistribution for T {} - fn as_box(d: T) -> Box { + fn as_box(d: T) -> Box { Box::new(d) } @@ -582,7 +571,7 @@ mod test { as_box(ExponentialEstimator::unit()), as_box(HalfT::unit()), as_box(Frechet::unit()), - as_box(Laplace::unit()), + as_box(AssymetricLaplace::unit()), as_box(Gumbel::unit()), as_box(Lomax::unit()), ] @@ -598,10 +587,10 @@ mod test { fn basic_distribution_propery_checks() { for dist in get_dists() { println!("Testing distribution: {:?}", dist); - let (mut low, mut high) = dist.tsupport(); + let (mut low, mut high) = dist.support(); - assert!(dist.tcdf(low) == 0.0); - assert!(dist.tcdf(high) == 1.0); + assert!(dist.cdf(low) == 0.0); + assert!(dist.cdf(high) == 1.0); if high == f64::INFINITY { high = 5.0; @@ -613,12 +602,12 @@ mod test { for x in linspace(low, high, 100) { // Basic properties... // println!("{x} -> {} vs {}", dist.tpdf(x), dist.tlogpdf(x).exp()); - assert!(is_close(dist.tpdf(x), dist.tlogpdf(x).exp())); - assert!(is_close(dist.tcdf(x), dist.tlogcdf(x).exp())); - assert!(is_close(dist.tccdf(x), dist.tlogccdf(x).exp())); - assert!(is_close(dist.tccdf(x), 1.0 - dist.tcdf(x))); + assert!(is_close(dist.pdf(x), dist.logpdf(x).exp())); + assert!(is_close(dist.cdf(x), dist.logcdf(x).exp())); + assert!(is_close(dist.ccdf(x), dist.logccdf(x).exp())); + assert!(is_close(dist.ccdf(x), 1.0 - dist.cdf(x))); // println!("{x} -> {}", dist.tppf(dist.tcdf(x))); - assert!(is_close(dist.tppf(dist.tcdf(x)), x)); + assert!(is_close(dist.ppf(dist.cdf(x)), x)); } } } diff --git a/src/trace_statistics.rs b/src/trace_statistics.rs index cb49630..f71fa39 100644 --- a/src/trace_statistics.rs +++ b/src/trace_statistics.rs @@ -6,7 +6,7 @@ use crate::{ alignment::AlignmentData, join_estimation::{JoinEstimator, JoinStatisticsCollector}, pipeline::NaiveTraceResults, - segments::Segment, + segments::{InitialSegments, Segment}, }; #[derive(Debug)] @@ -37,6 +37,33 @@ pub enum OccuranceCountingMode { Trace, } +pub fn calculate_region_statistics(segments: &InitialSegments) -> RegionStatistics { + let mut region_stat = RegionStatistics { + total_bases: 0, + unexplained_bases: Vec::with_capacity(segments.len()), + }; + + let mut unexplained_bases_up_to: usize = 0; + let mut prior_segment: Option<&Segment> = None; + + for seg in segments.view_segments() { + if let Some(prior_segment) = prior_segment { + // If a skip block was the prior block, add it's bases as unexplained. + if prior_segment.blocks.len() == 1 && prior_segment.blocks[0].row_idx == 0 { + unexplained_bases_up_to += seg.end_col - seg.start_col + 1; + } + unexplained_bases_up_to += seg.start_col - prior_segment.end_col - 1; + region_stat.total_bases += seg.start_col - prior_segment.end_col - 1; + } + region_stat.total_bases += seg.end_col - seg.start_col + 1; + region_stat.unexplained_bases.push(unexplained_bases_up_to); + + prior_segment = Some(seg); + } + + region_stat +} + pub fn trace_statistics, E: JoinEstimator>( naive_traces: &[NaiveTraceResults], alignment_data: &AlignmentData, @@ -117,30 +144,7 @@ pub fn trace_statistics, E: JoinEst } } - let mut region_stat = RegionStatistics { - total_bases: 0, - unexplained_bases: Vec::with_capacity(trace_results.segments.len()), - }; - - let mut unexplained_bases_up_to: usize = 0; - let mut prior_segment: Option<&Segment> = None; - - for seg in trace_results.segments.view_segments() { - if let Some(prior_segment) = prior_segment { - // If a skip block was the prior block, add it's bases as unexplained. - if prior_segment.blocks.len() == 1 && prior_segment.blocks[0].row_idx == 0 { - unexplained_bases_up_to += seg.end_col - seg.start_col + 1; - } - unexplained_bases_up_to += seg.start_col - prior_segment.end_col - 1; - region_stat.total_bases += seg.start_col - prior_segment.end_col - 1; - } - region_stat.total_bases += seg.end_col - seg.start_col + 1; - region_stat.unexplained_bases.push(unexplained_bases_up_to); - - prior_segment = Some(seg); - } - - all_region_stats.push(region_stat); + all_region_stats.push(calculate_region_statistics(&trace_results.segments)); } // Calculate join statistics for all families using combined prior as a starting point... From 22654e4b728da32274f2d63bc6da5fcf30b8241e Mon Sep 17 00:00:00 2001 From: isaacr Date: Wed, 3 Jun 2026 17:53:03 -0600 Subject: [PATCH 35/39] Overhaul table viz to work better with very large runs. --- fixtures/soda/table.html | 618 +++++++++++++++++++++++++++------------ src/assembly.rs | 1 + src/join_estimation.rs | 18 +- src/p2estimator.rs | 189 +++++++----- src/statistics.rs | 8 +- src/viz/stats.rs | 46 +-- 6 files changed, 572 insertions(+), 308 deletions(-) diff --git a/fixtures/soda/table.html b/fixtures/soda/table.html index 73ccfeb..db87869 100644 --- a/fixtures/soda/table.html +++ b/fixtures/soda/table.html @@ -2,16 +2,18 @@ PAGE_TITLE - + +

PAGE_TITLE

-
+
< Back -
- TABLE_TARGET +
+
+ +
+ +
+ + + + + +
- \ No newline at end of file + diff --git a/src/assembly.rs b/src/assembly.rs index 35eb626..9485691 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -357,6 +357,7 @@ fn link_assemblies( score_params: &ScoreParams, args: &AnnotationArgs, ) { + println!("{:#?}", query_statistics); // this relies on the alignments being sorted by target start let compatable_blocks = compatable_blocks.sorted().collect_vec(); diff --git a/src/join_estimation.rs b/src/join_estimation.rs index f28fc62..89f3763 100644 --- a/src/join_estimation.rs +++ b/src/join_estimation.rs @@ -6,10 +6,7 @@ use std::{ use crate::{ assembly::{relative_consensus_distance, ConsensusDistanceNormalization, LinkType}, - p2estimator::{ - custom_quantile_estimator::{LomaxQuant, MedianEstimator}, - QuantileEstimator, - }, + p2estimator::custom_quantile_estimator::{LomaxQuant, MedianEstimator}, segments::Block, statistics::{ln_add_exp, AssymetricLaplace, Distribution, ExponentialEstimator, HalfT, Lomax}, }; @@ -210,11 +207,18 @@ impl From<&MedianEstimator> for ExponentialEstimator { impl From<&BayesianJoinStatistics> for BayesianJoinEstimator { fn from(statistics: &BayesianJoinStatistics) -> Self { + println!("{:#?}", statistics); Self { target_distance_join: statistics.joinable_target_distance.into(), target_distance_nojoin: statistics.unjoinable_target_distance.into(), divergence_join: statistics.joinable_divergence.into(), - divergence_nojoin: statistics.unjoinable_divergence.into(), + divergence_nojoin: HalfT::from_sample_mean( + statistics + .unjoinable_divergence + .mean() + .max(statistics.joinable_divergence.mean()), + statistics.unjoinable_divergence.samples(), + ), consensus_distance_join: AssymetricLaplace::from_exponential_halves( 0.0, statistics.joinable_consensus_neg.mean(), @@ -277,10 +281,6 @@ impl JoinStatisticsCollector for BayesianJoinStatistics { } fn add(&mut self, first_block: &Block, second_block: &Block, link_info: &LinkInfo) { - if !link_info.neighbors { - return; - } - let target_dist = link_info.unexplained_bases; let divergence_diff = (second_block.kimura80 - first_block.kimura80).abs(); let (rel_con_dist, join_type) = relative_consensus_distance( diff --git a/src/p2estimator.rs b/src/p2estimator.rs index 0616cbc..9da5da7 100644 --- a/src/p2estimator.rs +++ b/src/p2estimator.rs @@ -1,4 +1,8 @@ -use std::cmp::Ordering; +use std::{ + cmp::Ordering, + ops::{Add, AddAssign}, + usize, +}; use crate::statistics::Distribution; use itertools::Itertools; @@ -8,14 +12,14 @@ use itertools::Itertools; // // We replace the P2 interpolation with PCHIP instead (See paper A Method for Constructing Local Monotone Piecewise Cubic Interpolants by F. N. Fritsch and J. Butland, or https://doi.org/10.1137/0905021) -struct QuantileEstimatorData<'a> { +pub struct QuantileEstimatorData<'a> { ranks: &'a [usize], values: &'a [f64], targets: &'a [f64], observations: &'a usize, } -struct MutableQuantileEstimatorData<'a> { +pub struct MutableQuantileEstimatorData<'a> { ranks: &'a mut [usize], values: &'a mut [f64], targets: &'a [f64], @@ -342,40 +346,20 @@ fn _interpolated_value_prediction< } } -pub trait QuantileEstimator: Distribution { - fn from_prior(prior: &Self, count: usize) -> Self; - fn update(&mut self, sample: f64); - fn update_all(&mut self, samples: &[f64]) { - for &s in samples.iter() { - self.update(s); - } - } - fn combine(&self, other: &Self) -> Self; - #[allow(dead_code)] - fn samples(&self) -> usize; -} - -trait SimpleQuantileEstimatorRepresentation: Clone { - fn new_like(other: &Self) -> Self; - fn _data(&self) -> QuantileEstimatorData<'_>; - fn _mut_data(&mut self) -> MutableQuantileEstimatorData<'_>; - fn _is_initialized(&self) -> bool { - let data = self._data(); - *data.observations >= data.ranks.len() - } -} +#[derive(Clone)] +pub struct QuantileEstimator(T); -impl QuantileEstimator for Q { - fn samples(&self) -> usize { - *self._data().observations +impl QuantileEstimator { + pub fn samples(&self) -> usize { + *self.0._data().observations } - fn from_prior(prior: &Self, count_per_entry: usize) -> Self { - let prior_data = prior._data(); - let mut new_self = Self::new_like(prior); - let new_data = new_self._mut_data(); + pub fn to_psuedo_count(&self, count: usize) -> Self { + let prior_data = self.0._data(); + let mut new_self = Self(T::new_like(&self.0)); + let new_data = new_self.0._mut_data(); - let new_observations = count_per_entry.max(1) * prior_data.ranks.len(); + let new_observations = count.max(1) * prior_data.ranks.len(); for i in 0..new_data.targets.len() { let closest_rank = ((new_data.targets[i] * (new_observations - 1) as f64) as usize) @@ -385,15 +369,41 @@ impl QuantileEstimator for Q { ); new_data.ranks[i] = closest_rank; - new_data.values[i] = prior.ppf(closest_rank as f64 / (new_observations - 1) as f64) + new_data.values[i] = self.ppf(closest_rank as f64 / (new_observations - 1) as f64) } *new_data.observations = new_observations; new_self } +} + +impl QuantileEstimator { + pub fn new_from_slice(targets: &[f64]) -> Self { + Self(VectorQuantileRepresentation::new(targets)) + } +} + +impl QuantileEstimator> { + pub fn new_from_array(targets: &[f64; N]) -> Self { + Self(ArrayQuantileRepresentation::::new(targets)) + } +} + +impl Default for QuantileEstimator { + fn default() -> Self { + Self(T::default()) + } +} - fn update(&mut self, sample: f64) { - let data = self._mut_data(); +impl QuantileEstimator { + fn new() -> Self { + Self::default() + } +} + +impl AddAssign for QuantileEstimator { + fn add_assign(&mut self, sample: f64) { + let data = self.0._mut_data(); match (*data.observations + 1).cmp(&data.values.len()) { Ordering::Less => { @@ -413,36 +423,48 @@ impl QuantileEstimator for Q { } } } +} + +impl AddAssign<&[f64]> for QuantileEstimator { + fn add_assign(&mut self, samples: &[f64]) { + for &s in samples.iter() { + *self += s; + } + } +} + +impl Add> for QuantileEstimator { + type Output = QuantileEstimator; - fn combine(&self, other: &Self) -> Self { - match (self._is_initialized(), other._is_initialized()) { + fn add(self, rhs: QuantileEstimator) -> Self::Output { + match (self.0._is_initialized(), rhs.0._is_initialized()) { (true, true) => { - let mut new_quant_est = Self::new_like(&self); + let mut new_quant_est = Self(T::new_like(&self.0)); - _merge_estimators(self._data(), other._data(), new_quant_est._mut_data()); + _merge_estimators(self.0._data(), rhs.0._data(), new_quant_est.0._mut_data()); new_quant_est } (true, false) | (false, false) => { - let other_data = other._data(); + let other_data = rhs.0._data(); let mut new_quant_est = self.clone(); - new_quant_est.update_all(&other_data.values[..*other_data.observations]); + new_quant_est += &other_data.values[..*other_data.observations]; new_quant_est } (false, true) => { - let self_data = self._data(); - let mut new_quant_est = other.clone(); - new_quant_est.update_all(&self_data.values[..*self_data.observations]); + let self_data = self.0._data(); + let mut new_quant_est = rhs.clone(); + new_quant_est += &self_data.values[..*self_data.observations]; new_quant_est } } } } -impl Distribution for Q { +impl Distribution for QuantileEstimator { fn cdf(&self, x: f64) -> f64 { - let data = self._data(); - if self._is_initialized() { + let data = self.0._data(); + if self.0._is_initialized() { _interpolated_value_prediction( data.values, data.ranks, @@ -482,13 +504,13 @@ impl Distribution for Q { } fn ppf(&self, p: f64) -> f64 { - let data = self._data(); + let data = self.0._data(); let est_rank = p.clamp(0.0, 1.0) * (*data.observations - 1) as f64; - let data = self._data(); + let data = self.0._data(); let (min_val, max_val) = self.support(); - if self._is_initialized() { + if self.0._is_initialized() { _interpolated_value_prediction( data.ranks, data.values, @@ -527,9 +549,9 @@ impl Distribution for Q { } fn support(&self) -> (f64, f64) { - let data = self._data(); + let data = self.0._data(); - if self._is_initialized() { + if self.0._is_initialized() { ( *data.values.first().unwrap_or(&f64::NEG_INFINITY), *data.values.last().unwrap_or(&f64::INFINITY), @@ -545,16 +567,26 @@ impl Distribution for Q { } } +pub trait QuantileEstimatorRepresentation: Clone { + fn new_like(other: &Self) -> Self; + fn _data(&self) -> QuantileEstimatorData<'_>; + fn _mut_data(&mut self) -> MutableQuantileEstimatorData<'_>; + fn _is_initialized(&self) -> bool { + let data = self._data(); + *data.observations >= data.ranks.len() + } +} + #[derive(Clone, Debug)] -pub struct FixedSizeQuantileEstimator { +pub struct ArrayQuantileRepresentation { values: [f64; N], ranks: [usize; N], targets: [f64; N], observations: usize, } -impl FixedSizeQuantileEstimator { - pub fn new(targets: &[f64; N]) -> Self { +impl ArrayQuantileRepresentation { + fn new(targets: &[f64; N]) -> Self { assert!( targets.is_sorted() && targets.first() == Some(&0.0) && targets.last() == Some(&1.0) ); @@ -567,7 +599,7 @@ impl FixedSizeQuantileEstimator { } } -impl SimpleQuantileEstimatorRepresentation for FixedSizeQuantileEstimator { +impl QuantileEstimatorRepresentation for ArrayQuantileRepresentation { fn new_like(other: &Self) -> Self { Self::new(&other.targets) } @@ -592,15 +624,15 @@ impl SimpleQuantileEstimatorRepresentation for FixedSizeQuantile } #[derive(Clone, Debug)] -pub struct VectorQuantileEstimator { +pub struct VectorQuantileRepresentation { values: Vec, ranks: Vec, targets: Vec, observations: usize, } -impl VectorQuantileEstimator { - pub fn new(targets: &[f64]) -> Self { +impl VectorQuantileRepresentation { + fn new(targets: &[f64]) -> Self { assert!( targets.is_sorted() && targets.first() == Some(&0.0) && targets.last() == Some(&1.0) ); @@ -613,7 +645,7 @@ impl VectorQuantileEstimator { } } -impl SimpleQuantileEstimatorRepresentation for VectorQuantileEstimator { +impl QuantileEstimatorRepresentation for VectorQuantileRepresentation { fn new_like(other: &Self) -> Self { Self::new(&other.targets) } @@ -651,15 +683,15 @@ pub mod custom_quantile_estimator { } macro_rules! implement_fixed_quantile_estimator { - ($name:ident[$($val:expr),+]) => { + ($name:ident, $repr_name:ident, [$($val:expr), +]) => { #[derive(Clone, Debug)] - pub struct $name { + pub struct $repr_name { values: [f64; Self::COUNT], ranks: [usize; Self::COUNT], observations: usize, } - impl $name { + impl $repr_name { const TARGETS: [f64; count_exprs!($($val),+) + 2] = [0.0, $($val),+, 1.0]; const COUNT: usize = Self::TARGETS.len(); @@ -672,13 +704,13 @@ pub mod custom_quantile_estimator { } } - impl Default for $name { + impl Default for $repr_name { fn default() -> Self { Self::new() } } - impl SimpleQuantileEstimatorRepresentation for $name { + impl QuantileEstimatorRepresentation for $repr_name { fn new_like(_other: &Self) -> Self { Self::default() } @@ -699,21 +731,27 @@ pub mod custom_quantile_estimator { } } } + + pub type $name = QuantileEstimator<$repr_name>; }; } - implement_fixed_quantile_estimator!(LomaxQuant[0.18, 0.36, 0.4752, 0.5904, 0.7952]); + implement_fixed_quantile_estimator!( + LomaxQuant, + LomaxQuantRepr, + [0.18, 0.36, 0.4752, 0.5904, 0.7952] + ); impl LomaxQuant { pub const PROB1: f64 = 0.36; pub const PROB2: f64 = 0.59; } - implement_fixed_quantile_estimator!(MedianEstimator[0.25, 0.5, 0.75]); + implement_fixed_quantile_estimator!(MedianEstimator, MedianEstimatorRepr, [0.25, 0.5, 0.75]); } #[cfg(test)] mod test { use crate::{ - p2estimator::{FixedSizeQuantileEstimator, QuantileEstimator, VectorQuantileEstimator}, + p2estimator::QuantileEstimator, statistics::{linspace, Distribution, Exponential}, }; use itertools::Itertools; @@ -733,14 +771,13 @@ mod test { fn quantiles_on_exponential_dist() { let expon = Exponential::new(1.0); let mut estimator = - FixedSizeQuantileEstimator::new(&[0.0, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]); + QuantileEstimator::new_from_array(&[0.0, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]); let mut rng = Xoshiro256PlusPlus::seed_from_u64(12345654321); for _ in 0..10_000 { let sample = expon.ppf(rng.random()); - - estimator.update(sample); + estimator += sample; } assert!(estimator.samples() == 10_000); @@ -771,19 +808,19 @@ mod test { fn test_quantile_merging() { let expon = Exponential::new(1.0); let mut merged_estimator = - VectorQuantileEstimator::new(&linspace(0.0, 1.0, 10).collect_vec()); + QuantileEstimator::new_from_slice(&linspace(0.0, 1.0, 10).collect_vec()); let mut rng = Xoshiro256PlusPlus::seed_from_u64(12345654321); for _ in 0..100 { let targets: Vec = linspace(0.0, 1.0, rng.random_range(5..15)).collect(); - let mut estimator = VectorQuantileEstimator::new(&targets); + let mut estimator = QuantileEstimator::new_from_slice(&targets); for _ in 0..100 { - estimator.update(expon.ppf(rng.random())); + estimator += expon.ppf(rng.random()); } - merged_estimator = merged_estimator.combine(&estimator); + merged_estimator = merged_estimator + estimator; } assert!(merged_estimator.samples() == 10_000); diff --git a/src/statistics.rs b/src/statistics.rs index 448c118..fa0373a 100644 --- a/src/statistics.rs +++ b/src/statistics.rs @@ -469,11 +469,9 @@ impl AssymetricLaplace { } pub fn from_exponential_halves(mode: f64, negative_mean: f64, positive_mean: f64) -> Self { - Self::new( - mode, - (negative_mean * positive_mean) / (negative_mean + positive_mean), - 1.0 / (positive_mean / negative_mean + 1.0), - ) + let nm = negative_mean.max(1e-8); + let pm = positive_mean.max(1e-8); + Self::new(mode, (nm * pm) / (nm + pm), 1.0 / (pm / nm + 1.0)) } pub fn symmetric_from_moments(mean: f64, standard_deviation: f64) -> Self { diff --git a/src/viz/stats.rs b/src/viz/stats.rs index 4b74fb1..4a3a202 100644 --- a/src/viz/stats.rs +++ b/src/viz/stats.rs @@ -20,29 +20,17 @@ struct InversionInfo { pub normal_joins: usize, } -fn write_table( +fn write_tsv( writer: &mut impl Write, header: &[A; N], data: &[[B; N]], ) -> std::io::Result<()> { - writeln!(writer, "\n")?; - - for heading in header.iter() { - writeln!(writer, "", heading)?; - } - - writeln!(writer, "\n")?; + writeln!(writer, "{}", header.iter().join("\t"))?; for row in data.iter() { - write!(writer, "")?; - for entry in row.iter() { - write!(writer, "", entry)?; - } - writeln!(writer, "")?; + writeln!(writer, "{}", row.iter().join("\t"))?; } - writeln!(writer, "\n
{}
{}
")?; - Ok(()) } @@ -54,13 +42,13 @@ fn write_statistics_table_page std::io::Result<()> { let mut tmp_writer = Vec::::new(); - write_table(&mut tmp_writer, header, data)?; + write_tsv(&mut tmp_writer, header, data)?; let table_page = TABLE_HTML .replace("PAGE_TITLE", title) .replace("SODA_TARGET", SODA_JS) .replace( - "TABLE_TARGET", + "TSV_TARGET", str::from_utf8(&tmp_writer).expect("UTF8 decoding failed!"), ); @@ -101,11 +89,11 @@ pub fn write_family_statistics( stats_writer, "Family Statistics", &[ - "Family", - "Occurrences", - "Coverage", - "Kimura80 Boxplot", - "Kimura80 KDE", + "Family_string", + "Occurrences_int", + "Coverage_int", + "Kimura80_Boxplot_boxplot", + "Kimura80_KDE_violin", ], &family_stats .iter() @@ -115,14 +103,8 @@ pub fn write_family_statistics( k.to_string(), v.occurrences.to_string(), v.coverage.to_string(), - format!( - "
", - v.kimura80_values.0.iter().join(",") - ), - format!( - "
", - v.kimura80_values.0.iter().join(",") - ), + v.kimura80_values.0.iter().join(":"), + v.kimura80_values.0.iter().join(":"), ] }) .collect_vec(), @@ -165,13 +147,13 @@ pub fn write_inversion_statistics( write_statistics_table_page( stats_writer, "Inversion Statistics", - &["Region", "Inversions", "Normal Joins"], + &["Region_region", "Inversions_int", "Normal_Joins_int"], &inversion_stats .iter() .sorted_by(|v1, v2| v2.1.cmp(v1.1)) .map(|(k, v)| { [ - format!("{}", k, k), + k.to_string(), v.inversions.to_string(), v.normal_joins.to_string(), ] From 5e409749d118b37565d1db955334d74b8a0e913a Mon Sep 17 00:00:00 2001 From: isaacr Date: Thu, 4 Jun 2026 02:00:45 -0600 Subject: [PATCH 36/39] Clean up implementation... --- src/assembly.rs | 1 - src/join_estimation.rs | 52 ++++++++----- src/main.rs | 4 + src/p2estimator.rs | 18 +++-- src/statistics.rs | 165 ++--------------------------------------- src/viz/stats.rs | 2 - 6 files changed, 56 insertions(+), 186 deletions(-) diff --git a/src/assembly.rs b/src/assembly.rs index 9485691..35eb626 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -357,7 +357,6 @@ fn link_assemblies( score_params: &ScoreParams, args: &AnnotationArgs, ) { - println!("{:#?}", query_statistics); // this relies on the alignments being sorted by target start let compatable_blocks = compatable_blocks.sorted().collect_vec(); diff --git a/src/join_estimation.rs b/src/join_estimation.rs index 89f3763..2eba480 100644 --- a/src/join_estimation.rs +++ b/src/join_estimation.rs @@ -143,11 +143,11 @@ impl Default for MomentEstimator { } } -impl ops::Add for MomentEstimator { +impl ops::Add<&MomentEstimator> for &MomentEstimator { type Output = MomentEstimator; - fn add(self, rhs: MomentEstimator) -> Self::Output { - Self { + fn add(self, rhs: &MomentEstimator) -> Self::Output { + MomentEstimator { sum_square: self.sum_square + rhs.sum_square, sum: self.sum + rhs.sum, samples: self.samples + rhs.samples, @@ -171,14 +171,14 @@ impl ops::AddAssign for MomentEstimator { } } -impl From for ExponentialEstimator { - fn from(value: MomentEstimator) -> Self { +impl From<&MomentEstimator> for ExponentialEstimator { + fn from(value: &MomentEstimator) -> Self { Self::new(value.mean(), value.samples().max(1)) } } -impl From for HalfT { - fn from(value: MomentEstimator) -> Self { +impl From<&MomentEstimator> for HalfT { + fn from(value: &MomentEstimator) -> Self { Self::from_sample_mean(value.mean(), value.samples().max(1)) } } @@ -205,13 +205,21 @@ impl From<&MedianEstimator> for ExponentialEstimator { } } +impl From<&MedianEstimator> for HalfT { + fn from(value: &MedianEstimator) -> Self { + Self::new( + value.ppf(0.5) / Self::new(1.0, value.samples()).ppf(0.5), + value.samples(), + ) + } +} + impl From<&BayesianJoinStatistics> for BayesianJoinEstimator { fn from(statistics: &BayesianJoinStatistics) -> Self { - println!("{:#?}", statistics); Self { - target_distance_join: statistics.joinable_target_distance.into(), - target_distance_nojoin: statistics.unjoinable_target_distance.into(), - divergence_join: statistics.joinable_divergence.into(), + target_distance_join: (&statistics.joinable_target_distance).into(), + target_distance_nojoin: (&statistics.unjoinable_target_distance).into(), + divergence_join: (&statistics.joinable_divergence).into(), divergence_nojoin: HalfT::from_sample_mean( statistics .unjoinable_divergence @@ -281,6 +289,10 @@ impl JoinStatisticsCollector for BayesianJoinStatistics { } fn add(&mut self, first_block: &Block, second_block: &Block, link_info: &LinkInfo) { + if !link_info.neighbors { + return; + } + let target_dist = link_info.unexplained_bases; let divergence_diff = (second_block.kimura80 - first_block.kimura80).abs(); let (rel_con_dist, join_type) = relative_consensus_distance( @@ -310,15 +322,15 @@ impl JoinStatisticsCollector for BayesianJoinStatistics { fn combine(&self, other: &Self) -> Self { Self { - joinable_target_distance: self.joinable_target_distance - + other.joinable_target_distance, - unjoinable_target_distance: self.unjoinable_target_distance - + other.unjoinable_target_distance, - joinable_divergence: self.joinable_divergence + other.joinable_divergence, - unjoinable_divergence: self.unjoinable_divergence + other.unjoinable_divergence, - joinable_consensus_pos: self.joinable_consensus_pos + other.joinable_consensus_pos, - joinable_consensus_neg: self.joinable_consensus_neg + other.joinable_consensus_neg, - unjoinable_consensus: self.unjoinable_consensus + other.unjoinable_consensus, + joinable_target_distance: &self.joinable_target_distance + + &other.joinable_target_distance, + unjoinable_target_distance: &self.unjoinable_target_distance + + &other.unjoinable_target_distance, + joinable_divergence: &self.joinable_divergence + &other.joinable_divergence, + unjoinable_divergence: &self.unjoinable_divergence + &other.unjoinable_divergence, + joinable_consensus_pos: &self.joinable_consensus_pos + &other.joinable_consensus_pos, + joinable_consensus_neg: &self.joinable_consensus_neg + &other.joinable_consensus_neg, + unjoinable_consensus: &self.unjoinable_consensus + &other.unjoinable_consensus, } } } diff --git a/src/main.rs b/src/main.rs index 5b01b6d..0243187 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,7 +8,11 @@ mod confidence; mod history_tracing; mod join_estimation; mod matrix; + +// Keeping around for enhanced parameter estimation work... +#[allow(dead_code)] mod p2estimator; + mod pipeline; mod score_params; mod segment_groups; diff --git a/src/p2estimator.rs b/src/p2estimator.rs index 9da5da7..ede85fe 100644 --- a/src/p2estimator.rs +++ b/src/p2estimator.rs @@ -346,7 +346,7 @@ fn _interpolated_value_prediction< } } -#[derive(Clone)] +#[derive(Clone, Debug)] pub struct QuantileEstimator(T); impl QuantileEstimator { @@ -433,13 +433,13 @@ impl AddAssign<&[f64]> for QuantileEstimator } } -impl Add> for QuantileEstimator { +impl Add<&QuantileEstimator> for &QuantileEstimator { type Output = QuantileEstimator; - fn add(self, rhs: QuantileEstimator) -> Self::Output { + fn add(self, rhs: &QuantileEstimator) -> Self::Output { match (self.0._is_initialized(), rhs.0._is_initialized()) { (true, true) => { - let mut new_quant_est = Self(T::new_like(&self.0)); + let mut new_quant_est = QuantileEstimator::(T::new_like(&self.0)); _merge_estimators(self.0._data(), rhs.0._data(), new_quant_est.0._mut_data()); @@ -752,11 +752,17 @@ pub mod custom_quantile_estimator { mod test { use crate::{ p2estimator::QuantileEstimator, - statistics::{linspace, Distribution, Exponential}, + statistics::{Distribution, Exponential}, }; use itertools::Itertools; use rand::{rngs::Xoshiro256PlusPlus, RngExt, SeedableRng}; + pub fn linspace(start: f64, stop: f64, steps: usize) -> impl Iterator { + (0..steps) + .map(move |n| n as f64 / (steps as f64 - 1.0)) + .map(move |n| start * (1.0 - n) + stop * n) + } + fn is_close(a: f64, b: f64) -> bool { let rel_tol = 1e-9; let abs_tol = 0.0; @@ -820,7 +826,7 @@ mod test { estimator += expon.ppf(rng.random()); } - merged_estimator = merged_estimator + estimator; + merged_estimator = &merged_estimator + &estimator; } assert!(merged_estimator.samples() == 10_000); diff --git a/src/statistics.rs b/src/statistics.rs index fa0373a..f5014b8 100644 --- a/src/statistics.rs +++ b/src/statistics.rs @@ -22,6 +22,7 @@ pub trait Distribution { fn logccdf(&self, x: f64) -> f64; } +#[allow(dead_code)] pub trait ParameterizedDistribution: Distribution + Debug + Default + Clone { fn unit() -> Self { Self::default() @@ -242,154 +243,6 @@ impl Distribution for HalfT { } } -#[derive(Debug, Clone)] -pub struct Frechet { - alpha: f64, - scale: f64, - minimum: f64, -} - -impl ParameterizedDistribution for Frechet {} - -impl Frechet { - pub fn new(alpha: f64, scale: f64, minimum: f64) -> Self { - Self { - alpha, - scale, - minimum, - } - } -} - -impl Default for Frechet { - fn default() -> Self { - Self { - alpha: 1.0, - scale: 1.0, - minimum: 0.0, - } - } -} - -impl Distribution for Frechet { - fn logpdf(&self, x: f64) -> f64 { - let a = self.alpha; - let s = self.scale; - let m = self.minimum; - if x > m { - (a / s).ln() + -(a + 1.0) * ((x - m) / s).ln() + -((x - m) / s).powf(-a) - } else { - f64::NEG_INFINITY - } - } - - fn pdf(&self, x: f64) -> f64 { - self.logpdf(x).exp() - } - - fn cdf(&self, x: f64) -> f64 { - self.logcdf(x).exp() - } - - fn logcdf(&self, x: f64) -> f64 { - let a = self.alpha; - let s = self.scale; - let m = self.minimum; - if x > m { - -((x - m) / s).powf(-a) - } else { - f64::NEG_INFINITY - } - } - - fn ppf(&self, p: f64) -> f64 { - let a = self.alpha; - let s = self.scale; - let m = self.minimum; - if p >= 1.0 { - f64::INFINITY - } else if p <= 0.0 { - m - } else { - m + s * (-p.min(1.0).ln()).powf(1.0 / -a) - } - } - - fn ccdf(&self, x: f64) -> f64 { - 1.0 - self.cdf(x) - } - - fn logccdf(&self, x: f64) -> f64 { - self.ccdf(x).ln() - } - - fn support(&self) -> (f64, f64) { - (self.minimum, f64::INFINITY) - } -} - -#[derive(Debug, Clone)] -pub struct Gumbel { - location: f64, - scale: f64, -} - -impl Gumbel { - pub fn new(location: f64, scale: f64) -> Self { - Self { location, scale } - } -} - -impl Default for Gumbel { - fn default() -> Self { - Self::new(0.0, 1.0) - } -} - -impl ParameterizedDistribution for Gumbel {} - -impl Distribution for Gumbel { - fn logpdf(&self, x: f64) -> f64 { - let mu = self.location; - let beta = self.scale; - let z = (x - mu) / beta; - (1.0 / beta).ln() - (z + (-z).exp()) - } - - fn pdf(&self, x: f64) -> f64 { - self.logpdf(x).exp() - } - - fn cdf(&self, x: f64) -> f64 { - self.logcdf(x).exp() - } - - fn logcdf(&self, x: f64) -> f64 { - let mu = self.location; - let beta = self.scale; - let z = (x - mu) / beta; - -((-z).exp()) - } - - fn ppf(&self, p: f64) -> f64 { - let mu = self.location; - let beta = self.scale; - mu - beta * (-p.ln()).ln() - } - - fn ccdf(&self, x: f64) -> f64 { - 1.0 - self.cdf(x) - } - - fn logccdf(&self, x: f64) -> f64 { - self.ccdf(x).ln() - } - - fn support(&self) -> (f64, f64) { - (f64::NEG_INFINITY, f64::INFINITY) - } -} - #[derive(Debug, Clone)] pub struct Lomax { alpha: f64, @@ -542,17 +395,17 @@ impl Distribution for AssymetricLaplace { } } -pub fn linspace(start: f64, stop: f64, steps: usize) -> impl Iterator { - (0..steps) - .map(move |n| n as f64 / (steps as f64 - 1.0)) - .map(move |n| start * (1.0 - n) + stop * n) -} - #[cfg(test)] mod test { use super::*; use std::fmt::Debug; + pub fn linspace(start: f64, stop: f64, steps: usize) -> impl Iterator { + (0..steps) + .map(move |n| n as f64 / (steps as f64 - 1.0)) + .map(move |n| start * (1.0 - n) + stop * n) + } + // Add debug trait to allow for printout... pub trait TestDistribution: Distribution + Debug {} impl TestDistribution for T {} @@ -563,14 +416,12 @@ mod test { use super::{Exponential, ParameterizedDistribution}; - fn get_dists() -> [Box; 7] { + fn get_dists() -> [Box; 5] { [ as_box(Exponential::unit()), as_box(ExponentialEstimator::unit()), as_box(HalfT::unit()), - as_box(Frechet::unit()), as_box(AssymetricLaplace::unit()), - as_box(Gumbel::unit()), as_box(Lomax::unit()), ] } diff --git a/src/viz/stats.rs b/src/viz/stats.rs index 4a3a202..a9e2877 100644 --- a/src/viz/stats.rs +++ b/src/viz/stats.rs @@ -92,7 +92,6 @@ pub fn write_family_statistics( "Family_string", "Occurrences_int", "Coverage_int", - "Kimura80_Boxplot_boxplot", "Kimura80_KDE_violin", ], &family_stats @@ -104,7 +103,6 @@ pub fn write_family_statistics( v.occurrences.to_string(), v.coverage.to_string(), v.kimura80_values.0.iter().join(":"), - v.kimura80_values.0.iter().join(":"), ] }) .collect_vec(), From 06923ff8e96e920c5e4a719cf965d110dee8581f Mon Sep 17 00:00:00 2001 From: isaacr Date: Thu, 4 Jun 2026 11:09:55 -0600 Subject: [PATCH 37/39] Enhance statistics displayed on family page. --- fixtures/soda/table.html | 17 ++++++++-- src/main.rs | 6 +++- src/segments.rs | 24 ++++++++++++++ src/viz/stats.rs | 69 ++++++++++++++++++++++++++++++++++++++-- 4 files changed, 110 insertions(+), 6 deletions(-) diff --git a/fixtures/soda/table.html b/fixtures/soda/table.html index db87869..26ca293 100644 --- a/fixtures/soda/table.html +++ b/fixtures/soda/table.html @@ -36,6 +36,10 @@ figure { margin: 0; } + + th input[type="text"] { + width: 6em; + }