From 6b2ee0b45ba85a223a4bcbfcf14e31f9ec64175d Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Wed, 8 Apr 2026 15:32:15 -0600
Subject: [PATCH 01/39] Fix warnings, bug in visual...

---
 Cargo.toml                 | 1 -
 src/balanced_tree.rs       | 4 ++--
 src/segments.rs            | 2 +-
 src/substitution_matrix.rs | 1 +
 src/util.rs                | 2 +-
 src/viz/mod.rs             | 4 ++--
 src/windowed_scores.rs     | 1 +
 7 files changed, 8 insertions(+), 7 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index fbf7952..24d0d77 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -28,4 +28,3 @@ opt-level = 3
 lto = "thin"
 codegen-units = 1
 debug = false
-
diff --git a/src/balanced_tree.rs b/src/balanced_tree.rs
index 603b68d..80629cf 100644
--- a/src/balanced_tree.rs
+++ b/src/balanced_tree.rs
@@ -181,7 +181,7 @@ impl<T: Into<usize> + TryFrom<usize> + Copy + Debug> AVLIndexSet<T> {
     }
 
     /// Iterate over the elements in the tree, in sorted order. Returns the element index and the depth of the element in the tree.
-    pub fn iter(&self) -> AVLInOrderSetIterator<T> {
+    pub fn iter(&self) -> AVLInOrderSetIterator<'_, T> {
         let mut stack = Vec::with_capacity(self.depth() + 1);
         if let Some(root) = self.root {
             stack.push((root, 0_u8));
@@ -190,7 +190,7 @@ impl<T: Into<usize> + TryFrom<usize> + Copy + Debug> AVLIndexSet<T> {
         AVLInOrderSetIterator { tree: self, stack }
     }
 
-    pub fn bfs(&self) -> AVLBFSSetIterator<T> {
+    pub fn bfs(&self) -> AVLBFSSetIterator<'_, T> {
         let mut queue = VecDeque::new();
 
         if let Some(root) = self.root {
diff --git a/src/segments.rs b/src/segments.rs
index f0914f1..6d7ddd2 100644
--- a/src/segments.rs
+++ b/src/segments.rs
@@ -213,7 +213,7 @@ pub struct SegmentView<'a> {
 
 #[allow(dead_code)]
 impl InitialSegments {
-    pub fn iter_segments(&self) -> impl Iterator<Item = SegmentView> {
+    pub fn iter_segments(&self) -> impl Iterator<Item = SegmentView<'_>> {
         self.segments.iter().map(|v| SegmentView {
             start_col: v.start_col,
             end_col: v.end_col,
diff --git a/src/substitution_matrix.rs b/src/substitution_matrix.rs
index 6364857..40127f5 100644
--- a/src/substitution_matrix.rs
+++ b/src/substitution_matrix.rs
@@ -9,6 +9,7 @@ use crate::alphabet::{
 };
 
 pub trait AlignmentScore {
+    #[allow(dead_code)]
     fn score(&self, target_char: u8, query_char: u8) -> f64;
     fn score_with_background(&self, target_char: u8, query_char: u8, frequencies: &[f64; 4])
         -> f64;
diff --git a/src/util.rs b/src/util.rs
index 62cabe0..1c7fe5c 100644
--- a/src/util.rs
+++ b/src/util.rs
@@ -15,7 +15,7 @@ impl<T: std::cmp::PartialEq> VecMap<T> {
         Self { values }
     }
 
-    pub fn values(&self) -> std::slice::Iter<T> {
+    pub fn values(&self) -> std::slice::Iter<'_, T> {
         self.values.iter()
     }
 
diff --git a/src/viz/mod.rs b/src/viz/mod.rs
index ad03f7f..410b86c 100644
--- a/src/viz/mod.rs
+++ b/src/viz/mod.rs
@@ -58,7 +58,7 @@ pub fn write_index_file(
 
     proximity_groups.iter().enumerate().for_each(|(idx, g)| {
         index_links.push_str(&format!(
-            "<div class=\"region\" data-target=\"{name}\" data-start=\"{start}\" data-end=\"{end}\"><a href=\"{idx}/index.html\"><h3>region {idx} | {name} {start}:{end}</h3></a>\n",
+            "<div class=\"region\" data-target=\"{name}\" data-start=\"{start}\" data-end=\"{end}\"><a href=\"{idx}/index.html\"><h3>region {idx} | {name} {start}:{end}</h3></a></div><br>\n",
             name = alignment_data.target_name_map.get(g.target_id),
             start = g.target_start,
             end = g.target_end,
@@ -746,7 +746,7 @@ impl<'a> AdjudicationSodaData<'a> {
         let columns = self
             .active_columns
             .iter()
-            .flat_map(|&(start, end)| (start..=end))
+            .flat_map(|&(start, end)| start..=end)
             .collect_vec();
         vec![columns]
     }
diff --git a/src/windowed_scores.rs b/src/windowed_scores.rs
index 6044e30..39456b4 100644
--- a/src/windowed_scores.rs
+++ b/src/windowed_scores.rs
@@ -45,6 +45,7 @@ impl BackgroundFrequencies for Background<'_> {
     }
 }
 
+#[allow(dead_code)]
 pub struct DummyBackground {}
 
 impl BackgroundFrequencies for DummyBackground {

From 67a11004848a47ce1d423906756ee2143ecfd620 Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Wed, 8 Apr 2026 17:27:33 -0600
Subject: [PATCH 02/39] Fix index page html.

---
 src/viz/mod.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/viz/mod.rs b/src/viz/mod.rs
index 410b86c..cf535cf 100644
--- a/src/viz/mod.rs
+++ b/src/viz/mod.rs
@@ -48,7 +48,7 @@ pub fn write_index_file(
         .enumerate()
         .for_each(|(idx, c)| {
             index_links.push_str(&format!(
-                "<div class=\"region\" data-target=\"{name}\" data-start=\"{start}\" data-end=\"{end}\"><a href=\"{name}-{start}-{end}.html\">slice {idx} | {name} {start}:{end}</a></div><br>\n",
+                "<div class=\"region\" data-target=\"{name}\" data-start=\"{start}\" data-end=\"{end}\"><a href=\"{name}-{start}-{end}.html\">slice {idx} | {name} {start}:{end}</a><br></div>\n",
                 name = c.target_name,
                 start = c.target_start,
                 end = c.target_end,
@@ -58,7 +58,7 @@ pub fn write_index_file(
 
     proximity_groups.iter().enumerate().for_each(|(idx, g)| {
         index_links.push_str(&format!(
-            "<div class=\"region\" data-target=\"{name}\" data-start=\"{start}\" data-end=\"{end}\"><a href=\"{idx}/index.html\"><h3>region {idx} | {name} {start}:{end}</h3></a></div><br>\n",
+            "<div class=\"region\" data-target=\"{name}\" data-start=\"{start}\" data-end=\"{end}\"><a href=\"{idx}/index.html\"><h3>region {idx} | {name} {start}:{end}</h3></a></div>\n",
             name = alignment_data.target_name_map.get(g.target_id),
             start = g.target_start,
             end = g.target_end,

From c2bf76405d08d62fdd30862c05a425290b5dbd67 Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Wed, 8 Apr 2026 17:51:00 -0600
Subject: [PATCH 03/39] Further index page fixes.

---
 src/viz/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/viz/mod.rs b/src/viz/mod.rs
index cf535cf..06e12c3 100644
--- a/src/viz/mod.rs
+++ b/src/viz/mod.rs
@@ -48,7 +48,7 @@ pub fn write_index_file(
         .enumerate()
         .for_each(|(idx, c)| {
             index_links.push_str(&format!(
-                "<div class=\"region\" data-target=\"{name}\" data-start=\"{start}\" data-end=\"{end}\"><a href=\"{name}-{start}-{end}.html\">slice {idx} | {name} {start}:{end}</a><br></div>\n",
+                "<div class=\"region\" data-target=\"{name}\" data-start=\"{start}\" data-end=\"{end}\"><a href=\"{name}-{start}-{end}.html\"><h3>slice {idx} | {name} {start}:{end}</h3></a></div>\n",
                 name = c.target_name,
                 start = c.target_start,
                 end = c.target_end,

From 32e8d23b1e2f3af84aea7417f939785298cc390a Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Thu, 9 Apr 2026 16:01:14 -0600
Subject: [PATCH 04/39] Gather basic global trace statistics.

---
 src/main.rs             | 31 ++++++++-----
 src/pipeline.rs         |  2 +
 src/segments.rs         |  4 ++
 src/trace_statistics.rs | 99 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 125 insertions(+), 11 deletions(-)
 create mode 100644 src/trace_statistics.rs

diff --git a/src/main.rs b/src/main.rs
index 9d6b56b..02c71f9 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -18,6 +18,7 @@ mod support;
 #[allow(dead_code)]
 mod union_find;
 
+mod trace_statistics;
 mod util;
 mod viterbi;
 mod viz;
@@ -43,6 +44,7 @@ use crate::{
     annotation::AmbiguousAnnotation,
     chunks::validate_groups,
     pipeline::{run_history_trace, run_naive_trace, NaiveTraceResults},
+    trace_statistics::{trace_statistics, OccuranceCountingMode},
     viz::{
         stats::{write_family_statistics, write_inversion_statistics},
         write_index_file, ICON_SVG,
@@ -448,22 +450,29 @@ fn main() -> Result<()> {
         .par_iter()
         .panic_fuse()
         .enumerate()
-        .map(|(region_idx, group)| {
-            (
-                region_idx,
-                run_naive_trace(group, &alignment_data, region_idx, &args),
-            )
-        })
-        .collect::<Vec<(usize, NaiveTraceResults)>>();
-    naive_results.sort_by_key(|v| v.0);
+        .map(|(region_idx, group)| run_naive_trace(group, &alignment_data, region_idx, &args))
+        .collect::<Vec<NaiveTraceResults>>();
+    naive_results.sort_by_key(|v| v.region_index);
+
+    let trace_stats = trace_statistics(
+        &naive_results,
+        &alignment_data,
+        OccuranceCountingMode::Segments,
+    );
 
     let mut results: Vec<(usize, Vec<AmbiguousAnnotation>)> = proximity_groups
         .par_iter()
         .zip(naive_results)
-        .map(|(group, (region_idx, mut naive_trace))| {
+        .map(|(group, mut naive_trace)| {
             (
-                region_idx,
-                run_history_trace(group, &alignment_data, &mut naive_trace, &args),
+                naive_trace.region_index,
+                run_history_trace(
+                    group,
+                    &alignment_data,
+                    &trace_stats,
+                    &mut naive_trace,
+                    &args,
+                ),
             )
         })
         .collect();
diff --git a/src/pipeline.rs b/src/pipeline.rs
index 64e8d11..7f7c0fd 100644
--- a/src/pipeline.rs
+++ b/src/pipeline.rs
@@ -14,6 +14,7 @@ use crate::{
     score_params::{approximate_ideal_skip_state_score, ScoreParams},
     segments::{assemble_and_link_segments, segments_from_matrix_trace, InitialSegments},
     support::windowed_confidence,
+    trace_statistics::TraceStatistics,
     viterbi::{trace_segments, traceback, viterbi_collapsed, TraceSegment},
     viz::{
         debug::{dump_debug_history_info, dump_final_trace_statistics},
@@ -260,6 +261,7 @@ pub fn run_naive_trace(
 pub fn run_history_trace(
     proximity_group: &ProximityGroup,
     alignment_data: &AlignmentData,
+    trace_statistics: &TraceStatistics,
     naive_trace: &mut NaiveTraceResults,
     args: &AuroraArgs,
 ) -> Vec<AmbiguousAnnotation> {
diff --git a/src/segments.rs b/src/segments.rs
index 6d7ddd2..a2d8e20 100644
--- a/src/segments.rs
+++ b/src/segments.rs
@@ -220,6 +220,10 @@ impl InitialSegments {
             blocks: &v.blocks,
         })
     }
+
+    pub fn len(&self) -> usize {
+        self.segments.len()
+    }
 }
 
 impl<I: Iterator, J: Iterator<Item = I::Item>, F: Fn(&I::Item, &I::Item) -> Ordering> Iterator
diff --git a/src/trace_statistics.rs b/src/trace_statistics.rs
new file mode 100644
index 0000000..7a73daf
--- /dev/null
+++ b/src/trace_statistics.rs
@@ -0,0 +1,99 @@
+use crate::{alignment::AlignmentData, pipeline::NaiveTraceResults, segments::SegmentView};
+
+pub struct RegionStatistics {
+    pub total_bases: usize,
+    pub unexplained_bases: Vec<usize>,
+}
+
+#[derive(Debug, Clone)]
+pub struct QueryStatistics {
+    pub occurances: usize,
+    pub coverage: usize,
+}
+
+pub struct TraceStatistics {
+    pub total_bases: usize,
+    pub query_statistics: Vec<QueryStatistics>,
+    pub region_statistics: Vec<RegionStatistics>,
+}
+
+pub enum OccuranceCountingMode {
+    Segments,
+    Trace,
+}
+
+pub fn trace_statistics(
+    naive_traces: &[NaiveTraceResults],
+    alignment_data: &AlignmentData,
+    count_mode: OccuranceCountingMode,
+) -> TraceStatistics {
+    // Asumption... All regions are sorted, no gaps. At least 1 region expected...
+    debug_assert!(naive_traces.first().map(|v| v.region_index) == Some(0));
+    debug_assert!(naive_traces
+        .iter()
+        .zip(naive_traces.iter().skip(1))
+        .all(|(v1, v2)| v1.region_index + 1 == v2.region_index));
+
+    let mut query_stats = vec![
+        QueryStatistics {
+            occurances: 0,
+            coverage: 0
+        };
+        alignment_data.query_name_map.size()
+    ];
+
+    let mut all_region_stats: Vec<RegionStatistics> = Vec::with_capacity(naive_traces.len());
+
+    for trace_results in naive_traces.iter() {
+        match count_mode {
+            OccuranceCountingMode::Segments => {
+                for seg in trace_results.segments.iter_segments() {
+                    for blk in seg.blocks.iter() {
+                        if let Some(query_id) = blk.query_id {
+                            query_stats[query_id].occurances += 1;
+                            query_stats[query_id].coverage += blk.col_end - blk.col_start + 1;
+                        }
+                    }
+                }
+            }
+            OccuranceCountingMode::Trace => {
+                for trace_blk in trace_results.trace_segments.iter() {
+                    query_stats[trace_blk.query_id].occurances += 1;
+                    query_stats[trace_blk.query_id].coverage +=
+                        trace_blk.col_end - trace_blk.col_start + 1;
+                }
+            }
+        }
+
+        let mut region_stat = RegionStatistics {
+            total_bases: 0,
+            unexplained_bases: Vec::with_capacity(trace_results.segments.len()),
+        };
+
+        let mut unexplained_bases_up_to: usize = 0;
+        let mut prior_segment: Option<SegmentView> = None;
+
+        for seg in trace_results.segments.iter_segments() {
+            if let Some(prior_segment) = prior_segment {
+                // If a skip block was the prior block, add it's bases as unexplained.
+                if prior_segment.blocks.len() == 1 && prior_segment.blocks[0].row_idx == 0 {
+                    unexplained_bases_up_to += seg.end_col - seg.start_col + 1;
+                }
+                unexplained_bases_up_to += prior_segment.end_col - seg.start_col - 1;
+                region_stat.total_bases += prior_segment.end_col - seg.start_col - 1;
+            }
+            region_stat.total_bases += seg.end_col - seg.start_col + 1;
+            region_stat.unexplained_bases.push(unexplained_bases_up_to);
+
+            prior_segment = Some(seg);
+        }
+
+        all_region_stats.push(region_stat);
+    }
+
+    TraceStatistics {
+        total_bases: all_region_stats.iter().map(|v| v.total_bases).sum(),
+        query_statistics: query_stats,
+        region_statistics: all_region_stats,
+    }
+}

From e62c2801293654e91099c6b726f0e1b71127eff6 Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Thu, 16 Apr 2026 17:56:31 -0600
Subject: [PATCH 05/39] New statistics module, incorperating into linking...

---
 scripts/plot_distance_distribution.py | 233 ++++++++++++++++++++++++++
 src/assembly.rs                       |  31 ++--
 src/main.rs                           |   1 +
 src/pipeline.rs                       |   5 +-
 src/statistics.rs                     |  75 +++++++++
 src/trace_statistics.rs               |  46 ++++-
 6 files changed, 372 insertions(+), 19 deletions(-)
 create mode 100755 scripts/plot_distance_distribution.py
 create mode 100644 src/statistics.rs

diff --git a/scripts/plot_distance_distribution.py b/scripts/plot_distance_distribution.py
new file mode 100755
index 0000000..3e64a73
--- /dev/null
+++ b/scripts/plot_distance_distribution.py
@@ -0,0 +1,233 @@
+# Plots distributions of distances between sequences of the same family in the Genome
+# Takes a single bed file as an argument.
+# Results vary between exponential and power-law depending on the family. Basically all look exponential with fatter tails for a good chunk of families.
+#
+# Most likely, based on results on understanding the genome (and shape!), the actual distributions are very likely Hyper-Exponential Distributions.
+# Basically, it means each family is sampled from one of a weighted set of exponential distributions with different rates.
+# This would make sense as TE's have different rates depending on what part of the genome your in (basically, there are functional domains or regions)
+#
+# Testing this though, would require an implementation of Prony's method to determine the fit. I wasn't able to find any implementations so this
+# and it seems implementing such a method would take quite some time. It may be worth eventually adding if better fit's are needed.
+#
+# For now, an exponential distribution seems to provide a good enough approximation for use in aurora. It also is easy to fit well.
+# Here's an interesting paper on the topic that seems to have landed in the same space I've been in: https://www.columbia.edu/~ww2040/FittingMixturesPerfEval98.pdf
+
+import sys
+from inspect import signature
+
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy.optimize import curve_fit
+from scipy.stats import (
+    betaprime,
+    burr12,
+    ecdf,
+    expon,
+    fisk,
+    genextreme,
+    genpareto,
+    invweibull,
+    linregress,
+    lognorm,
+    lomax,
+    weibull_min,
+)
+
+bed_file = sys.argv[1]
+
+seq_info = {}
+cs = np.inf
+ce = -np.inf
+
+with open(bed_file, "r") as f:
+    for line in f:
+        tokens = line.strip().split()
+        start = int(tokens[1])
+        end = int(tokens[2])
+        join_id = int(tokens[12])
+        name = str(tokens[3])
+
+        if name.endswith("Simple_repeat"):
+            continue
+
+        name = name.upper()
+
+        if name not in seq_info:
+            seq_info[name] = []
+        seq_info[name].append((start, end, join_id))
+
+        cs = min(cs, start)
+        ce = max(ce, end)
+
+
+class Distribution:
+    def __init__(self, dist, defaults, exclude_location=True):
+        self._dist = dist
+        self.DEFAULTS = list(defaults)
+        self._excl_loc = exclude_location
+        self.NAME = getattr(
+            dist, "__name__", getattr(type(dist), "__qualname__", repr(dist))
+        )
+
+    def pdf(self, x, *args):
+        # print(*args)
+        if self._excl_loc:
+            return self._dist.pdf(x, *args[:-1], 0.0, args[-1])
+        else:
+            return self._dist.pdf(x, *args)
+
+    def cdf(self, x, *args):
+        if self._excl_loc:
+            return self._dist.cdf(x, *args[:-1], 0.0, args[-1])
+        else:
+            return self._dist.cdf(x, *args)
+
+    def logcdf(self, x, *args):
+        print(*args)
+        if self._excl_loc:
+            return self._dist.logcdf(x, *args[:-1], 0.0, args[-1])
+        else:
+            return self._dist.logcdf(x, *args)
+
+
+distributions = [
+    Distribution(expon, (1000.0,)),
+    Distribution(weibull_min, (2.0, 1000.0)),
+    # Distribution(invweibull, (1.0, 20000.0)),
+    Distribution(lomax, (1.0, 1000.0)),
+    # Distribution(burr12, (1.0, 1.0, 1000.0)),
+    # Distribution(lognorm, (1.0, 1000.0)),
+    # Distribution(fisk, (1.0, 1000.0)),
+    Distribution(genpareto, (0.0, 1000.0)),
+    Distribution(betaprime, (1.0, 2.0, 1000.0)),
+]
+
+
+for name, info in sorted(seq_info.items(), key=lambda k: -len(k[1])):
+    # if not name.startswith("CHARLIE1#"):
+    #    continue
+
+    info = np.array(info)
+    print(info.shape)
+    info = info[np.argsort(info[:, 0])]
+    starts = info[:, 0]
+    ends = info[:, 1]
+
+    # Get distances between consecutive sequences...
+    _, indexes = np.unique(info[:, 2], return_index=True)
+    indexes = np.sort(indexes)
+
+    dists = np.diff(starts[indexes])
+
+    counts, bins = np.histogram(dists, "auto", density=True)
+    x = (bins[1:] + bins[:-1]) / 2
+
+    # Get fit for exponential dist...
+    beta = np.mean(dists)  # np.median(dists) / np.log(2)
+
+    # Get fit for weibull dist... (and CDF)...
+    cdf = ecdf(dists).cdf
+
+    y_transform = lambda y: np.log(-np.log(1 - y))
+
+    with np.errstate(divide="ignore"):
+        wcdfx = np.log(cdf.quantiles)
+        wcdfy = y_transform(cdf.probabilities)
+        # Estimate the error that the transform adds to the line. This makes linear fit better fit a CDF...
+        wcdfy_p1 = y_transform(
+            cdf.probabilities - np.sign(cdf.probabilities - 0.5) * 0.01
+        )
+        wcdfy_err = np.maximum(np.abs((wcdfy_p1 - wcdfy) / 0.01), 1e-8)
+
+    valid_values = np.isfinite(wcdfx) & np.isfinite(wcdfy)
+    fit_line = curve_fit(
+        lambda x, m, b: m * x + b,
+        wcdfx[valid_values],
+        wcdfy[valid_values],
+        sigma=wcdfy_err[valid_values],
+    )[0]
+
+    wk = fit_line[0]
+    w_beta = np.exp(-fit_line[1] / wk)
+
+    cdf_fits = [
+        curve_fit(
+            distrib.cdf,
+            cdf.quantiles,
+            cdf.probabilities,
+            distrib.DEFAULTS,
+            full_output=True,
+        )[0]
+        for distrib in distributions
+    ]
+    # print(cdf_fits)
+    # raise ValueError
+
+    # Printout results...
+    print(f"Count: {starts.shape[0]}")
+    print(f"Genome Size: {ce - cs}")
+    est_scale = (ce - cs) / starts.shape[0]
+    print(f"Avg Occurance Rate (in nucleotides): {est_scale}")
+    print(f"Exponential Fit: Scale (beta): {beta}, Rate (lambda): {1 / beta}")
+    print(
+        f"Weibull Line Fit: Shape (k): {wk}, Scale (beta): {w_beta}, Rate (lambda): {1 / w_beta}"
+    )
+    with np.printoptions(suppress=True):
+        for distrib, fit in zip(distributions, cdf_fits):
+            print(f"{distrib.NAME} CDF Fit: {fit}")
+
+    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2)
+    fig.suptitle(name)
+
+    ax1.set_title("Histogram and Predicted PDFs")
+    ax1.set_ylabel("Density")
+    ax1.set_xlabel("Distance (nucleotides)")
+    ax1.stairs(counts, bins, fill=True, color="tan", label="Histogram")
+    ax1.plot(
+        x, weibull_min.pdf(x, wk, 0.0, w_beta), color="orange", label="Weibull Line Fit"
+    )
+    ax1.plot(x, expon.pdf(x, 0.0, est_scale), color="green", label="Naive Fit Exp")
+    for distrib, fit in zip(distributions, cdf_fits):
+        ax1.plot(x, distrib.pdf(x, *fit), label=f"{distrib.NAME} CDF Fit")
+    ax1.legend()
+
+    ax2.set_title("Weibull-Space of EDF")
+    ax2.set_ylabel(r"$ \ln(x) $")
+    ax2.set_xlabel(r"$ \ln(-\ln(1 - F(x))) $")
+    ax2.scatter(wcdfx, wcdfy, label="Empirical Distribution Function")
+    ax2.plot(
+        wcdfx,
+        fit_line[1] + fit_line[0] * wcdfx,
+        label=f"Line: {fit_line[0]:.02f}x + {fit_line[1]:.02f}",
+    )
+    ax2.legend()
+
+    ax3.set_title("Cumulative Distribution Functions")
+    ax3.set_ylabel(r"$ P(X <= x) $")
+    ax3.set_xlabel("Distance (nucleotides)")
+    ax3.scatter(cdf.quantiles, cdf.probabilities, color="tan", label="EDF")
+    ax3.plot(
+        cdf.quantiles,
+        weibull_min.cdf(cdf.quantiles, wk, 0.0, w_beta),
+        color="orange",
+        label="Weibull Line Fit",
+    )
+    ax3.plot(
+        cdf.quantiles,
+        expon.cdf(cdf.quantiles, 0.0, est_scale),
+        color="green",
+        label="Naive Fit Exp",
+    )
+    for distrib, fit in zip(distributions, cdf_fits):
+        ax3.plot(
+            cdf.quantiles,
+            distrib.cdf(cdf.quantiles, *fit),
+            label=f"{distrib.NAME} CDF Fit",
+        )
+    ax3.legend()
+
+    ax4.plot(cdf.quantiles, cdf.probabilities)
+
+    fig.set_size_inches(12, 12)
+    fig.tight_layout()
+    plt.show()
diff --git a/src/assembly.rs b/src/assembly.rs
index 527e08f..ff92f33 100644
--- a/src/assembly.rs
+++ b/src/assembly.rs
@@ -6,6 +6,8 @@ use crate::{
     alignment::{Alignment, Strand},
     score_params::ScoreParams,
     segments::SegmentedMatrix,
+    statistics::Distribution,
+    trace_statistics::{self, TraceStatistics},
     AnnotationArgs,
 };
 
@@ -164,10 +166,10 @@ fn link_assemblies(
             //    return;
             //}
 
-            let target_distance = b_block.col_start as isize - a_block.col_end as isize;
+            let target_distance = b_block.col_start as isize - a_block.col_end as isize - 1;
 
-            let a_length = a_block.query_end.abs_diff(a_block.query_start);
-            let b_length = b_block.query_end.abs_diff(b_block.query_start);
+            let a_length = a_block.query_end.abs_diff(a_block.query_start) + 1;
+            let b_length = b_block.query_end.abs_diff(b_block.query_start) + 1;
             let min_length = a_length.min(b_length);
 
             let select_closest = |prop1: (isize, LinkType), prop2: (isize, LinkType)| {
@@ -266,11 +268,13 @@ pub struct SegmentAssemblyGraph {
 }
 
 impl SegmentAssemblyGraph {
-    pub fn new(
+    pub fn new<T: Distribution>(
         alignments: &[Alignment],
         segments: &SegmentedMatrix,
+        trace_statistics: &TraceStatistics<T>,
         score_params: &ScoreParams,
         annotation_args: &AnnotationArgs,
+        region_idx: usize,
     ) -> Self {
         let mut alignment_block_map = vec![Vec::<SegmentAndDenseRow>::new(); alignments.len()];
 
@@ -291,19 +295,24 @@ impl SegmentAssemblyGraph {
         query_ids
             .iter()
             // grab the alignments for this ID
-            .map(|id| {
-                alignments
-                    .iter()
-                    .enumerate()
-                    .filter(|&(_, a)| a.query_id == *id)
-                    .flat_map(|(a_idx, _)| alignment_block_map[a_idx].iter().copied())
+            .map(|&id| {
+                (
+                    id,
+                    alignments
+                        .iter()
+                        .enumerate()
+                        .filter(|&(_, a)| a.query_id == id)
+                        .flat_map(|(a_idx, _)| alignment_block_map[a_idx].iter().copied()),
+                )
             })
-            .for_each(|compat_blocks| {
+            .for_each(|(id, compat_blocks)| {
                 link_assemblies(
                     &mut link_graph,
                     compat_blocks,
                     alignments,
                     segments,
+                    trace_statistics.query_statistics[id],
+                    trace_statistics.region_statistics[region_idx],
                     score_params,
                     annotation_args,
                 );
diff --git a/src/main.rs b/src/main.rs
index 02c71f9..dd411ef 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -11,6 +11,7 @@ mod pipeline;
 mod score_params;
 mod segment_groups;
 mod segments;
+mod statistics;
 mod substitution_matrix;
 mod support;
 
diff --git a/src/pipeline.rs b/src/pipeline.rs
index 7f7c0fd..358c617 100644
--- a/src/pipeline.rs
+++ b/src/pipeline.rs
@@ -13,6 +13,7 @@ use crate::{
     matrix::{Matrix, MatrixDef},
     score_params::{approximate_ideal_skip_state_score, ScoreParams},
     segments::{assemble_and_link_segments, segments_from_matrix_trace, InitialSegments},
+    statistics::Distribution,
     support::windowed_confidence,
     trace_statistics::TraceStatistics,
     viterbi::{trace_segments, traceback, viterbi_collapsed, TraceSegment},
@@ -258,10 +259,10 @@ pub fn run_naive_trace(
     }
 }
 
-pub fn run_history_trace(
+pub fn run_history_trace<T: Distribution>(
     proximity_group: &ProximityGroup,
     alignment_data: &AlignmentData,
-    trace_statistics: &TraceStatistics,
+    trace_statistics: &TraceStatistics<T>,
     naive_trace: &mut NaiveTraceResults,
     args: &AuroraArgs,
 ) -> Vec<AmbiguousAnnotation> {
diff --git a/src/statistics.rs b/src/statistics.rs
new file mode 100644
index 0000000..fa3b4ef
--- /dev/null
+++ b/src/statistics.rs
@@ -0,0 +1,75 @@
+#[allow(dead_code)]
+pub trait Distribution: Clone {
+    fn unit() -> Self;
+    fn pdf(&self, x: f64) -> f64;
+    fn cdf(&self, x: f64) -> f64;
+    fn ppf(&self, p: f64) -> f64;
+    fn support(&self) -> (f64, f64);
+
+    fn ccdf(&self, x: f64) -> f64 {
+        1.0 - self.cdf(x)
+    }
+
+    fn logpdf(&self, x: f64) -> f64 {
+        self.pdf(x).ln()
+    }
+    fn logcdf(&self, x: f64) -> f64 {
+        self.cdf(x).ln()
+    }
+    fn logccdf(&self, x: f64) -> f64 {
+        (1.0 - self.ccdf(x)).ln()
+    }
+}
+
+#[derive(Clone)]
+pub struct Exponential {
+    lambda: f64,
+}
+
+impl Exponential {
+    pub fn new(lambda: f64) -> Self {
+        Self { lambda }
+    }
+
+    pub fn from_scale(beta: f64) -> Self {
+        Self::new(1.0 / beta)
+    }
+}
+
+impl Distribution for Exponential {
+    fn unit() -> Self {
+        Self::new(1.0)
+    }
+
+    fn pdf(&self, x: f64) -> f64 {
+        self.lambda * (-self.lambda * x).exp()
+    }
+
+    fn cdf(&self, x: f64) -> f64 {
+        1.0 - (-self.lambda * x).exp()
+    }
+
+    fn ppf(&self, p: f64) -> f64 {
+        -(1.0 - p).ln() / self.lambda
+    }
+
+    fn ccdf(&self, x: f64) -> f64 {
+        (-self.lambda * x).exp()
+    }
+
+    fn logpdf(&self, x: f64) -> f64 {
+        self.lambda.ln() - self.lambda * x
+    }
+
+    fn logcdf(&self, x: f64) -> f64 {
+        (-(-self.lambda * x).exp()).ln_1p()
+    }
+
+    fn logccdf(&self, x: f64) -> f64 {
+        -self.lambda * x
+    }
+
+    fn support(&self) -> (f64, f64) {
+        (0.0, f64::INFINITY)
+    }
+}
diff --git a/src/trace_statistics.rs b/src/trace_statistics.rs
index 7a73daf..39db395 100644
--- a/src/trace_statistics.rs
+++ b/src/trace_statistics.rs
@@ -1,4 +1,9 @@
-use crate::{alignment::AlignmentData, pipeline::NaiveTraceResults, segments::SegmentView};
+use crate::{
+    alignment::AlignmentData,
+    pipeline::NaiveTraceResults,
+    segments::SegmentView,
+    statistics::{Distribution, Exponential},
+};
 
 pub struct RegionStatistics {
     pub total_bases: usize,
@@ -6,14 +11,16 @@ pub struct RegionStatistics {
 }
 
 #[derive(Debug, Clone)]
-pub struct QueryStatistics {
+pub struct QueryStatistics<T: Distribution> {
     pub occurances: usize,
     pub coverage: usize,
+    pub target_span: usize,
+    pub distribution: T,
 }
 
-pub struct TraceStatistics {
+pub struct TraceStatistics<T: Distribution> {
     pub total_bases: usize,
-    pub query_statistics: Vec<QueryStatistics>,
+    pub query_statistics: Vec<QueryStatistics<T>>,
     pub region_statistics: Vec<RegionStatistics>,
 }
 
@@ -26,7 +33,7 @@ pub fn trace_statistics(
     naive_traces: &[NaiveTraceResults],
     alignment_data: &AlignmentData,
     count_mode: OccuranceCountingMode,
-) -> TraceStatistics {
+) -> TraceStatistics<Exponential> {
     // Asumption... All regions are sorted, no gaps. At least 1 region expected...
     debug_assert!(naive_traces.first().map(|v| v.region_index) == Some(0));
     debug_assert!(naive_traces
@@ -37,11 +44,16 @@ pub fn trace_statistics(
     let mut query_stats = vec![
         QueryStatistics {
             occurances: 0,
-            coverage: 0
+            coverage: 0,
+            target_span: 0,
+            distribution: Exponential::unit(),
         };
         alignment_data.query_name_map.size()
     ];
 
+    let mut query_span: Vec<Option<(usize, usize)>> =
+        vec![None; alignment_data.query_name_map.size()];
+
     let mut all_region_stats: Vec<RegionStatistics> = Vec::with_capacity(naive_traces.len());
 
     for trace_results in naive_traces.iter() {
@@ -52,6 +64,12 @@ pub fn trace_statistics(
                         if let Some(query_id) = blk.query_id {
                             query_stats[query_id].occurances += 1;
                             query_stats[query_id].coverage += blk.col_end - blk.col_start + 1;
+                            query_span[query_id] = match query_span[query_id] {
+                                None => Some((blk.col_start, blk.col_end)),
+                                Some((start, end)) => {
+                                    Some((start.min(blk.col_start), end.min(blk.col_end)))
+                                }
+                            }
                         }
                     }
                 }
@@ -61,6 +79,13 @@ pub fn trace_statistics(
                     query_stats[trace_blk.query_id].occurances += 1;
                     query_stats[trace_blk.query_id].coverage +=
                         trace_blk.col_end - trace_blk.col_start + 1;
+
+                    query_span[trace_blk.query_id] = match query_span[trace_blk.query_id] {
+                        None => Some((trace_blk.col_start, trace_blk.col_end)),
+                        Some((start, end)) => {
+                            Some((start.min(trace_blk.col_start), end.min(trace_blk.col_end)))
+                        }
+                    }
                 }
             }
         }
@@ -91,6 +116,15 @@ pub fn trace_statistics(
         all_region_stats.push(region_stat);
     }
 
+    for (query_info, query_span) in query_stats.iter_mut().zip(query_span.iter()) {
+        if let Some((start, end)) = query_span {
+            query_info.target_span = end - start + 1;
+            query_info.distribution = Exponential::from_scale(
+                query_info.occurances as f64 / query_info.target_span as f64,
+            );
+        }
+    }
+
     TraceStatistics {
         total_bases: all_region_stats.iter().map(|v| v.total_bases).sum(),
         query_statistics: query_stats,

From decf2329140f5350f2822669a9a0a3437a910c55 Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Fri, 17 Apr 2026 01:14:12 -0600
Subject: [PATCH 06/39] Fix errors.

---
 src/assembly.rs         | 32 +++++++++++++++++---------------
 src/pipeline.rs         |  6 ++++++
 src/segments.rs         | 16 +++++++++++++---
 src/statistics.rs       |  6 ++++--
 src/trace_statistics.rs | 24 ++++++++++++++++--------
 5 files changed, 56 insertions(+), 28 deletions(-)

diff --git a/src/assembly.rs b/src/assembly.rs
index ff92f33..449ec68 100644
--- a/src/assembly.rs
+++ b/src/assembly.rs
@@ -7,7 +7,7 @@ use crate::{
     score_params::ScoreParams,
     segments::SegmentedMatrix,
     statistics::Distribution,
-    trace_statistics::{self, TraceStatistics},
+    trace_statistics::{QueryStatistics, RegionStatistics, TraceStatistics},
     AnnotationArgs,
 };
 
@@ -138,11 +138,13 @@ fn get_link_cost(
         + lambda * target_gap
 }
 
-fn link_assemblies(
+fn link_assemblies<T: Distribution>(
     graph: &mut HashMap<(SegmentAndDenseRow, SegmentAndDenseRow), Edge>,
     compatable_blocks: impl Iterator<Item = (usize, usize)>,
     alignments: &[Alignment],
     segments: &SegmentedMatrix,
+    query_statistics: &QueryStatistics<T>,
+    region_statistics: &RegionStatistics,
     score_params: &ScoreParams,
     args: &AnnotationArgs,
 ) {
@@ -187,30 +189,30 @@ fn link_assemblies(
                 alignments[b_block.row_idx - 1].strand,
             ) {
                 (Strand::Forward, Strand::Forward) => (
-                    b_block.query_start as isize - a_block.query_end as isize,
+                    b_block.query_start as isize - a_block.query_end as isize - 1,
                     LinkType::Forward,
                 ),
                 (Strand::Reverse, Strand::Reverse) => (
-                    a_block.query_end as isize - b_block.query_start as isize,
+                    a_block.query_end as isize - b_block.query_start as isize - 1,
                     LinkType::Reverse,
                 ),
                 (Strand::Forward, Strand::Reverse) => select_closest(
                     (
-                        a_block.query_start as isize - b_block.query_start as isize,
+                        a_block.query_start as isize - b_block.query_start as isize - 1,
                         LinkType::FRInversion1,
                     ),
                     (
-                        b_block.query_end as isize - a_block.query_end as isize,
+                        b_block.query_end as isize - a_block.query_end as isize - 1,
                         LinkType::FRInversion2,
                     ),
                 ),
                 (Strand::Reverse, Strand::Forward) => select_closest(
                     (
-                        b_block.query_start as isize - a_block.query_start as isize,
+                        b_block.query_start as isize - a_block.query_start as isize - 1,
                         LinkType::RFInversion1,
                     ),
                     (
-                        a_block.query_end as isize - b_block.query_end as isize,
+                        a_block.query_end as isize - b_block.query_end as isize - 1,
                         LinkType::RFInversion2,
                     ),
                 ),
@@ -271,10 +273,10 @@ impl SegmentAssemblyGraph {
     pub fn new<T: Distribution>(
         alignments: &[Alignment],
         segments: &SegmentedMatrix,
-        trace_statistics: &TraceStatistics<T>,
+        region_statistics: &RegionStatistics,
+        query_statistics: &[QueryStatistics<T>],
         score_params: &ScoreParams,
         annotation_args: &AnnotationArgs,
-        region_idx: usize,
     ) -> Self {
         let mut alignment_block_map = vec![Vec::<SegmentAndDenseRow>::new(); alignments.len()];
 
@@ -295,13 +297,13 @@ impl SegmentAssemblyGraph {
         query_ids
             .iter()
             // grab the alignments for this ID
-            .map(|&id| {
+            .map(|id| {
                 (
-                    id,
+                    *id,
                     alignments
                         .iter()
                         .enumerate()
-                        .filter(|&(_, a)| a.query_id == id)
+                        .filter(|&(_, a)| a.query_id == *id)
                         .flat_map(|(a_idx, _)| alignment_block_map[a_idx].iter().copied()),
                 )
             })
@@ -311,8 +313,8 @@ impl SegmentAssemblyGraph {
                     compat_blocks,
                     alignments,
                     segments,
-                    trace_statistics.query_statistics[id],
-                    trace_statistics.region_statistics[region_idx],
+                    &query_statistics[id],
+                    region_statistics,
                     score_params,
                     annotation_args,
                 );
diff --git a/src/pipeline.rs b/src/pipeline.rs
index 358c617..42851a4 100644
--- a/src/pipeline.rs
+++ b/src/pipeline.rs
@@ -142,6 +142,8 @@ fn get_active_columns<T: Copy + Default + Display>(matrix: &Matrix<T>) -> Vec<(u
 }
 
 pub struct NaiveTraceResults {
+    pub target_start: usize,
+    pub target_end: usize,
     pub trace_segments: Vec<TraceSegment>,
     pub segments: InitialSegments,
     pub score_params: ScoreParams,
@@ -249,6 +251,8 @@ pub fn run_naive_trace(
     }
 
     NaiveTraceResults {
+        target_start: proximity_group.target_start,
+        target_end: proximity_group.target_end,
         trace_segments: simple_trace,
         segments,
         score_params,
@@ -272,6 +276,8 @@ pub fn run_history_trace<T: Distribution>(
         proximity_group,
         &mut naive_trace.segments,
         &naive_trace.trace_segments,
+        &trace_statistics.region_statistics[naive_trace.region_index],
+        &trace_statistics.query_statistics,
         &naive_trace.score_params,
         &args.annotation_args,
     );
diff --git a/src/segments.rs b/src/segments.rs
index a2d8e20..c990b25 100644
--- a/src/segments.rs
+++ b/src/segments.rs
@@ -2,8 +2,14 @@ use core::f64;
 use std::{cmp::Ordering, fmt::Debug, iter::Fuse};
 
 use crate::{
-    assembly::SegmentAssemblyGraph, chunks::ProximityGroup, matrix::Matrix,
-    score_params::ScoreParams, viterbi::TraceSegment, AnnotationArgs,
+    assembly::SegmentAssemblyGraph,
+    chunks::ProximityGroup,
+    matrix::Matrix,
+    score_params::ScoreParams,
+    statistics::Distribution,
+    trace_statistics::{QueryStatistics, RegionStatistics, TraceStatistics},
+    viterbi::TraceSegment,
+    AnnotationArgs,
 };
 use itertools::Itertools;
 
@@ -556,16 +562,20 @@ pub fn segments_from_matrix_trace(
     }
 }
 
-pub fn assemble_and_link_segments<'a>(
+pub fn assemble_and_link_segments<'a, T: Distribution>(
     proximity_group: &ProximityGroup,
     initial_segments: &'a mut InitialSegments,
     trace_segments: &[TraceSegment],
+    region_statistics: &RegionStatistics,
+    query_statistics: &[QueryStatistics<T>],
     score_params: &ScoreParams,
     annotation_args: &AnnotationArgs,
 ) -> (&'a SegmentedMatrix, SegmentAssemblyGraph) {
     let assembly_graph = SegmentAssemblyGraph::new(
         proximity_group.alignments,
         &initial_segments.segments,
+        region_statistics,
+        query_statistics,
         score_params,
         annotation_args,
     );
diff --git a/src/statistics.rs b/src/statistics.rs
index fa3b4ef..5db05b7 100644
--- a/src/statistics.rs
+++ b/src/statistics.rs
@@ -1,5 +1,7 @@
+use std::fmt::Debug;
+
 #[allow(dead_code)]
-pub trait Distribution: Clone {
+pub trait Distribution: Clone + Debug {
     fn unit() -> Self;
     fn pdf(&self, x: f64) -> f64;
     fn cdf(&self, x: f64) -> f64;
@@ -21,7 +23,7 @@ pub trait Distribution: Clone {
     }
 }
 
-#[derive(Clone)]
+#[derive(Clone, Debug)]
 pub struct Exponential {
     lambda: f64,
 }
diff --git a/src/trace_statistics.rs b/src/trace_statistics.rs
index 39db395..7d7f9c6 100644
--- a/src/trace_statistics.rs
+++ b/src/trace_statistics.rs
@@ -65,10 +65,14 @@ pub fn trace_statistics(
                             query_stats[query_id].occurances += 1;
                             query_stats[query_id].coverage += blk.col_end - blk.col_start + 1;
                             query_span[query_id] = match query_span[query_id] {
-                                None => Some((blk.col_start, blk.col_end)),
-                                Some((start, end)) => {
-                                    Some((start.min(blk.col_start), end.min(blk.col_end)))
-                                }
+                                None => Some((
+                                    trace_results.target_start + blk.col_start,
+                                    trace_results.target_start + blk.col_end,
+                                )),
+                                Some((start, end)) => Some((
+                                    start.min(trace_results.target_start + blk.col_start),
+                                    end.max(trace_results.target_start + blk.col_end),
+                                )),
                             }
                         }
                     }
@@ -81,10 +85,14 @@ pub fn trace_statistics(
                         trace_blk.col_end - trace_blk.col_start + 1;
 
                     query_span[trace_blk.query_id] = match query_span[trace_blk.query_id] {
-                        None => Some((trace_blk.col_start, trace_blk.col_end)),
-                        Some((start, end)) => {
-                            Some((start.min(trace_blk.col_start), end.min(trace_blk.col_end)))
-                        }
+                        None => Some((
+                            trace_results.target_start + trace_blk.col_start,
+                            trace_results.target_start + trace_blk.col_end,
+                        )),
+                        Some((start, end)) => Some((
+                            start.min(trace_results.target_start + trace_blk.col_start),
+                            end.max(trace_results.target_start + trace_blk.col_end),
+                        )),
                     }
                 }
             }

From 2a39a859b9993af0df7ec20dae45407ad55e4456 Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Fri, 17 Apr 2026 22:27:35 -0600
Subject: [PATCH 07/39] remove unused imports

---
 src/assembly.rs | 2 +-
 src/segments.rs | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/assembly.rs b/src/assembly.rs
index 449ec68..b35948c 100644
--- a/src/assembly.rs
+++ b/src/assembly.rs
@@ -7,7 +7,7 @@ use crate::{
     score_params::ScoreParams,
     segments::SegmentedMatrix,
     statistics::Distribution,
-    trace_statistics::{QueryStatistics, RegionStatistics, TraceStatistics},
+    trace_statistics::{QueryStatistics, RegionStatistics},
     AnnotationArgs,
 };
 
diff --git a/src/segments.rs b/src/segments.rs
index c990b25..b9a7349 100644
--- a/src/segments.rs
+++ b/src/segments.rs
@@ -7,7 +7,7 @@ use crate::{
     matrix::Matrix,
     score_params::ScoreParams,
     statistics::Distribution,
-    trace_statistics::{QueryStatistics, RegionStatistics, TraceStatistics},
+    trace_statistics::{QueryStatistics, RegionStatistics},
     viterbi::TraceSegment,
     AnnotationArgs,
 };

From 6ce0e30278a5b52ce282c95111d6f89e1c24735c Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Tue, 21 Apr 2026 00:15:19 -0600
Subject: [PATCH 08/39] New target scoring scheme is done, also added
 unexplained base removal.

---
 .gitignore              |  4 ++-
 src/assembly.rs         | 52 ++++++++++++++++++++----------
 src/main.rs             | 19 +++++++----
 src/statistics.rs       | 71 +++++++++++++++++++++++++++++++++++++++--
 src/trace_statistics.rs | 25 +++++++++++----
 5 files changed, 137 insertions(+), 34 deletions(-)

diff --git a/.gitignore b/.gitignore
index a744cfb..801aca5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,4 +19,6 @@ Cargo.lock
 
 # Visuals generated by aurora...
 /viz/
-out.txt
+
+# Temporary output files...
+/out*.txt
diff --git a/src/assembly.rs b/src/assembly.rs
index b35948c..1061361 100644
--- a/src/assembly.rs
+++ b/src/assembly.rs
@@ -100,6 +100,7 @@ fn piecewise_linear_cost(
 fn get_link_cost(
     annotation_args: &AnnotationArgs,
     score_params: &ScoreParams,
+    target_gap_distribution: &impl Distribution,
     consensus_gap: f64,
     target_gap: f64,
 ) -> f64 {
@@ -118,14 +119,16 @@ fn get_link_cost(
         .max(1.0);
 
     // Compute slopes....
-    let lambda = -value_range
-        * (annotation_args.join_target_gap_penalty
-            / annotation_args.target_join_distance.max(1) as f64)
-            .abs();
     let alpha =
         -value_range * (annotation_args.join_consensus_overlap_penalty / overlap_range).abs();
     let beta = -value_range * (annotation_args.join_consensus_gap_penalty / gap_range).abs();
 
+    // Compute target gap penalty.
+    // Doing this as the expected value over the transition scores...
+    let target_random_prob = target_gap_distribution.cdf(target_gap);
+    let target_expected_score = target_random_prob * score_params.query_jump_score
+        + (1.0 - target_random_prob) * score_params.query_loop_score;
+
     // Cost = linear consensus cost + linear target gap cost...
     min_value
         + piecewise_linear_cost(
@@ -135,7 +138,7 @@ fn get_link_cost(
             beta,
             consensus_gap,
         )
-        + lambda * target_gap
+        + target_expected_score
 }
 
 fn link_assemblies<T: Distribution>(
@@ -149,7 +152,6 @@ fn link_assemblies<T: Distribution>(
     args: &AnnotationArgs,
 ) {
     // this relies on the alignments being sorted by target start
-    // note: this assertion iter will only run in debug mode
     let compatable_blocks = compatable_blocks.sorted().collect_vec();
 
     compatable_blocks.iter().enumerate().for_each(|(idx, a)| {
@@ -162,12 +164,6 @@ fn link_assemblies<T: Distribution>(
             let a_block = &segments[a.0].blocks[a.1];
             let b_block = &segments[b.0].blocks[b.1];
 
-            // We allow this now, otherwise inversions might not properly join...
-            // If same alignment, and neighboring segments, don't join...
-            //if a_block.row_idx == b_block.row_idx && ((b.0 - 1) <= a.0) {
-            //    return;
-            //}
-
             let target_distance = b_block.col_start as isize - a_block.col_end as isize - 1;
 
             let a_length = a_block.query_end.abs_diff(a_block.query_start) + 1;
@@ -219,14 +215,23 @@ fn link_assemblies<T: Distribution>(
                 _ => panic!("Invalid strand types!"),
             };
 
-            let within_target_distance_threshold =
-                target_distance < args.target_join_distance as isize;
+            // Incorperate unexplained bases into query distance...
+            let unexplained_bases =
+                region_statistics.unexplained_bases[b.0] - region_statistics.unexplained_bases[a.0];
+            let corrected_consensus_distance =
+                (consensus_distance - unexplained_bases as isize).max(consensus_distance.min(0));
+
+            // Within target distance???
+            let within_target_distance_threshold = (target_distance
+                < args.target_join_distance as isize)
+                && (query_statistics.distribution.ccdf(target_distance as f64)
+                    >= args.target_distance_likelihood_threshold);
 
             let consensus_is_colinear = if link_type.is_inversion() {
-                consensus_distance.abs() < args.inversion_distance
+                corrected_consensus_distance.abs() < args.inversion_distance
             } else {
-                consensus_distance > -args.consensus_join_overlap
-                    && consensus_distance < args.consensus_join_distance
+                corrected_consensus_distance > -args.consensus_join_overlap
+                    && corrected_consensus_distance < args.consensus_join_distance
             };
 
             // TODO: Hardcoded, change later...
@@ -239,12 +244,25 @@ fn link_assemblies<T: Distribution>(
                 get_link_cost(
                     args,
                     score_params,
+                    &query_statistics.distribution,
                     consensus_distance as f64,
                     target_distance as f64,
                 )
             };
 
             if within_target_distance_threshold && consensus_is_colinear && is_significant {
+                println!("{:?}->{:?}", a, b);
+                println!(
+                    "CD: {}, UEB: {} => Corrected: {}",
+                    consensus_distance, unexplained_bases, corrected_consensus_distance
+                );
+
+                println!(
+                    "Target distance = {} => Prob not seeing at random = {}",
+                    target_distance,
+                    query_statistics.distribution.ccdf(target_distance as f64)
+                );
+
                 graph.insert(
                     ((a.0, a_block.row_idx), (b.0, b_block.row_idx)),
                     Edge {
diff --git a/src/main.rs b/src/main.rs
index dd411ef..3767ccc 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -125,7 +125,7 @@ pub struct AnnotationArgs {
     )]
     pub num_skip_loops_eq_to_jump: usize,
 
-    /// The max distance across unaligned positions
+    /// The max distance across positions
     /// in the target (genome) at which a join is
     /// considered between compatible alignments
     #[arg(
@@ -136,6 +136,17 @@ pub struct AnnotationArgs {
     )]
     pub target_join_distance: usize,
 
+    /// Removes joins across positions
+    /// in the target (genome) at which a join is
+    /// less than this likely to not be generated
+    /// at random.
+    #[arg(
+        long = "target-join-likelihood-threshold",
+        default_value = "0.5",
+        value_name = "f"
+    )]
+    pub target_distance_likelihood_threshold: f64,
+
     /// The maximum overlap in the consensus at which
     /// a join is considered between compatible alignments.
     #[arg(
@@ -241,12 +252,6 @@ pub struct AnnotationArgs {
         value_name = "f"
     )]
     pub join_consensus_gap_penalty: f64,
-
-    /// The amount of penalty to apply to a join at the maximum allowed target gap
-    /// A value of 1 means to apply a penalty equal to a query jump.
-    /// The cost grows linearly to this value as the gap between the sequences in the target space increases.
-    #[arg(long = "target-gap-penalty", default_value = "0.4", value_name = "f")]
-    pub join_target_gap_penalty: f64,
 }
 
 #[derive(Args, Debug, Clone, Default)]
diff --git a/src/statistics.rs b/src/statistics.rs
index 5db05b7..309c7ff 100644
--- a/src/statistics.rs
+++ b/src/statistics.rs
@@ -11,7 +11,6 @@ pub trait Distribution: Clone + Debug {
     fn ccdf(&self, x: f64) -> f64 {
         1.0 - self.cdf(x)
     }
-
     fn logpdf(&self, x: f64) -> f64 {
         self.pdf(x).ln()
     }
@@ -19,7 +18,7 @@ pub trait Distribution: Clone + Debug {
         self.cdf(x).ln()
     }
     fn logccdf(&self, x: f64) -> f64 {
-        (1.0 - self.ccdf(x)).ln()
+        self.ccdf(x).ln()
     }
 }
 
@@ -75,3 +74,71 @@ impl Distribution for Exponential {
         (0.0, f64::INFINITY)
     }
 }
+
+#[derive(Debug, Clone)]
+pub struct ExponentialEstimator {
+    sample_mean: f64,
+    degrees_of_freedom: usize,
+}
+
+impl ExponentialEstimator {
+    pub fn new(sample_mean: f64, sample_size: usize) -> Self {
+        Self {
+            sample_mean: sample_mean,
+            degrees_of_freedom: sample_size,
+        }
+    }
+}
+
+impl From<ExponentialEstimator> for Exponential {
+    fn from(value: ExponentialEstimator) -> Self {
+        Self::from_scale(value.sample_mean)
+    }
+}
+
+impl Distribution for ExponentialEstimator {
+    fn unit() -> Self {
+        Self {
+            sample_mean: 1.0,
+            degrees_of_freedom: 1,
+        }
+    }
+
+    fn logpdf(&self, x: f64) -> f64 {
+        let n = self.degrees_of_freedom as f64;
+        let sm = self.sample_mean;
+        ((n + 1.0) * n.ln() + n * sm.ln()) - ((n + 1.0) * (n * sm + x).ln())
+    }
+
+    fn pdf(&self, x: f64) -> f64 {
+        self.logpdf(x).exp()
+    }
+
+    fn logccdf(&self, x: f64) -> f64 {
+        let n = self.degrees_of_freedom as f64;
+        let sm = self.sample_mean;
+        n * ((n * sm).ln() - (n * sm + x).ln())
+    }
+
+    fn logcdf(&self, x: f64) -> f64 {
+        self.cdf(x).ln()
+    }
+
+    fn cdf(&self, x: f64) -> f64 {
+        -(self.logccdf(x).exp_m1())
+    }
+
+    fn ccdf(&self, x: f64) -> f64 {
+        self.logccdf(x).exp()
+    }
+
+    fn ppf(&self, p: f64) -> f64 {
+        let n = self.degrees_of_freedom as f64;
+        let sm = self.sample_mean;
+        (n * sm) * ((1.0 - p).powf(-1.0 / n) - 1.0)
+    }
+
+    fn support(&self) -> (f64, f64) {
+        (0.0, f64::INFINITY)
+    }
+}
diff --git a/src/trace_statistics.rs b/src/trace_statistics.rs
index 7d7f9c6..f8183fd 100644
--- a/src/trace_statistics.rs
+++ b/src/trace_statistics.rs
@@ -2,9 +2,10 @@ use crate::{
     alignment::AlignmentData,
     pipeline::NaiveTraceResults,
     segments::SegmentView,
-    statistics::{Distribution, Exponential},
+    statistics::{Distribution, ExponentialEstimator},
 };
 
+#[derive(Debug)]
 pub struct RegionStatistics {
     pub total_bases: usize,
     pub unexplained_bases: Vec<usize>,
@@ -18,7 +19,9 @@ pub struct QueryStatistics<T: Distribution> {
     pub distribution: T,
 }
 
+#[derive(Debug)]
 pub struct TraceStatistics<T: Distribution> {
+    #[allow(dead_code)]
     pub total_bases: usize,
     pub query_statistics: Vec<QueryStatistics<T>>,
     pub region_statistics: Vec<RegionStatistics>,
@@ -26,6 +29,7 @@ pub struct TraceStatistics<T: Distribution> {
 
 pub enum OccuranceCountingMode {
     Segments,
+    #[allow(dead_code)]
     Trace,
 }
 
@@ -33,7 +37,7 @@ pub fn trace_statistics(
     naive_traces: &[NaiveTraceResults],
     alignment_data: &AlignmentData,
     count_mode: OccuranceCountingMode,
-) -> TraceStatistics<Exponential> {
+) -> TraceStatistics<ExponentialEstimator> {
     // Asumption... All regions are sorted, no gaps. At least 1 region expected...
     debug_assert!(naive_traces.first().map(|v| v.region_index) == Some(0));
     debug_assert!(naive_traces
@@ -41,12 +45,17 @@ pub fn trace_statistics(
         .zip(naive_traces.iter().skip(1))
         .all(|(v1, v2)| v1.region_index + 1 == v2.region_index));
 
+    assert!(naive_traces
+        .iter()
+        .zip(naive_traces.iter().skip(1))
+        .all(|(v1, v2)| v1.region_index + 1 == v2.region_index && v1.target_end < v2.target_start));
+
     let mut query_stats = vec![
         QueryStatistics {
             occurances: 0,
             coverage: 0,
             target_span: 0,
-            distribution: Exponential::unit(),
+            distribution: ExponentialEstimator::unit(),
         };
         alignment_data.query_name_map.size()
     ];
@@ -112,8 +121,8 @@ pub fn trace_statistics(
                 if prior_segment.blocks.len() == 1 && prior_segment.blocks[0].row_idx == 0 {
                     unexplained_bases_up_to += seg.end_col - seg.start_col + 1;
                 }
-                unexplained_bases_up_to += prior_segment.end_col - seg.start_col - 1;
-                region_stat.total_bases += prior_segment.end_col - seg.start_col - 1;
+                unexplained_bases_up_to += seg.start_col - prior_segment.end_col - 1;
+                region_stat.total_bases += seg.start_col - prior_segment.end_col - 1;
             }
             region_stat.total_bases += seg.end_col - seg.start_col + 1;
             region_stat.unexplained_bases.push(unexplained_bases_up_to);
@@ -127,8 +136,10 @@ pub fn trace_statistics(
     for (query_info, query_span) in query_stats.iter_mut().zip(query_span.iter()) {
         if let Some((start, end)) = query_span {
             query_info.target_span = end - start + 1;
-            query_info.distribution = Exponential::from_scale(
-                query_info.occurances as f64 / query_info.target_span as f64,
+            // We subtract 1 because were looking at distances between each occurance as a sample value.
+            query_info.distribution = ExponentialEstimator::new(
+                query_info.target_span as f64 / query_info.occurances.saturating_sub(1) as f64,
+                query_info.occurances.saturating_sub(1),
             );
         }
     }

From cf319e384f3d38e702766109407385d923ba8609 Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Tue, 21 Apr 2026 16:48:53 -0600
Subject: [PATCH 09/39] Remove unexplained gaps.

---
 src/assembly.rs | 24 +++---------------------
 1 file changed, 3 insertions(+), 21 deletions(-)

diff --git a/src/assembly.rs b/src/assembly.rs
index 1061361..f8604d0 100644
--- a/src/assembly.rs
+++ b/src/assembly.rs
@@ -215,12 +215,6 @@ fn link_assemblies<T: Distribution>(
                 _ => panic!("Invalid strand types!"),
             };
 
-            // Incorperate unexplained bases into query distance...
-            let unexplained_bases =
-                region_statistics.unexplained_bases[b.0] - region_statistics.unexplained_bases[a.0];
-            let corrected_consensus_distance =
-                (consensus_distance - unexplained_bases as isize).max(consensus_distance.min(0));
-
             // Within target distance???
             let within_target_distance_threshold = (target_distance
                 < args.target_join_distance as isize)
@@ -228,10 +222,10 @@ fn link_assemblies<T: Distribution>(
                     >= args.target_distance_likelihood_threshold);
 
             let consensus_is_colinear = if link_type.is_inversion() {
-                corrected_consensus_distance.abs() < args.inversion_distance
+                consensus_distance.abs() < args.inversion_distance
             } else {
-                corrected_consensus_distance > -args.consensus_join_overlap
-                    && corrected_consensus_distance < args.consensus_join_distance
+                consensus_distance > -args.consensus_join_overlap
+                    && consensus_distance < args.consensus_join_distance
             };
 
             // TODO: Hardcoded, change later...
@@ -251,18 +245,6 @@ fn link_assemblies<T: Distribution>(
             };
 
             if within_target_distance_threshold && consensus_is_colinear && is_significant {
-                println!("{:?}->{:?}", a, b);
-                println!(
-                    "CD: {}, UEB: {} => Corrected: {}",
-                    consensus_distance, unexplained_bases, corrected_consensus_distance
-                );
-
-                println!(
-                    "Target distance = {} => Prob not seeing at random = {}",
-                    target_distance,
-                    query_statistics.distribution.ccdf(target_distance as f64)
-                );
-
                 graph.insert(
                     ((a.0, a_block.row_idx), (b.0, b_block.row_idx)),
                     Edge {

From 4aab08d9bce1d5a90512f6a45f8e9679470eeeef Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Thu, 30 Apr 2026 11:07:49 -0600
Subject: [PATCH 10/39] Fix reading of bed files.

---
 scripts/plot_consensus_distance_caf.py        | 44 ++++++++++++
 .../plot_consensus_distance_repeatmasker.py   | 71 +++++++++++++++++++
 src/viz/block.rs                              |  2 +-
 3 files changed, 116 insertions(+), 1 deletion(-)
 create mode 100644 scripts/plot_consensus_distance_caf.py
 create mode 100644 scripts/plot_consensus_distance_repeatmasker.py

diff --git a/scripts/plot_consensus_distance_caf.py b/scripts/plot_consensus_distance_caf.py
new file mode 100644
index 0000000..18d4937
--- /dev/null
+++ b/scripts/plot_consensus_distance_caf.py
@@ -0,0 +1,44 @@
+import sys
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+caf_file = sys.argv[1]
+
+gap_info = {}
+prior = {}
+
+with open(caf_file, "r") as f:
+    for line in f:
+        tokens = line.strip().split(",")
+        name = tokens[8].strip()
+        qstart = int(tokens[10])
+        qend = int(tokens[11])
+        tstart = int(tokens[5])
+        is_neg_strand = int(tokens[13])
+
+        if name in prior:
+            pstart, pend, p_is_neg, p_tstart = prior[name]
+
+            if tstart - p_tstart < 10000:
+                if name not in gap_info:
+                    gap_info[name] = []
+
+                if not is_neg_strand and not p_is_neg:
+                    gap = qstart - pend
+                elif is_neg_strand and p_is_neg:
+                    gap = qend - pstart
+                else:
+                    continue
+
+                gap_info[name].append(gap)
+
+        prior[name] = (qstart, qend, is_neg_strand, tstart)
+
+
+for gap_name, gap_vals in sorted(
+    gap_info.items(), key=lambda k: len(k[1]), reverse=True
+):
+    plt.title(gap_name)
+    plt.hist(gap_vals, 500)
+    plt.show()
diff --git a/scripts/plot_consensus_distance_repeatmasker.py b/scripts/plot_consensus_distance_repeatmasker.py
new file mode 100644
index 0000000..ebef9cd
--- /dev/null
+++ b/scripts/plot_consensus_distance_repeatmasker.py
@@ -0,0 +1,71 @@
+import sys
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+caf_file = sys.argv[1]
+
+gap_info = {}
+prior = {}
+
+with open(caf_file, "r") as f:
+    for line in f:
+        shared_name = line.strip().split("\t")[3].split("#")[-1]
+        seqs = line.strip().split("\t")[-1].split(",")
+
+        if len(seqs) <= 1:
+            continue
+
+        prior = None
+
+        for seq in seqs:
+            tokens = seq.split(" ")
+            name = f"{tokens[9]}#{tokens[10]}"
+            is_pos = tokens[8] == "+"
+
+            if is_pos:
+                qstart = int(tokens[11])
+                qend = int(tokens[12])
+            else:
+                qstart = int(tokens[13])
+                qend = int(tokens[12])
+
+            assert qend >= qstart
+
+            if prior is not None:
+                pname, ppos, pstart, pend = prior
+
+                try:
+                    if is_pos and ppos:
+                        assert pstart <= qstart <= qend and pstart <= pend <= qend
+                        gap = qstart - pend
+                    elif not is_pos and not ppos:
+                        assert qstart <= qend <= pstart and qstart <= pstart <= pend
+                        gap = pstart - qend
+                    else:
+                        gap = None
+                except AssertionError:
+                    if is_pos and ppos:
+                        print(f"Violation: {pstart} => {pend} => {qstart} => {qend}")
+                    else:
+                        print(f"Violation: {pstart} <= {pend} <= {qstart} <= {qend}")
+
+                    print(f"Offending annotation: {' '.join(line.split('\t')[:-1])}")
+                    print(f"Offending sequences:\n\t {'\n\t'.join(seqs)}")
+                    gap = None
+
+                if gap is not None:
+                    if shared_name not in gap_info:
+                        gap_info[shared_name] = []
+
+                    gap_info[shared_name].append(gap)
+
+            prior = (name, is_pos, qstart, qend)
+
+
+for gap_name, gap_vals in sorted(
+    gap_info.items(), key=lambda k: len(k[1]), reverse=True
+):
+    plt.title(gap_name)
+    plt.hist(gap_vals, 100)
+    plt.show()
diff --git a/src/viz/block.rs b/src/viz/block.rs
index 6005b35..4065e6d 100644
--- a/src/viz/block.rs
+++ b/src/viz/block.rs
@@ -189,7 +189,7 @@ impl BlockGroup {
                     None
                 } else {
                     match elems[8] {
-                        "C" => Some(bed.strand),
+                        "C" => Some(Strand::Reverse),
                         _ => Some(Strand::from_str(elems[8])),
                     }
                 }

From 03c24b4226dde25450f9ac7a72465efe9012e260 Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Thu, 30 Apr 2026 17:39:59 -0600
Subject: [PATCH 11/39] Add divergence calculation for segments, to use in
 linking. Adjust max consensus distance to match what is used in repeat masker

---
 .../plot_consensus_distance_repeatmasker.py   | 89 ++++++++++++++-----
 src/main.rs                                   |  2 +-
 src/segments.rs                               | 16 +++-
 3 files changed, 80 insertions(+), 27 deletions(-)

diff --git a/scripts/plot_consensus_distance_repeatmasker.py b/scripts/plot_consensus_distance_repeatmasker.py
index ebef9cd..755dc3e 100644
--- a/scripts/plot_consensus_distance_repeatmasker.py
+++ b/scripts/plot_consensus_distance_repeatmasker.py
@@ -1,3 +1,5 @@
+# This file plots histograms of join distance per tandem repeat family given a repeatmasker formatted bed file as an argument.
+
 import sys
 
 import matplotlib.pyplot as plt
@@ -6,6 +8,7 @@
 caf_file = sys.argv[1]
 
 gap_info = {}
+length_estimate = {}
 prior = {}
 
 with open(caf_file, "r") as f:
@@ -16,7 +19,11 @@
         if len(seqs) <= 1:
             continue
 
+        # Sort by location on the target...
+        seqs = sorted(seqs, key=lambda k: int(k.split(" ")[5]))
+
         prior = None
+        length = 0
 
         for seq in seqs:
             tokens = seq.split(" ")
@@ -31,41 +38,75 @@
                 qend = int(tokens[12])
 
             assert qend >= qstart
+            length += qend - qstart
 
             if prior is not None:
                 pname, ppos, pstart, pend = prior
-
-                try:
-                    if is_pos and ppos:
-                        assert pstart <= qstart <= qend and pstart <= pend <= qend
-                        gap = qstart - pend
-                    elif not is_pos and not ppos:
-                        assert qstart <= qend <= pstart and qstart <= pstart <= pend
-                        gap = pstart - qend
-                    else:
-                        gap = None
-                except AssertionError:
-                    if is_pos and ppos:
-                        print(f"Violation: {pstart} => {pend} => {qstart} => {qend}")
-                    else:
-                        print(f"Violation: {pstart} <= {pend} <= {qstart} <= {qend}")
-
-                    print(f"Offending annotation: {' '.join(line.split('\t')[:-1])}")
-                    print(f"Offending sequences:\n\t {'\n\t'.join(seqs)}")
+                if pname != name:
                     gap = None
+                else:
+                    try:
+                        if is_pos and ppos:
+                            assert pstart <= qstart <= qend and pstart <= pend <= qend
+                            gap = qstart - pend
+                        elif not is_pos and not ppos:
+                            assert qstart <= qend <= pend and qstart <= pstart <= pend
+                            gap = pstart - qend
+                        else:
+                            gap = None
+                    except AssertionError:
+                        if is_pos and ppos:
+                            print(
+                                f"Violation: {pstart} => {pend} => {qstart} => {qend}"
+                            )
+                        else:
+                            print(
+                                f"Violation: {pend} <= {pstart} <= {qend} <= {qstart}"
+                            )
+
+                        print(
+                            f"Offending annotation: {' '.join(line.split('\t')[:-1])}"
+                        )
+                        print(f"Offending sequences:\n\t {'\n\t'.join(seqs)}")
+                        gap = None
 
-                if gap is not None:
-                    if shared_name not in gap_info:
-                        gap_info[shared_name] = []
+                    if gap is not None:
+                        if shared_name not in gap_info:
+                            gap_info[shared_name] = []
 
-                    gap_info[shared_name].append(gap)
+                        gap_info[shared_name].append(gap)
 
             prior = (name, is_pos, qstart, qend)
 
+        old_length_est = length_estimate.get(shared_name, (0, 0, 0))
+        length_estimate[shared_name] = (
+            old_length_est[0] + length,
+            old_length_est[1] + 1,
+            max(old_length_est[2], length),
+        )
+
 
 for gap_name, gap_vals in sorted(
     gap_info.items(), key=lambda k: len(k[1]), reverse=True
 ):
-    plt.title(gap_name)
-    plt.hist(gap_vals, 100)
+    gap_vals = np.array(gap_vals)
+    # gap_vals = gap_vals[np.abs(gap_vals) > 1]
+
+    avg_len = length_estimate[gap_name][0] / length_estimate[gap_name][1]
+    max_len = length_estimate[gap_name][2]
+
+    plt.title(f"{gap_name} (Avg Length: {avg_len:.02f}, Max Length: {max_len})")
+
+    avg_pos_d = np.mean(gap_vals[gap_vals >= 0])
+    avg_neg_d = np.mean(gap_vals[gap_vals <= 0])
+
+    info = f"Positive Avg: {avg_pos_d:.02f}, Negative Avg: {avg_neg_d:.02f}, ({avg_len / avg_pos_d:.02f}, {max_len / avg_pos_d:.02f})"
+
+    range = (0, np.max(gap_vals))
+    x = np.arange(*range)
+    lamb = 1 / avg_pos_d
+
+    plt.hist(gap_vals, 100, label=info)
+    # plt.plot(x, lamb * np.exp(-lamb * x))
+    plt.legend()
     plt.show()
diff --git a/src/main.rs b/src/main.rs
index 3767ccc..2f1e11a 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -162,7 +162,7 @@ pub struct AnnotationArgs {
     #[arg(
         short = 'C',
         long = "consensus-join-distance",
-        default_value = "2000",
+        default_value = "3750",
         value_name = "n"
     )]
     pub consensus_join_distance: isize,
diff --git a/src/segments.rs b/src/segments.rs
index b9a7349..fff8da8 100644
--- a/src/segments.rs
+++ b/src/segments.rs
@@ -66,6 +66,7 @@ pub struct Block {
     pub query_end: usize,
     pub avg_confidence: f64,
     pub alignment_score: f64,
+    pub kimura80: f64,
     pub can_join_up_to: usize,
 }
 
@@ -534,17 +535,28 @@ pub fn segments_from_matrix_trace(
                         _ => None,
                     };
 
+                    let query_start = confidence_matrix.consensus_position(row_idx, start);
+                    let query_end = confidence_matrix.consensus_position(row_idx, end);
+
+                    let kimura80 = match block_type {
+                        BlockType::Alignment => {
+                            group.alignments[row_idx - 1].kimura80(query_start, query_end)
+                        }
+                        _ => 0.0,
+                    };
+
                     Block {
                         row_idx,
                         block_type,
                         query_id,
                         col_start: start,
                         col_end: end,
-                        query_start: confidence_matrix.consensus_position(row_idx, start),
-                        query_end: confidence_matrix.consensus_position(row_idx, end),
+                        query_start,
+                        query_end,
                         avg_confidence: row_conf_sum[row_idx]
                             / (row_valid_cell_count[row_idx].max(1) as f64),
                         alignment_score: row_scores[row_idx],
+                        kimura80,
                         can_join_up_to: s_idx,
                     }
                 })

From 8ba72ebf271d379f634bf336127d67473038b555 Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Fri, 1 May 2026 01:19:30 -0600
Subject: [PATCH 12/39] move consensus distance out into it's own function.

---
 src/assembly.rs | 95 +++++++++++++++++++++++++------------------------
 src/segments.rs |  8 +++++
 2 files changed, 57 insertions(+), 46 deletions(-)

diff --git a/src/assembly.rs b/src/assembly.rs
index f8604d0..97e49a7 100644
--- a/src/assembly.rs
+++ b/src/assembly.rs
@@ -5,7 +5,7 @@ use itertools::Itertools;
 use crate::{
     alignment::{Alignment, Strand},
     score_params::ScoreParams,
-    segments::SegmentedMatrix,
+    segments::{Block, SegmentedMatrix},
     statistics::Distribution,
     trace_statistics::{QueryStatistics, RegionStatistics},
     AnnotationArgs,
@@ -141,10 +141,55 @@ fn get_link_cost(
         + target_expected_score
 }
 
+pub fn block_target_distance(first_block: &Block, second_block: &Block) -> isize {
+    second_block.col_start as isize - first_block.col_end as isize - 1
+}
+
+pub fn block_consensus_distance(first_block: &Block, second_block: &Block) -> (isize, LinkType) {
+    let select_closest = |prop1: (isize, LinkType), prop2: (isize, LinkType)| {
+        if prop1.0.abs() < prop2.0.abs() {
+            prop1
+        } else {
+            prop2
+        }
+    };
+
+    match (first_block.strand, second_block.strand) {
+        (Strand::Forward, Strand::Forward) => (
+            second_block.query_start as isize - first_block.query_end as isize - 1,
+            LinkType::Forward,
+        ),
+        (Strand::Reverse, Strand::Reverse) => (
+            first_block.query_end as isize - second_block.query_start as isize - 1,
+            LinkType::Reverse,
+        ),
+        (Strand::Forward, Strand::Reverse) => select_closest(
+            (
+                first_block.query_start as isize - second_block.query_start as isize - 1,
+                LinkType::FRInversion1,
+            ),
+            (
+                second_block.query_end as isize - first_block.query_end as isize - 1,
+                LinkType::FRInversion2,
+            ),
+        ),
+        (Strand::Reverse, Strand::Forward) => select_closest(
+            (
+                second_block.query_start as isize - first_block.query_start as isize - 1,
+                LinkType::RFInversion1,
+            ),
+            (
+                first_block.query_end as isize - second_block.query_end as isize - 1,
+                LinkType::RFInversion2,
+            ),
+        ),
+        _ => panic!("Invalid strand types!"),
+    }
+}
+
 fn link_assemblies<T: Distribution>(
     graph: &mut HashMap<(SegmentAndDenseRow, SegmentAndDenseRow), Edge>,
     compatable_blocks: impl Iterator<Item = (usize, usize)>,
-    alignments: &[Alignment],
     segments: &SegmentedMatrix,
     query_statistics: &QueryStatistics<T>,
     region_statistics: &RegionStatistics,
@@ -164,56 +209,15 @@ fn link_assemblies<T: Distribution>(
             let a_block = &segments[a.0].blocks[a.1];
             let b_block = &segments[b.0].blocks[b.1];
 
-            let target_distance = b_block.col_start as isize - a_block.col_end as isize - 1;
+            let target_distance = block_target_distance(a_block, b_block);
 
             let a_length = a_block.query_end.abs_diff(a_block.query_start) + 1;
             let b_length = b_block.query_end.abs_diff(b_block.query_start) + 1;
             let min_length = a_length.min(b_length);
 
-            let select_closest = |prop1: (isize, LinkType), prop2: (isize, LinkType)| {
-                if prop1.0.abs() < prop2.0.abs() {
-                    prop1
-                } else {
-                    prop2
-                }
-            };
-
             // Query bounds are reversed for reverse sequences, so the start is actually greater than the end (Ex. start: 1510 -> end: 105)
 
-            let (consensus_distance, link_type) = match (
-                alignments[a_block.row_idx - 1].strand,
-                alignments[b_block.row_idx - 1].strand,
-            ) {
-                (Strand::Forward, Strand::Forward) => (
-                    b_block.query_start as isize - a_block.query_end as isize - 1,
-                    LinkType::Forward,
-                ),
-                (Strand::Reverse, Strand::Reverse) => (
-                    a_block.query_end as isize - b_block.query_start as isize - 1,
-                    LinkType::Reverse,
-                ),
-                (Strand::Forward, Strand::Reverse) => select_closest(
-                    (
-                        a_block.query_start as isize - b_block.query_start as isize - 1,
-                        LinkType::FRInversion1,
-                    ),
-                    (
-                        b_block.query_end as isize - a_block.query_end as isize - 1,
-                        LinkType::FRInversion2,
-                    ),
-                ),
-                (Strand::Reverse, Strand::Forward) => select_closest(
-                    (
-                        b_block.query_start as isize - a_block.query_start as isize - 1,
-                        LinkType::RFInversion1,
-                    ),
-                    (
-                        a_block.query_end as isize - b_block.query_end as isize - 1,
-                        LinkType::RFInversion2,
-                    ),
-                ),
-                _ => panic!("Invalid strand types!"),
-            };
+            let (consensus_distance, link_type) = block_consensus_distance(a_block, b_block);
 
             // Within target distance???
             let within_target_distance_threshold = (target_distance
@@ -311,7 +315,6 @@ impl SegmentAssemblyGraph {
                 link_assemblies(
                     &mut link_graph,
                     compat_blocks,
-                    alignments,
                     segments,
                     &query_statistics[id],
                     region_statistics,
diff --git a/src/segments.rs b/src/segments.rs
index fff8da8..a9aa282 100644
--- a/src/segments.rs
+++ b/src/segments.rs
@@ -2,6 +2,7 @@ use core::f64;
 use std::{cmp::Ordering, fmt::Debug, iter::Fuse};
 
 use crate::{
+    alignment::Strand,
     assembly::SegmentAssemblyGraph,
     chunks::ProximityGroup,
     matrix::Matrix,
@@ -59,6 +60,7 @@ pub enum BlockType {
 pub struct Block {
     pub row_idx: usize,
     pub block_type: BlockType,
+    pub strand: Strand,
     pub query_id: Option<usize>,
     pub col_start: usize,
     pub col_end: usize,
@@ -545,9 +547,15 @@ pub fn segments_from_matrix_trace(
                         _ => 0.0,
                     };
 
+                    let strand = match block_type {
+                        BlockType::Alignment => group.alignments[row_idx - 1].strand,
+                        _ => Strand::Forward,
+                    };
+
                     Block {
                         row_idx,
                         block_type,
+                        strand,
                         query_id,
                         col_start: start,
                         col_end: end,

From 75a319ab19532c75586ed079580da062ce068ca1 Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Fri, 1 May 2026 17:16:48 -0600
Subject: [PATCH 13/39] Work on stat extraction from aurora.

---
 scripts/plot_consensus_distance_repeatmasker.py |  2 +-
 src/annotation.rs                               | 10 ++++++++--
 src/pipeline.rs                                 |  2 +-
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/scripts/plot_consensus_distance_repeatmasker.py b/scripts/plot_consensus_distance_repeatmasker.py
index 755dc3e..5390df2 100644
--- a/scripts/plot_consensus_distance_repeatmasker.py
+++ b/scripts/plot_consensus_distance_repeatmasker.py
@@ -13,7 +13,7 @@
 
 with open(caf_file, "r") as f:
     for line in f:
-        shared_name = line.strip().split("\t")[3].split("#")[-1]
+        shared_name = line.strip().split("\t")[3]  # .split("#")[-1]
         seqs = line.strip().split("\t")[-1].split(",")
 
         if len(seqs) <= 1:
diff --git a/src/annotation.rs b/src/annotation.rs
index c904a57..0c97aba 100644
--- a/src/annotation.rs
+++ b/src/annotation.rs
@@ -108,7 +108,12 @@ fn get_strings(simple_annotations: &[SimpleAnnotation], simplify: bool) -> [Stri
     [
         get_mutli_option_string(simple_annotations, |v| v.target_start, simplify),
         get_mutli_option_string(simple_annotations, |v| v.target_end, simplify),
-        get_mutli_option_string(simple_annotations, |v| v.query_name.clone(), simplify),
+        get_mutli_option_string(
+            simple_annotations,
+            // Remove spaces as it breaks the format...
+            |v| v.query_name.replace(" ", "-"),
+            simplify,
+        ),
         get_mutli_option_string(simple_annotations, |v| v.query_start, simplify),
         get_mutli_option_string(simple_annotations, |v| v.query_end, simplify),
         get_mutli_option_string(simple_annotations, |v| v.strand, simplify),
@@ -127,7 +132,8 @@ impl AmbiguousAnnotation {
         format!(
             "{:w0$} {:w1$} {:w2$} {:w3$} {:w4$} {:w5$} {:w6$} {:w7$} {:4.3} {:w8$} {:w9$} {}",
             self.annotations.len(),
-            self.target_name,
+            // Remove spaces as it breaks the format...
+            self.target_name.replace(" ", "-"),
             ts,
             te,
             qn,
diff --git a/src/pipeline.rs b/src/pipeline.rs
index 42851a4..377266f 100644
--- a/src/pipeline.rs
+++ b/src/pipeline.rs
@@ -61,7 +61,7 @@ pub fn to_annotations(
                                         a.row_idx - proximity_group.alignments.len() - 1;
                                     let repeat = &proximity_group.tandem_repeats[tandem_repeat_idx];
                                     format!(
-                                        "({}:{})#tandem repeat",
+                                        "({}:{})#tandem-repeat",
                                         repeat.period, repeat.consensus_pattern,
                                     )
                                 }

From 4ce5845e0c460b5c72b33b01dd938293e158476a Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Mon, 4 May 2026 18:36:41 -0600
Subject: [PATCH 14/39] Work on parameter analysis for joins.

---
 scripts/plot_distributions_aurora.py          | 266 ++++++++++++++++++
 ...y => plot_target_distance_distribution.py} |   0
 2 files changed, 266 insertions(+)
 create mode 100644 scripts/plot_distributions_aurora.py
 rename scripts/{plot_distance_distribution.py => plot_target_distance_distribution.py} (100%)

diff --git a/scripts/plot_distributions_aurora.py b/scripts/plot_distributions_aurora.py
new file mode 100644
index 0000000..b0f368d
--- /dev/null
+++ b/scripts/plot_distributions_aurora.py
@@ -0,0 +1,266 @@
+import sys
+import typing
+from dataclasses import dataclass, fields
+
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy.optimize import curve_fit
+from scipy.stats import ecdf, expon, genpareto, gumbel_r, invweibull, norm, weibull_min
+
+
+@dataclass
+class AuroraEntry:
+    annotation_count: int
+    target: str
+    target_start: list[int]
+    target_end: list[int]
+    query: list[str]
+    query_start: list[int]
+    query_end: list[int]
+    strand: list[str]
+    score: float
+    kimera80: list[float]
+    join_id: int
+    region: int
+
+    @classmethod
+    def from_line(cls, line: str) -> typing.Self:
+        parts = line.strip().split()
+
+        parsed = []
+        max_list_len = 0
+
+        for field, part in zip(fields(cls), parts):
+            if isinstance(field.type, str):
+                raise ValueError("Can't parse strings!")
+
+            origin = typing.get_origin(field.type)
+            if origin is not None and issubclass(origin, list):
+                tp = typing.get_args(field.type)[0]
+                parsed.append([tp(v) for v in part.split(",")])
+                max_list_len = max(max_list_len, len(parsed[-1]))
+            else:
+                parsed.append(field.type(part))
+
+        if max_list_len > 0:
+            parsed_new = []
+
+            for v in parsed:
+                if isinstance(v, list):
+                    if len(v) == max_list_len:
+                        parsed_new.append(v)
+                    elif len(v) == 1:
+                        parsed_new.append(v * max_list_len)
+                    else:
+                        raise ValueError(
+                            f"One of the elements has and invalid length: {v}"
+                        )
+                else:
+                    parsed_new.append(v)
+
+            parsed = parsed_new
+
+        return cls(*parsed)
+
+
+aurora_file = sys.argv[1]
+
+joined_annots: dict[tuple[int, int], list[AuroraEntry]] = {}
+
+with open(aurora_file, "r") as f:
+    for line in f:
+        entry = AuroraEntry.from_line(line)
+        key = (entry.region, entry.join_id)
+        if key not in joined_annots:
+            joined_annots[key] = []
+        joined_annots[key].append(entry)
+
+for key, annots in joined_annots.items():
+    annots.sort(key=lambda k: min(k.target_start))
+
+
+random_stats = {}
+join_stats = {}
+prior_vals = {}
+seq_size = {}
+
+
+def consensus_dist(a: AuroraEntry, b: AuroraEntry, a_idx: int, b_idx: int):
+    def best(*a):
+        return min(a, key=lambda v: abs(v))
+
+    match (a.strand[a_idx], b.strand[b_idx]):
+        case ("+", "+"):
+            return b.query_start[b_idx] - a.query_end[a_idx] - 1
+        case ("-", "-"):
+            return a.query_end[a_idx] - b.query_start[b_idx] - 1
+        case ("-", "+"):
+            return best(
+                a.query_start[a_idx] - b.query_start[b_idx] - 1,
+                b.query_end[b_idx] - a.query_end[a_idx] - 1,
+            )
+        case ("+", "-"):
+            return best(
+                b.query_start[b_idx] - a.query_start[a_idx] - 1,
+                a.query_end[a_idx] - b.query_end[b_idx] - 1,
+            )
+        case _:
+            raise ValueError("Unknown stand configuration.")
+
+
+def _target_distance(a: AuroraEntry, b: AuroraEntry, ai: int, bi: int):
+    assert b.target_start[bi] - a.target_end[ai] - 1 >= 0
+    return b.target_start[bi] - a.target_end[ai] - 1
+
+
+stats_to_compute = {
+    "Consensus Distance": consensus_dist,
+    "Target Distance": _target_distance,
+    "Divergence Change": lambda a, b, ai, bi: b.kimera80[bi] - a.kimera80[ai],
+}
+
+
+class Distribution:
+    def __init__(self, dist, defaults, exclude_location=True):
+        self._dist = dist
+        self.DEFAULTS = list(defaults)
+        self._excl_loc = exclude_location
+        self.NAME = getattr(
+            dist, "__name__", getattr(type(dist), "__qualname__", repr(dist))
+        )
+
+    def pdf(self, x, *args):
+        # print(*args)
+        if self._excl_loc:
+            return self._dist.pdf(x, *args[:-1], 0.0, args[-1])
+        else:
+            return self._dist.pdf(x, *args)
+
+    def cdf(self, x, *args):
+        if self._excl_loc:
+            return self._dist.cdf(x, *args[:-1], 0.0, args[-1])
+        else:
+            return self._dist.cdf(x, *args)
+
+    def logcdf(self, x, *args):
+        print(*args)
+        if self._excl_loc:
+            return self._dist.logcdf(x, *args[:-1], 0.0, args[-1])
+        else:
+            return self._dist.logcdf(x, *args)
+
+
+estimator = {
+    "Consensus Distance": Distribution(invweibull, (1.0, 0.0, 1.0), False),
+    "Target Distance": Distribution(
+        weibull_min, (1.0, 10000.0)
+    ),  # Distribution(genpareto, (0.0, 1.0)),
+    "Divergence Change": Distribution(norm, (0.0, 1.0), False),
+}
+
+
+def fit_dist(data, dist):
+    emp_cdf = ecdf(data).cdf
+
+    return curve_fit(
+        dist.cdf,
+        emp_cdf.quantiles,
+        emp_cdf.probabilities,
+        dist.DEFAULTS,
+        full_output=True,
+    )[0]
+
+
+for name in stats_to_compute:
+    join_stats[name] = {}
+    random_stats[name] = {}
+
+for k, annots in sorted(
+    joined_annots.items(), key=lambda k: min(min(v.target_start) for v in k[1])
+):
+    for ann in annots[:1]:
+        for i in range(len(ann.query)):
+            name = ann.query[i]
+            (pann, j) = prior_vals.get(name, (None, None))
+            if pann is not None:
+                for stat_name, values_per_query in random_stats.items():
+                    (ann1, idx1), (ann2, idx2) = sorted(
+                        [(pann, j), (ann, i)], key=lambda v: v[0].target_start[v[1]]
+                    )
+
+                    if name not in values_per_query:
+                        values_per_query[name] = []
+                    values_per_query[name].append(
+                        stats_to_compute[stat_name](ann1, ann2, idx1, idx2)
+                    )
+
+            seq_size[name] = max(
+                seq_size.get(name, 1),
+                ann.query_start[i],
+                ann.query_end[i],
+            )
+            prior_vals[name] = (ann, i)
+
+    if len(annots) <= 1:
+        continue
+
+    for a, b in zip(annots[:-1], annots[1:]):
+        for i in range(len(a.query)):
+            name = a.query[i]
+            for stat_name, values_per_query in join_stats.items():
+                if name not in values_per_query:
+                    values_per_query[name] = []
+                values_per_query[name].append(stats_to_compute[stat_name](a, b, i, i))
+
+
+for query_name, _ in sorted(
+    join_stats["Consensus Distance"].items(), key=lambda k: -len(k[1])
+):
+    fig, axs = plt.subplots(3, len(stats_to_compute))
+    axs = axs.T
+
+    fig.suptitle(f"{query_name} (Size: {seq_size.get(query_name, 0)})")
+
+    for name, (ax1, ax2, ax3) in zip(stats_to_compute, axs):
+        est = estimator[name]
+
+        sx = np.sort(join_stats[name][query_name])
+        fit = fit_dist(sx, est)
+        ax1.set_title(f"Join {name}")
+        ax1.hist(
+            join_stats[name][query_name],
+            50,
+            density=True,
+            label=f"Mean: {np.mean(join_stats[name][query_name]):.02f}\nSTD: {np.std(join_stats[name][query_name]):.02f}",
+        )
+        ax1.plot(
+            sx, est.pdf(sx, *fit), label=f"Fit: {', '.join(f'{v:.02f}' for v in fit)}"
+        )
+        ax1.legend()
+
+        sx2 = np.sort(random_stats[name][query_name])
+        fit2 = fit_dist(sx2, est)
+        ax2.set_title(f"All {name}")
+        ax2.hist(
+            random_stats[name][query_name],
+            50,
+            density=True,
+            label=f"Mean: {np.mean(random_stats[name][query_name]):.02f}\nSTD: {np.std(random_stats[name][query_name]):.02f}",
+        )
+        ax2.plot(
+            sx2,
+            est.pdf(sx2, *fit2),
+            label=f"Fit: {', '.join(f'{v:.02f}' for v in fit2)}",
+        )
+        ax2.legend()
+
+        ax3.set_title("CDFs")
+        ax3.ecdf(join_stats[name][query_name], label="Joins CDF")
+        ax3.ecdf(random_stats[name][query_name], label="All CDF")
+        ax3.plot(sx, est.cdf(sx, *fit), label="Est. Join CDF")
+        ax3.plot(sx2, est.cdf(sx2, *fit2), label="Est. All CDF")
+        ax3.legend()
+
+    fig.set_size_inches(12, 8)
+    fig.tight_layout()
+    plt.show()
diff --git a/scripts/plot_distance_distribution.py b/scripts/plot_target_distance_distribution.py
similarity index 100%
rename from scripts/plot_distance_distribution.py
rename to scripts/plot_target_distance_distribution.py

From b3fc5e06bfe8b4840244436cf6a117533d631be5 Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Tue, 5 May 2026 17:31:05 -0600
Subject: [PATCH 15/39] Further exploration of join distributions.

---
 .../plot_consensus_distance_repeatmasker.py   | 141 +++++++++----
 scripts/plot_distributions_aurora.py          | 190 +++++++++++++++---
 2 files changed, 257 insertions(+), 74 deletions(-)

diff --git a/scripts/plot_consensus_distance_repeatmasker.py b/scripts/plot_consensus_distance_repeatmasker.py
index 5390df2..2bd07bd 100644
--- a/scripts/plot_consensus_distance_repeatmasker.py
+++ b/scripts/plot_consensus_distance_repeatmasker.py
@@ -9,12 +9,39 @@
 
 gap_info = {}
 length_estimate = {}
-prior = {}
+sequences = []
+
+
+def get_gap(pstart, pend, qstart, qend, ppos, is_pos, check_valid: bool = True):
+    try:
+        if is_pos and ppos:
+            if check_valid:
+                assert pstart <= qstart <= qend and pstart <= pend <= qend
+            gap = qstart - pend
+        elif not is_pos and not ppos:
+            if check_valid:
+                assert qstart <= qend <= pend and qstart <= pstart <= pend
+            gap = pstart - qend
+        else:
+            gap = None
+    except AssertionError:
+        if is_pos and ppos:
+            print(f"Violation: {pstart} => {pend} => {qstart} => {qend}")
+        else:
+            print(f"Violation: {pend} <= {pstart} <= {qend} <= {qstart}")
+
+        print(f"Offending annotation: {' '.join(line.split('\t')[:-1])}")
+        print(f"Offending sequences:\n\t {'\n\t'.join(seqs)}")
+        gap = None
+
+    return gap
+
 
 with open(caf_file, "r") as f:
     for line in f:
         shared_name = line.strip().split("\t")[3]  # .split("#")[-1]
         seqs = line.strip().split("\t")[-1].split(",")
+        sequences.extend(seqs)
 
         if len(seqs) <= 1:
             continue
@@ -29,6 +56,7 @@
             tokens = seq.split(" ")
             name = f"{tokens[9]}#{tokens[10]}"
             is_pos = tokens[8] == "+"
+            join_id = int(tokens[14])
 
             if is_pos:
                 qstart = int(tokens[11])
@@ -45,36 +73,13 @@
                 if pname != name:
                     gap = None
                 else:
-                    try:
-                        if is_pos and ppos:
-                            assert pstart <= qstart <= qend and pstart <= pend <= qend
-                            gap = qstart - pend
-                        elif not is_pos and not ppos:
-                            assert qstart <= qend <= pend and qstart <= pstart <= pend
-                            gap = pstart - qend
-                        else:
-                            gap = None
-                    except AssertionError:
-                        if is_pos and ppos:
-                            print(
-                                f"Violation: {pstart} => {pend} => {qstart} => {qend}"
-                            )
-                        else:
-                            print(
-                                f"Violation: {pend} <= {pstart} <= {qend} <= {qstart}"
-                            )
-
-                        print(
-                            f"Offending annotation: {' '.join(line.split('\t')[:-1])}"
-                        )
-                        print(f"Offending sequences:\n\t {'\n\t'.join(seqs)}")
-                        gap = None
-
-                    if gap is not None:
-                        if shared_name not in gap_info:
-                            gap_info[shared_name] = []
-
-                        gap_info[shared_name].append(gap)
+                    gap = get_gap(pstart, pend, qstart, qend, ppos, is_pos)
+
+                if gap is not None:
+                    if shared_name not in gap_info:
+                        gap_info[shared_name] = []
+
+                    gap_info[shared_name].append(gap)
 
             prior = (name, is_pos, qstart, qend)
 
@@ -86,27 +91,81 @@
         )
 
 
-for gap_name, gap_vals in sorted(
+sequences.sort(key=lambda seq: int(seq.split(" ")[5]))
+gap_nojoin_info = {}
+other_priors = {}
+target_nojoin_info = {}
+
+for seq in sequences:
+    tokens = seq.split(" ")
+    name = f"{tokens[9]}#{tokens[10]}"
+    is_pos = tokens[8] == "+"
+    join_id = int(tokens[14])
+    tstart = int(tokens[5])
+
+    if is_pos:
+        qstart = int(tokens[11])
+        qend = int(tokens[12])
+    else:
+        qstart = int(tokens[13])
+        qend = int(tokens[12])
+
+    random_prior = other_priors.get(name, None)
+    if random_prior is not None:
+        ppos, pstart, pend, pjoin_id, p_tstart = random_prior
+        if pjoin_id == join_id:
+            gap = None
+        else:
+            gap = get_gap(pstart, pend, qstart, qend, ppos, is_pos, False)
+
+        target_gap = tstart - p_tstart
+
+        if gap is not None:
+            if name not in target_nojoin_info:
+                target_nojoin_info[name] = []
+            target_nojoin_info[name].append(target_gap)
+
+            if name not in gap_nojoin_info:
+                gap_nojoin_info[name] = []
+            gap_nojoin_info[name].append(gap)
+
+    other_priors[name] = (is_pos, qstart, qend, join_id, tstart)
+
+
+for gap_name, gap_join_vals in sorted(
     gap_info.items(), key=lambda k: len(k[1]), reverse=True
 ):
-    gap_vals = np.array(gap_vals)
+    gap_join_vals = np.array(gap_join_vals)
     # gap_vals = gap_vals[np.abs(gap_vals) > 1]
 
     avg_len = length_estimate[gap_name][0] / length_estimate[gap_name][1]
     max_len = length_estimate[gap_name][2]
 
-    plt.title(f"{gap_name} (Avg Length: {avg_len:.02f}, Max Length: {max_len})")
-
-    avg_pos_d = np.mean(gap_vals[gap_vals >= 0])
-    avg_neg_d = np.mean(gap_vals[gap_vals <= 0])
+    avg_pos_d = np.mean(gap_join_vals[gap_join_vals >= 0])
+    avg_neg_d = np.mean(gap_join_vals[gap_join_vals <= 0])
 
-    info = f"Positive Avg: {avg_pos_d:.02f}, Negative Avg: {avg_neg_d:.02f}, ({avg_len / avg_pos_d:.02f}, {max_len / avg_pos_d:.02f})"
+    info = f"Positive Avg: {avg_pos_d:.02f},\n Negative Avg: {avg_neg_d:.02f}, ({avg_len / avg_pos_d:.02f}, {max_len / avg_pos_d:.02f})"
 
-    range = (0, np.max(gap_vals))
+    range = (0, np.max(gap_join_vals))
     x = np.arange(*range)
     lamb = 1 / avg_pos_d
 
-    plt.hist(gap_vals, 100, label=info)
+    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, squeeze=True)
+    ax1.set_title(f"{gap_name} (Avg Length: {avg_len:.02f}, Max Length: {max_len})")
+
+    ax1.hist(gap_join_vals, 50, label=info)
     # plt.plot(x, lamb * np.exp(-lamb * x))
-    plt.legend()
+    ax1.legend()
+
+    ax2.set_title(f"{gap_name} Non-Join Consensus Gaps")
+    ax2.hist(gap_nojoin_info[gap_name], 50)
+
+    ax3.set_title(f"{gap_name} Scatterplot")
+    ax3.scatter(target_nojoin_info[gap_name], gap_nojoin_info[gap_name])
+    ax3.set_xlabel("Target Distance")
+    ax3.set_ylabel("Consensus Distance")
+
+    w, h = fig.get_size_inches()
+    fig.set_size_inches(w * 3, h)
+    fig.tight_layout()
     plt.show()
diff --git a/scripts/plot_distributions_aurora.py b/scripts/plot_distributions_aurora.py
index b0f368d..4d12d43 100644
--- a/scripts/plot_distributions_aurora.py
+++ b/scripts/plot_distributions_aurora.py
@@ -62,6 +62,25 @@ def from_line(cls, line: str) -> typing.Self:
 
         return cls(*parsed)
 
+    def get_key(self) -> tuple[int, int]:
+        return (self.region, self.join_id)
+
+    def select(self, idx: int) -> typing.Self:
+        return type(self)(
+            self.annotation_count,
+            self.target,
+            [self.target_start[idx]],
+            [self.target_end[idx]],
+            [self.query[idx]],
+            [self.query_start[idx]],
+            [self.query_end[idx]],
+            [self.strand[idx]],
+            self.score,
+            [self.kimera80[idx]],
+            self.join_id,
+            self.region,
+        )
+
 
 aurora_file = sys.argv[1]
 
@@ -70,7 +89,7 @@ def from_line(cls, line: str) -> typing.Self:
 with open(aurora_file, "r") as f:
     for line in f:
         entry = AuroraEntry.from_line(line)
-        key = (entry.region, entry.join_id)
+        key = entry.get_key()
         if key not in joined_annots:
             joined_annots[key] = []
         joined_annots[key].append(entry)
@@ -95,11 +114,13 @@ def best(*a):
         case ("-", "-"):
             return a.query_end[a_idx] - b.query_start[b_idx] - 1
         case ("-", "+"):
+            return None
             return best(
                 a.query_start[a_idx] - b.query_start[b_idx] - 1,
                 b.query_end[b_idx] - a.query_end[a_idx] - 1,
             )
         case ("+", "-"):
+            return None
             return best(
                 b.query_start[b_idx] - a.query_start[a_idx] - 1,
                 a.query_end[a_idx] - b.query_end[b_idx] - 1,
@@ -113,10 +134,34 @@ def _target_distance(a: AuroraEntry, b: AuroraEntry, ai: int, bi: int):
     return b.target_start[bi] - a.target_end[ai] - 1
 
 
+def _kimura_dist(a, b, ai, bi):
+    return b.kimera80[bi] - a.kimera80[ai]
+
+
+def _relative_consensus_dist(a, b, ai, bi):
+    d = consensus_dist(a, b, ai, bi)
+    min_seq_len = min(
+        [
+            abs(v)
+            for v in (
+                a.query_end[ai] - a.query_start[ai],
+                b.query_end[bi] - b.query_start[bi],
+            )
+        ]
+    )
+    if d is None:
+        return None
+    try:
+        return d / min_seq_len
+    except ZeroDivisionError:
+        return None
+
+
 stats_to_compute = {
     "Consensus Distance": consensus_dist,
     "Target Distance": _target_distance,
-    "Divergence Change": lambda a, b, ai, bi: b.kimera80[bi] - a.kimera80[ai],
+    "Divergence Change": _kimura_dist,
+    "Relative Consensus Distance": _relative_consensus_dist,
 }
 
 
@@ -151,10 +196,11 @@ def logcdf(self, x, *args):
 
 
 estimator = {
+    "Relative Consensus Distance": Distribution(invweibull, (1.0, 0.0, 1.0), False),
     "Consensus Distance": Distribution(invweibull, (1.0, 0.0, 1.0), False),
     "Target Distance": Distribution(
-        weibull_min, (1.0, 10000.0)
-    ),  # Distribution(genpareto, (0.0, 1.0)),
+        genpareto, (0.0, 1.0)
+    ),  # Distribution(expon, (1.0,)),  # Distribution(genpareto, (0.0, 1.0)),
     "Divergence Change": Distribution(norm, (0.0, 1.0), False),
 }
 
@@ -171,51 +217,85 @@ def fit_dist(data, dist):
     )[0]
 
 
+random_idx_reference = {}
+random_is_join = {}
+
 for name in stats_to_compute:
     join_stats[name] = {}
     random_stats[name] = {}
 
-for k, annots in sorted(
-    joined_annots.items(), key=lambda k: min(min(v.target_start) for v in k[1])
-):
-    for ann in annots[:1]:
-        for i in range(len(ann.query)):
-            name = ann.query[i]
-            (pann, j) = prior_vals.get(name, (None, None))
-            if pann is not None:
-                for stat_name, values_per_query in random_stats.items():
-                    (ann1, idx1), (ann2, idx2) = sorted(
-                        [(pann, j), (ann, i)], key=lambda v: v[0].target_start[v[1]]
-                    )
+all_anots_flat = [ann for annots in joined_annots.values() for ann in annots]
+all_anots_flat.sort(key=lambda v: min(v.target_start))
+
+
+for ann_i, ann in enumerate(all_anots_flat):
+    for i in range(len(ann.query)):
+        name = ann.query[i]
+        (pann, j, pann_i) = prior_vals.get(name, (None, None, None))
+        if pann is not None:
+            # if pann.region == ann.region and pann.join_id == ann.join_id:
+            #    continue
+
+            (ann1, idx1), (ann2, idx2) = sorted(
+                [(pann, j), (ann, i)], key=lambda v: v[0].target_start[v[1]]
+            )
+
+            stats = {
+                stat_name: stats_to_compute[stat_name](ann1, ann2, idx1, idx2)
+                for stat_name in stats_to_compute
+            }
+
+            if all([v is not None for v in stats.values()]):
+                for stat_name, value in stats.items():
+                    values_per_query = random_stats[stat_name]
 
                     if name not in values_per_query:
                         values_per_query[name] = []
-                    values_per_query[name].append(
-                        stats_to_compute[stat_name](ann1, ann2, idx1, idx2)
-                    )
-
-            seq_size[name] = max(
-                seq_size.get(name, 1),
-                ann.query_start[i],
-                ann.query_end[i],
-            )
-            prior_vals[name] = (ann, i)
+                    values_per_query[name].append(value)
+
+                if name not in random_idx_reference:
+                    random_idx_reference[name] = []
+                    random_is_join[name] = []
+
+                random_idx_reference[name].append((ann_i, i, pann_i, j))
+                random_is_join[name].append(
+                    ann in joined_annots.get(pann.get_key(), [])
+                )
+
+        seq_size[name] = max(
+            seq_size.get(name, 1),
+            ann.query_start[i],
+            ann.query_end[i],
+        )
+        prior_vals[name] = (ann, i, ann_i)
 
+for k, annots in joined_annots.items():
     if len(annots) <= 1:
         continue
 
     for a, b in zip(annots[:-1], annots[1:]):
         for i in range(len(a.query)):
             name = a.query[i]
-            for stat_name, values_per_query in join_stats.items():
-                if name not in values_per_query:
-                    values_per_query[name] = []
-                values_per_query[name].append(stats_to_compute[stat_name](a, b, i, i))
+
+            stats = {
+                stat_name: stats_to_compute[stat_name](a, b, i, i)
+                for stat_name in stats_to_compute
+            }
+
+            if all([v is not None for v in stats.values()]):
+                for stat_name, value in stats.items():
+                    values_per_query = join_stats[stat_name]
+
+                    if name not in values_per_query:
+                        values_per_query[name] = []
+                    values_per_query[name].append(value)
 
 
 for query_name, _ in sorted(
     join_stats["Consensus Distance"].items(), key=lambda k: -len(k[1])
 ):
+    # if not query_name.startswith("alu"):
+    #     continue
     fig, axs = plt.subplots(3, len(stats_to_compute))
     axs = axs.T
 
@@ -229,7 +309,7 @@ def fit_dist(data, dist):
         ax1.set_title(f"Join {name}")
         ax1.hist(
             join_stats[name][query_name],
-            50,
+            100,
             density=True,
             label=f"Mean: {np.mean(join_stats[name][query_name]):.02f}\nSTD: {np.std(join_stats[name][query_name]):.02f}",
         )
@@ -243,7 +323,7 @@ def fit_dist(data, dist):
         ax2.set_title(f"All {name}")
         ax2.hist(
             random_stats[name][query_name],
-            50,
+            100,
             density=True,
             label=f"Mean: {np.mean(random_stats[name][query_name]):.02f}\nSTD: {np.std(random_stats[name][query_name]):.02f}",
         )
@@ -261,6 +341,50 @@ def fit_dist(data, dist):
         ax3.plot(sx2, est.cdf(sx2, *fit2), label="Est. All CDF")
         ax3.legend()
 
-    fig.set_size_inches(12, 8)
+    fig.set_size_inches(16, 8)
     fig.tight_layout()
     plt.show()
+
+    plt.title(f"{query_name} (Size: {seq_size.get(query_name, 0)})")
+
+    join_indexes = np.flatnonzero(random_is_join[query_name])
+    not_join_indexes = np.flatnonzero(~np.array(random_is_join[query_name]))
+
+    join_art = plt.plot(
+        np.array(random_stats["Target Distance"][query_name])[join_indexes],
+        np.array(random_stats["Consensus Distance"][query_name])[join_indexes],
+        "ro",
+        picker=5,
+        label="Joins",
+    )
+    no_join_art = plt.plot(
+        np.array(random_stats["Target Distance"][query_name])[not_join_indexes],
+        np.array(random_stats["Consensus Distance"][query_name])[not_join_indexes],
+        "bo",
+        picker=5,
+        label="Not Joins",
+    )
+    plt.xlabel("Target Distance")
+    plt.ylabel("Consensus Distance")
+    plt.legend()
+    fig = plt.gcf()
+
+    def on_pick(evt):
+        mask = join_indexes if evt.artist == join_art else not_join_indexes
+
+        for idx in evt.ind:
+            idx = mask[idx]
+            annot_idx, sub_i, pann_idx, p_sub_i = random_idx_reference[query_name][idx]
+            print(f"Index: {annot_idx}, Sub-Index: {sub_i}")
+            print(
+                f"\tTarget Distance: {random_stats['Target Distance'][query_name][idx]}"
+            )
+            print(
+                f"\tConsensus Distance: {random_stats['Consensus Distance'][query_name][idx]}"
+            )
+            print(f"\tPrior: {all_anots_flat[pann_idx].select(p_sub_i)}")
+            print(f"\tCurrent: {all_anots_flat[annot_idx].select(sub_i)}")
+            print(f"\tIs Joined: {random_is_join[query_name][idx]}")
+
+    fig.canvas.mpl_connect("pick_event", on_pick)
+    plt.show()

From 0d2bba685cafa9493e0578e0bffcb1bcf9ec64f2 Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Wed, 6 May 2026 17:49:50 -0600
Subject: [PATCH 16/39] Join estimation WIP

---
 Cargo.toml                                   |   1 +
 scripts/plot_distributions_aurora.py         | 205 +++++++++++--------
 scripts/plot_target_distance_distribution.py |  33 ---
 src/join_estimation.rs                       |  34 +++
 src/main.rs                                  |   1 +
 src/statistics.rs                            |  34 +++
 6 files changed, 188 insertions(+), 120 deletions(-)
 create mode 100644 src/join_estimation.rs

diff --git a/Cargo.toml b/Cargo.toml
index 24d0d77..c5b43b6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -16,6 +16,7 @@ serde_json = "1.0.93"
 itertools = "0.11.0"
 rayon = "1.8.0"
 base64 = "0.22.1"
+puruspe = "0.4.4"
 
 [target.'cfg(not(target_env = "msvc"))'.dependencies]
 tikv-jemallocator = "0.5"
diff --git a/scripts/plot_distributions_aurora.py b/scripts/plot_distributions_aurora.py
index 4d12d43..31609d0 100644
--- a/scripts/plot_distributions_aurora.py
+++ b/scripts/plot_distributions_aurora.py
@@ -1,6 +1,7 @@
 import sys
 import typing
 from dataclasses import dataclass, fields
+from pathlib import Path
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -82,7 +83,22 @@ def select(self, idx: int) -> typing.Self:
         )
 
 
+if len(sys.argv) not in [2, 3]:
+    print("Usage:")
+    print(f"\t{Path(sys.argv[0]).name} AURORA_OUTPUT_FILE [dist|scatter]")
+    sys.exit(1)
+
 aurora_file = sys.argv[1]
+mode = sys.argv[2] if len(sys.argv) > 2 else "dist"
+
+if mode not in ["scatter", "dist"]:
+    print("Second argument (the mode) must be 'scatter' or 'dist'!")
+    sys.exit(1)
+
+if mode == "dist":
+    print("Generating distributions plots...")
+else:
+    print("Generating scatter plots...")
 
 joined_annots: dict[tuple[int, int], list[AuroraEntry]] = {}
 
@@ -140,7 +156,7 @@ def _kimura_dist(a, b, ai, bi):
 
 def _relative_consensus_dist(a, b, ai, bi):
     d = consensus_dist(a, b, ai, bi)
-    min_seq_len = min(
+    sum_seq_len = sum(
         [
             abs(v)
             for v in (
@@ -152,7 +168,7 @@ def _relative_consensus_dist(a, b, ai, bi):
     if d is None:
         return None
     try:
-        return d / min_seq_len
+        return d / sum_seq_len
     except ZeroDivisionError:
         return None
 
@@ -296,95 +312,110 @@ def fit_dist(data, dist):
 ):
     # if not query_name.startswith("alu"):
     #     continue
-    fig, axs = plt.subplots(3, len(stats_to_compute))
-    axs = axs.T
-
-    fig.suptitle(f"{query_name} (Size: {seq_size.get(query_name, 0)})")
-
-    for name, (ax1, ax2, ax3) in zip(stats_to_compute, axs):
-        est = estimator[name]
-
-        sx = np.sort(join_stats[name][query_name])
-        fit = fit_dist(sx, est)
-        ax1.set_title(f"Join {name}")
-        ax1.hist(
-            join_stats[name][query_name],
-            100,
-            density=True,
-            label=f"Mean: {np.mean(join_stats[name][query_name]):.02f}\nSTD: {np.std(join_stats[name][query_name]):.02f}",
-        )
-        ax1.plot(
-            sx, est.pdf(sx, *fit), label=f"Fit: {', '.join(f'{v:.02f}' for v in fit)}"
-        )
-        ax1.legend()
-
-        sx2 = np.sort(random_stats[name][query_name])
-        fit2 = fit_dist(sx2, est)
-        ax2.set_title(f"All {name}")
-        ax2.hist(
-            random_stats[name][query_name],
-            100,
-            density=True,
-            label=f"Mean: {np.mean(random_stats[name][query_name]):.02f}\nSTD: {np.std(random_stats[name][query_name]):.02f}",
-        )
-        ax2.plot(
-            sx2,
-            est.pdf(sx2, *fit2),
-            label=f"Fit: {', '.join(f'{v:.02f}' for v in fit2)}",
-        )
-        ax2.legend()
+    join_indexes = np.flatnonzero(random_is_join[query_name])
+    not_join_indexes = np.flatnonzero(~np.array(random_is_join[query_name]))
 
-        ax3.set_title("CDFs")
-        ax3.ecdf(join_stats[name][query_name], label="Joins CDF")
-        ax3.ecdf(random_stats[name][query_name], label="All CDF")
-        ax3.plot(sx, est.cdf(sx, *fit), label="Est. Join CDF")
-        ax3.plot(sx2, est.cdf(sx2, *fit2), label="Est. All CDF")
-        ax3.legend()
+    if mode == "dist":
+        fig, axs = plt.subplots(3, len(stats_to_compute))
+        axs = axs.T
 
-    fig.set_size_inches(16, 8)
-    fig.tight_layout()
-    plt.show()
+        fig.suptitle(f"{query_name} (Size: {seq_size.get(query_name, 0)})")
 
-    plt.title(f"{query_name} (Size: {seq_size.get(query_name, 0)})")
+        for name, (ax1, ax2, ax3) in zip(stats_to_compute, axs):
+            est = estimator[name]
 
-    join_indexes = np.flatnonzero(random_is_join[query_name])
-    not_join_indexes = np.flatnonzero(~np.array(random_is_join[query_name]))
-
-    join_art = plt.plot(
-        np.array(random_stats["Target Distance"][query_name])[join_indexes],
-        np.array(random_stats["Consensus Distance"][query_name])[join_indexes],
-        "ro",
-        picker=5,
-        label="Joins",
-    )
-    no_join_art = plt.plot(
-        np.array(random_stats["Target Distance"][query_name])[not_join_indexes],
-        np.array(random_stats["Consensus Distance"][query_name])[not_join_indexes],
-        "bo",
-        picker=5,
-        label="Not Joins",
-    )
-    plt.xlabel("Target Distance")
-    plt.ylabel("Consensus Distance")
-    plt.legend()
-    fig = plt.gcf()
-
-    def on_pick(evt):
-        mask = join_indexes if evt.artist == join_art else not_join_indexes
-
-        for idx in evt.ind:
-            idx = mask[idx]
-            annot_idx, sub_i, pann_idx, p_sub_i = random_idx_reference[query_name][idx]
-            print(f"Index: {annot_idx}, Sub-Index: {sub_i}")
-            print(
-                f"\tTarget Distance: {random_stats['Target Distance'][query_name][idx]}"
+            join_samples = np.array(join_stats[name][query_name])
+            sx = np.linspace(join_samples.min(), join_samples.max(), 1000)
+            fit = fit_dist(join_samples, est)
+            ax1.set_title(f"Join {name}")
+            ax1.hist(
+                join_samples,
+                100,
+                density=True,
+                label=f"Mean: {np.mean(join_samples):.02f}\nSTD: {np.std(join_samples):.02f}",
+            )
+            ax1.plot(
+                sx,
+                est.pdf(sx, *fit),
+                label=f"Fit: {', '.join(f'{v:.02f}' for v in fit)}",
+            )
+            ax1.legend()
+
+            random_samples = np.array(random_stats[name][query_name])
+            sx2 = np.linspace(random_samples.min(), random_samples.max(), 1000)
+            fit2 = fit_dist(random_samples, est)
+            ax2.set_title(f"All {name}")
+            ax2.plot(
+                [0],
+                [0],
+                color="black",
+                visible=False,
+                label=f"Mean: {np.mean(random_samples):.02f}\nSTD: {np.std(random_samples):.02f}",
             )
-            print(
-                f"\tConsensus Distance: {random_stats['Consensus Distance'][query_name][idx]}"
+            ax2.hist(
+                [random_samples[not_join_indexes], random_samples[join_indexes]],
+                100,
+                label=["Not Joined", "Joined"],
+                density=True,
+                stacked=True,
             )
-            print(f"\tPrior: {all_anots_flat[pann_idx].select(p_sub_i)}")
-            print(f"\tCurrent: {all_anots_flat[annot_idx].select(sub_i)}")
-            print(f"\tIs Joined: {random_is_join[query_name][idx]}")
+            ax2.plot(
+                sx2,
+                est.pdf(sx2, *fit2),
+                label=f"Fit: {', '.join(f'{v:.02f}' for v in fit2)}",
+            )
+            ax2.legend()
+
+            ax3.set_title("CDFs")
+            ax3.ecdf(join_stats[name][query_name], label="Joins CDF")
+            ax3.ecdf(random_stats[name][query_name], label="All CDF")
+            ax3.plot(sx, est.cdf(sx, *fit), label="Est. Join CDF")
+            ax3.plot(sx2, est.cdf(sx2, *fit2), label="Est. All CDF")
+            ax3.legend()
+
+        fig.set_size_inches(16, 8)
+        fig.tight_layout()
+        plt.show()
+    else:
+        plt.title(f"{query_name} (Size: {seq_size.get(query_name, 0)})")
+
+        join_art = plt.plot(
+            np.array(random_stats["Target Distance"][query_name])[join_indexes],
+            np.array(random_stats["Consensus Distance"][query_name])[join_indexes],
+            "ro",
+            picker=5,
+            label="Joins",
+        )
+        no_join_art = plt.plot(
+            np.array(random_stats["Target Distance"][query_name])[not_join_indexes],
+            np.array(random_stats["Consensus Distance"][query_name])[not_join_indexes],
+            "bo",
+            picker=5,
+            label="Not Joins",
+        )
+        plt.xlabel("Target Distance")
+        plt.ylabel("Consensus Distance")
+        plt.legend()
+        fig = plt.gcf()
+
+        def on_pick(evt):
+            mask = join_indexes if evt.artist == join_art else not_join_indexes
+
+            for idx in evt.ind:
+                idx = mask[idx]
+                annot_idx, sub_i, pann_idx, p_sub_i = random_idx_reference[query_name][
+                    idx
+                ]
+                print(f"Index: {annot_idx}, Sub-Index: {sub_i}")
+                print(
+                    f"\tTarget Distance: {random_stats['Target Distance'][query_name][idx]}"
+                )
+                print(
+                    f"\tConsensus Distance: {random_stats['Consensus Distance'][query_name][idx]}"
+                )
+                print(f"\tPrior: {all_anots_flat[pann_idx].select(p_sub_i)}")
+                print(f"\tCurrent: {all_anots_flat[annot_idx].select(sub_i)}")
+                print(f"\tIs Joined: {random_is_join[query_name][idx]}")
 
-    fig.canvas.mpl_connect("pick_event", on_pick)
-    plt.show()
+        fig.canvas.mpl_connect("pick_event", on_pick)
+        plt.show()
diff --git a/scripts/plot_target_distance_distribution.py b/scripts/plot_target_distance_distribution.py
index 3e64a73..c107e36 100755
--- a/scripts/plot_target_distance_distribution.py
+++ b/scripts/plot_target_distance_distribution.py
@@ -12,27 +12,6 @@
 # For now, an exponential distribution seems to provide a good enough approximation for use in aurora. It also is easy to fit well.
 # Here's an interesting paper on the topic that seems to have landed in the same space I've been in: https://www.columbia.edu/~ww2040/FittingMixturesPerfEval98.pdf
 
-import sys
-from inspect import signature
-
-import matplotlib.pyplot as plt
-import numpy as np
-from scipy.optimize import curve_fit
-from scipy.stats import (
-    betaprime,
-    burr12,
-    ecdf,
-    expon,
-    fisk,
-    genextreme,
-    genpareto,
-    invweibull,
-    linregress,
-    lognorm,
-    lomax,
-    weibull_min,
-)
-
 bed_file = sys.argv[1]
 
 seq_info = {}
@@ -126,18 +105,6 @@ def logcdf(self, x, *args):
     beta = np.mean(dists)  # np.median(dists) / np.log(2)
 
     # Get fit for weibull dist... (and CDF)...
-    cdf = ecdf(dists).cdf
-
-    y_transform = lambda y: np.log(-np.log(1 - y))
-
-    with np.errstate(divide="ignore"):
-        wcdfx = np.log(cdf.quantiles)
-        wcdfy = y_transform(cdf.probabilities)
-        # Estimate the error that the transform adds to the line. This makes linear fit better fit a CDF...
-        wcdfy_p1 = y_transform(
-            cdf.probabilities - np.sign(cdf.probabilities - 0.5) * 0.01
-        )
-        wcdfy_err = np.maximum(np.abs((wcdfy_p1 - wcdfy) / 0.01), 1e-8)
 
     valid_values = np.isfinite(wcdfx) & np.isfinite(wcdfy)
     fit_line = curve_fit(
diff --git a/src/join_estimation.rs b/src/join_estimation.rs
new file mode 100644
index 0000000..abef24f
--- /dev/null
+++ b/src/join_estimation.rs
@@ -0,0 +1,34 @@
+use crate::{
+    segments::Block,
+    statistics::{ExponentialEstimator, StudentsT},
+};
+
+trait JoinEstimator {
+    fn predict(&self, first_block: &Block, second_block: &Block) -> f64;
+}
+
+trait JoinStatistics<T: JoinEstimator> {
+    fn new() -> Self;
+    fn combine(&self, other: &Self) -> Self;
+    fn add(&self, first_block: &Block, second_block: &Block, neighbors: bool, joinable: bool);
+    fn to_estimator(&self) -> T;
+}
+
+struct BayesianJoinEstimator {
+    target_distance_join: ExponentialEstimator,
+    target_distance_background: ExponentialEstimator,
+    divergence_join: StudentsT,
+    divergence_background: StudentsT,
+}
+
+struct BayesianJoinStatistics {
+    joinable_target_distance_sum: usize,
+    all_target_distance_sum: usize,
+    divergence_sum: f64,
+    divergence_square_sum: f64,
+    join_divergence_sum: f64,
+    join_divergence_square_sum: f64,
+    divergence_offset: f64,
+    joinable_count: usize,
+    all_count: usize,
+}
diff --git a/src/main.rs b/src/main.rs
index 2f1e11a..927fbf9 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -6,6 +6,7 @@ mod balanced_tree;
 mod chunks;
 mod confidence;
 mod history_tracing;
+mod join_estimation;
 mod matrix;
 mod pipeline;
 mod score_params;
diff --git a/src/statistics.rs b/src/statistics.rs
index 309c7ff..0b48788 100644
--- a/src/statistics.rs
+++ b/src/statistics.rs
@@ -1,3 +1,4 @@
+use puruspe::{beta, betai};
 use std::fmt::Debug;
 
 #[allow(dead_code)]
@@ -142,3 +143,36 @@ impl Distribution for ExponentialEstimator {
         (0.0, f64::INFINITY)
     }
 }
+
+#[derive(Debug, Clone)]
+pub struct StudentsT {
+    mean: f64,
+    standard_deviation: f64,
+    degrees_of_freedom: usize,
+}
+
+impl Distribution for StudentsT {
+    fn unit() -> Self {
+        Self {
+            mean: 0.0,
+            standard_deviation: 1.0,
+            degrees_of_freedom: 1,
+        }
+    }
+
+    fn pdf(&self, x: f64) -> f64 {
+        let v = self.degrees_of_freedom as f64;
+        let z = (x - self.mean) / self.standard_deviation;
+        1.0 / (v.sqrt() * beta(0.5, 0.5 * v)) * (1.0 + (z * z) / v).powf(-0.5 * (v + 1.0))
+    }
+
+    fn cdf(&self, x: f64) -> f64 {
+        let v = self.degrees_of_freedom as f64;
+        let z = (x - self.mean) / self.standard_deviation;
+        if z >= 0.0 {
+            1.0 - 0.5 * betai(0.5 * v, 0.5, v / (z * z + v))
+        } else {
+            0.5 * betai(0.5 * v, 0.5, v / (z * z + v))
+        }
+    }
+}

From 0601c8bccc302da7e627dd3cd38833942802fb09 Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Fri, 8 May 2026 00:03:21 -0600
Subject: [PATCH 17/39] WIP Half t distribution.

---
 scripts/plot_distributions_aurora.py |  42 ++++++----
 src/join_estimation.rs               | 110 ++++++++++++++++++++++++---
 src/statistics.rs                    |  99 ++++++++++++++++++++++--
 3 files changed, 221 insertions(+), 30 deletions(-)

diff --git a/scripts/plot_distributions_aurora.py b/scripts/plot_distributions_aurora.py
index 31609d0..d76eb97 100644
--- a/scripts/plot_distributions_aurora.py
+++ b/scripts/plot_distributions_aurora.py
@@ -6,7 +6,17 @@
 import matplotlib.pyplot as plt
 import numpy as np
 from scipy.optimize import curve_fit
-from scipy.stats import ecdf, expon, genpareto, gumbel_r, invweibull, norm, weibull_min
+from scipy.stats import (
+    ecdf,
+    expon,
+    genpareto,
+    gumbel_r,
+    halfnorm,
+    invweibull,
+    laplace_asymmetric,
+    norm,
+    weibull_min,
+)
 
 
 @dataclass
@@ -151,12 +161,12 @@ def _target_distance(a: AuroraEntry, b: AuroraEntry, ai: int, bi: int):
 
 
 def _kimura_dist(a, b, ai, bi):
-    return b.kimera80[bi] - a.kimera80[ai]
+    return abs(b.kimera80[bi] - a.kimera80[ai])
 
 
 def _relative_consensus_dist(a, b, ai, bi):
     d = consensus_dist(a, b, ai, bi)
-    sum_seq_len = sum(
+    sum_seq_len = max(
         [
             abs(v)
             for v in (
@@ -212,12 +222,16 @@ def logcdf(self, x, *args):
 
 
 estimator = {
-    "Relative Consensus Distance": Distribution(invweibull, (1.0, 0.0, 1.0), False),
-    "Consensus Distance": Distribution(invweibull, (1.0, 0.0, 1.0), False),
+    "Relative Consensus Distance": Distribution(
+        laplace_asymmetric, (1.0, 0.0, 1.0), False
+    ),  # Distribution(invweibull, (1.0, 0.0, 1.0), False),
+    "Consensus Distance": Distribution(
+        laplace_asymmetric, (1.0, 0.0, 1.0), False
+    ),  # Distribution(invweibull, (1.0, 0.0, 1.0), False),
     "Target Distance": Distribution(
-        genpareto, (0.0, 1.0)
+        weibull_min, (1.0, 10000)
     ),  # Distribution(expon, (1.0,)),  # Distribution(genpareto, (0.0, 1.0)),
-    "Divergence Change": Distribution(norm, (0.0, 1.0), False),
+    "Divergence Change": Distribution(halfnorm, (1.0,)),
 }
 
 
@@ -310,8 +324,8 @@ def fit_dist(data, dist):
 for query_name, _ in sorted(
     join_stats["Consensus Distance"].items(), key=lambda k: -len(k[1])
 ):
-    # if not query_name.startswith("alu"):
-    #     continue
+    # if not query_name.startswith("sin"):
+    #    continue
     join_indexes = np.flatnonzero(random_is_join[query_name])
     not_join_indexes = np.flatnonzero(~np.array(random_is_join[query_name]))
 
@@ -330,7 +344,7 @@ def fit_dist(data, dist):
             ax1.set_title(f"Join {name}")
             ax1.hist(
                 join_samples,
-                100,
+                200,
                 density=True,
                 label=f"Mean: {np.mean(join_samples):.02f}\nSTD: {np.std(join_samples):.02f}",
             )
@@ -339,7 +353,7 @@ def fit_dist(data, dist):
                 est.pdf(sx, *fit),
                 label=f"Fit: {', '.join(f'{v:.02f}' for v in fit)}",
             )
-            ax1.legend()
+            ax1.legend(fontsize="xx-small")
 
             random_samples = np.array(random_stats[name][query_name])
             sx2 = np.linspace(random_samples.min(), random_samples.max(), 1000)
@@ -354,7 +368,7 @@ def fit_dist(data, dist):
             )
             ax2.hist(
                 [random_samples[not_join_indexes], random_samples[join_indexes]],
-                100,
+                200,
                 label=["Not Joined", "Joined"],
                 density=True,
                 stacked=True,
@@ -364,14 +378,14 @@ def fit_dist(data, dist):
                 est.pdf(sx2, *fit2),
                 label=f"Fit: {', '.join(f'{v:.02f}' for v in fit2)}",
             )
-            ax2.legend()
+            ax2.legend(fontsize="xx-small")
 
             ax3.set_title("CDFs")
             ax3.ecdf(join_stats[name][query_name], label="Joins CDF")
             ax3.ecdf(random_stats[name][query_name], label="All CDF")
             ax3.plot(sx, est.cdf(sx, *fit), label="Est. Join CDF")
             ax3.plot(sx2, est.cdf(sx2, *fit2), label="Est. All CDF")
-            ax3.legend()
+            ax3.legend(fontsize="xx-small")
 
         fig.set_size_inches(16, 8)
         fig.tight_layout()
diff --git a/src/join_estimation.rs b/src/join_estimation.rs
index abef24f..44a7295 100644
--- a/src/join_estimation.rs
+++ b/src/join_estimation.rs
@@ -1,34 +1,126 @@
 use crate::{
+    assembly::block_target_distance,
     segments::Block,
-    statistics::{ExponentialEstimator, StudentsT},
+    statistics::{Distribution, ExponentialEstimator, StudentsT},
 };
 
-trait JoinEstimator {
-    fn predict(&self, first_block: &Block, second_block: &Block) -> f64;
+pub trait JoinEstimator<T: JoinStatistics> {
+    fn predict(&self, first_block: &Block, second_block: &Block, log_space: bool) -> f64;
+    fn from_statistics(statistics: T) -> Self;
 }
 
-trait JoinStatistics<T: JoinEstimator> {
+pub trait JoinStatistics {
     fn new() -> Self;
     fn combine(&self, other: &Self) -> Self;
-    fn add(&self, first_block: &Block, second_block: &Block, neighbors: bool, joinable: bool);
-    fn to_estimator(&self) -> T;
+    fn add(&mut self, first_block: &Block, second_block: &Block, neighbors: bool, joinable: bool);
 }
 
-struct BayesianJoinEstimator {
+pub struct BayesianJoinEstimator {
     target_distance_join: ExponentialEstimator,
     target_distance_background: ExponentialEstimator,
     divergence_join: StudentsT,
     divergence_background: StudentsT,
 }
 
-struct BayesianJoinStatistics {
+impl JoinEstimator<BayesianJoinStatistics> for BayesianJoinEstimator {
+    fn from_statistics(statistics: BayesianJoinStatistics) -> Self {
+        let join_td_mean =
+            statistics.joinable_target_distance_sum as f64 / statistics.joinable_count as f64;
+        let all_td_mean = statistics.all_target_distance_sum as f64 / statistics.all_count as f64;
+
+        // Divergence distributions should have a mean of 0, so we assume that...
+        let join_div_std = statistics.join_divergence_square_sum / statistics.joinable_count as f64;
+        let all_div_std = statistics.divergence_square_sum / statistics.all_count as f64;
+
+        Self {
+            target_distance_join: ExponentialEstimator::new(
+                join_td_mean,
+                statistics.joinable_count,
+            ),
+            target_distance_background: ExponentialEstimator::new(
+                all_td_mean,
+                statistics.all_count,
+            ),
+            divergence_join: StudentsT::new(0.0, join_div_std, statistics.joinable_count),
+            divergence_background: StudentsT::new(0.0, all_div_std, statistics.all_count),
+        }
+    }
+
+    fn predict(&self, first_block: &Block, second_block: &Block, log_space: bool) -> f64 {
+        let prior_acc: f64 = 0.95; // Accuracy of the prior estimator of joins...
+        let target_dist = block_target_distance(first_block, second_block) as f64;
+        // Absolute value as t-dist is symmetric and we want to get prob in tail, also, we know the mean is 0...
+        let divergence_diff = (second_block.kimura80 - first_block.kimura80).abs();
+
+        let target_likelihood = self.target_distance_join.logccdf(target_dist)
+            - self.target_distance_background.logccdf(target_dist);
+        let diverg_likelihood = self.divergence_join.logccdf(divergence_diff)
+            - self.divergence_background.logccdf(divergence_diff);
+
+        let score = target_likelihood + diverg_likelihood + prior_acc.ln();
+
+        if log_space {
+            score
+        } else {
+            score.exp()
+        }
+    }
+}
+
+pub struct BayesianJoinStatistics {
     joinable_target_distance_sum: usize,
     all_target_distance_sum: usize,
     divergence_sum: f64,
     divergence_square_sum: f64,
     join_divergence_sum: f64,
     join_divergence_square_sum: f64,
-    divergence_offset: f64,
     joinable_count: usize,
     all_count: usize,
 }
+
+impl JoinStatistics for BayesianJoinStatistics {
+    fn new() -> Self {
+        Self {
+            joinable_target_distance_sum: 0,
+            all_target_distance_sum: 0,
+            divergence_sum: 0.0,
+            divergence_square_sum: 0.0,
+            join_divergence_sum: 0.0,
+            join_divergence_square_sum: 0.0,
+            joinable_count: 0,
+            all_count: 0,
+        }
+    }
+
+    fn add(&mut self, first_block: &Block, second_block: &Block, neighbors: bool, joinable: bool) {
+        let target_dist = block_target_distance(first_block, second_block).abs() as usize;
+        let divergence_diff = second_block.kimura80 - first_block.kimura80;
+
+        if joinable {
+            self.joinable_target_distance_sum += target_dist;
+            self.join_divergence_sum += divergence_diff;
+            self.join_divergence_square_sum += divergence_diff * divergence_diff;
+            self.joinable_count += 1;
+        }
+
+        self.all_target_distance_sum += target_dist;
+        self.divergence_sum += divergence_diff;
+        self.divergence_square_sum += divergence_diff * divergence_diff;
+        self.all_count += 1;
+    }
+
+    fn combine(&self, other: &Self) -> Self {
+        Self {
+            joinable_target_distance_sum: self.joinable_target_distance_sum
+                + other.joinable_target_distance_sum,
+            all_target_distance_sum: self.all_target_distance_sum + other.all_target_distance_sum,
+            divergence_sum: self.divergence_sum + other.divergence_sum,
+            divergence_square_sum: self.divergence_square_sum + other.divergence_square_sum,
+            join_divergence_sum: self.join_divergence_sum + other.join_divergence_sum,
+            join_divergence_square_sum: self.join_divergence_square_sum
+                + other.join_divergence_square_sum,
+            joinable_count: self.joinable_count + other.joinable_count,
+            all_count: self.all_count + other.all_count,
+        }
+    }
+}
diff --git a/src/statistics.rs b/src/statistics.rs
index 0b48788..875a75a 100644
--- a/src/statistics.rs
+++ b/src/statistics.rs
@@ -1,5 +1,5 @@
-use puruspe::{beta, betai};
-use std::fmt::Debug;
+use puruspe::{beta, betai, invbetai};
+use std::{f64, fmt::Debug};
 
 #[allow(dead_code)]
 pub trait Distribution: Clone + Debug {
@@ -151,6 +151,16 @@ pub struct StudentsT {
     degrees_of_freedom: usize,
 }
 
+impl StudentsT {
+    pub fn new(mean: f64, standard_deviation: f64, degrees_of_freedom: usize) -> Self {
+        Self {
+            mean,
+            standard_deviation,
+            degrees_of_freedom,
+        }
+    }
+}
+
 impl Distribution for StudentsT {
     fn unit() -> Self {
         Self {
@@ -162,17 +172,92 @@ impl Distribution for StudentsT {
 
     fn pdf(&self, x: f64) -> f64 {
         let v = self.degrees_of_freedom as f64;
-        let z = (x - self.mean) / self.standard_deviation;
-        1.0 / (v.sqrt() * beta(0.5, 0.5 * v)) * (1.0 + (z * z) / v).powf(-0.5 * (v + 1.0))
+        let s = self.standard_deviation;
+        let z = (x - self.mean) / s;
+        1.0 / (v.sqrt() * beta(0.5, 0.5 * v) * s) * (1.0 + (z * z) / v).powf(-0.5 * (v + 1.0))
     }
 
     fn cdf(&self, x: f64) -> f64 {
         let v = self.degrees_of_freedom as f64;
         let z = (x - self.mean) / self.standard_deviation;
-        if z >= 0.0 {
-            1.0 - 0.5 * betai(0.5 * v, 0.5, v / (z * z + v))
+        let beta_comp = betai(0.5 * v, 0.5, v / (z * z + v));
+        if z > 0.0 {
+            1.0 - 0.5 * beta_comp
         } else {
-            0.5 * betai(0.5 * v, 0.5, v / (z * z + v))
+            0.5 * beta_comp
+        }
+    }
+
+    fn ppf(&self, p: f64) -> f64 {
+        let v = self.degrees_of_freedom as f64;
+        let p_in = if p <= 0.5 { 2.0 * p } else { 2.0 * (1.0 - p) };
+        let inv_out = invbetai(p_in, 0.5 * v, 0.5);
+        let x_unit = (v / inv_out - v).sqrt();
+        x_unit * self.standard_deviation + self.mean
+    }
+
+    fn support(&self) -> (f64, f64) {
+        (f64::NEG_INFINITY, f64::INFINITY)
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct HalfT {
+    standard_deviation: f64,
+    degrees_of_freedom: usize,
+}
+
+impl HalfT {
+    pub fn new(standard_deviation: f64, degrees_of_freedom: usize) -> Self {
+        Self {
+            standard_deviation,
+            degrees_of_freedom,
+        }
+    }
+
+    pub fn from_sample_mean(mean: f64, degrees_of_freedom: usize) -> Self {
+        Self {
+            standard_deviation: mean * (2.0 / f64::consts::PI).sqrt(),
+            degrees_of_freedom,
         }
     }
 }
+
+impl Distribution for HalfT {
+    fn unit() -> Self {
+        Self {
+            standard_deviation: 1.0,
+            degrees_of_freedom: 1,
+        }
+    }
+
+    fn pdf(&self, x: f64) -> f64 {
+        let v = self.degrees_of_freedom as f64;
+        let s = self.standard_deviation;
+        let z = x / s;
+        2.0 / (v.sqrt() * beta(0.5, 0.5 * v) * s) * (1.0 + (z * z) / v).powf(-0.5 * (v + 1.0))
+    }
+
+    fn cdf(&self, x: f64) -> f64 {
+        let v = self.degrees_of_freedom as f64;
+        let z = x / self.standard_deviation;
+        let beta_comp = betai(0.5 * v, 0.5, v / (z * z + v));
+        if z > 0.0 {
+            1.0 - 0.5 * beta_comp
+        } else {
+            0.5 * beta_comp
+        }
+    }
+
+    fn ppf(&self, p: f64) -> f64 {
+        let v = self.degrees_of_freedom as f64;
+        let p_in = if p <= 0.5 { 2.0 * p } else { 2.0 * (1.0 - p) };
+        let inv_out = invbetai(p_in, 0.5 * v, 0.5);
+        let x_unit = (v / inv_out - v).sqrt();
+        x_unit * self.standard_deviation
+    }
+
+    fn support(&self) -> (f64, f64) {
+        (0.0, f64::INFINITY)
+    }
+}

From 597a2606c3331c13936e8f3988fe8f3060593342 Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Fri, 8 May 2026 01:23:35 -0600
Subject: [PATCH 18/39] Init impl of HalfT, needs testing...

---
 src/statistics.rs | 108 +++++++++++++++-------------------------------
 1 file changed, 35 insertions(+), 73 deletions(-)

diff --git a/src/statistics.rs b/src/statistics.rs
index 875a75a..db2497b 100644
--- a/src/statistics.rs
+++ b/src/statistics.rs
@@ -1,5 +1,6 @@
-use puruspe::{beta, betai, invbetai};
-use std::{f64, fmt::Debug};
+use core::f64;
+use puruspe::{beta, betai, erf, invbetai};
+use std::fmt::Debug;
 
 #[allow(dead_code)]
 pub trait Distribution: Clone + Debug {
@@ -8,19 +9,10 @@ pub trait Distribution: Clone + Debug {
     fn cdf(&self, x: f64) -> f64;
     fn ppf(&self, p: f64) -> f64;
     fn support(&self) -> (f64, f64);
-
-    fn ccdf(&self, x: f64) -> f64 {
-        1.0 - self.cdf(x)
-    }
-    fn logpdf(&self, x: f64) -> f64 {
-        self.pdf(x).ln()
-    }
-    fn logcdf(&self, x: f64) -> f64 {
-        self.cdf(x).ln()
-    }
-    fn logccdf(&self, x: f64) -> f64 {
-        self.ccdf(x).ln()
-    }
+    fn ccdf(&self, x: f64) -> f64;
+    fn logpdf(&self, x: f64) -> f64;
+    fn logcdf(&self, x: f64) -> f64;
+    fn logccdf(&self, x: f64) -> f64;
 }
 
 #[derive(Clone, Debug)]
@@ -151,56 +143,6 @@ pub struct StudentsT {
     degrees_of_freedom: usize,
 }
 
-impl StudentsT {
-    pub fn new(mean: f64, standard_deviation: f64, degrees_of_freedom: usize) -> Self {
-        Self {
-            mean,
-            standard_deviation,
-            degrees_of_freedom,
-        }
-    }
-}
-
-impl Distribution for StudentsT {
-    fn unit() -> Self {
-        Self {
-            mean: 0.0,
-            standard_deviation: 1.0,
-            degrees_of_freedom: 1,
-        }
-    }
-
-    fn pdf(&self, x: f64) -> f64 {
-        let v = self.degrees_of_freedom as f64;
-        let s = self.standard_deviation;
-        let z = (x - self.mean) / s;
-        1.0 / (v.sqrt() * beta(0.5, 0.5 * v) * s) * (1.0 + (z * z) / v).powf(-0.5 * (v + 1.0))
-    }
-
-    fn cdf(&self, x: f64) -> f64 {
-        let v = self.degrees_of_freedom as f64;
-        let z = (x - self.mean) / self.standard_deviation;
-        let beta_comp = betai(0.5 * v, 0.5, v / (z * z + v));
-        if z > 0.0 {
-            1.0 - 0.5 * beta_comp
-        } else {
-            0.5 * beta_comp
-        }
-    }
-
-    fn ppf(&self, p: f64) -> f64 {
-        let v = self.degrees_of_freedom as f64;
-        let p_in = if p <= 0.5 { 2.0 * p } else { 2.0 * (1.0 - p) };
-        let inv_out = invbetai(p_in, 0.5 * v, 0.5);
-        let x_unit = (v / inv_out - v).sqrt();
-        x_unit * self.standard_deviation + self.mean
-    }
-
-    fn support(&self) -> (f64, f64) {
-        (f64::NEG_INFINITY, f64::INFINITY)
-    }
-}
-
 #[derive(Debug, Clone)]
 pub struct HalfT {
     standard_deviation: f64,
@@ -231,33 +173,53 @@ impl Distribution for HalfT {
         }
     }
 
-    fn pdf(&self, x: f64) -> f64 {
+    fn logpdf(&self, x: f64) -> f64 {
         let v = self.degrees_of_freedom as f64;
         let s = self.standard_deviation;
         let z = x / s;
-        2.0 / (v.sqrt() * beta(0.5, 0.5 * v) * s) * (1.0 + (z * z) / v).powf(-0.5 * (v + 1.0))
+        if z >= 0.0 {
+            let norm = (2.0_f64).ln() - (0.5 * v.ln() + beta(0.5, 0.5 * v).ln() + s.ln());
+            norm - 0.5 * (v + 1.0) * ((z * z) / v).ln_1p()
+        } else {
+            0.0
+        }
+    }
+
+    fn pdf(&self, x: f64) -> f64 {
+        self.logpdf(x).exp()
     }
 
     fn cdf(&self, x: f64) -> f64 {
         let v = self.degrees_of_freedom as f64;
         let z = x / self.standard_deviation;
-        let beta_comp = betai(0.5 * v, 0.5, v / (z * z + v));
-        if z > 0.0 {
-            1.0 - 0.5 * beta_comp
+        if z >= 0.0 {
+            1.0 - betai(0.5 * v, 0.5, v / (z * z + v))
         } else {
-            0.5 * beta_comp
+            0.0
         }
     }
 
+    fn logcdf(&self, x: f64) -> f64 {
+        self.cdf(x).ln()
+    }
+
     fn ppf(&self, p: f64) -> f64 {
         let v = self.degrees_of_freedom as f64;
-        let p_in = if p <= 0.5 { 2.0 * p } else { 2.0 * (1.0 - p) };
-        let inv_out = invbetai(p_in, 0.5 * v, 0.5);
+        let inv_out = invbetai(1.0 - p, 0.5 * v, 0.5);
         let x_unit = (v / inv_out - v).sqrt();
         x_unit * self.standard_deviation
     }
 
+    fn ccdf(&self, x: f64) -> f64 {
+        1.0 - self.cdf(x)
+    }
+
+    fn logccdf(&self, x: f64) -> f64 {
+        self.ccdf(x).ln()
+    }
+
     fn support(&self) -> (f64, f64) {
         (0.0, f64::INFINITY)
     }
 }
+

From daa195359e15e0c7480899a1b3f8715aab143db6 Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Fri, 8 May 2026 16:18:36 -0600
Subject: [PATCH 19/39] Start adding tests for distributions.

---
 .zed/settings.json     |  16 ++++++
 src/join_estimation.rs |  10 ++--
 src/statistics.rs      | 107 ++++++++++++++++++++++++++++++++++++++---
 3 files changed, 120 insertions(+), 13 deletions(-)
 create mode 100644 .zed/settings.json

diff --git a/.zed/settings.json b/.zed/settings.json
new file mode 100644
index 0000000..d229d6c
--- /dev/null
+++ b/.zed/settings.json
@@ -0,0 +1,16 @@
+// Folder-specific settings
+//
+// For a full list of overridable settings, and general information on folder-specific settings,
+// see the documentation: https://zed.dev/docs/configuring-zed#settings-files
+{
+  "terminal": {
+    "detect_venv": {
+      "on": {
+        "directories": [
+          ".venv"
+        ],
+        "activate_script": "default"
+      }
+    }
+  }
+}
diff --git a/src/join_estimation.rs b/src/join_estimation.rs
index 44a7295..7dd9d56 100644
--- a/src/join_estimation.rs
+++ b/src/join_estimation.rs
@@ -1,7 +1,7 @@
 use crate::{
     assembly::block_target_distance,
     segments::Block,
-    statistics::{Distribution, ExponentialEstimator, StudentsT},
+    statistics::{Distribution, ExponentialEstimator, HalfT},
 };
 
 pub trait JoinEstimator<T: JoinStatistics> {
@@ -18,8 +18,8 @@ pub trait JoinStatistics {
 pub struct BayesianJoinEstimator {
     target_distance_join: ExponentialEstimator,
     target_distance_background: ExponentialEstimator,
-    divergence_join: StudentsT,
-    divergence_background: StudentsT,
+    divergence_join: HalfT,
+    divergence_background: HalfT,
 }
 
 impl JoinEstimator<BayesianJoinStatistics> for BayesianJoinEstimator {
@@ -41,8 +41,8 @@ impl JoinEstimator<BayesianJoinStatistics> for BayesianJoinEstimator {
                 all_td_mean,
                 statistics.all_count,
             ),
-            divergence_join: StudentsT::new(0.0, join_div_std, statistics.joinable_count),
-            divergence_background: StudentsT::new(0.0, all_div_std, statistics.all_count),
+            divergence_join: HalfT::new(join_div_std, statistics.joinable_count),
+            divergence_background: HalfT::new(all_div_std, statistics.all_count),
         }
     }
 
diff --git a/src/statistics.rs b/src/statistics.rs
index db2497b..3a02ce8 100644
--- a/src/statistics.rs
+++ b/src/statistics.rs
@@ -1,5 +1,5 @@
 use core::f64;
-use puruspe::{beta, betai, erf, invbetai};
+use puruspe::{beta, betai, invbetai};
 use std::fmt::Debug;
 
 #[allow(dead_code)]
@@ -136,13 +136,6 @@ impl Distribution for ExponentialEstimator {
     }
 }
 
-#[derive(Debug, Clone)]
-pub struct StudentsT {
-    mean: f64,
-    standard_deviation: f64,
-    degrees_of_freedom: usize,
-}
-
 #[derive(Debug, Clone)]
 pub struct HalfT {
     standard_deviation: f64,
@@ -223,3 +216,101 @@ impl Distribution for HalfT {
     }
 }
 
+#[cfg(test)]
+mod test {
+    use crate::statistics::{ExponentialEstimator, HalfT};
+    use std::fmt::Debug;
+
+    pub trait TestDistribution: Debug {
+        fn tpdf(&self, x: f64) -> f64;
+        fn tcdf(&self, x: f64) -> f64;
+        fn tppf(&self, p: f64) -> f64;
+        fn tsupport(&self) -> (f64, f64);
+        fn tccdf(&self, x: f64) -> f64;
+        fn tlogpdf(&self, x: f64) -> f64;
+        fn tlogcdf(&self, x: f64) -> f64;
+        fn tlogccdf(&self, x: f64) -> f64;
+    }
+
+    impl<T: Distribution> TestDistribution for T {
+        fn tpdf(&self, x: f64) -> f64 {
+            self.pdf(x)
+        }
+        fn tcdf(&self, x: f64) -> f64 {
+            self.cdf(x)
+        }
+        fn tppf(&self, p: f64) -> f64 {
+            self.ppf(p)
+        }
+        fn tsupport(&self) -> (f64, f64) {
+            self.support()
+        }
+        fn tccdf(&self, x: f64) -> f64 {
+            self.ccdf(x)
+        }
+        fn tlogpdf(&self, x: f64) -> f64 {
+            self.logpdf(x)
+        }
+        fn tlogcdf(&self, x: f64) -> f64 {
+            self.logcdf(x)
+        }
+        fn tlogccdf(&self, x: f64) -> f64 {
+            self.logccdf(x)
+        }
+    }
+
+    fn as_box<T: Distribution + 'static>(d: T) -> Box<dyn TestDistribution> {
+        Box::new(d)
+    }
+
+    use super::{Distribution, Exponential};
+
+    fn get_dists() -> [Box<dyn TestDistribution>; 3] {
+        [
+            as_box(Exponential::unit()),
+            as_box(ExponentialEstimator::unit()),
+            as_box(HalfT::unit()),
+        ]
+    }
+
+    fn is_close(a: f64, b: f64) -> bool {
+        let rel_tol = 1e-9;
+        let abs_tol = 0.0;
+        (a - b).abs() <= (rel_tol * (a.abs()).max(b.abs())).max(abs_tol)
+    }
+
+    fn linspace(start: f64, stop: f64, steps: usize) -> impl Iterator<Item = f64> {
+        (0..steps)
+            .map(move |n| n as f64 / (steps as f64 - 1.0))
+            .map(move |n| start * (1.0 - n) + stop * n)
+    }
+
+    #[test]
+    fn basic_distribution_propery_checks() {
+        for dist in get_dists() {
+            println!("Testing distribution: {:?}", dist);
+            let (mut low, mut high) = dist.tsupport();
+
+            if high == f64::INFINITY {
+                high = 5.0;
+            }
+            if low == f64::INFINITY {
+                low = -5.0;
+            }
+
+            for x in linspace(low, high, 100) {
+                // Basic properties...
+                assert!(is_close(dist.tpdf(x), dist.tlogpdf(x).exp()));
+                assert!(is_close(dist.tcdf(x), dist.tlogcdf(x).exp()));
+                assert!(is_close(dist.tccdf(x), dist.tlogccdf(x).exp()));
+                assert!(is_close(dist.tccdf(x), 1.0 - dist.tcdf(x)));
+                assert!(is_close(dist.tppf(dist.tcdf(x)), x));
+            }
+        }
+    }
+
+    #[test]
+    fn test_exponential_distribution() {
+        let dist = Exponential::unit();
+    }
+}

From a0236ac30e188cc0a74240aa0851c4bbf887f431 Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Tue, 12 May 2026 03:24:17 -0600
Subject: [PATCH 20/39] P2 Estimator wip for doing quantile estimators.

---
 Cargo.toml                           |  1 +
 scripts/plot_distributions_aurora.py | 23 +++++-----
 src/main.rs                          |  1 +
 src/p2estimator.rs                   | 67 ++++++++++++++++++++++++++++
 src/statistics.rs                    |  1 +
 5 files changed, 81 insertions(+), 12 deletions(-)
 create mode 100644 src/p2estimator.rs

diff --git a/Cargo.toml b/Cargo.toml
index c5b43b6..9055fa9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -17,6 +17,7 @@ itertools = "0.11.0"
 rayon = "1.8.0"
 base64 = "0.22.1"
 puruspe = "0.4.4"
+num-traits = "0.2.19"
 
 [target.'cfg(not(target_env = "msvc"))'.dependencies]
 tikv-jemallocator = "0.5"
diff --git a/scripts/plot_distributions_aurora.py b/scripts/plot_distributions_aurora.py
index d76eb97..26fa437 100644
--- a/scripts/plot_distributions_aurora.py
+++ b/scripts/plot_distributions_aurora.py
@@ -5,7 +5,8 @@
 
 import matplotlib.pyplot as plt
 import numpy as np
-from scipy.optimize import curve_fit
+from matplotlib.colors import to_rgba
+from scipy.optimize import curve_fit, minimize
 from scipy.stats import (
     ecdf,
     expon,
@@ -184,7 +185,6 @@ def _relative_consensus_dist(a, b, ai, bi):
 
 
 stats_to_compute = {
-    "Consensus Distance": consensus_dist,
     "Target Distance": _target_distance,
     "Divergence Change": _kimura_dist,
     "Relative Consensus Distance": _relative_consensus_dist,
@@ -225,12 +225,9 @@ def logcdf(self, x, *args):
     "Relative Consensus Distance": Distribution(
         laplace_asymmetric, (1.0, 0.0, 1.0), False
     ),  # Distribution(invweibull, (1.0, 0.0, 1.0), False),
-    "Consensus Distance": Distribution(
-        laplace_asymmetric, (1.0, 0.0, 1.0), False
-    ),  # Distribution(invweibull, (1.0, 0.0, 1.0), False),
     "Target Distance": Distribution(
-        weibull_min, (1.0, 10000)
-    ),  # Distribution(expon, (1.0,)),  # Distribution(genpareto, (0.0, 1.0)),
+        genpareto, (0.0, 1.0)
+    ),  # Distribution(expon, (1.0,)),  # Distribution(genpareto, (0.0, 1.0)), Distribution(weibull_min, (1.0, 10000)
     "Divergence Change": Distribution(halfnorm, (1.0,)),
 }
 
@@ -322,7 +319,7 @@ def fit_dist(data, dist):
 
 
 for query_name, _ in sorted(
-    join_stats["Consensus Distance"].items(), key=lambda k: -len(k[1])
+    next(iter(join_stats.values())).items(), key=lambda k: -len(k[1])
 ):
     # if not query_name.startswith("sin"):
     #    continue
@@ -357,7 +354,7 @@ def fit_dist(data, dist):
 
             random_samples = np.array(random_stats[name][query_name])
             sx2 = np.linspace(random_samples.min(), random_samples.max(), 1000)
-            fit2 = fit_dist(random_samples, est)
+            fit2 = fit_dist(random_samples[not_join_indexes], est)
             ax2.set_title(f"All {name}")
             ax2.plot(
                 [0],
@@ -372,19 +369,21 @@ def fit_dist(data, dist):
                 label=["Not Joined", "Joined"],
                 density=True,
                 stacked=True,
+                color=[to_rgba(c) for c in ["tab:orange", "tab:blue"]],
             )
             ax2.plot(
                 sx2,
                 est.pdf(sx2, *fit2),
+                "black",
                 label=f"Fit: {', '.join(f'{v:.02f}' for v in fit2)}",
             )
             ax2.legend(fontsize="xx-small")
 
             ax3.set_title("CDFs")
-            ax3.ecdf(join_stats[name][query_name], label="Joins CDF")
-            ax3.ecdf(random_stats[name][query_name], label="All CDF")
+            ax3.ecdf(join_samples, label="Joins CDF")
+            ax3.ecdf(random_samples[not_join_indexes], label="No Joins CDF")
             ax3.plot(sx, est.cdf(sx, *fit), label="Est. Join CDF")
-            ax3.plot(sx2, est.cdf(sx2, *fit2), label="Est. All CDF")
+            ax3.plot(sx2, est.cdf(sx2, *fit2), label="Est. No Join CDF")
             ax3.legend(fontsize="xx-small")
 
         fig.set_size_inches(16, 8)
diff --git a/src/main.rs b/src/main.rs
index 927fbf9..c0ba526 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -8,6 +8,7 @@ mod confidence;
 mod history_tracing;
 mod join_estimation;
 mod matrix;
+mod p2estimator;
 mod pipeline;
 mod score_params;
 mod segment_groups;
diff --git a/src/p2estimator.rs b/src/p2estimator.rs
new file mode 100644
index 0000000..8bdf897
--- /dev/null
+++ b/src/p2estimator.rs
@@ -0,0 +1,67 @@
+/// Implementation of P2 estimator.
+/// See "The P2 Algorithm for Dynamic Statistical Computing Calculation of Quantiles and Histograms Without Storing Observations"
+/// at https://www.cse.wustl.edu/~jain/papers/ftp/psqr.pdf
+use num_traits::{AsPrimitive, Float, Unsigned};
+
+struct P2HistogramPoint<F: Float, I: Unsigned> {
+    value: F,
+    rank: I,
+}
+
+fn linear_prediction<F: Float + From<isize>, I: Unsigned + Copy + Ord + Into<F>>(
+    points: &[P2HistogramPoint<F, I>; 3],
+    d: isize,
+) -> F {
+    let n: [F; 3] = points.each_ref().map(|v| v.rank.into());
+    let q: [F; 3] = points.each_ref().map(|v| v.value);
+    let d_f: F = d.into();
+    let d_off = 1 + d as usize;
+
+    q[1] + d_f * ((q[d_off] - q[1]) / (n[d_off] - n[1]))
+}
+
+fn parabolic_prediction<F: Float + From<isize>, I: Unsigned + Copy + Ord + Into<F>>(
+    points: &[P2HistogramPoint<F, I>; 3],
+    d: isize,
+) -> F {
+    let n: [F; 3] = points.each_ref().map(|v| v.rank.into());
+    let q: [F; 3] = points.each_ref().map(|v| v.value);
+    let d = d.into();
+
+    let left = (n[1] - n[0] + d) * ((q[2] - q[1]) / (n[2] - n[1]));
+    let right = (n[2] - n[1] - d) * ((q[1] - q[0]) / (n[1] - n[0]));
+    q[1] + (d / (n[2] - n[0])) * (left + right)
+}
+
+fn p2update<F: Float + AsPrimitive<isize>, I: Unsigned + Ord + Into<F>>(
+    points: &[P2HistogramPoint<F, I>],
+    center_index: I,
+    observations: I,
+    total_points: I,
+    proposal: F,
+) -> (F, I) {
+    // Actual rank desired for the given quantile...
+    let rank_proposal: F =
+        (center_index * (observations - I::one())).into() / (total_points - I::one()).into();
+    let d: F = rank_proposal - points[1].rank.into();
+
+    if d >= F::one() && (points[2].rank - points[1].rank) > I::one()
+        || (d <= -F::one()) && points[1].rank - points[0].rank > I::one()
+    {
+        let d: isize = if d >= F::zero() { 1 } else { -1 };
+        let n: [F; 3];
+        for i in 0..3 {
+            n[i] = points[i].rank.into();
+        }
+
+        let est = points[1].value + (d / (points[2].rank - points[0].rank).into());
+        est
+    } else {
+        (points[1].value, I::zero())
+    }
+}
+
+struct P2HistogramData<'a, F: Float, I: Unsigned + Into<F>> {
+    observations: I,
+    points: &'a mut [P2HistogramPoint<F, I>],
+}
diff --git a/src/statistics.rs b/src/statistics.rs
index 3a02ce8..078bcbb 100644
--- a/src/statistics.rs
+++ b/src/statistics.rs
@@ -2,6 +2,7 @@ use core::f64;
 use puruspe::{beta, betai, invbetai};
 use std::fmt::Debug;
 
+// TODO: Support for generic floating types...
 #[allow(dead_code)]
 pub trait Distribution: Clone + Debug {
     fn unit() -> Self;

From aba5104aa3550bee7f99db488e79057c92c695f0 Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Tue, 12 May 2026 17:32:19 -0600
Subject: [PATCH 21/39] Init p2 est done..

---
 src/p2estimator.rs | 136 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 115 insertions(+), 21 deletions(-)

diff --git a/src/p2estimator.rs b/src/p2estimator.rs
index 8bdf897..1bfe387 100644
--- a/src/p2estimator.rs
+++ b/src/p2estimator.rs
@@ -1,67 +1,161 @@
+use std::{cmp::Ordering, ops::Neg};
+
 /// Implementation of P2 estimator.
 /// See "The P2 Algorithm for Dynamic Statistical Computing Calculation of Quantiles and Histograms Without Storing Observations"
 /// at https://www.cse.wustl.edu/~jain/papers/ftp/psqr.pdf
-use num_traits::{AsPrimitive, Float, Unsigned};
+use num_traits::{float::TotalOrder, Float, Num, Unsigned};
 
 struct P2HistogramPoint<F: Float, I: Unsigned> {
     value: F,
     rank: I,
 }
 
-fn linear_prediction<F: Float + From<isize>, I: Unsigned + Copy + Ord + Into<F>>(
+fn get_sign<A: Num + PartialOrd + Neg<Output = A>, B: Num + PartialOrd + Neg<Output = B>>(
+    val: A,
+) -> B {
+    if val >= A::zero() {
+        B::one()
+    } else {
+        -B::one()
+    }
+}
+
+fn inc_or_dec<A: Num + PartialOrd, B: Num + PartialOrd + Neg<Output = B>>(val: A, delta: B) -> A {
+    match delta.partial_cmp(&B::zero()) {
+        Some(Ordering::Less) => val - A::one(),
+        Some(Ordering::Greater) => val + A::one(),
+        _ => val,
+    }
+}
+
+fn linear_prediction<F: Float, I: Unsigned + Copy + Ord + Into<F>>(
     points: &[P2HistogramPoint<F, I>; 3],
     d: isize,
 ) -> F {
     let n: [F; 3] = points.each_ref().map(|v| v.rank.into());
     let q: [F; 3] = points.each_ref().map(|v| v.value);
-    let d_f: F = d.into();
-    let d_off = 1 + d as usize;
+    let d_f: F = get_sign(d);
+    let d_off = (1 + d) as usize;
 
     q[1] + d_f * ((q[d_off] - q[1]) / (n[d_off] - n[1]))
 }
 
-fn parabolic_prediction<F: Float + From<isize>, I: Unsigned + Copy + Ord + Into<F>>(
+fn parabolic_prediction<F: Float, I: Unsigned + Copy + Ord + Into<F>>(
     points: &[P2HistogramPoint<F, I>; 3],
     d: isize,
 ) -> F {
     let n: [F; 3] = points.each_ref().map(|v| v.rank.into());
     let q: [F; 3] = points.each_ref().map(|v| v.value);
-    let d = d.into();
+    let d: F = get_sign(d);
 
     let left = (n[1] - n[0] + d) * ((q[2] - q[1]) / (n[2] - n[1]));
     let right = (n[2] - n[1] - d) * ((q[1] - q[0]) / (n[1] - n[0]));
     q[1] + (d / (n[2] - n[0])) * (left + right)
 }
 
-fn p2update<F: Float + AsPrimitive<isize>, I: Unsigned + Ord + Into<F>>(
-    points: &[P2HistogramPoint<F, I>],
-    center_index: I,
+fn _p2update<F: Float, I: Unsigned + Copy + Ord + Into<F> + From<usize>>(
+    points: &mut [P2HistogramPoint<F, I>],
+    center_index: usize,
     observations: I,
     total_points: I,
-    proposal: F,
-) -> (F, I) {
+) {
     // Actual rank desired for the given quantile...
+    let ci: I = center_index.into();
     let rank_proposal: F =
-        (center_index * (observations - I::one())).into() / (total_points - I::one()).into();
+        (ci * (observations - I::one())).into() / (total_points - I::one()).into();
     let d: F = rank_proposal - points[1].rank.into();
 
     if d >= F::one() && (points[2].rank - points[1].rank) > I::one()
         || (d <= -F::one()) && points[1].rank - points[0].rank > I::one()
     {
-        let d: isize = if d >= F::zero() { 1 } else { -1 };
-        let n: [F; 3];
-        for i in 0..3 {
-            n[i] = points[i].rank.into();
+        let d: isize = get_sign(d);
+        let mut p_est = parabolic_prediction(
+            (&points[center_index - 1..center_index + 1])
+                .as_array()
+                .unwrap(),
+            d,
+        );
+        if p_est <= points[center_index - 1].value || p_est >= points[center_index + 1].value {
+            p_est = linear_prediction(
+                (&points[center_index - 1..center_index + 1])
+                    .as_array()
+                    .unwrap(),
+                d,
+            );
         }
 
-        let est = points[1].value + (d / (points[2].rank - points[0].rank).into());
-        est
-    } else {
-        (points[1].value, I::zero())
+        points[center_index].value = p_est;
+        points[center_index].rank = inc_or_dec(points[center_index].rank, d);
     }
 }
 
-struct P2HistogramData<'a, F: Float, I: Unsigned + Into<F>> {
+struct P2HistogramData<'a, F: Float, I: Unsigned + Copy + Ord + Into<F>> {
     observations: I,
     points: &'a mut [P2HistogramPoint<F, I>],
 }
+
+impl<'a, F: Float + TotalOrder, I: Unsigned + Copy + Ord + Into<F> + From<usize> + Into<usize>>
+    P2HistogramData<'a, F, I>
+{
+    fn _standard_update(&mut self, sample: F) {
+        // Find where sample falls within distribution...
+        let p = self.points.partition_point(|v| v.value <= sample);
+        let bound_p = p.min(self.points.len() - 1);
+
+        // Update extremes...
+        if bound_p == 0 {
+            self.points[bound_p].value = self.points[bound_p].value.min(sample);
+        } else if bound_p == (self.points.len() - 1) {
+            self.points[bound_p].value = self.points[bound_p].value.max(sample);
+        }
+
+        // Increment ranks of markers above newly inserted sample...
+        for i in (bound_p + 1)..self.points.len() {
+            self.points[i].rank = self.points[i].rank + I::one();
+        }
+
+        // Adjust inner markers to within 1 of their target quantile using p2 formula...
+        for i in 1..(self.points.len() - 1) {
+            _p2update(self.points, i, self.observations, self.points.len().into());
+        }
+
+        self.observations = self.observations + I::one();
+    }
+
+    fn _pre_init_update(&mut self, sample: F) {
+        let nxt_idx: usize = self.observations.into();
+        self.points[nxt_idx].value = sample;
+        self.observations = self.observations + I::one();
+    }
+
+    fn _initialize(&mut self) {
+        self.points.sort_by(|a, b| a.value.total_cmp(&b.value));
+        self.points.iter_mut().enumerate().for_each(|(i, p)| {
+            p.rank = i.into();
+        });
+    }
+
+    pub fn update(&mut self, sample: F) {
+        let obs: usize = self.observations.into();
+        match (obs + 1).cmp(&self.points.len()) {
+            Ordering::Less => self._pre_init_update(sample),
+            Ordering::Equal => {
+                self._pre_init_update(sample);
+                self._initialize();
+            }
+            Ordering::Greater => {
+                self._standard_update(sample);
+            }
+        }
+    }
+
+    pub fn is_initialized(&self) -> bool {
+        let obs: usize = self.observations.into();
+        obs >= self.points.len()
+    }
+
+    fn combine(&mut self, other: &P2HistogramData<F, I>) {
+        // TODO: Need to think about how to do this efficiently while maintaining accuracy...
+        panic!("Not implemented!")
+    }
+}

From 88c0a9f354d8213dd574b0a65b21d805e18ed332 Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Thu, 14 May 2026 01:48:41 -0600
Subject: [PATCH 22/39] First refactor with new scoring done...

---
 src/assembly.rs         | 225 ++++++++++++++++++++++++++++------------
 src/join_estimation.rs  | 142 +++++++++++++------------
 src/main.rs             |  19 ++--
 src/p2estimator.rs      |   2 +-
 src/pipeline.rs         |  18 ++--
 src/segments.rs         |  39 ++++---
 src/statistics.rs       |  10 +-
 src/trace_statistics.rs |  48 +++++----
 8 files changed, 316 insertions(+), 187 deletions(-)

diff --git a/src/assembly.rs b/src/assembly.rs
index 97e49a7..fa4b6e2 100644
--- a/src/assembly.rs
+++ b/src/assembly.rs
@@ -4,9 +4,9 @@ use itertools::Itertools;
 
 use crate::{
     alignment::{Alignment, Strand},
+    join_estimation::{JoinEstimator, JoinStatisticsCollector},
     score_params::ScoreParams,
-    segments::{Block, SegmentedMatrix},
-    statistics::Distribution,
+    segments::{Block, SegmentedMatrix, SegmentedMatrixView},
     trace_statistics::{QueryStatistics, RegionStatistics},
     AnnotationArgs,
 };
@@ -100,9 +100,8 @@ fn piecewise_linear_cost(
 fn get_link_cost(
     annotation_args: &AnnotationArgs,
     score_params: &ScoreParams,
-    target_gap_distribution: &impl Distribution,
-    consensus_gap: f64,
-    target_gap: f64,
+    consensus_gap: isize,
+    join_prob: f64,
 ) -> f64 {
     // Minimum cost (a query loop)
     let min_value = score_params.query_loop_score;
@@ -123,11 +122,9 @@ fn get_link_cost(
         -value_range * (annotation_args.join_consensus_overlap_penalty / overlap_range).abs();
     let beta = -value_range * (annotation_args.join_consensus_gap_penalty / gap_range).abs();
 
-    // Compute target gap penalty.
     // Doing this as the expected value over the transition scores...
-    let target_random_prob = target_gap_distribution.cdf(target_gap);
-    let target_expected_score = target_random_prob * score_params.query_jump_score
-        + (1.0 - target_random_prob) * score_params.query_loop_score;
+    let expected_score = join_prob * score_params.query_loop_score
+        + (1.0 - join_prob) * score_params.query_jump_score;
 
     // Cost = linear consensus cost + linear target gap cost...
     min_value
@@ -136,9 +133,9 @@ fn get_link_cost(
             (annotation_args.free_join_consensus_gap as f64).abs(),
             alpha,
             beta,
-            consensus_gap,
+            consensus_gap as f64,
         )
-        + target_expected_score
+        + expected_score
 }
 
 pub fn block_target_distance(first_block: &Block, second_block: &Block) -> isize {
@@ -187,12 +184,126 @@ pub fn block_consensus_distance(first_block: &Block, second_block: &Block) -> (i
     }
 }
 
-fn link_assemblies<T: Distribution>(
+pub fn block_length_on_query(b: &Block) -> usize {
+    b.query_end.abs_diff(b.query_start) + 1
+}
+
+fn is_joinable(
+    target_distance: isize,
+    consensus_distance: isize,
+    link_type: LinkType,
+    min_block_length: usize,
+    args: &AnnotationArgs,
+) -> bool {
+    let within_target_distance_threshold =
+        target_distance < args.target_join_distance as isize && target_distance >= 0;
+
+    let consensus_is_colinear = if link_type.is_inversion() {
+        consensus_distance.abs() < args.inversion_distance
+    } else {
+        consensus_distance > -args.consensus_join_overlap
+            && consensus_distance < args.consensus_join_distance
+    };
+
+    // TODO: Hardcoded, change later...
+    let is_significant =
+        min_block_length >= 10 && -consensus_distance <= ((min_block_length / 2) as isize);
+
+    within_target_distance_threshold && consensus_is_colinear && is_significant
+}
+
+fn new_alignment_to_blocks_map(
+    segments: SegmentedMatrixView,
+    alignments: &[Alignment],
+) -> Vec<Vec<SegmentAndDenseRow>> {
+    let mut alignment_block_map = vec![Vec::<SegmentAndDenseRow>::new(); alignments.len()];
+
+    for (s_idx, segment) in segments.iter().enumerate() {
+        for (b_idx, block) in segment.blocks.iter().enumerate() {
+            if block.row_idx > 0 && block.row_idx <= alignments.len() {
+                alignment_block_map[block.row_idx - 1].push((s_idx, b_idx));
+            }
+        }
+    }
+
+    alignment_block_map
+}
+
+pub fn gather_join_statistics<T: JoinStatisticsCollector>(
+    alignments: &[Alignment],
+    annotation_args: &AnnotationArgs,
+) -> Vec<(usize, T)> {
+    let mut query_ids: Vec<usize> = alignments.iter().map(|a| a.query_id).unique().collect();
+    query_ids.sort();
+
+    let mut query_stats: Vec<(usize, T)> = Vec::with_capacity(query_ids.len());
+
+    query_ids
+        .iter()
+        // grab the alignments for this ID
+        .map(|id| {
+            (
+                *id,
+                alignments
+                    .iter()
+                    .enumerate()
+                    .filter(|&(_, a)| a.query_id == *id)
+                    .map(|(i, a)| Block::from_alignment(a, i, 0.0, 0.0)),
+            )
+        })
+        .for_each(|(id, compat_alignments)| {
+            let mut new_stats = T::new();
+
+            gather_join_statistics_single_family(
+                compat_alignments,
+                annotation_args,
+                &mut new_stats,
+            );
+
+            query_stats.push((id, new_stats));
+        });
+
+    query_stats
+}
+
+fn gather_join_statistics_single_family<'a>(
+    compatable_alignments: impl Iterator<Item = Block>,
+    args: &AnnotationArgs,
+    join_stats: &mut impl JoinStatisticsCollector,
+) {
+    let compatable_blocks = compatable_alignments
+        .sorted_by_key(|a| a.col_start)
+        .collect_vec();
+
+    compatable_blocks
+        .iter()
+        .enumerate()
+        .for_each(|(idx, a_block)| {
+            compatable_blocks[idx + 1..]
+                .iter()
+                .enumerate()
+                .for_each(|(idx2, b_block)| {
+                    let (consensus_distance, link_type) =
+                        block_consensus_distance(a_block, b_block);
+                    let joinable = is_joinable(
+                        block_target_distance(a_block, b_block),
+                        consensus_distance,
+                        link_type,
+                        block_length_on_query(a_block).min(block_length_on_query(b_block)),
+                        args,
+                    );
+
+                    join_stats.add(a_block, b_block, idx + 1 == idx2, joinable);
+                })
+        })
+}
+
+fn link_assemblies<T: JoinEstimator>(
     graph: &mut HashMap<(SegmentAndDenseRow, SegmentAndDenseRow), Edge>,
     compatable_blocks: impl Iterator<Item = (usize, usize)>,
     segments: &SegmentedMatrix,
     query_statistics: &QueryStatistics<T>,
-    region_statistics: &RegionStatistics,
+    _region_statistics: &RegionStatistics,
     score_params: &ScoreParams,
     args: &AnnotationArgs,
 ) {
@@ -210,54 +321,39 @@ fn link_assemblies<T: Distribution>(
             let b_block = &segments[b.0].blocks[b.1];
 
             let target_distance = block_target_distance(a_block, b_block);
-
-            let a_length = a_block.query_end.abs_diff(a_block.query_start) + 1;
-            let b_length = b_block.query_end.abs_diff(b_block.query_start) + 1;
-            let min_length = a_length.min(b_length);
-
-            // Query bounds are reversed for reverse sequences, so the start is actually greater than the end (Ex. start: 1510 -> end: 105)
+            let min_block_length =
+                block_length_on_query(a_block).min(block_length_on_query(b_block));
 
             let (consensus_distance, link_type) = block_consensus_distance(a_block, b_block);
 
-            // Within target distance???
-            let within_target_distance_threshold = (target_distance
-                < args.target_join_distance as isize)
-                && (query_statistics.distribution.ccdf(target_distance as f64)
-                    >= args.target_distance_likelihood_threshold);
-
-            let consensus_is_colinear = if link_type.is_inversion() {
-                consensus_distance.abs() < args.inversion_distance
-            } else {
-                consensus_distance > -args.consensus_join_overlap
-                    && consensus_distance < args.consensus_join_distance
-            };
-
-            // TODO: Hardcoded, change later...
-            let is_significant =
-                min_length >= 10 && -consensus_distance <= ((min_length / 2) as isize);
-
-            let weight = if a_block.row_idx == b_block.row_idx && ((b.0 - 1) <= a.0) {
-                score_params.query_loop_score
-            } else {
-                get_link_cost(
-                    args,
-                    score_params,
-                    &query_statistics.distribution,
-                    consensus_distance as f64,
-                    target_distance as f64,
-                )
-            };
-
-            if within_target_distance_threshold && consensus_is_colinear && is_significant {
-                graph.insert(
-                    ((a.0, a_block.row_idx), (b.0, b_block.row_idx)),
-                    Edge {
-                        weight,
-                        first_sparse_row: a.1,
-                        second_sparse_row: b.1,
-                        link_type,
-                    },
-                );
+            if is_joinable(
+                target_distance,
+                consensus_distance,
+                link_type,
+                min_block_length,
+                args,
+            ) {
+                if let Some(estimator) = &query_statistics.estimator {
+                    let join_prob = estimator.predict(a_block, b_block, false);
+
+                    if join_prob >= args.join_likelihood_threshold {
+                        let weight = if a_block.row_idx == b_block.row_idx && ((b.0 - 1) <= a.0) {
+                            score_params.query_loop_score
+                        } else {
+                            get_link_cost(args, score_params, consensus_distance, join_prob)
+                        };
+
+                        graph.insert(
+                            ((a.0, a_block.row_idx), (b.0, b_block.row_idx)),
+                            Edge {
+                                weight,
+                                first_sparse_row: a.1,
+                                second_sparse_row: b.1,
+                                link_type,
+                            },
+                        );
+                    }
+                }
             }
         });
     });
@@ -274,7 +370,7 @@ pub struct SegmentAssemblyGraph {
 }
 
 impl SegmentAssemblyGraph {
-    pub fn new<T: Distribution>(
+    pub fn new<T: JoinEstimator>(
         alignments: &[Alignment],
         segments: &SegmentedMatrix,
         region_statistics: &RegionStatistics,
@@ -282,16 +378,7 @@ impl SegmentAssemblyGraph {
         score_params: &ScoreParams,
         annotation_args: &AnnotationArgs,
     ) -> Self {
-        let mut alignment_block_map = vec![Vec::<SegmentAndDenseRow>::new(); alignments.len()];
-
-        for (s_idx, segment) in segments.iter().enumerate() {
-            for (b_idx, block) in segment.blocks.iter().enumerate() {
-                if block.row_idx > 0 && block.row_idx <= alignments.len() {
-                    alignment_block_map[block.row_idx - 1].push((s_idx, b_idx));
-                }
-            }
-        }
-
+        let alignment_block_map = new_alignment_to_blocks_map(segments, alignments);
         let mut query_ids: Vec<usize> = alignments.iter().map(|a| a.query_id).unique().collect();
 
         query_ids.sort();
diff --git a/src/join_estimation.rs b/src/join_estimation.rs
index 7dd9d56..7b0cd3e 100644
--- a/src/join_estimation.rs
+++ b/src/join_estimation.rs
@@ -1,63 +1,43 @@
 use crate::{
     assembly::block_target_distance,
     segments::Block,
-    statistics::{Distribution, ExponentialEstimator, HalfT},
+    statistics::{ln_add_exp, Distribution, ExponentialEstimator, HalfT},
 };
 
-pub trait JoinEstimator<T: JoinStatistics> {
+pub trait JoinEstimator: Clone {
     fn predict(&self, first_block: &Block, second_block: &Block, log_space: bool) -> f64;
-    fn from_statistics(statistics: T) -> Self;
 }
 
-pub trait JoinStatistics {
+pub trait JoinStatisticsCollector: Clone {
     fn new() -> Self;
     fn combine(&self, other: &Self) -> Self;
     fn add(&mut self, first_block: &Block, second_block: &Block, neighbors: bool, joinable: bool);
 }
 
+#[derive(Debug, Clone)]
 pub struct BayesianJoinEstimator {
     target_distance_join: ExponentialEstimator,
-    target_distance_background: ExponentialEstimator,
+    target_distance_nojoin: ExponentialEstimator,
     divergence_join: HalfT,
-    divergence_background: HalfT,
+    divergence_nojoin: HalfT,
+    join_prior: f64,
 }
 
-impl JoinEstimator<BayesianJoinStatistics> for BayesianJoinEstimator {
-    fn from_statistics(statistics: BayesianJoinStatistics) -> Self {
-        let join_td_mean =
-            statistics.joinable_target_distance_sum as f64 / statistics.joinable_count as f64;
-        let all_td_mean = statistics.all_target_distance_sum as f64 / statistics.all_count as f64;
-
-        // Divergence distributions should have a mean of 0, so we assume that...
-        let join_div_std = statistics.join_divergence_square_sum / statistics.joinable_count as f64;
-        let all_div_std = statistics.divergence_square_sum / statistics.all_count as f64;
-
-        Self {
-            target_distance_join: ExponentialEstimator::new(
-                join_td_mean,
-                statistics.joinable_count,
-            ),
-            target_distance_background: ExponentialEstimator::new(
-                all_td_mean,
-                statistics.all_count,
-            ),
-            divergence_join: HalfT::new(join_div_std, statistics.joinable_count),
-            divergence_background: HalfT::new(all_div_std, statistics.all_count),
-        }
-    }
-
+impl JoinEstimator for BayesianJoinEstimator {
     fn predict(&self, first_block: &Block, second_block: &Block, log_space: bool) -> f64 {
-        let prior_acc: f64 = 0.95; // Accuracy of the prior estimator of joins...
         let target_dist = block_target_distance(first_block, second_block) as f64;
         // Absolute value as t-dist is symmetric and we want to get prob in tail, also, we know the mean is 0...
         let divergence_diff = (second_block.kimura80 - first_block.kimura80).abs();
 
-        let target_likelihood = self.target_distance_join.logccdf(target_dist)
-            - self.target_distance_background.logccdf(target_dist);
-        let diverg_likelihood = self.divergence_join.logccdf(divergence_diff)
-            - self.divergence_background.logccdf(divergence_diff);
+        let join_score = self.join_prior.ln()
+            + self.target_distance_join.logpdf(target_dist)
+            + self.divergence_join.logpdf(divergence_diff);
+        let nojoin_score = (-self.join_prior).ln_1p()
+            + self.target_distance_nojoin.logpdf(target_dist)
+            + self.divergence_nojoin.logpdf(target_dist);
 
-        let score = target_likelihood + diverg_likelihood + prior_acc.ln();
+        let score_norm = ln_add_exp(join_score, nojoin_score);
+        let score = join_score - score_norm;
 
         if log_space {
             score
@@ -67,60 +47,92 @@ impl JoinEstimator<BayesianJoinStatistics> for BayesianJoinEstimator {
     }
 }
 
+impl From<BayesianJoinStatistics> for BayesianJoinEstimator {
+    fn from(value: BayesianJoinStatistics) -> Self {
+        Self::from(&value)
+    }
+}
+
+impl From<&BayesianJoinStatistics> for BayesianJoinEstimator {
+    fn from(statistics: &BayesianJoinStatistics) -> Self {
+        let join_td_mean =
+            statistics.joinable_target_distance_sum as f64 / statistics.joinable_count as f64;
+        let nojoin_td_mean =
+            statistics.unjoinable_target_distance_sum as f64 / statistics.unjoinable_count as f64;
+
+        // Divergence distributions should have a mean of 0, so we assume that...
+        let join_div_mean = statistics.joinable_divergence_sum / statistics.joinable_count as f64;
+        let nojoin_div_mean =
+            statistics.unjoinable_divergence_sum / statistics.joinable_count as f64;
+
+        Self {
+            target_distance_join: ExponentialEstimator::new(
+                join_td_mean,
+                statistics.joinable_count,
+            ),
+            target_distance_nojoin: ExponentialEstimator::new(
+                nojoin_td_mean,
+                statistics.unjoinable_count,
+            ),
+            divergence_join: HalfT::from_sample_mean(join_div_mean, statistics.joinable_count),
+            divergence_nojoin: HalfT::from_sample_mean(
+                nojoin_div_mean,
+                statistics.unjoinable_count,
+            ),
+            join_prior: statistics.joinable_count as f64
+                / (statistics.joinable_count + statistics.unjoinable_count) as f64,
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
 pub struct BayesianJoinStatistics {
     joinable_target_distance_sum: usize,
-    all_target_distance_sum: usize,
-    divergence_sum: f64,
-    divergence_square_sum: f64,
-    join_divergence_sum: f64,
-    join_divergence_square_sum: f64,
+    unjoinable_target_distance_sum: usize,
+    joinable_divergence_sum: f64,
+    unjoinable_divergence_sum: f64,
     joinable_count: usize,
-    all_count: usize,
+    unjoinable_count: usize,
 }
 
-impl JoinStatistics for BayesianJoinStatistics {
+impl JoinStatisticsCollector for BayesianJoinStatistics {
     fn new() -> Self {
         Self {
             joinable_target_distance_sum: 0,
-            all_target_distance_sum: 0,
-            divergence_sum: 0.0,
-            divergence_square_sum: 0.0,
-            join_divergence_sum: 0.0,
-            join_divergence_square_sum: 0.0,
+            unjoinable_target_distance_sum: 0,
+            joinable_divergence_sum: 0.0,
+            unjoinable_divergence_sum: 0.0,
             joinable_count: 0,
-            all_count: 0,
+            unjoinable_count: 0,
         }
     }
 
-    fn add(&mut self, first_block: &Block, second_block: &Block, neighbors: bool, joinable: bool) {
+    fn add(&mut self, first_block: &Block, second_block: &Block, _neighbors: bool, joinable: bool) {
         let target_dist = block_target_distance(first_block, second_block).abs() as usize;
-        let divergence_diff = second_block.kimura80 - first_block.kimura80;
+        let divergence_diff = (second_block.kimura80 - first_block.kimura80).abs();
 
         if joinable {
             self.joinable_target_distance_sum += target_dist;
-            self.join_divergence_sum += divergence_diff;
-            self.join_divergence_square_sum += divergence_diff * divergence_diff;
+            self.joinable_divergence_sum += divergence_diff;
             self.joinable_count += 1;
+        } else {
+            self.unjoinable_target_distance_sum += target_dist;
+            self.unjoinable_divergence_sum += divergence_diff;
+            self.unjoinable_count += 1;
         }
-
-        self.all_target_distance_sum += target_dist;
-        self.divergence_sum += divergence_diff;
-        self.divergence_square_sum += divergence_diff * divergence_diff;
-        self.all_count += 1;
     }
 
     fn combine(&self, other: &Self) -> Self {
         Self {
             joinable_target_distance_sum: self.joinable_target_distance_sum
                 + other.joinable_target_distance_sum,
-            all_target_distance_sum: self.all_target_distance_sum + other.all_target_distance_sum,
-            divergence_sum: self.divergence_sum + other.divergence_sum,
-            divergence_square_sum: self.divergence_square_sum + other.divergence_square_sum,
-            join_divergence_sum: self.join_divergence_sum + other.join_divergence_sum,
-            join_divergence_square_sum: self.join_divergence_square_sum
-                + other.join_divergence_square_sum,
+            unjoinable_target_distance_sum: self.unjoinable_target_distance_sum
+                + other.unjoinable_target_distance_sum,
+            joinable_divergence_sum: self.joinable_divergence_sum + other.joinable_divergence_sum,
+            unjoinable_divergence_sum: self.unjoinable_divergence_sum
+                + other.unjoinable_divergence_sum,
             joinable_count: self.joinable_count + other.joinable_count,
-            all_count: self.all_count + other.all_count,
+            unjoinable_count: self.unjoinable_count + other.unjoinable_count,
         }
     }
 }
diff --git a/src/main.rs b/src/main.rs
index c0ba526..0063815 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -46,8 +46,9 @@ use viz::VizConstraint;
 use crate::{
     annotation::AmbiguousAnnotation,
     chunks::validate_groups,
+    join_estimation::{BayesianJoinEstimator, BayesianJoinStatistics},
     pipeline::{run_history_trace, run_naive_trace, NaiveTraceResults},
-    trace_statistics::{trace_statistics, OccuranceCountingMode},
+    trace_statistics::{trace_statistics, OccuranceCountingMode, TraceStatistics},
     viz::{
         stats::{write_family_statistics, write_inversion_statistics},
         write_index_file, ICON_SVG,
@@ -138,16 +139,14 @@ pub struct AnnotationArgs {
     )]
     pub target_join_distance: usize,
 
-    /// Removes joins across positions
-    /// in the target (genome) at which a join is
-    /// less than this likely to not be generated
-    /// at random.
+    /// Removes joins that fall below this threshold of occuring.
+    /// Value can be set between 0 and 1.
     #[arg(
-        long = "target-join-likelihood-threshold",
-        default_value = "0.5",
+        long = "join-likelihood-threshold",
+        default_value = "0.25",
         value_name = "f"
     )]
-    pub target_distance_likelihood_threshold: f64,
+    pub join_likelihood_threshold: f64,
 
     /// The maximum overlap in the consensus at which
     /// a join is considered between compatible alignments.
@@ -459,10 +458,10 @@ fn main() -> Result<()> {
         .panic_fuse()
         .enumerate()
         .map(|(region_idx, group)| run_naive_trace(group, &alignment_data, region_idx, &args))
-        .collect::<Vec<NaiveTraceResults>>();
+        .collect::<Vec<NaiveTraceResults<BayesianJoinStatistics>>>();
     naive_results.sort_by_key(|v| v.region_index);
 
-    let trace_stats = trace_statistics(
+    let trace_stats: TraceStatistics<BayesianJoinEstimator> = trace_statistics(
         &naive_results,
         &alignment_data,
         OccuranceCountingMode::Segments,
diff --git a/src/p2estimator.rs b/src/p2estimator.rs
index 1bfe387..fbcdeb2 100644
--- a/src/p2estimator.rs
+++ b/src/p2estimator.rs
@@ -154,7 +154,7 @@ impl<'a, F: Float + TotalOrder, I: Unsigned + Copy + Ord + Into<F> + From<usize>
         obs >= self.points.len()
     }
 
-    fn combine(&mut self, other: &P2HistogramData<F, I>) {
+    fn combine(&mut self, _other: &P2HistogramData<F, I>) {
         // TODO: Need to think about how to do this efficiently while maintaining accuracy...
         panic!("Not implemented!")
     }
diff --git a/src/pipeline.rs b/src/pipeline.rs
index 377266f..5ca4f2a 100644
--- a/src/pipeline.rs
+++ b/src/pipeline.rs
@@ -5,15 +5,16 @@ use itertools::Itertools;
 use crate::{
     alignment::{AlignmentData, Strand},
     annotation::{AmbiguousAnnotation, SimpleAnnotation},
+    assembly::gather_join_statistics,
     chunks::ProximityGroup,
     confidence::confidence,
     history_tracing::{
         backtrace_histories, history_viterbi_on_segments, History, RefinedTraceSegment,
     },
+    join_estimation::{JoinEstimator, JoinStatisticsCollector},
     matrix::{Matrix, MatrixDef},
     score_params::{approximate_ideal_skip_state_score, ScoreParams},
     segments::{assemble_and_link_segments, segments_from_matrix_trace, InitialSegments},
-    statistics::Distribution,
     support::windowed_confidence,
     trace_statistics::TraceStatistics,
     viterbi::{trace_segments, traceback, viterbi_collapsed, TraceSegment},
@@ -141,7 +142,7 @@ fn get_active_columns<T: Copy + Default + Display>(matrix: &Matrix<T>) -> Vec<(u
     active_cols
 }
 
-pub struct NaiveTraceResults {
+pub struct NaiveTraceResults<T: JoinStatisticsCollector> {
     pub target_start: usize,
     pub target_end: usize,
     pub trace_segments: Vec<TraceSegment>,
@@ -149,16 +150,17 @@ pub struct NaiveTraceResults {
     pub score_params: ScoreParams,
     pub alignment_confidences: Vec<f64>,
     pub active_columns: Vec<(usize, usize)>,
+    pub query_join_statistics: Vec<(usize, T)>,
     pub viz_writer: AdjudicationSodaWriter,
     pub region_index: usize,
 }
 
-pub fn run_naive_trace(
+pub fn run_naive_trace<T: JoinStatisticsCollector>(
     proximity_group: &ProximityGroup,
     alignment_data: &AlignmentData,
     region_idx: usize,
     args: &AuroraArgs,
-) -> NaiveTraceResults {
+) -> NaiveTraceResults<T> {
     let annot_args = &args.annotation_args;
 
     let score_params = ScoreParams::new(
@@ -250,6 +252,9 @@ pub fn run_naive_trace(
             .expect("Unable to write confidences!!!");
     }
 
+    let query_join_statistics =
+        gather_join_statistics(proximity_group.alignments, &args.annotation_args);
+
     NaiveTraceResults {
         target_start: proximity_group.target_start,
         target_end: proximity_group.target_end,
@@ -258,16 +263,17 @@ pub fn run_naive_trace(
         score_params,
         alignment_confidences: confidence_by_row,
         active_columns: get_active_columns(&confidence_matrix),
+        query_join_statistics,
         viz_writer,
         region_index: region_idx,
     }
 }
 
-pub fn run_history_trace<T: Distribution>(
+pub fn run_history_trace<T: JoinEstimator, S: JoinStatisticsCollector>(
     proximity_group: &ProximityGroup,
     alignment_data: &AlignmentData,
     trace_statistics: &TraceStatistics<T>,
-    naive_trace: &mut NaiveTraceResults,
+    naive_trace: &mut NaiveTraceResults<S>,
     args: &AuroraArgs,
 ) -> Vec<AmbiguousAnnotation> {
     let vis_args = &args.visualization_args;
diff --git a/src/segments.rs b/src/segments.rs
index a9aa282..30de252 100644
--- a/src/segments.rs
+++ b/src/segments.rs
@@ -2,12 +2,12 @@ use core::f64;
 use std::{cmp::Ordering, fmt::Debug, iter::Fuse};
 
 use crate::{
-    alignment::Strand,
+    alignment::{Alignment, Strand},
     assembly::SegmentAssemblyGraph,
     chunks::ProximityGroup,
+    join_estimation::JoinEstimator,
     matrix::Matrix,
     score_params::ScoreParams,
-    statistics::Distribution,
     trace_statistics::{QueryStatistics, RegionStatistics},
     viterbi::TraceSegment,
     AnnotationArgs,
@@ -99,6 +99,23 @@ impl Block {
     pub fn to_comparable(&self) -> (Option<usize>, usize) {
         (self.query_id, self.row_idx)
     }
+
+    pub fn from_alignment(alignment: &Alignment, row: usize, confidence: f64, score: f64) -> Self {
+        Self {
+            row_idx: row,
+            block_type: BlockType::Alignment,
+            strand: alignment.strand,
+            query_id: Some(alignment.query_id),
+            col_start: alignment.target_start,
+            col_end: alignment.target_end,
+            query_start: alignment.query_start,
+            query_end: alignment.query_end,
+            avg_confidence: confidence,
+            alignment_score: score,
+            kimura80: alignment.kimura80(alignment.query_start, alignment.query_end),
+            can_join_up_to: 0,
+        }
+    }
 }
 
 #[derive(Debug)]
@@ -111,6 +128,7 @@ pub struct Segment {
 }
 
 pub type SegmentedMatrix = Vec<Segment>;
+pub type SegmentedMatrixView<'a> = &'a [Segment];
 
 #[derive(Copy, Clone, Debug)]
 enum MergeEntry<T> {
@@ -213,21 +231,10 @@ pub struct InitialSegments {
     initial_trace_scores: Vec<f64>,
 }
 
-#[allow(dead_code)]
-pub struct SegmentView<'a> {
-    pub start_col: usize,
-    pub end_col: usize,
-    pub blocks: &'a [Block],
-}
-
 #[allow(dead_code)]
 impl InitialSegments {
-    pub fn iter_segments(&self) -> impl Iterator<Item = SegmentView<'_>> {
-        self.segments.iter().map(|v| SegmentView {
-            start_col: v.start_col,
-            end_col: v.end_col,
-            blocks: &v.blocks,
-        })
+    pub fn view_segments(&self) -> SegmentedMatrixView<'_> {
+        return &self.segments;
     }
 
     pub fn len(&self) -> usize {
@@ -582,7 +589,7 @@ pub fn segments_from_matrix_trace(
     }
 }
 
-pub fn assemble_and_link_segments<'a, T: Distribution>(
+pub fn assemble_and_link_segments<'a, T: JoinEstimator>(
     proximity_group: &ProximityGroup,
     initial_segments: &'a mut InitialSegments,
     trace_segments: &[TraceSegment],
diff --git a/src/statistics.rs b/src/statistics.rs
index 078bcbb..9f2722b 100644
--- a/src/statistics.rs
+++ b/src/statistics.rs
@@ -2,6 +2,13 @@ use core::f64;
 use puruspe::{beta, betai, invbetai};
 use std::fmt::Debug;
 
+pub fn ln_add_exp(a: f64, b: f64) -> f64 {
+    let max = a.max(b);
+    let min = a.min(b);
+    // TODO: Possibly use more stable ln_1p_exp at https://github.com/JuliaStats/LogExpFunctions.jl/files/8218470/log1pexp.pdf (Implemented at https://github.com/JuliaStats/LogExpFunctions.jl/blob/master/src/basicfuns.jl#L263)
+    max + (min - max).exp().ln_1p()
+}
+
 // TODO: Support for generic floating types...
 #[allow(dead_code)]
 pub trait Distribution: Clone + Debug {
@@ -144,6 +151,7 @@ pub struct HalfT {
 }
 
 impl HalfT {
+    #[allow(dead_code)]
     pub fn new(standard_deviation: f64, degrees_of_freedom: usize) -> Self {
         Self {
             standard_deviation,
@@ -312,6 +320,6 @@ mod test {
 
     #[test]
     fn test_exponential_distribution() {
-        let dist = Exponential::unit();
+        let _dist = Exponential::unit();
     }
 }
diff --git a/src/trace_statistics.rs b/src/trace_statistics.rs
index f8183fd..537ff1d 100644
--- a/src/trace_statistics.rs
+++ b/src/trace_statistics.rs
@@ -1,8 +1,12 @@
+use std::fmt::Debug;
+
+use itertools::izip;
+
 use crate::{
     alignment::AlignmentData,
+    join_estimation::{JoinEstimator, JoinStatisticsCollector},
     pipeline::NaiveTraceResults,
-    segments::SegmentView,
-    statistics::{Distribution, ExponentialEstimator},
+    segments::Segment,
 };
 
 #[derive(Debug)]
@@ -12,15 +16,15 @@ pub struct RegionStatistics {
 }
 
 #[derive(Debug, Clone)]
-pub struct QueryStatistics<T: Distribution> {
+pub struct QueryStatistics<T: JoinEstimator> {
     pub occurances: usize,
     pub coverage: usize,
     pub target_span: usize,
-    pub distribution: T,
+    pub estimator: Option<T>,
 }
 
 #[derive(Debug)]
-pub struct TraceStatistics<T: Distribution> {
+pub struct TraceStatistics<T: JoinEstimator> {
     #[allow(dead_code)]
     pub total_bases: usize,
     pub query_statistics: Vec<QueryStatistics<T>>,
@@ -33,11 +37,11 @@ pub enum OccuranceCountingMode {
     Trace,
 }
 
-pub fn trace_statistics(
-    naive_traces: &[NaiveTraceResults],
+pub fn trace_statistics<S: JoinStatisticsCollector + Debug + Into<E>, E: JoinEstimator>(
+    naive_traces: &[NaiveTraceResults<S>],
     alignment_data: &AlignmentData,
     count_mode: OccuranceCountingMode,
-) -> TraceStatistics<ExponentialEstimator> {
+) -> TraceStatistics<E> {
     // Asumption... All regions are sorted, no gaps. At least 1 region expected...
     debug_assert!(naive_traces.first().map(|v| v.region_index) == Some(0));
     debug_assert!(naive_traces
@@ -50,12 +54,12 @@ pub fn trace_statistics(
         .zip(naive_traces.iter().skip(1))
         .all(|(v1, v2)| v1.region_index + 1 == v2.region_index && v1.target_end < v2.target_start));
 
-    let mut query_stats = vec![
+    let mut query_stats: Vec<QueryStatistics<E>> = vec![
         QueryStatistics {
             occurances: 0,
             coverage: 0,
             target_span: 0,
-            distribution: ExponentialEstimator::unit(),
+            estimator: None,
         };
         alignment_data.query_name_map.size()
     ];
@@ -64,11 +68,19 @@ pub fn trace_statistics(
         vec![None; alignment_data.query_name_map.size()];
 
     let mut all_region_stats: Vec<RegionStatistics> = Vec::with_capacity(naive_traces.len());
+    let mut all_join_stats: Vec<Option<S>> = vec![None; alignment_data.query_name_map.size()];
 
     for trace_results in naive_traces.iter() {
+        for (query_id, stats) in trace_results.query_join_statistics.iter() {
+            all_join_stats[*query_id] = match &all_join_stats[*query_id] {
+                None => Some(stats.clone()),
+                Some(other_stats) => Some(other_stats.combine(stats)),
+            };
+        }
+
         match count_mode {
             OccuranceCountingMode::Segments => {
-                for seg in trace_results.segments.iter_segments() {
+                for seg in trace_results.segments.view_segments().iter() {
                     for blk in seg.blocks.iter() {
                         if let Some(query_id) = blk.query_id {
                             query_stats[query_id].occurances += 1;
@@ -113,9 +125,9 @@ pub fn trace_statistics(
         };
 
         let mut unexplained_bases_up_to: usize = 0;
-        let mut prior_segment: Option<SegmentView> = None;
+        let mut prior_segment: Option<&Segment> = None;
 
-        for seg in trace_results.segments.iter_segments() {
+        for seg in trace_results.segments.view_segments() {
             if let Some(prior_segment) = prior_segment {
                 // If a skip block was the prior block, add it's bases as unexplained.
                 if prior_segment.blocks.len() == 1 && prior_segment.blocks[0].row_idx == 0 {
@@ -133,15 +145,13 @@ pub fn trace_statistics(
         all_region_stats.push(region_stat);
     }
 
-    for (query_info, query_span) in query_stats.iter_mut().zip(query_span.iter()) {
+    for (query_info, query_span, join_stat) in
+        izip!(query_stats.iter_mut(), query_span.iter(), all_join_stats)
+    {
         if let Some((start, end)) = query_span {
             query_info.target_span = end - start + 1;
-            // We subtract 1 because were looking at distances between each occurance as a sample value.
-            query_info.distribution = ExponentialEstimator::new(
-                query_info.target_span as f64 / query_info.occurances.saturating_sub(1) as f64,
-                query_info.occurances.saturating_sub(1),
-            );
         }
+        query_info.estimator = join_stat.map(|v| v.into());
     }
 
     TraceStatistics {

From a8e923c4bd11f7a3d09a8e1db57fd66a9898ac47 Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Thu, 14 May 2026 15:38:28 -0600
Subject: [PATCH 23/39] Target distance and divergence bayesian scoring
 working... Very clear bug in history backtrace.

---
 src/assembly.rs         | 82 +++++++++++++++++++++++++++++++----------
 src/history_tracing.rs  |  1 +
 src/join_estimation.rs  | 67 +++++++++++++++++++++------------
 src/statistics.rs       | 25 +++++++++----
 src/trace_statistics.rs | 27 +++++++++-----
 5 files changed, 142 insertions(+), 60 deletions(-)

diff --git a/src/assembly.rs b/src/assembly.rs
index fa4b6e2..df96601 100644
--- a/src/assembly.rs
+++ b/src/assembly.rs
@@ -326,6 +326,47 @@ fn link_assemblies<T: JoinEstimator>(
 
             let (consensus_distance, link_type) = block_consensus_distance(a_block, b_block);
 
+            if b_block.row_idx == 583 {
+                println!("Block: {}", a_block.row_idx);
+                println!(
+                    "Score: {}",
+                    query_statistics.estimator.predict(a_block, b_block, false)
+                );
+
+                println!(
+                    "Is Joinable: {}",
+                    is_joinable(
+                        target_distance,
+                        consensus_distance,
+                        link_type,
+                        min_block_length,
+                        args,
+                    )
+                );
+
+                println!(
+                    "Weight: {}",
+                    if a_block.row_idx == b_block.row_idx && ((b.0 - 1) <= a.0) {
+                        score_params.query_loop_score
+                    } else {
+                        get_link_cost(
+                            args,
+                            score_params,
+                            consensus_distance,
+                            query_statistics.estimator.predict(a_block, b_block, false),
+                        )
+                    }
+                );
+
+                println!("Estimator: {:#?}", query_statistics.estimator);
+                println!(
+                    "Target Dist: {}, Div: {}, Cons Dist: {}",
+                    target_distance,
+                    (a_block.kimura80 - b_block.kimura80).abs(),
+                    consensus_distance
+                )
+            }
+
             if is_joinable(
                 target_distance,
                 consensus_distance,
@@ -333,26 +374,29 @@ fn link_assemblies<T: JoinEstimator>(
                 min_block_length,
                 args,
             ) {
-                if let Some(estimator) = &query_statistics.estimator {
-                    let join_prob = estimator.predict(a_block, b_block, false);
-
-                    if join_prob >= args.join_likelihood_threshold {
-                        let weight = if a_block.row_idx == b_block.row_idx && ((b.0 - 1) <= a.0) {
-                            score_params.query_loop_score
-                        } else {
-                            get_link_cost(args, score_params, consensus_distance, join_prob)
-                        };
-
-                        graph.insert(
-                            ((a.0, a_block.row_idx), (b.0, b_block.row_idx)),
-                            Edge {
-                                weight,
-                                first_sparse_row: a.1,
-                                second_sparse_row: b.1,
-                                link_type,
-                            },
-                        );
+                let join_prob = query_statistics.estimator.predict(a_block, b_block, false);
+
+                if join_prob >= args.join_likelihood_threshold {
+                    let mut weight = if a_block.row_idx == b_block.row_idx && ((b.0 - 1) <= a.0) {
+                        score_params.query_loop_score
+                    } else {
+                        get_link_cost(args, score_params, consensus_distance, join_prob)
+                    };
+
+                    if b_block.query_id == Some(196) {
+                        println!("Setting Weight to 1 for {}", b_block.row_idx);
+                        weight = score_params.query_loop_score
                     }
+
+                    graph.insert(
+                        ((a.0, a_block.row_idx), (b.0, b_block.row_idx)),
+                        Edge {
+                            weight,
+                            first_sparse_row: a.1,
+                            second_sparse_row: b.1,
+                            link_type,
+                        },
+                    );
                 }
             }
         });
diff --git a/src/history_tracing.rs b/src/history_tracing.rs
index 8f39011..436b330 100644
--- a/src/history_tracing.rs
+++ b/src/history_tracing.rs
@@ -1357,6 +1357,7 @@ pub fn backtrace_histories(
     let mut current_entry = &history.entries[current_idx];
 
     while let HistoryEntry::Join(entry_info) | HistoryEntry::Append(entry_info) = current_entry {
+        println!("{:#?}", current_entry);
         // Append current entry to segment stack...
         let blocks = history.segment_groups[entry_info.segment]
             .get_group(entry_info.group_index)
diff --git a/src/join_estimation.rs b/src/join_estimation.rs
index 7b0cd3e..975536c 100644
--- a/src/join_estimation.rs
+++ b/src/join_estimation.rs
@@ -1,20 +1,23 @@
+use std::fmt::Debug;
+
 use crate::{
     assembly::block_target_distance,
     segments::Block,
     statistics::{ln_add_exp, Distribution, ExponentialEstimator, HalfT},
 };
 
-pub trait JoinEstimator: Clone {
+pub trait JoinEstimator: Clone + Default + Debug {
     fn predict(&self, first_block: &Block, second_block: &Block, log_space: bool) -> f64;
 }
 
-pub trait JoinStatisticsCollector: Clone {
+pub trait JoinStatisticsCollector: Clone + Debug {
     fn new() -> Self;
+    fn new_from_prior(bayesian_prior: &Self) -> Self;
     fn combine(&self, other: &Self) -> Self;
     fn add(&mut self, first_block: &Block, second_block: &Block, neighbors: bool, joinable: bool);
 }
 
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, Default)]
 pub struct BayesianJoinEstimator {
     target_distance_join: ExponentialEstimator,
     target_distance_nojoin: ExponentialEstimator,
@@ -34,7 +37,7 @@ impl JoinEstimator for BayesianJoinEstimator {
             + self.divergence_join.logpdf(divergence_diff);
         let nojoin_score = (-self.join_prior).ln_1p()
             + self.target_distance_nojoin.logpdf(target_dist)
-            + self.divergence_nojoin.logpdf(target_dist);
+            + self.divergence_nojoin.logpdf(divergence_diff);
 
         let score_norm = ln_add_exp(join_score, nojoin_score);
         let score = join_score - score_norm;
@@ -55,32 +58,30 @@ impl From<BayesianJoinStatistics> for BayesianJoinEstimator {
 
 impl From<&BayesianJoinStatistics> for BayesianJoinEstimator {
     fn from(statistics: &BayesianJoinStatistics) -> Self {
+        let join_psuedo_count = statistics.joinable_count.max(1);
+        let nojoin_psuedo_count = statistics.unjoinable_count.max(1);
+
         let join_td_mean =
-            statistics.joinable_target_distance_sum as f64 / statistics.joinable_count as f64;
-        let nojoin_td_mean =
-            statistics.unjoinable_target_distance_sum as f64 / statistics.unjoinable_count as f64;
+            (statistics.joinable_target_distance_sum as f64 / join_psuedo_count as f64).max(1.0);
+        let nojoin_td_mean = (statistics.unjoinable_target_distance_sum as f64
+            / nojoin_psuedo_count as f64)
+            .max(join_td_mean);
 
         // Divergence distributions should have a mean of 0, so we assume that...
-        let join_div_mean = statistics.joinable_divergence_sum / statistics.joinable_count as f64;
+        let join_div_mean =
+            (statistics.joinable_divergence_sum / join_psuedo_count as f64).max(1.0);
         let nojoin_div_mean =
-            statistics.unjoinable_divergence_sum / statistics.joinable_count as f64;
+            (statistics.unjoinable_divergence_sum / nojoin_psuedo_count as f64).max(join_div_mean);
 
         Self {
-            target_distance_join: ExponentialEstimator::new(
-                join_td_mean,
-                statistics.joinable_count,
-            ),
-            target_distance_nojoin: ExponentialEstimator::new(
-                nojoin_td_mean,
-                statistics.unjoinable_count,
-            ),
-            divergence_join: HalfT::from_sample_mean(join_div_mean, statistics.joinable_count),
-            divergence_nojoin: HalfT::from_sample_mean(
-                nojoin_div_mean,
-                statistics.unjoinable_count,
-            ),
-            join_prior: statistics.joinable_count as f64
-                / (statistics.joinable_count + statistics.unjoinable_count) as f64,
+            target_distance_join: ExponentialEstimator::new(join_td_mean, join_psuedo_count),
+            target_distance_nojoin: ExponentialEstimator::new(nojoin_td_mean, nojoin_psuedo_count),
+            divergence_join: HalfT::from_sample_mean(join_div_mean, join_psuedo_count),
+            divergence_nojoin: HalfT::from_sample_mean(nojoin_div_mean, nojoin_psuedo_count),
+            // We take sqrt since we count all pairs, not just neighbors.
+            join_prior: (join_psuedo_count as f64
+                / (nojoin_psuedo_count + join_psuedo_count) as f64)
+                .sqrt(),
         }
     }
 }
@@ -107,6 +108,24 @@ impl JoinStatisticsCollector for BayesianJoinStatistics {
         }
     }
 
+    fn new_from_prior(bayesian_prior: &Self) -> Self {
+        let join_psuedo_count = bayesian_prior.joinable_count.max(1);
+        let nojoin_psuedo_count = bayesian_prior.unjoinable_count.max(1);
+
+        Self {
+            joinable_target_distance_sum: bayesian_prior.joinable_target_distance_sum
+                / join_psuedo_count,
+            unjoinable_target_distance_sum: bayesian_prior.unjoinable_target_distance_sum
+                / nojoin_psuedo_count,
+            joinable_divergence_sum: bayesian_prior.joinable_divergence_sum
+                / join_psuedo_count as f64,
+            unjoinable_divergence_sum: bayesian_prior.unjoinable_divergence_sum
+                / nojoin_psuedo_count as f64,
+            joinable_count: 1,
+            unjoinable_count: 1,
+        }
+    }
+
     fn add(&mut self, first_block: &Block, second_block: &Block, _neighbors: bool, joinable: bool) {
         let target_dist = block_target_distance(first_block, second_block).abs() as usize;
         let divergence_diff = (second_block.kimura80 - first_block.kimura80).abs();
diff --git a/src/statistics.rs b/src/statistics.rs
index 9f2722b..77e5ae6 100644
--- a/src/statistics.rs
+++ b/src/statistics.rs
@@ -11,8 +11,7 @@ pub fn ln_add_exp(a: f64, b: f64) -> f64 {
 
 // TODO: Support for generic floating types...
 #[allow(dead_code)]
-pub trait Distribution: Clone + Debug {
-    fn unit() -> Self;
+pub trait Distribution: Clone + Debug + Default {
     fn pdf(&self, x: f64) -> f64;
     fn cdf(&self, x: f64) -> f64;
     fn ppf(&self, p: f64) -> f64;
@@ -21,6 +20,10 @@ pub trait Distribution: Clone + Debug {
     fn logpdf(&self, x: f64) -> f64;
     fn logcdf(&self, x: f64) -> f64;
     fn logccdf(&self, x: f64) -> f64;
+
+    fn unit() -> Self {
+        Self::default()
+    }
 }
 
 #[derive(Clone, Debug)]
@@ -38,11 +41,13 @@ impl Exponential {
     }
 }
 
-impl Distribution for Exponential {
-    fn unit() -> Self {
+impl Default for Exponential {
+    fn default() -> Self {
         Self::new(1.0)
     }
+}
 
+impl Distribution for Exponential {
     fn pdf(&self, x: f64) -> f64 {
         self.lambda * (-self.lambda * x).exp()
     }
@@ -97,14 +102,16 @@ impl From<ExponentialEstimator> for Exponential {
     }
 }
 
-impl Distribution for ExponentialEstimator {
-    fn unit() -> Self {
+impl Default for ExponentialEstimator {
+    fn default() -> Self {
         Self {
             sample_mean: 1.0,
             degrees_of_freedom: 1,
         }
     }
+}
 
+impl Distribution for ExponentialEstimator {
     fn logpdf(&self, x: f64) -> f64 {
         let n = self.degrees_of_freedom as f64;
         let sm = self.sample_mean;
@@ -167,14 +174,16 @@ impl HalfT {
     }
 }
 
-impl Distribution for HalfT {
-    fn unit() -> Self {
+impl Default for HalfT {
+    fn default() -> Self {
         Self {
             standard_deviation: 1.0,
             degrees_of_freedom: 1,
         }
     }
+}
 
+impl Distribution for HalfT {
     fn logpdf(&self, x: f64) -> f64 {
         let v = self.degrees_of_freedom as f64;
         let s = self.standard_deviation;
diff --git a/src/trace_statistics.rs b/src/trace_statistics.rs
index 537ff1d..705f761 100644
--- a/src/trace_statistics.rs
+++ b/src/trace_statistics.rs
@@ -20,7 +20,7 @@ pub struct QueryStatistics<T: JoinEstimator> {
     pub occurances: usize,
     pub coverage: usize,
     pub target_span: usize,
-    pub estimator: Option<T>,
+    pub estimator: T,
 }
 
 #[derive(Debug)]
@@ -59,7 +59,7 @@ pub fn trace_statistics<S: JoinStatisticsCollector + Debug + Into<E>, E: JoinEst
             occurances: 0,
             coverage: 0,
             target_span: 0,
-            estimator: None,
+            estimator: E::default(),
         };
         alignment_data.query_name_map.size()
     ];
@@ -68,14 +68,12 @@ pub fn trace_statistics<S: JoinStatisticsCollector + Debug + Into<E>, E: JoinEst
         vec![None; alignment_data.query_name_map.size()];
 
     let mut all_region_stats: Vec<RegionStatistics> = Vec::with_capacity(naive_traces.len());
-    let mut all_join_stats: Vec<Option<S>> = vec![None; alignment_data.query_name_map.size()];
+    // We combine stats for all families to use as a prior (psuedo-count, single sample) for all stats...
+    let mut all_family_stats: S = S::new();
 
     for trace_results in naive_traces.iter() {
-        for (query_id, stats) in trace_results.query_join_statistics.iter() {
-            all_join_stats[*query_id] = match &all_join_stats[*query_id] {
-                None => Some(stats.clone()),
-                Some(other_stats) => Some(other_stats.combine(stats)),
-            };
+        for (_query_id, stats) in trace_results.query_join_statistics.iter() {
+            all_family_stats = all_family_stats.combine(stats);
         }
 
         match count_mode {
@@ -145,13 +143,24 @@ pub fn trace_statistics<S: JoinStatisticsCollector + Debug + Into<E>, E: JoinEst
         all_region_stats.push(region_stat);
     }
 
+    // Calculate join statistics for all families using combined prior as a starting point...
+    let mut all_join_stats: Vec<S> = vec![S::new(); alignment_data.query_name_map.size()];
+
+    for trace_results in naive_traces.iter() {
+        for (query_id, stats) in trace_results.query_join_statistics.iter() {
+            all_join_stats[*query_id] = all_join_stats[*query_id].combine(stats);
+        }
+    }
+
+    println!("{:#?}", all_join_stats);
+
     for (query_info, query_span, join_stat) in
         izip!(query_stats.iter_mut(), query_span.iter(), all_join_stats)
     {
         if let Some((start, end)) = query_span {
             query_info.target_span = end - start + 1;
         }
-        query_info.estimator = join_stat.map(|v| v.into());
+        query_info.estimator = join_stat.into();
     }
 
     TraceStatistics {

From 88dabb8a82312712546d0f4faee45f5f0778a9a8 Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Mon, 18 May 2026 18:52:52 -0600
Subject: [PATCH 24/39] Bug fix: properly adjust history when join is made.

---
 src/assembly.rs         | 48 +----------------------------------------
 src/history_tracing.rs  | 40 +++++++++++++++++++++++-----------
 src/trace_statistics.rs |  5 ++---
 3 files changed, 31 insertions(+), 62 deletions(-)

diff --git a/src/assembly.rs b/src/assembly.rs
index df96601..a58875a 100644
--- a/src/assembly.rs
+++ b/src/assembly.rs
@@ -326,47 +326,6 @@ fn link_assemblies<T: JoinEstimator>(
 
             let (consensus_distance, link_type) = block_consensus_distance(a_block, b_block);
 
-            if b_block.row_idx == 583 {
-                println!("Block: {}", a_block.row_idx);
-                println!(
-                    "Score: {}",
-                    query_statistics.estimator.predict(a_block, b_block, false)
-                );
-
-                println!(
-                    "Is Joinable: {}",
-                    is_joinable(
-                        target_distance,
-                        consensus_distance,
-                        link_type,
-                        min_block_length,
-                        args,
-                    )
-                );
-
-                println!(
-                    "Weight: {}",
-                    if a_block.row_idx == b_block.row_idx && ((b.0 - 1) <= a.0) {
-                        score_params.query_loop_score
-                    } else {
-                        get_link_cost(
-                            args,
-                            score_params,
-                            consensus_distance,
-                            query_statistics.estimator.predict(a_block, b_block, false),
-                        )
-                    }
-                );
-
-                println!("Estimator: {:#?}", query_statistics.estimator);
-                println!(
-                    "Target Dist: {}, Div: {}, Cons Dist: {}",
-                    target_distance,
-                    (a_block.kimura80 - b_block.kimura80).abs(),
-                    consensus_distance
-                )
-            }
-
             if is_joinable(
                 target_distance,
                 consensus_distance,
@@ -377,17 +336,12 @@ fn link_assemblies<T: JoinEstimator>(
                 let join_prob = query_statistics.estimator.predict(a_block, b_block, false);
 
                 if join_prob >= args.join_likelihood_threshold {
-                    let mut weight = if a_block.row_idx == b_block.row_idx && ((b.0 - 1) <= a.0) {
+                    let weight = if a_block.row_idx == b_block.row_idx && ((b.0 - 1) <= a.0) {
                         score_params.query_loop_score
                     } else {
                         get_link_cost(args, score_params, consensus_distance, join_prob)
                     };
 
-                    if b_block.query_id == Some(196) {
-                        println!("Setting Weight to 1 for {}", b_block.row_idx);
-                        weight = score_params.query_loop_score
-                    }
-
                     graph.insert(
                         ((a.0, a_block.row_idx), (b.0, b_block.row_idx)),
                         Edge {
diff --git a/src/history_tracing.rs b/src/history_tracing.rs
index 436b330..5348aa1 100644
--- a/src/history_tracing.rs
+++ b/src/history_tracing.rs
@@ -659,6 +659,15 @@ fn get_join_endpoints_from_links(
     (left_side, right_side)
 }
 
+fn prior_history_index(entry: &HistoryEntry) -> usize {
+    if let HistoryEntry::Append(info) | HistoryEntry::Join(info) = entry {
+        info.prior_history
+    } else {
+        // 0 for the root of the histories...
+        0
+    }
+}
+
 fn add_single_join(
     histories: &mut Vec<HistoryEntry>,
     segments: &[Segment],
@@ -679,11 +688,13 @@ fn add_single_join(
     let new_group_index =
         segment_groups[segment_idx].add_group(&segments[segment_idx], join_blocks);
 
-    let join_prior_index = match (left_join_link, right_join_link) {
-        (Some(lv), Some(rv)) => lv.origin_history.min(rv.origin_history),
-        (Some(v), None) | (None, Some(v)) => v.origin_history,
-        _ => prior_hist_idx,
-    };
+    let join_prior_index = prior_history_index(
+        &histories[match (left_join_link, right_join_link) {
+            (Some(lv), Some(rv)) => lv.origin_history.min(rv.origin_history),
+            (Some(v), None) | (None, Some(v)) => v.origin_history,
+            _ => panic!("Unreachable branch here, something went really wrong..."),
+        }],
+    );
 
     // Clean expired history entries from the join path....
     let simplified_join_index = remove_expired_history_entries(
@@ -1158,6 +1169,7 @@ fn get_joinable_extensions<'a>(
         .collect_vec()
 }
 
+#[derive(Debug)]
 struct JoinStackEntry {
     joined_history_offset: usize,
     trace_segment_offset: usize,
@@ -1170,6 +1182,7 @@ struct AddedBlockInfo {
     join_index: usize,
 }
 
+#[derive(Debug)]
 struct JoinStack {
     pub stack: Vec<JoinStackEntry>,
     pub next_join_index: usize,
@@ -1193,7 +1206,7 @@ impl JoinStack {
     /// Try adding one or two joins to the join stack if this block is a join and has linked edges.
     fn try_push(&mut self, entry: &HistoryEntry, added_block_info: Option<&AddedBlockInfo>) {
         if let (HistoryEntry::Join(val), Some(info)) = (entry, added_block_info) {
-            let mut top_stack_entries = 0;
+            let top_offset = self.stack.len();
 
             if val.join_left_block.caused_by_history != info.history_index {
                 self.stack.push(JoinStackEntry {
@@ -1201,7 +1214,6 @@ impl JoinStack {
                     trace_segment_offset: info.trace_stack_index,
                     join_index: info.join_index,
                 });
-                top_stack_entries += 1;
             }
 
             if val.join_right_block.caused_by_history != info.history_index {
@@ -1210,11 +1222,9 @@ impl JoinStack {
                     trace_segment_offset: info.trace_stack_index,
                     join_index: info.join_index,
                 });
-                top_stack_entries += 1;
             }
 
-            let top_vals_offset = self.stack.len() - top_stack_entries;
-            self.stack[top_vals_offset..].sort_unstable_by_key(|v| v.joined_history_offset);
+            self.stack[top_offset..].sort_unstable_by_key(|v| v.joined_history_offset);
         }
     }
 
@@ -1278,7 +1288,7 @@ fn history_backtrace_append_block(
         .any(|&b| matches!(b.block_type, BlockType::Alignment | BlockType::TandemRepeat))
     {
         match joiner.check_for_join(current_history_index) {
-            // Case 2: Involved in a join, add new block, but don't
+            // Case 2: Involved in a join, add new block.
             BlockAction::Join(join_index, stack_pos) => {
                 let joins = get_joinable_extensions(
                     blocks.iter().copied(),
@@ -1357,7 +1367,6 @@ pub fn backtrace_histories(
     let mut current_entry = &history.entries[current_idx];
 
     while let HistoryEntry::Join(entry_info) | HistoryEntry::Append(entry_info) = current_entry {
-        println!("{:#?}", current_entry);
         // Append current entry to segment stack...
         let blocks = history.segment_groups[entry_info.segment]
             .get_group(entry_info.group_index)
@@ -1384,6 +1393,13 @@ pub fn backtrace_histories(
         current_entry = &history.entries[current_idx];
     }
 
+    if joiner.stack.len() != 0 {
+        panic!(
+            "Backtrace not done properly, there are {} leftover values on the join stack!",
+            joiner.stack.len()
+        );
+    }
+
     // Reverse so trace segments go from start to end instead of end to start.
     refined_segments.reverse();
     refined_segments
diff --git a/src/trace_statistics.rs b/src/trace_statistics.rs
index 705f761..20e66f1 100644
--- a/src/trace_statistics.rs
+++ b/src/trace_statistics.rs
@@ -144,7 +144,8 @@ pub fn trace_statistics<S: JoinStatisticsCollector + Debug + Into<E>, E: JoinEst
     }
 
     // Calculate join statistics for all families using combined prior as a starting point...
-    let mut all_join_stats: Vec<S> = vec![S::new(); alignment_data.query_name_map.size()];
+    let mut all_join_stats: Vec<S> =
+        vec![S::new_from_prior(&all_family_stats); alignment_data.query_name_map.size()];
 
     for trace_results in naive_traces.iter() {
         for (query_id, stats) in trace_results.query_join_statistics.iter() {
@@ -152,8 +153,6 @@ pub fn trace_statistics<S: JoinStatisticsCollector + Debug + Into<E>, E: JoinEst
         }
     }
 
-    println!("{:#?}", all_join_stats);
-
     for (query_info, query_span, join_stat) in
         izip!(query_stats.iter_mut(), query_span.iter(), all_join_stats)
     {

From 8a3e2680cce325f0b36422deb279296cade44088 Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Thu, 21 May 2026 11:45:53 -0600
Subject: [PATCH 25/39] Tested est for frechet, stinks so temp disabled.

---
 scripts/plot_distributions_aurora.py |   4 +-
 src/join_estimation.rs               | 257 ++++++++++++++++++++-------
 src/p2estimator.rs                   |   2 +-
 src/statistics.rs                    | 174 +++++++++++++++++-
 src/trace_statistics.rs              |   2 +-
 5 files changed, 364 insertions(+), 75 deletions(-)

diff --git a/scripts/plot_distributions_aurora.py b/scripts/plot_distributions_aurora.py
index 26fa437..dfe8961 100644
--- a/scripts/plot_distributions_aurora.py
+++ b/scripts/plot_distributions_aurora.py
@@ -223,8 +223,8 @@ def logcdf(self, x, *args):
 
 estimator = {
     "Relative Consensus Distance": Distribution(
-        laplace_asymmetric, (1.0, 0.0, 1.0), False
-    ),  # Distribution(invweibull, (1.0, 0.0, 1.0), False),
+        invweibull, (1.0, 0.0, 1.0), False
+    ),  # Distribution(invweibull, (1.0, 0.0, 1.0), False), Distribution(laplace_asymmetric, (1.0, 0.0, 1.0), False)
     "Target Distance": Distribution(
         genpareto, (0.0, 1.0)
     ),  # Distribution(expon, (1.0,)),  # Distribution(genpareto, (0.0, 1.0)), Distribution(weibull_min, (1.0, 10000)
diff --git a/src/join_estimation.rs b/src/join_estimation.rs
index 975536c..b688ed9 100644
--- a/src/join_estimation.rs
+++ b/src/join_estimation.rs
@@ -1,9 +1,9 @@
-use std::fmt::Debug;
+use std::{fmt::Debug, ops};
 
 use crate::{
-    assembly::block_target_distance,
+    assembly::{block_consensus_distance, block_length_on_query, block_target_distance, LinkType},
     segments::Block,
-    statistics::{ln_add_exp, Distribution, ExponentialEstimator, HalfT},
+    statistics::{ln_add_exp, Distribution, ExponentialEstimator, Frechet, HalfT, Laplace},
 };
 
 pub trait JoinEstimator: Clone + Default + Debug {
@@ -12,7 +12,7 @@ pub trait JoinEstimator: Clone + Default + Debug {
 
 pub trait JoinStatisticsCollector: Clone + Debug {
     fn new() -> Self;
-    fn new_from_prior(bayesian_prior: &Self) -> Self;
+    fn new_from_prior(bayesian_prior: &Self, pseudo_count: usize) -> Self;
     fn combine(&self, other: &Self) -> Self;
     fn add(&mut self, first_block: &Block, second_block: &Block, neighbors: bool, joinable: bool);
 }
@@ -23,6 +23,8 @@ pub struct BayesianJoinEstimator {
     target_distance_nojoin: ExponentialEstimator,
     divergence_join: HalfT,
     divergence_nojoin: HalfT,
+    consensus_distance_join: Frechet,
+    consensus_distance_nojoin: Laplace,
     join_prior: f64,
 }
 
@@ -31,13 +33,39 @@ impl JoinEstimator for BayesianJoinEstimator {
         let target_dist = block_target_distance(first_block, second_block) as f64;
         // Absolute value as t-dist is symmetric and we want to get prob in tail, also, we know the mean is 0...
         let divergence_diff = (second_block.kimura80 - first_block.kimura80).abs();
+        let (consensus_dist, _join_type) = block_consensus_distance(first_block, second_block);
+        let rel_con_dist = consensus_dist as f64
+            / block_length_on_query(first_block).max(block_length_on_query(second_block)) as f64;
+
+        /*
+        println!("{:#?}", self);
+        println!(
+            "{} {} {}",
+            rel_con_dist,
+            self.consensus_distance_join.pdf(rel_con_dist),
+            self.consensus_distance_nojoin.pdf(rel_con_dist)
+        );
+        println!(
+            "{} {} {}",
+            target_dist,
+            self.target_distance_join.pdf(target_dist),
+            self.target_distance_nojoin.pdf(target_dist)
+        );
+        println!(
+            "{} {} {}",
+            divergence_diff,
+            self.divergence_join.pdf(divergence_diff),
+            self.divergence_nojoin.pdf(divergence_diff)
+        );*/
 
         let join_score = self.join_prior.ln()
             + self.target_distance_join.logpdf(target_dist)
             + self.divergence_join.logpdf(divergence_diff);
+        //+ self.consensus_distance_join.logpdf(rel_con_dist);
         let nojoin_score = (-self.join_prior).ln_1p()
             + self.target_distance_nojoin.logpdf(target_dist)
             + self.divergence_nojoin.logpdf(divergence_diff);
+        //+ self.consensus_distance_nojoin.logpdf(rel_con_dist);
 
         let score_norm = ln_add_exp(join_score, nojoin_score);
         let score = join_score - score_norm;
@@ -56,102 +84,195 @@ impl From<BayesianJoinStatistics> for BayesianJoinEstimator {
     }
 }
 
-impl From<&BayesianJoinStatistics> for BayesianJoinEstimator {
-    fn from(statistics: &BayesianJoinStatistics) -> Self {
-        let join_psuedo_count = statistics.joinable_count.max(1);
-        let nojoin_psuedo_count = statistics.unjoinable_count.max(1);
+#[derive(Debug, Clone, Copy)]
+struct MomentEstimator {
+    sum_square: f64,
+    sum: f64,
+    samples: usize,
+}
+
+impl MomentEstimator {
+    fn new() -> Self {
+        Self {
+            sum_square: 0.0,
+            sum: 0.0,
+            samples: 0,
+        }
+    }
+
+    fn to_psuedo_count(&self, count: usize) -> Self {
+        Self {
+            sum_square: (self.sum_square / self.samples.max(1) as f64) * count as f64,
+            sum: (self.sum / self.samples.max(1) as f64) * count as f64,
+            samples: count,
+        }
+    }
+
+    fn mean(&self) -> f64 {
+        self.sum / self.samples.max(1) as f64
+    }
+
+    fn variance(&self) -> f64 {
+        // TODO: Use shifted data alg for more accuracy...
+        (self.sum_square - (self.sum * self.sum) / self.samples.max(1) as f64)
+            / (self.samples.max(2) as f64 - 1.0)
+    }
+
+    fn standard_deviation(&self) -> f64 {
+        self.variance().sqrt()
+    }
+
+    fn samples(&self) -> usize {
+        self.samples
+    }
+}
+
+impl Default for MomentEstimator {
+    fn default() -> Self {
+        Self::new()
+    }
+}
 
-        let join_td_mean =
-            (statistics.joinable_target_distance_sum as f64 / join_psuedo_count as f64).max(1.0);
-        let nojoin_td_mean = (statistics.unjoinable_target_distance_sum as f64
-            / nojoin_psuedo_count as f64)
-            .max(join_td_mean);
+impl ops::Add<MomentEstimator> for MomentEstimator {
+    type Output = MomentEstimator;
 
-        // Divergence distributions should have a mean of 0, so we assume that...
-        let join_div_mean =
-            (statistics.joinable_divergence_sum / join_psuedo_count as f64).max(1.0);
-        let nojoin_div_mean =
-            (statistics.unjoinable_divergence_sum / nojoin_psuedo_count as f64).max(join_div_mean);
+    fn add(self, rhs: MomentEstimator) -> Self::Output {
+        Self {
+            sum_square: self.sum_square + rhs.sum_square,
+            sum: self.sum + rhs.sum,
+            samples: self.samples + rhs.samples,
+        }
+    }
+}
+
+impl ops::AddAssign<MomentEstimator> for MomentEstimator {
+    fn add_assign(&mut self, rhs: MomentEstimator) {
+        self.sum_square += rhs.sum_square;
+        self.sum += rhs.sum;
+        self.samples += rhs.samples;
+    }
+}
+
+impl ops::AddAssign<f64> for MomentEstimator {
+    fn add_assign(&mut self, rhs: f64) {
+        self.sum_square += rhs * rhs;
+        self.sum += rhs;
+        self.samples += 1;
+    }
+}
+
+impl From<MomentEstimator> for ExponentialEstimator {
+    fn from(value: MomentEstimator) -> Self {
+        Self::new(value.mean(), value.samples().max(1))
+    }
+}
 
+impl From<MomentEstimator> for HalfT {
+    fn from(value: MomentEstimator) -> Self {
+        Self::from_sample_mean(value.mean(), value.samples().max(1))
+    }
+}
+
+impl From<MomentEstimator> for Laplace {
+    fn from(value: MomentEstimator) -> Self {
+        Self::from_moments(value.mean(), value.standard_deviation())
+    }
+}
+
+impl From<&BayesianJoinStatistics> for BayesianJoinEstimator {
+    fn from(statistics: &BayesianJoinStatistics) -> Self {
         Self {
-            target_distance_join: ExponentialEstimator::new(join_td_mean, join_psuedo_count),
-            target_distance_nojoin: ExponentialEstimator::new(nojoin_td_mean, nojoin_psuedo_count),
-            divergence_join: HalfT::from_sample_mean(join_div_mean, join_psuedo_count),
-            divergence_nojoin: HalfT::from_sample_mean(nojoin_div_mean, nojoin_psuedo_count),
+            target_distance_join: statistics.joinable_target_distance.into(),
+            target_distance_nojoin: statistics.unjoinable_target_distance.into(),
+            divergence_join: statistics.joinable_divergence.into(),
+            divergence_nojoin: statistics.unjoinable_divergence.into(),
+            consensus_distance_join: Frechet::from_log_moments(
+                statistics.joinable_consensus_log.mean(),
+                statistics.joinable_consensus_log.standard_deviation(),
+                -0.05,
+            ),
+            consensus_distance_nojoin: statistics.unjoinable_consensus.into(),
             // We take sqrt since we count all pairs, not just neighbors.
-            join_prior: (join_psuedo_count as f64
-                / (nojoin_psuedo_count + join_psuedo_count) as f64)
+            join_prior: (statistics.joinable_target_distance.samples() as f64
+                / (statistics.joinable_target_distance.samples()
+                    + statistics.unjoinable_target_distance.samples())
+                .max(1) as f64)
                 .sqrt(),
         }
     }
 }
 
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, Default)]
 pub struct BayesianJoinStatistics {
-    joinable_target_distance_sum: usize,
-    unjoinable_target_distance_sum: usize,
-    joinable_divergence_sum: f64,
-    unjoinable_divergence_sum: f64,
-    joinable_count: usize,
-    unjoinable_count: usize,
+    joinable_target_distance: MomentEstimator,
+    unjoinable_target_distance: MomentEstimator,
+    joinable_divergence: MomentEstimator,
+    unjoinable_divergence: MomentEstimator,
+    joinable_consensus_log: MomentEstimator,
+    unjoinable_consensus: MomentEstimator,
 }
 
 impl JoinStatisticsCollector for BayesianJoinStatistics {
     fn new() -> Self {
-        Self {
-            joinable_target_distance_sum: 0,
-            unjoinable_target_distance_sum: 0,
-            joinable_divergence_sum: 0.0,
-            unjoinable_divergence_sum: 0.0,
-            joinable_count: 0,
-            unjoinable_count: 0,
-        }
+        Self::default()
     }
 
-    fn new_from_prior(bayesian_prior: &Self) -> Self {
-        let join_psuedo_count = bayesian_prior.joinable_count.max(1);
-        let nojoin_psuedo_count = bayesian_prior.unjoinable_count.max(1);
-
+    fn new_from_prior(bayesian_prior: &Self, pseudo_count: usize) -> Self {
         Self {
-            joinable_target_distance_sum: bayesian_prior.joinable_target_distance_sum
-                / join_psuedo_count,
-            unjoinable_target_distance_sum: bayesian_prior.unjoinable_target_distance_sum
-                / nojoin_psuedo_count,
-            joinable_divergence_sum: bayesian_prior.joinable_divergence_sum
-                / join_psuedo_count as f64,
-            unjoinable_divergence_sum: bayesian_prior.unjoinable_divergence_sum
-                / nojoin_psuedo_count as f64,
-            joinable_count: 1,
-            unjoinable_count: 1,
+            joinable_target_distance: bayesian_prior
+                .joinable_target_distance
+                .to_psuedo_count(pseudo_count),
+            unjoinable_target_distance: bayesian_prior
+                .unjoinable_target_distance
+                .to_psuedo_count(pseudo_count),
+            joinable_divergence: bayesian_prior
+                .joinable_divergence
+                .to_psuedo_count(pseudo_count),
+            unjoinable_divergence: bayesian_prior
+                .unjoinable_divergence
+                .to_psuedo_count(pseudo_count),
+            joinable_consensus_log: bayesian_prior
+                .joinable_consensus_log
+                .to_psuedo_count(pseudo_count),
+            unjoinable_consensus: bayesian_prior
+                .unjoinable_consensus
+                .to_psuedo_count(pseudo_count),
         }
     }
 
     fn add(&mut self, first_block: &Block, second_block: &Block, _neighbors: bool, joinable: bool) {
         let target_dist = block_target_distance(first_block, second_block).abs() as usize;
         let divergence_diff = (second_block.kimura80 - first_block.kimura80).abs();
+        let (consensus_dist, join_type) = block_consensus_distance(first_block, second_block);
+        let rel_con_dist = consensus_dist as f64
+            / block_length_on_query(first_block).max(block_length_on_query(second_block)) as f64;
 
         if joinable {
-            self.joinable_target_distance_sum += target_dist;
-            self.joinable_divergence_sum += divergence_diff;
-            self.joinable_count += 1;
+            self.joinable_target_distance += target_dist as f64;
+            self.joinable_divergence += divergence_diff;
+            if matches!(join_type, LinkType::Forward | LinkType::Reverse) {
+                //println!("CDist: {}", rel_con_dist);
+                self.joinable_consensus_log += (rel_con_dist + 0.05).max(1e-50).ln();
+            }
         } else {
-            self.unjoinable_target_distance_sum += target_dist;
-            self.unjoinable_divergence_sum += divergence_diff;
-            self.unjoinable_count += 1;
+            self.unjoinable_target_distance += target_dist as f64;
+            self.unjoinable_divergence += divergence_diff;
+            if matches!(join_type, LinkType::Forward | LinkType::Reverse) {
+                self.unjoinable_consensus += rel_con_dist;
+            }
         }
     }
 
     fn combine(&self, other: &Self) -> Self {
         Self {
-            joinable_target_distance_sum: self.joinable_target_distance_sum
-                + other.joinable_target_distance_sum,
-            unjoinable_target_distance_sum: self.unjoinable_target_distance_sum
-                + other.unjoinable_target_distance_sum,
-            joinable_divergence_sum: self.joinable_divergence_sum + other.joinable_divergence_sum,
-            unjoinable_divergence_sum: self.unjoinable_divergence_sum
-                + other.unjoinable_divergence_sum,
-            joinable_count: self.joinable_count + other.joinable_count,
-            unjoinable_count: self.unjoinable_count + other.unjoinable_count,
+            joinable_target_distance: self.joinable_target_distance
+                + other.joinable_target_distance,
+            unjoinable_target_distance: self.unjoinable_target_distance
+                + other.unjoinable_target_distance,
+            joinable_divergence: self.joinable_divergence + other.joinable_divergence,
+            unjoinable_divergence: self.unjoinable_divergence + other.unjoinable_divergence,
+            joinable_consensus_log: self.joinable_consensus_log + other.joinable_consensus_log,
+            unjoinable_consensus: self.unjoinable_consensus + other.unjoinable_consensus,
         }
     }
 }
diff --git a/src/p2estimator.rs b/src/p2estimator.rs
index fbcdeb2..1bfe387 100644
--- a/src/p2estimator.rs
+++ b/src/p2estimator.rs
@@ -154,7 +154,7 @@ impl<'a, F: Float + TotalOrder, I: Unsigned + Copy + Ord + Into<F> + From<usize>
         obs >= self.points.len()
     }
 
-    fn combine(&mut self, _other: &P2HistogramData<F, I>) {
+    fn combine(&mut self, other: &P2HistogramData<F, I>) {
         // TODO: Need to think about how to do this efficiently while maintaining accuracy...
         panic!("Not implemented!")
     }
diff --git a/src/statistics.rs b/src/statistics.rs
index 77e5ae6..7670c93 100644
--- a/src/statistics.rs
+++ b/src/statistics.rs
@@ -234,9 +234,173 @@ impl Distribution for HalfT {
     }
 }
 
+#[derive(Debug, Clone)]
+pub struct Frechet {
+    alpha: f64,
+    scale: f64,
+    minimum: f64,
+}
+
+impl Frechet {
+    pub fn new(alpha: f64, scale: f64, minimum: f64) -> Self {
+        Self {
+            alpha,
+            scale,
+            minimum,
+        }
+    }
+
+    pub fn from_log_moments(log_mean: f64, log_std: f64, minimum: f64) -> Self {
+        let alpha = f64::consts::PI / (6.0 * log_std);
+        let lambda = (alpha * log_mean - f64::consts::EULER_GAMMA).exp();
+        let scale = lambda.powf(1.0 / alpha);
+        Self {
+            alpha,
+            scale,
+            minimum,
+        }
+    }
+}
+
+impl Default for Frechet {
+    fn default() -> Self {
+        Self {
+            alpha: 1.0,
+            scale: 1.0,
+            minimum: 0.0,
+        }
+    }
+}
+
+impl Distribution for Frechet {
+    fn logpdf(&self, x: f64) -> f64 {
+        let a = self.alpha;
+        let s = self.scale;
+        let m = self.minimum;
+        if x > m {
+            (a / s).ln() + -(a + 1.0) * ((x - m) / s).ln() + -((x - m) / s).powf(-a)
+        } else {
+            f64::NEG_INFINITY
+        }
+    }
+
+    fn pdf(&self, x: f64) -> f64 {
+        self.logpdf(x).exp()
+    }
+
+    fn cdf(&self, x: f64) -> f64 {
+        self.logcdf(x).exp()
+    }
+
+    fn logcdf(&self, x: f64) -> f64 {
+        let a = self.alpha;
+        let s = self.scale;
+        let m = self.minimum;
+        if x > m {
+            -((x - m) / s).powf(-a)
+        } else {
+            f64::NEG_INFINITY
+        }
+    }
+
+    fn ppf(&self, p: f64) -> f64 {
+        let a = self.alpha;
+        let s = self.scale;
+        let m = self.minimum;
+        if p >= 1.0 {
+            f64::INFINITY
+        } else if p <= 0.0 {
+            m
+        } else {
+            m + s * (-p.min(1.0).ln()).powf(1.0 / -a)
+        }
+    }
+
+    fn ccdf(&self, x: f64) -> f64 {
+        1.0 - self.cdf(x)
+    }
+
+    fn logccdf(&self, x: f64) -> f64 {
+        self.ccdf(x).ln()
+    }
+
+    fn support(&self) -> (f64, f64) {
+        (self.minimum, f64::INFINITY)
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct Laplace {
+    mean: f64,
+    scale: f64,
+}
+
+impl Laplace {
+    pub fn new(mean: f64, scale: f64) -> Self {
+        Self { mean, scale }
+    }
+
+    pub fn from_moments(mean: f64, standard_deviation: f64) -> Self {
+        Self {
+            mean,
+            scale: standard_deviation / f64::consts::SQRT_2,
+        }
+    }
+}
+
+impl Default for Laplace {
+    fn default() -> Self {
+        Self {
+            mean: 0.0,
+            scale: 1.0,
+        }
+    }
+}
+
+impl Distribution for Laplace {
+    fn logpdf(&self, x: f64) -> f64 {
+        let mu = self.mean;
+        let b = self.scale;
+        (0.5 / b).ln() + -((x - mu).abs() / b)
+    }
+
+    fn pdf(&self, x: f64) -> f64 {
+        self.logpdf(x).exp()
+    }
+
+    fn cdf(&self, x: f64) -> f64 {
+        let mu = self.mean;
+        let b = self.scale;
+        0.5 + 0.5 * (x - mu).signum() * (1.0 - (-(x - mu).abs() / b).exp())
+    }
+
+    fn logcdf(&self, x: f64) -> f64 {
+        self.cdf(x).ln()
+    }
+
+    fn ccdf(&self, x: f64) -> f64 {
+        1.0 - self.cdf(x)
+    }
+
+    fn logccdf(&self, x: f64) -> f64 {
+        self.ccdf(x).ln()
+    }
+
+    fn ppf(&self, p: f64) -> f64 {
+        let mu = self.mean;
+        let b = self.scale;
+        let p = p.clamp(0.0, 1.0);
+        mu - b * (p - 0.5).signum() * (1.0 - 2.0 * (p - 0.5).abs()).ln()
+    }
+
+    fn support(&self) -> (f64, f64) {
+        (f64::NEG_INFINITY, f64::INFINITY)
+    }
+}
+
 #[cfg(test)]
 mod test {
-    use crate::statistics::{ExponentialEstimator, HalfT};
+    use crate::statistics::{ExponentialEstimator, Frechet, HalfT, Laplace};
     use std::fmt::Debug;
 
     pub trait TestDistribution: Debug {
@@ -283,11 +447,13 @@ mod test {
 
     use super::{Distribution, Exponential};
 
-    fn get_dists() -> [Box<dyn TestDistribution>; 3] {
+    fn get_dists() -> [Box<dyn TestDistribution>; 5] {
         [
             as_box(Exponential::unit()),
             as_box(ExponentialEstimator::unit()),
             as_box(HalfT::unit()),
+            as_box(Frechet::unit()),
+            as_box(Laplace::unit()),
         ]
     }
 
@@ -312,16 +478,18 @@ mod test {
             if high == f64::INFINITY {
                 high = 5.0;
             }
-            if low == f64::INFINITY {
+            if low == f64::NEG_INFINITY {
                 low = -5.0;
             }
 
             for x in linspace(low, high, 100) {
                 // Basic properties...
+                // println!("{x} -> {} vs {}", dist.tpdf(x), dist.tlogpdf(x).exp());
                 assert!(is_close(dist.tpdf(x), dist.tlogpdf(x).exp()));
                 assert!(is_close(dist.tcdf(x), dist.tlogcdf(x).exp()));
                 assert!(is_close(dist.tccdf(x), dist.tlogccdf(x).exp()));
                 assert!(is_close(dist.tccdf(x), 1.0 - dist.tcdf(x)));
+                // println!("{x} -> {}", dist.tppf(dist.tcdf(x)));
                 assert!(is_close(dist.tppf(dist.tcdf(x)), x));
             }
         }
diff --git a/src/trace_statistics.rs b/src/trace_statistics.rs
index 20e66f1..cb49630 100644
--- a/src/trace_statistics.rs
+++ b/src/trace_statistics.rs
@@ -145,7 +145,7 @@ pub fn trace_statistics<S: JoinStatisticsCollector + Debug + Into<E>, E: JoinEst
 
     // Calculate join statistics for all families using combined prior as a starting point...
     let mut all_join_stats: Vec<S> =
-        vec![S::new_from_prior(&all_family_stats); alignment_data.query_name_map.size()];
+        vec![S::new_from_prior(&all_family_stats, 1); alignment_data.query_name_map.size()];
 
     for trace_results in naive_traces.iter() {
         for (query_id, stats) in trace_results.query_join_statistics.iter() {

From c065af2515a2181d6471d5349067689e19c1a9f3 Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Thu, 21 May 2026 18:05:04 -0600
Subject: [PATCH 26/39] WIP on quantile estimator.

---
 src/p2estimator.rs | 162 +++++++++++++++++++++++++++++----------------
 1 file changed, 106 insertions(+), 56 deletions(-)

diff --git a/src/p2estimator.rs b/src/p2estimator.rs
index 1bfe387..74873ba 100644
--- a/src/p2estimator.rs
+++ b/src/p2estimator.rs
@@ -3,11 +3,14 @@ use std::{cmp::Ordering, ops::Neg};
 /// Implementation of P2 estimator.
 /// See "The P2 Algorithm for Dynamic Statistical Computing Calculation of Quantiles and Histograms Without Storing Observations"
 /// at https://www.cse.wustl.edu/~jain/papers/ftp/psqr.pdf
-use num_traits::{float::TotalOrder, Float, Num, Unsigned};
+///
+/// We replace the P2 interpolation with PCHIP instead (See paper A Method for Constructing Local Monotone Piecewise Cubic Interpolants by F. N. Fritsch and J. Butland, or https://doi.org/10.1137/0905021)
+use num_traits::{float::TotalOrder, Float, FromPrimitive, Num, Unsigned};
 
 struct P2HistogramPoint<F: Float, I: Unsigned> {
     value: F,
     rank: I,
+    target: F,
 }
 
 fn get_sign<A: Num + PartialOrd + Neg<Output = A>, B: Num + PartialOrd + Neg<Output = B>>(
@@ -28,74 +31,88 @@ fn inc_or_dec<A: Num + PartialOrd, B: Num + PartialOrd + Neg<Output = B>>(val: A
     }
 }
 
-fn linear_prediction<F: Float, I: Unsigned + Copy + Ord + Into<F>>(
-    points: &[P2HistogramPoint<F, I>; 3],
-    d: isize,
+fn cubic_hermite_spline<F: Float + FromPrimitive>(
+    x0: F,
+    y0: F,
+    x1: F,
+    y1: F,
+    m0: F,
+    m1: F,
+    x: F,
 ) -> F {
-    let n: [F; 3] = points.each_ref().map(|v| v.rank.into());
-    let q: [F; 3] = points.each_ref().map(|v| v.value);
-    let d_f: F = get_sign(d);
-    let d_off = (1 + d) as usize;
+    let t = (x - x0) / (x1 - x0);
+    let ms0 = (x1 - x0) * m0;
+    let ms1 = (x1 - x0) * m1;
 
-    q[1] + d_f * ((q[d_off] - q[1]) / (n[d_off] - n[1]))
-}
+    let _1 = F::one();
+    let _2 = F::from_i32(2).unwrap();
+    let _3 = F::from_i32(3).unwrap();
 
-fn parabolic_prediction<F: Float, I: Unsigned + Copy + Ord + Into<F>>(
-    points: &[P2HistogramPoint<F, I>; 3],
-    d: isize,
-) -> F {
-    let n: [F; 3] = points.each_ref().map(|v| v.rank.into());
-    let q: [F; 3] = points.each_ref().map(|v| v.value);
-    let d: F = get_sign(d);
+    let h0: F = (_2 * t - _3) * t * t + _1;
+    let h1 = ((t - _1) * t + _1) * t;
+    let h2 = (_2 * t + _3) * t * t;
+    let h3 = (t - _1) * t * t;
 
-    let left = (n[1] - n[0] + d) * ((q[2] - q[1]) / (n[2] - n[1]));
-    let right = (n[2] - n[1] - d) * ((q[1] - q[0]) / (n[1] - n[0]));
-    q[1] + (d / (n[2] - n[0])) * (left + right)
+    h0 * y0 + h1 * ms0 + h2 * y1 + h3 * ms1
 }
 
-fn _p2update<F: Float, I: Unsigned + Copy + Ord + Into<F> + From<usize>>(
-    points: &mut [P2HistogramPoint<F, I>],
-    center_index: usize,
-    observations: I,
-    total_points: I,
-) {
-    // Actual rank desired for the given quantile...
-    let ci: I = center_index.into();
-    let rank_proposal: F =
-        (ci * (observations - I::one())).into() / (total_points - I::one()).into();
-    let d: F = rank_proposal - points[1].rank.into();
-
-    if d >= F::one() && (points[2].rank - points[1].rank) > I::one()
-        || (d <= -F::one()) && points[1].rank - points[0].rank > I::one()
-    {
-        let d: isize = get_sign(d);
-        let mut p_est = parabolic_prediction(
-            (&points[center_index - 1..center_index + 1])
-                .as_array()
-                .unwrap(),
-            d,
-        );
-        if p_est <= points[center_index - 1].value || p_est >= points[center_index + 1].value {
-            p_est = linear_prediction(
-                (&points[center_index - 1..center_index + 1])
-                    .as_array()
-                    .unwrap(),
-                d,
-            );
-        }
+fn pchip_point_derivative<F: Float + FromPrimitive>(dx0: F, dy0: F, dx1: F, dy1: F) -> F {
+    if dy0 * dy1 > F::zero() {
+        let _1 = F::one();
+        let one_third = _1 / F::from_i32(3).unwrap();
+        let alpha = one_third * (_1 + dx1 / (dx0 + dx1));
+        dy0 * dy1 / (alpha * dy1 + (_1 - alpha) * dy0)
+    } else {
+        F::zero()
+    }
+}
 
-        points[center_index].value = p_est;
-        points[center_index].rank = inc_or_dec(points[center_index].rank, d);
+fn secant_diff<F: Float, I: Unsigned + Copy + Ord + Into<F>>(
+    point0: Option<&P2HistogramPoint<F, I>>,
+    point1: Option<&P2HistogramPoint<F, I>>,
+) -> (F, F) {
+    if let (Some(p0), Some(p1)) = (point0, point1) {
+        ((p1.rank - p0.rank).into(), p1.value - p0.value)
+    } else {
+        // Assume slope at endpoints of CDF is 0...
+        (F::zero(), F::zero())
     }
 }
 
-struct P2HistogramData<'a, F: Float, I: Unsigned + Copy + Ord + Into<F>> {
+fn pchip_prediction<F: Float + FromPrimitive, I: Unsigned + Copy + Ord + Into<F>>(
+    point0: Option<&P2HistogramPoint<F, I>>,
+    point1: &P2HistogramPoint<F, I>,
+    point2: &P2HistogramPoint<F, I>,
+    point3: Option<&P2HistogramPoint<F, I>>,
+    x: F,
+) -> F {
+    let s0 = secant_diff(point0, Some(point1));
+    let s1 = secant_diff(Some(point1), Some(point2));
+    let s2 = secant_diff(Some(point2), point3);
+    let m0 = pchip_point_derivative(s0.0, s0.1, s1.0, s1.1);
+    let m1 = pchip_point_derivative(s1.0, s1.1, s2.0, s2.1);
+
+    cubic_hermite_spline(
+        point1.rank.into(),
+        point1.value,
+        point2.rank.into(),
+        point2.value,
+        m0,
+        m1,
+        x,
+    )
+}
+
+struct QuantileEstimator<'a, F: Float, I: Unsigned + Copy + Ord + Into<F>> {
     observations: I,
     points: &'a mut [P2HistogramPoint<F, I>],
 }
 
-impl<'a, F: Float + TotalOrder, I: Unsigned + Copy + Ord + Into<F> + From<usize> + Into<usize>>
-    P2HistogramData<'a, F, I>
+impl<
+        'a,
+        F: Float + TotalOrder + Into<usize> + FromPrimitive,
+        I: Unsigned + Copy + Ord + Into<F> + From<usize> + Into<usize>,
+    > QuantileEstimator<'a, F, I>
 {
     fn _standard_update(&mut self, sample: F) {
         // Find where sample falls within distribution...
@@ -116,7 +133,38 @@ impl<'a, F: Float + TotalOrder, I: Unsigned + Copy + Ord + Into<F> + From<usize>
 
         // Adjust inner markers to within 1 of their target quantile using p2 formula...
         for i in 1..(self.points.len() - 1) {
-            _p2update(self.points, i, self.observations, self.points.len().into());
+            let target_rank: F = (self.points[i].target * self.observations.into()).floor();
+            let true_rank: F = self.points[i].rank.into();
+            let true_rank_int: usize = self.points[i].rank.into();
+            if (true_rank - target_rank).abs() > F::one() {
+                let target_rank_int: usize = target_rank.into();
+                let lower_rank: usize = self.points[i - 1].rank.into();
+                let upper_rank: usize = self.points[i + 1].rank.into();
+                let new_rank: usize = target_rank_int
+                    .clamp(lower_rank.saturating_add(1), upper_rank.saturating_sub(1));
+                if new_rank == true_rank_int {
+                    continue;
+                }
+
+                let shift = if new_rank > target_rank_int { 1 } else { 0 };
+
+                self.points[i].rank = new_rank.into();
+                self.points[i].value = pchip_prediction(
+                    if i + shift > 2 {
+                        Some(&self.points[i + shift - 2])
+                    } else {
+                        None
+                    },
+                    &self.points[i - shift - 1],
+                    &self.points[i + shift],
+                    if i + shift + 1 < self.points.len() {
+                        Some(&self.points[i + shift + 1])
+                    } else {
+                        None
+                    },
+                    F::from_usize(new_rank).unwrap(),
+                );
+            }
         }
 
         self.observations = self.observations + I::one();
@@ -129,6 +177,8 @@ impl<'a, F: Float + TotalOrder, I: Unsigned + Copy + Ord + Into<F> + From<usize>
     }
 
     fn _initialize(&mut self) {
+        panic!("Fix!");
+        // TODO: Fix this...
         self.points.sort_by(|a, b| a.value.total_cmp(&b.value));
         self.points.iter_mut().enumerate().for_each(|(i, p)| {
             p.rank = i.into();

From 9c261d38303e253572222f3bcfd960c3cec9514e Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Fri, 22 May 2026 19:50:57 -0600
Subject: [PATCH 27/39] New quantile estimator updates done.

---
 Cargo.toml         |   1 -
 src/p2estimator.rs | 497 ++++++++++++++++++++++++++++++---------------
 src/segments.rs    |  10 +-
 3 files changed, 340 insertions(+), 168 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 9055fa9..c5b43b6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -17,7 +17,6 @@ itertools = "0.11.0"
 rayon = "1.8.0"
 base64 = "0.22.1"
 puruspe = "0.4.4"
-num-traits = "0.2.19"
 
 [target.'cfg(not(target_env = "msvc"))'.dependencies]
 tikv-jemallocator = "0.5"
diff --git a/src/p2estimator.rs b/src/p2estimator.rs
index 74873ba..5be3edb 100644
--- a/src/p2estimator.rs
+++ b/src/p2estimator.rs
@@ -1,211 +1,378 @@
-use std::{cmp::Ordering, ops::Neg};
-
-/// Implementation of P2 estimator.
-/// See "The P2 Algorithm for Dynamic Statistical Computing Calculation of Quantiles and Histograms Without Storing Observations"
-/// at https://www.cse.wustl.edu/~jain/papers/ftp/psqr.pdf
-///
-/// We replace the P2 interpolation with PCHIP instead (See paper A Method for Constructing Local Monotone Piecewise Cubic Interpolants by F. N. Fritsch and J. Butland, or https://doi.org/10.1137/0905021)
-use num_traits::{float::TotalOrder, Float, FromPrimitive, Num, Unsigned};
-
-struct P2HistogramPoint<F: Float, I: Unsigned> {
-    value: F,
-    rank: I,
-    target: F,
-}
+use std::cmp::Ordering;
 
-fn get_sign<A: Num + PartialOrd + Neg<Output = A>, B: Num + PartialOrd + Neg<Output = B>>(
-    val: A,
-) -> B {
-    if val >= A::zero() {
-        B::one()
-    } else {
-        -B::one()
-    }
+use crate::segments::MergeIterator;
+use itertools::izip;
+// Implementation of P2 estimator.
+// See "The P2 Algorithm for Dynamic Statistical Computing Calculation of Quantiles and Histograms Without Storing Observations"
+// at https://www.cse.wustl.edu/~jain/papers/ftp/psqr.pdf
+//
+// We replace the P2 interpolation with PCHIP instead (See paper A Method for Constructing Local Monotone Piecewise Cubic Interpolants by F. N. Fritsch and J. Butland, or https://doi.org/10.1137/0905021)
+
+struct QuantileEstimatorData<'a> {
+    ranks: &'a [usize],
+    values: &'a [f64],
+    targets: &'a [f64],
+    observations: &'a usize,
 }
 
-fn inc_or_dec<A: Num + PartialOrd, B: Num + PartialOrd + Neg<Output = B>>(val: A, delta: B) -> A {
-    match delta.partial_cmp(&B::zero()) {
-        Some(Ordering::Less) => val - A::one(),
-        Some(Ordering::Greater) => val + A::one(),
-        _ => val,
-    }
+struct MutableQuantileEstimatorData<'a> {
+    ranks: &'a mut [usize],
+    values: &'a mut [f64],
+    targets: &'a [f64],
+    observations: &'a mut usize,
 }
 
-fn cubic_hermite_spline<F: Float + FromPrimitive>(
-    x0: F,
-    y0: F,
-    x1: F,
-    y1: F,
-    m0: F,
-    m1: F,
-    x: F,
-) -> F {
+fn cubic_hermite_spline(x0: f64, y0: f64, x1: f64, y1: f64, m0: f64, m1: f64, x: f64) -> f64 {
     let t = (x - x0) / (x1 - x0);
     let ms0 = (x1 - x0) * m0;
     let ms1 = (x1 - x0) * m1;
 
-    let _1 = F::one();
-    let _2 = F::from_i32(2).unwrap();
-    let _3 = F::from_i32(3).unwrap();
-
-    let h0: F = (_2 * t - _3) * t * t + _1;
-    let h1 = ((t - _1) * t + _1) * t;
-    let h2 = (_2 * t + _3) * t * t;
-    let h3 = (t - _1) * t * t;
+    let h0 = (2.0 * t - 3.0) * t * t + 1.0;
+    let h1 = ((t - 1.0) * t + 1.0) * t;
+    let h2 = (2.0 * t + 3.0) * t * t;
+    let h3 = (t - 1.0) * t * t;
 
     h0 * y0 + h1 * ms0 + h2 * y1 + h3 * ms1
 }
 
-fn pchip_point_derivative<F: Float + FromPrimitive>(dx0: F, dy0: F, dx1: F, dy1: F) -> F {
-    if dy0 * dy1 > F::zero() {
-        let _1 = F::one();
-        let one_third = _1 / F::from_i32(3).unwrap();
-        let alpha = one_third * (_1 + dx1 / (dx0 + dx1));
-        dy0 * dy1 / (alpha * dy1 + (_1 - alpha) * dy0)
+fn pchip_point_derivative(dx0: f64, dy0: f64, dx1: f64, dy1: f64) -> f64 {
+    if dy0 * dy1 > 0.0 {
+        let alpha = (1.0 / 3.0) * (1.0 + dx1 / (dx0 + dx1));
+        dy0 * dy1 / (alpha * dy1 + (1.0 - alpha) * dy0)
     } else {
-        F::zero()
+        0.0
     }
 }
 
-fn secant_diff<F: Float, I: Unsigned + Copy + Ord + Into<F>>(
-    point0: Option<&P2HistogramPoint<F, I>>,
-    point1: Option<&P2HistogramPoint<F, I>>,
-) -> (F, F) {
-    if let (Some(p0), Some(p1)) = (point0, point1) {
-        ((p1.rank - p0.rank).into(), p1.value - p0.value)
-    } else {
-        // Assume slope at endpoints of CDF is 0...
-        (F::zero(), F::zero())
-    }
+fn pchip_prediction(ranks: &[f64; 4], values: &[f64; 4], x: f64) -> f64 {
+    let m0 = pchip_point_derivative(
+        ranks[1] - ranks[0],
+        values[1] - values[0],
+        ranks[2] - ranks[1],
+        values[2] - values[1],
+    );
+    let m1 = pchip_point_derivative(
+        ranks[2] - ranks[1],
+        values[2] - values[1],
+        ranks[3] - ranks[2],
+        values[3] - values[2],
+    );
+
+    cubic_hermite_spline(ranks[1], values[1], ranks[2], values[2], m0, m1, x)
 }
 
-fn pchip_prediction<F: Float + FromPrimitive, I: Unsigned + Copy + Ord + Into<F>>(
-    point0: Option<&P2HistogramPoint<F, I>>,
-    point1: &P2HistogramPoint<F, I>,
-    point2: &P2HistogramPoint<F, I>,
-    point3: Option<&P2HistogramPoint<F, I>>,
-    x: F,
-) -> F {
-    let s0 = secant_diff(point0, Some(point1));
-    let s1 = secant_diff(Some(point1), Some(point2));
-    let s2 = secant_diff(Some(point2), point3);
-    let m0 = pchip_point_derivative(s0.0, s0.1, s1.0, s1.1);
-    let m1 = pchip_point_derivative(s1.0, s1.1, s2.0, s2.1);
-
-    cubic_hermite_spline(
-        point1.rank.into(),
-        point1.value,
-        point2.rank.into(),
-        point2.value,
-        m0,
-        m1,
-        x,
-    )
+fn debug_check_valid_estimator(
+    ranks: &[usize],
+    values: &[f64],
+    targets: &[f64],
+    observations: usize,
+) {
+    debug_assert!(ranks.len() > 2);
+    debug_assert!(ranks.len() == values.len() && ranks.len() == targets.len());
+    debug_assert!(values.is_sorted() && ranks.is_sorted() && targets.is_sorted());
+    debug_assert!(targets.first() == Some(&0.0) && targets.last() == Some(&1.0));
+    debug_assert!(observations >= ranks.len());
+    debug_assert!(ranks.first() == Some(&0) && ranks.last() == Some(&observations));
 }
 
-struct QuantileEstimator<'a, F: Float, I: Unsigned + Copy + Ord + Into<F>> {
-    observations: I,
-    points: &'a mut [P2HistogramPoint<F, I>],
+fn debug_check_uninitialized_estimator(ranks: &[usize], values: &[f64], targets: &[f64]) {
+    debug_assert!(ranks.len() > 2);
+    debug_assert!(ranks.len() == values.len() && ranks.len() == targets.len());
+    debug_assert!(targets.is_sorted());
+    debug_assert!(targets.first() == Some(&0.0) && targets.last() == Some(&1.0));
 }
 
-impl<
-        'a,
-        F: Float + TotalOrder + Into<usize> + FromPrimitive,
-        I: Unsigned + Copy + Ord + Into<F> + From<usize> + Into<usize>,
-    > QuantileEstimator<'a, F, I>
-{
-    fn _standard_update(&mut self, sample: F) {
-        // Find where sample falls within distribution...
-        let p = self.points.partition_point(|v| v.value <= sample);
-        let bound_p = p.min(self.points.len() - 1);
-
-        // Update extremes...
-        if bound_p == 0 {
-            self.points[bound_p].value = self.points[bound_p].value.min(sample);
-        } else if bound_p == (self.points.len() - 1) {
-            self.points[bound_p].value = self.points[bound_p].value.max(sample);
+fn _add_sample_to_estimator(data: MutableQuantileEstimatorData, sample: f64) {
+    let MutableQuantileEstimatorData {
+        ranks,
+        values,
+        targets,
+        observations,
+    } = data;
+    debug_check_valid_estimator(ranks, values, targets, *observations);
+    // Find where sample falls within distribution...
+    let p = values.partition_point(|&v| v <= sample);
+    let bound_p = p.min(values.len() - 1);
+
+    // Update extremes...
+    if bound_p == 0 {
+        values[bound_p] = values[bound_p].min(sample);
+    } else if bound_p == (values.len() - 1) {
+        values[bound_p] = values[bound_p].max(sample);
+    }
+
+    // Increment ranks of markers above newly inserted sample...
+    for i in (bound_p + 1)..ranks.len() {
+        ranks[i] = ranks[1] + 1;
+    }
+
+    // Adjust inner markers to within 1 of their target quantile using p2 formula...
+    for i in 1..(values.len() - 1) {
+        let target_rank = (targets[i] * (*observations) as f64) as usize;
+        let true_rank = ranks[i];
+        if true_rank.abs_diff(target_rank) > 1 {
+            let new_rank: usize = target_rank.clamp(
+                ranks[i - 1].saturating_add(1),
+                ranks[i + 1].saturating_sub(1),
+            );
+            if new_rank == true_rank {
+                continue;
+            }
+
+            let idx_shift = if new_rank > target_rank { 1 } else { 0 };
+            let indexes = [
+                (i + idx_shift).saturating_sub(2),
+                (i + idx_shift).saturating_sub(1),
+                (i + idx_shift),
+                (i + idx_shift).saturating_add(1).min(ranks.len() - 1),
+            ];
+
+            ranks[i] = new_rank;
+            values[i] = pchip_prediction(
+                &indexes.map(|i| ranks[i] as f64),
+                &indexes.map(|i| values[i]),
+                new_rank as f64,
+            );
         }
+    }
+
+    *observations += 1;
+}
+
+fn _merge_estimators(
+    q1: QuantileEstimatorData,
+    q2: QuantileEstimatorData,
+    mut new_estimator: MutableQuantileEstimatorData,
+) {
+    debug_check_valid_estimator(q1.ranks, q1.values, q1.targets, *q1.observations);
+    debug_check_valid_estimator(q2.ranks, q2.values, q2.targets, *q2.observations);
+    debug_check_uninitialized_estimator(
+        new_estimator.ranks,
+        new_estimator.values,
+        new_estimator.targets,
+    );
+
+    assert!(new_estimator.targets.len() <= (*q1.observations + *q2.observations));
+
+    fn get_at(a: &QuantileEstimatorData, i: usize) -> (usize, f64) {
+        (a.ranks[i], a.values[i])
+    }
+
+    fn set_at(a: &mut MutableQuantileEstimatorData, i: usize, data: (usize, f64)) {
+        a.ranks[i] = data.0;
+        a.values[i] = data.1;
+    }
+
+    // Initialize the min/max quantiles...
+    if q1.values[0] <= q2.values[0] {
+        set_at(&mut new_estimator, 0, get_at(&q1, 0));
+    } else {
+        set_at(&mut new_estimator, 0, get_at(&q2, 0));
+    }
+
+    let new_est_len = new_estimator.ranks.len();
+    if q1.values[q1.values.len() - 1] >= q2.values[q2.values.len() - 1] {
+        set_at(
+            &mut new_estimator,
+            new_est_len - 1,
+            get_at(&q1, q1.ranks.len() - 1),
+        );
+    } else {
+        set_at(
+            &mut new_estimator,
+            new_est_len - 1,
+            get_at(&q2, q2.ranks.len() - 1),
+        );
+    }
+
+    // May eventually replace with algorithm that doesn't use extra memory...
+    // Calculate a "merged" quantiles by linearly iterpolating ranks based on the values we see...
+    let mut dual_est_quants: Vec<(f64, f64)> = Vec::with_capacity(q1.ranks.len() + q2.ranks.len());
+
+    let mut q1_prior: Option<(usize, f64)> = None;
+    let mut q2_prior: Option<(usize, f64)> = None;
 
-        // Increment ranks of markers above newly inserted sample...
-        for i in (bound_p + 1)..self.points.len() {
-            self.points[i].rank = self.points[i].rank + I::one();
+    let mut q1_idx = 0;
+    let mut q2_idx = 0;
+
+    loop {
+        let q1_past_end = q1_idx >= q1.values.len();
+        let q2_past_end = q2_idx >= q2.values.len();
+
+        if q1_past_end && q2_past_end {
+            break;
+        } else if q1_past_end {
+            let next = get_at(&q2, q2_idx);
+            dual_est_quants.push(((next.0 + q1_prior.map(|v| v.0).unwrap_or(0)) as f64, next.1));
+            q2_prior = Some(next);
+            q2_idx += 1;
+        } else if q2_past_end {
+            let next = get_at(&q1, q1_idx);
+            dual_est_quants.push(((next.0 + q2_prior.map(|v| v.0).unwrap_or(0)) as f64, next.1));
+            q1_prior = Some(next);
+            q1_idx += 1;
+        } else if q1.values[q1_idx] <= q2.values[q2_idx] {
+            let other_next = get_at(&q2, q2_idx);
+            let next = get_at(&q1, q1_idx);
+            let w = q2_prior
+                .map(|other_prior| (next.1 - other_prior.1) / (other_next.1 - other_prior.1))
+                .unwrap_or(0.0);
+            let other_rank_est = q2_prior
+                .map(|other_prior| other_prior.0 as f64 * (1.0 - w) + other_next.0 as f64 * w)
+                .unwrap_or(0.0);
+            dual_est_quants.push((next.0 as f64 + other_rank_est, next.1));
+            q1_prior = Some(next);
+            q1_idx += 1;
+        } else {
+            let other_next = get_at(&q1, q1_idx);
+            let next = get_at(&q2, q2_idx);
+            let w = q1_prior
+                .map(|other_prior| (next.1 - other_prior.1) / (other_next.1 - other_prior.1))
+                .unwrap_or(0.0);
+            let other_rank_est = q1_prior
+                .map(|other_prior| other_prior.0 as f64 * (1.0 - w) + other_next.0 as f64 * w)
+                .unwrap_or(0.0);
+            dual_est_quants.push((next.0 as f64 + other_rank_est, next.1));
+            q2_prior = Some(next);
+            q2_idx += 1;
         }
+    }
 
-        // Adjust inner markers to within 1 of their target quantile using p2 formula...
-        for i in 1..(self.points.len() - 1) {
-            let target_rank: F = (self.points[i].target * self.observations.into()).floor();
-            let true_rank: F = self.points[i].rank.into();
-            let true_rank_int: usize = self.points[i].rank.into();
-            if (true_rank - target_rank).abs() > F::one() {
-                let target_rank_int: usize = target_rank.into();
-                let lower_rank: usize = self.points[i - 1].rank.into();
-                let upper_rank: usize = self.points[i + 1].rank.into();
-                let new_rank: usize = target_rank_int
-                    .clamp(lower_rank.saturating_add(1), upper_rank.saturating_sub(1));
-                if new_rank == true_rank_int {
-                    continue;
-                }
+    // New number of observations is the sum of both...
+    *(new_estimator.observations) = *(q1.observations) + *(q2.observations);
 
-                let shift = if new_rank > target_rank_int { 1 } else { 0 };
-
-                self.points[i].rank = new_rank.into();
-                self.points[i].value = pchip_prediction(
-                    if i + shift > 2 {
-                        Some(&self.points[i + shift - 2])
-                    } else {
-                        None
-                    },
-                    &self.points[i - shift - 1],
-                    &self.points[i + shift],
-                    if i + shift + 1 < self.points.len() {
-                        Some(&self.points[i + shift + 1])
-                    } else {
-                        None
-                    },
-                    F::from_usize(new_rank).unwrap(),
-                );
-            }
+    // Solve all inner quantiles using traditional interpolation...
+    let mut index_between = 0;
+
+    for ti in 1..new_estimator.targets.len() - 1 {
+        // Calculate new rank...
+        let target = new_estimator.targets[ti];
+        let approx_obs_rank = ((target * *(new_estimator.observations) as f64) as usize).clamp(
+            1 + ti,
+            *(new_estimator.observations) - (new_estimator.targets.len() - (ti + 1)),
+        );
+
+        // Find where it lands in cdf...
+        while index_between < dual_est_quants.len()
+            && (approx_obs_rank as f64) < dual_est_quants[index_between].0
+        {
+            index_between += 1;
         }
 
-        self.observations = self.observations + I::one();
+        // Get pchip estimate for the value...
+        let indexes = [
+            index_between.saturating_sub(2),
+            index_between.saturating_sub(1),
+            index_between,
+            index_between
+                .saturating_add(1)
+                .min(dual_est_quants.len() - 1),
+        ];
+
+        new_estimator.ranks[ti] = approx_obs_rank;
+        new_estimator.values[ti] = pchip_prediction(
+            &indexes.map(|i| dual_est_quants[i].0),
+            &indexes.map(|i| dual_est_quants[i].1),
+            approx_obs_rank as f64,
+        )
     }
+}
 
-    fn _pre_init_update(&mut self, sample: F) {
-        let nxt_idx: usize = self.observations.into();
-        self.points[nxt_idx].value = sample;
-        self.observations = self.observations + I::one();
+pub trait QuantileEstimator {
+    fn update(&mut self, sample: f64);
+    fn update_all(&mut self, samples: &[f64]) {
+        for &s in samples.iter() {
+            self.update(s);
+        }
     }
+    fn combine(&self, other: &Self) -> Self;
+}
+
+#[derive(Clone)]
+struct FixedSizeQuantileEstimator<const N: usize> {
+    values: [f64; N],
+    ranks: [usize; N],
+    targets: [f64; N],
+    observations: usize,
+}
 
-    fn _initialize(&mut self) {
-        panic!("Fix!");
-        // TODO: Fix this...
-        self.points.sort_by(|a, b| a.value.total_cmp(&b.value));
-        self.points.iter_mut().enumerate().for_each(|(i, p)| {
-            p.rank = i.into();
-        });
+impl<const N: usize> FixedSizeQuantileEstimator<N> {
+    pub fn new(targets: &[f64; N]) -> Self {
+        Self {
+            values: [0.0; N],
+            ranks: [0; N],
+            targets: targets.clone(),
+            observations: 0,
+        }
     }
 
-    pub fn update(&mut self, sample: F) {
-        let obs: usize = self.observations.into();
-        match (obs + 1).cmp(&self.points.len()) {
-            Ordering::Less => self._pre_init_update(sample),
+    fn _as_data(&self) -> QuantileEstimatorData<'_> {
+        QuantileEstimatorData {
+            ranks: &self.ranks,
+            values: &self.values,
+            targets: &self.targets,
+            observations: &self.observations,
+        }
+    }
+
+    fn _as_mut_data(&mut self) -> MutableQuantileEstimatorData<'_> {
+        MutableQuantileEstimatorData {
+            ranks: &mut self.ranks,
+            values: &mut self.values,
+            targets: &self.targets,
+            observations: &mut self.observations,
+        }
+    }
+
+    fn _is_initialized(&self) -> bool {
+        self.observations >= N
+    }
+}
+
+impl<const N: usize> QuantileEstimator for FixedSizeQuantileEstimator<N> {
+    fn update(&mut self, sample: f64) {
+        match (self.observations + 1).cmp(&self.values.len()) {
+            Ordering::Less => {
+                self.values[self.observations] = sample;
+                self.observations += 1;
+            }
             Ordering::Equal => {
-                self._pre_init_update(sample);
-                self._initialize();
+                self.values[self.observations] = sample;
+                self.values.sort_by(|a, b| a.total_cmp(b));
+                for i in 0..self.ranks.len() {
+                    self.ranks[i] = i / (self.ranks.len() - 1);
+                }
+                self.observations += 1;
             }
             Ordering::Greater => {
-                self._standard_update(sample);
+                _add_sample_to_estimator(self._as_mut_data(), sample);
             }
         }
     }
 
-    pub fn is_initialized(&self) -> bool {
-        let obs: usize = self.observations.into();
-        obs >= self.points.len()
-    }
+    fn combine(&self, other: &Self) -> Self {
+        match (self._is_initialized(), other._is_initialized()) {
+            (true, true) => {
+                let mut new_quant_est = Self::new(&self.targets);
+
+                _merge_estimators(
+                    self._as_data(),
+                    other._as_data(),
+                    new_quant_est._as_mut_data(),
+                );
 
-    fn combine(&mut self, other: &P2HistogramData<F, I>) {
-        // TODO: Need to think about how to do this efficiently while maintaining accuracy...
-        panic!("Not implemented!")
+                new_quant_est
+            }
+            (true, false) | (false, false) => {
+                let mut new_quant_est = self.clone();
+                new_quant_est.update_all(&other.values[..other.observations]);
+                new_quant_est
+            }
+            (false, true) => {
+                let mut new_quant_est = other.clone();
+                new_quant_est.update_all(&self.values[..self.observations]);
+                new_quant_est
+            }
+        }
     }
 }
diff --git a/src/segments.rs b/src/segments.rs
index 30de252..81cdb3e 100644
--- a/src/segments.rs
+++ b/src/segments.rs
@@ -199,6 +199,7 @@ pub struct MergeIterator<
     val1: MergeEntry<I::Item>,
     val2: MergeEntry<I::Item>,
     prior_val: MergeEntry<I::Item>,
+    only_unique: bool,
     comparator: F,
 }
 
@@ -207,13 +208,14 @@ impl<I: Iterator, J: Iterator<Item = I::Item>, F: Fn(&I::Item, &I::Item) -> Orde
 where
     I::Item: Copy,
 {
-    pub fn new(iter1: I, iter2: J, comparator: F) -> Self {
+    pub fn new(iter1: I, iter2: J, comparator: F, only_unique: bool) -> Self {
         Self {
             iter1: iter1.fuse(),
             iter2: iter2.fuse(),
             val1: MergeEntry::Start,
             val2: MergeEntry::Start,
             prior_val: MergeEntry::Start,
+            only_unique,
             comparator,
         }
     }
@@ -269,6 +271,10 @@ where
                 next_val = self.val2;
                 self.val2 = self.iter2.next().into();
             }
+
+            if !self.only_unique {
+                break;
+            }
         }
 
         self.prior_val = next_val;
@@ -295,7 +301,7 @@ pub fn unique_merging_iterator<I: Iterator, J: Iterator<Item = I::Item>>(
 where
     I::Item: Copy + Ord,
 {
-    MergeIterator::new(list1, list2, |a, b| a.cmp(b))
+    MergeIterator::new(list1, list2, |a, b| a.cmp(b), true)
 }
 
 #[derive(Debug)]

From 958b3da138b65f04b9d10e2c9745d5317f47629f Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Sat, 23 May 2026 01:49:18 -0600
Subject: [PATCH 28/39] More work on quantile estimation.

---
 src/p2estimator.rs | 138 +++++++++++++++++++++++++++++++++++++--------
 src/statistics.rs  |  20 +++++--
 2 files changed, 132 insertions(+), 26 deletions(-)

diff --git a/src/p2estimator.rs b/src/p2estimator.rs
index 5be3edb..3a2455e 100644
--- a/src/p2estimator.rs
+++ b/src/p2estimator.rs
@@ -1,6 +1,6 @@
 use std::cmp::Ordering;
 
-use crate::segments::MergeIterator;
+use crate::{segments::MergeIterator, statistics::Distribution};
 use itertools::izip;
 // Implementation of P2 estimator.
 // See "The P2 Algorithm for Dynamic Statistical Computing Calculation of Quantiles and Histograms Without Storing Observations"
@@ -36,29 +36,39 @@ fn cubic_hermite_spline(x0: f64, y0: f64, x1: f64, y1: f64, m0: f64, m1: f64, x:
 }
 
 fn pchip_point_derivative(dx0: f64, dy0: f64, dx1: f64, dy1: f64) -> f64 {
-    if dy0 * dy1 > 0.0 {
+    let s0 = if dx0 != 0.0 { dy0 / dx0 } else { 0.0 };
+    let s1 = if dx1 != 0.0 { dy1 / dx1 } else { 0.0 };
+    if s0 * s1 > 0.0 {
         let alpha = (1.0 / 3.0) * (1.0 + dx1 / (dx0 + dx1));
-        dy0 * dy1 / (alpha * dy1 + (1.0 - alpha) * dy0)
+        s0 * s1 / (alpha * s1 + (1.0 - alpha) * s0)
     } else {
         0.0
     }
 }
 
-fn pchip_prediction(ranks: &[f64; 4], values: &[f64; 4], x: f64) -> f64 {
+fn pchip_prediction(x_points: &[f64; 4], y_points: &[f64; 4], x: f64) -> f64 {
     let m0 = pchip_point_derivative(
-        ranks[1] - ranks[0],
-        values[1] - values[0],
-        ranks[2] - ranks[1],
-        values[2] - values[1],
+        x_points[1] - x_points[0],
+        y_points[1] - y_points[0],
+        x_points[2] - x_points[1],
+        y_points[2] - y_points[1],
     );
     let m1 = pchip_point_derivative(
-        ranks[2] - ranks[1],
-        values[2] - values[1],
-        ranks[3] - ranks[2],
-        values[3] - values[2],
+        x_points[2] - x_points[1],
+        y_points[2] - y_points[1],
+        x_points[3] - x_points[2],
+        y_points[3] - y_points[2],
     );
 
-    cubic_hermite_spline(ranks[1], values[1], ranks[2], values[2], m0, m1, x)
+    cubic_hermite_spline(
+        x_points[1],
+        y_points[1],
+        x_points[2],
+        x_points[2],
+        m0,
+        m1,
+        x,
+    )
 }
 
 fn debug_check_valid_estimator(
@@ -72,7 +82,7 @@ fn debug_check_valid_estimator(
     debug_assert!(values.is_sorted() && ranks.is_sorted() && targets.is_sorted());
     debug_assert!(targets.first() == Some(&0.0) && targets.last() == Some(&1.0));
     debug_assert!(observations >= ranks.len());
-    debug_assert!(ranks.first() == Some(&0) && ranks.last() == Some(&observations));
+    debug_assert!(ranks.first() == Some(&0) && ranks.last() == Some(&(observations - 1)));
 }
 
 fn debug_check_uninitialized_estimator(ranks: &[usize], values: &[f64], targets: &[f64]) {
@@ -91,7 +101,7 @@ fn _add_sample_to_estimator(data: MutableQuantileEstimatorData, sample: f64) {
     } = data;
     debug_check_valid_estimator(ranks, values, targets, *observations);
     // Find where sample falls within distribution...
-    let p = values.partition_point(|&v| v <= sample);
+    let p = values.partition_point(|&v| v < sample);
     let bound_p = p.min(values.len() - 1);
 
     // Update extremes...
@@ -102,12 +112,13 @@ fn _add_sample_to_estimator(data: MutableQuantileEstimatorData, sample: f64) {
     }
 
     // Increment ranks of markers above newly inserted sample...
-    for i in (bound_p + 1)..ranks.len() {
+    for i in bound_p.max(1)..ranks.len() {
         ranks[i] = ranks[1] + 1;
     }
 
     // Adjust inner markers to within 1 of their target quantile using p2 formula...
     for i in 1..(values.len() - 1) {
+        // Observations hasn't been incremented yet, don't need to subtract 1...
         let target_rank = (targets[i] * (*observations) as f64) as usize;
         let true_rank = ranks[i];
         if true_rank.abs_diff(target_rank) > 1 {
@@ -240,6 +251,7 @@ fn _merge_estimators(
 
     // New number of observations is the sum of both...
     *(new_estimator.observations) = *(q1.observations) + *(q2.observations);
+    let rank_range = *(new_estimator.observations) - 1;
 
     // Solve all inner quantiles using traditional interpolation...
     let mut index_between = 0;
@@ -247,10 +259,8 @@ fn _merge_estimators(
     for ti in 1..new_estimator.targets.len() - 1 {
         // Calculate new rank...
         let target = new_estimator.targets[ti];
-        let approx_obs_rank = ((target * *(new_estimator.observations) as f64) as usize).clamp(
-            1 + ti,
-            *(new_estimator.observations) - (new_estimator.targets.len() - (ti + 1)),
-        );
+        let approx_obs_rank = ((target * rank_range as f64) as usize)
+            .clamp(ti, rank_range - (new_estimator.targets.len() - (ti + 1)));
 
         // Find where it lands in cdf...
         while index_between < dual_est_quants.len()
@@ -278,7 +288,7 @@ fn _merge_estimators(
     }
 }
 
-pub trait QuantileEstimator {
+pub trait QuantileEstimator: Distribution {
     fn update(&mut self, sample: f64);
     fn update_all(&mut self, samples: &[f64]) {
         for &s in samples.iter() {
@@ -329,6 +339,90 @@ impl<const N: usize> FixedSizeQuantileEstimator<N> {
     }
 }
 
+impl<const N: usize> Distribution for FixedSizeQuantileEstimator<N> {
+    fn cdf(&self, x: f64) -> f64 {
+        let upper_p = self.values.partition_point(|&v| v < x);
+        if upper_p > self.values.len() {
+            1.0
+        } else if upper_p == 0 {
+            0.0
+        } else {
+            let indexes = [
+                upper_p.saturating_sub(2),
+                upper_p.saturating_sub(1),
+                upper_p,
+                upper_p.saturating_add(1).min(self.values.len()),
+            ];
+
+            pchip_prediction(
+                &indexes.map(|i| self.values[i]),
+                &indexes.map(|i| self.ranks[i] as f64),
+                x,
+            ) / (self.observations - 1) as f64
+        }
+    }
+
+    fn logcdf(&self, x: f64) -> f64 {
+        self.cdf(x).ln()
+    }
+
+    fn ccdf(&self, x: f64) -> f64 {
+        1.0 - self.cdf(x)
+    }
+
+    fn logccdf(&self, x: f64) -> f64 {
+        (-self.cdf(x)).ln_1p()
+    }
+
+    fn ppf(&self, p: f64) -> f64 {
+        let est_rank = p.clamp(0.0, 1.0) * (self.observations - 1) as f64;
+        let upper_p = self.ranks.partition_point(|&r| (r as f64) < est_rank);
+        if upper_p > self.values.len() {
+            self.values[self.values.len() - 1]
+        } else if upper_p == 0 {
+            self.values[0]
+        } else {
+            let indexes = [
+                upper_p.saturating_sub(2),
+                upper_p.saturating_sub(1),
+                upper_p,
+                upper_p.saturating_add(1).min(self.values.len()),
+            ];
+
+            pchip_prediction(
+                &indexes.map(|i| self.ranks[i] as f64),
+                &indexes.map(|i| self.values[i]),
+                est_rank,
+            )
+        }
+    }
+
+    fn pdf(&self, x: f64) -> f64 {
+        // Will have to calculate derivatives, cache normalization factor (such that area under curve is 1)...
+        // May be worth splitting out into different class to allow pre-processing this stuff...
+        panic!("TODO!");
+        let upper_p = self.values.partition_point(|&v| v < x);
+        if upper_p > self.values.len() {
+            0.0
+        } else if upper_p == 0 {
+            0.0
+        } else {
+            0.0
+        }
+    }
+
+    fn logpdf(&self, x: f64) -> f64 {
+        self.pdf(x).ln()
+    }
+
+    fn support(&self) -> (f64, f64) {
+        (
+            *self.values.first().unwrap_or(&f64::NEG_INFINITY),
+            *self.values.last().unwrap_or(&f64::INFINITY),
+        )
+    }
+}
+
 impl<const N: usize> QuantileEstimator for FixedSizeQuantileEstimator<N> {
     fn update(&mut self, sample: f64) {
         match (self.observations + 1).cmp(&self.values.len()) {
@@ -340,7 +434,7 @@ impl<const N: usize> QuantileEstimator for FixedSizeQuantileEstimator<N> {
                 self.values[self.observations] = sample;
                 self.values.sort_by(|a, b| a.total_cmp(b));
                 for i in 0..self.ranks.len() {
-                    self.ranks[i] = i / (self.ranks.len() - 1);
+                    self.ranks[i] = i;
                 }
                 self.observations += 1;
             }
diff --git a/src/statistics.rs b/src/statistics.rs
index 7670c93..6fc6e40 100644
--- a/src/statistics.rs
+++ b/src/statistics.rs
@@ -11,7 +11,7 @@ pub fn ln_add_exp(a: f64, b: f64) -> f64 {
 
 // TODO: Support for generic floating types...
 #[allow(dead_code)]
-pub trait Distribution: Clone + Debug + Default {
+pub trait Distribution: Clone {
     fn pdf(&self, x: f64) -> f64;
     fn cdf(&self, x: f64) -> f64;
     fn ppf(&self, p: f64) -> f64;
@@ -20,7 +20,9 @@ pub trait Distribution: Clone + Debug + Default {
     fn logpdf(&self, x: f64) -> f64;
     fn logcdf(&self, x: f64) -> f64;
     fn logccdf(&self, x: f64) -> f64;
+}
 
+pub trait ParameterizedDistribution: Distribution + Debug + Default {
     fn unit() -> Self {
         Self::default()
     }
@@ -31,6 +33,8 @@ pub struct Exponential {
     lambda: f64,
 }
 
+impl ParameterizedDistribution for Exponential {}
+
 impl Exponential {
     pub fn new(lambda: f64) -> Self {
         Self { lambda }
@@ -87,6 +91,8 @@ pub struct ExponentialEstimator {
     degrees_of_freedom: usize,
 }
 
+impl ParameterizedDistribution for ExponentialEstimator {}
+
 impl ExponentialEstimator {
     pub fn new(sample_mean: f64, sample_size: usize) -> Self {
         Self {
@@ -157,6 +163,8 @@ pub struct HalfT {
     degrees_of_freedom: usize,
 }
 
+impl ParameterizedDistribution for HalfT {}
+
 impl HalfT {
     #[allow(dead_code)]
     pub fn new(standard_deviation: f64, degrees_of_freedom: usize) -> Self {
@@ -241,6 +249,8 @@ pub struct Frechet {
     minimum: f64,
 }
 
+impl ParameterizedDistribution for Frechet {}
+
 impl Frechet {
     pub fn new(alpha: f64, scale: f64, minimum: f64) -> Self {
         Self {
@@ -335,6 +345,8 @@ pub struct Laplace {
     scale: f64,
 }
 
+impl ParameterizedDistribution for Laplace {}
+
 impl Laplace {
     pub fn new(mean: f64, scale: f64) -> Self {
         Self { mean, scale }
@@ -414,7 +426,7 @@ mod test {
         fn tlogccdf(&self, x: f64) -> f64;
     }
 
-    impl<T: Distribution> TestDistribution for T {
+    impl<T: ParameterizedDistribution> TestDistribution for T {
         fn tpdf(&self, x: f64) -> f64 {
             self.pdf(x)
         }
@@ -441,11 +453,11 @@ mod test {
         }
     }
 
-    fn as_box<T: Distribution + 'static>(d: T) -> Box<dyn TestDistribution> {
+    fn as_box<T: ParameterizedDistribution + 'static>(d: T) -> Box<dyn TestDistribution> {
         Box::new(d)
     }
 
-    use super::{Distribution, Exponential};
+    use super::{Exponential, ParameterizedDistribution};
 
     fn get_dists() -> [Box<dyn TestDistribution>; 5] {
         [

From 44fde429d13a23f36c41ff955b3dbb315581b955 Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Tue, 26 May 2026 18:10:14 -0600
Subject: [PATCH 29/39] Quantile est working, need to fix merging.

---
 Cargo.toml         |   3 +
 src/p2estimator.rs | 509 +++++++++++++++++++++++++++++++--------------
 src/statistics.rs  |  14 +-
 3 files changed, 367 insertions(+), 159 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index c5b43b6..1a636d7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -29,3 +29,6 @@ opt-level = 3
 lto = "thin"
 codegen-units = 1
 debug = false
+
+[dev-dependencies]
+rand = "0.10.1"
diff --git a/src/p2estimator.rs b/src/p2estimator.rs
index 3a2455e..7e3b702 100644
--- a/src/p2estimator.rs
+++ b/src/p2estimator.rs
@@ -1,7 +1,8 @@
 use std::cmp::Ordering;
 
 use crate::{segments::MergeIterator, statistics::Distribution};
-use itertools::izip;
+use itertools::{izip, Itertools};
+use rayon::iter::Interleave;
 // Implementation of P2 estimator.
 // See "The P2 Algorithm for Dynamic Statistical Computing Calculation of Quantiles and Histograms Without Storing Observations"
 // at https://www.cse.wustl.edu/~jain/papers/ftp/psqr.pdf
@@ -28,8 +29,8 @@ fn cubic_hermite_spline(x0: f64, y0: f64, x1: f64, y1: f64, m0: f64, m1: f64, x:
     let ms1 = (x1 - x0) * m1;
 
     let h0 = (2.0 * t - 3.0) * t * t + 1.0;
-    let h1 = ((t - 1.0) * t + 1.0) * t;
-    let h2 = (2.0 * t + 3.0) * t * t;
+    let h1 = ((t - 2.0) * t + 1.0) * t;
+    let h2 = (-2.0 * t + 3.0) * t * t;
     let h3 = (t - 1.0) * t * t;
 
     h0 * y0 + h1 * ms0 + h2 * y1 + h3 * ms1
@@ -47,6 +48,8 @@ fn pchip_point_derivative(dx0: f64, dy0: f64, dx1: f64, dy1: f64) -> f64 {
 }
 
 fn pchip_prediction(x_points: &[f64; 4], y_points: &[f64; 4], x: f64) -> f64 {
+    println!("{x_points:?}, {y_points:?}, {x}");
+    debug_assert!(x_points.is_sorted() && x <= x_points[2] && x >= x_points[1]);
     let m0 = pchip_point_derivative(
         x_points[1] - x_points[0],
         y_points[1] - y_points[0],
@@ -64,7 +67,7 @@ fn pchip_prediction(x_points: &[f64; 4], y_points: &[f64; 4], x: f64) -> f64 {
         x_points[1],
         y_points[1],
         x_points[2],
-        x_points[2],
+        y_points[2],
         m0,
         m1,
         x,
@@ -113,24 +116,26 @@ fn _add_sample_to_estimator(data: MutableQuantileEstimatorData, sample: f64) {
 
     // Increment ranks of markers above newly inserted sample...
     for i in bound_p.max(1)..ranks.len() {
-        ranks[i] = ranks[1] + 1;
+        ranks[i] = ranks[i] + 1;
     }
 
     // Adjust inner markers to within 1 of their target quantile using p2 formula...
     for i in 1..(values.len() - 1) {
         // Observations hasn't been incremented yet, don't need to subtract 1...
         let target_rank = (targets[i] * (*observations) as f64) as usize;
-        let true_rank = ranks[i];
-        if true_rank.abs_diff(target_rank) > 1 {
+        let current_rank = ranks[i];
+        if current_rank.abs_diff(target_rank) > 1 {
+            //println!("{:?}, {}, {}", ranks, i, ranks[i]);
             let new_rank: usize = target_rank.clamp(
                 ranks[i - 1].saturating_add(1),
                 ranks[i + 1].saturating_sub(1),
             );
-            if new_rank == true_rank {
+            if new_rank == current_rank {
                 continue;
             }
 
-            let idx_shift = if new_rank > target_rank { 1 } else { 0 };
+            let idx_shift = if new_rank > current_rank { 1 } else { 0 };
+
             let indexes = [
                 (i + idx_shift).saturating_sub(2),
                 (i + idx_shift).saturating_sub(1),
@@ -138,12 +143,12 @@ fn _add_sample_to_estimator(data: MutableQuantileEstimatorData, sample: f64) {
                 (i + idx_shift).saturating_add(1).min(ranks.len() - 1),
             ];
 
-            ranks[i] = new_rank;
             values[i] = pchip_prediction(
                 &indexes.map(|i| ranks[i] as f64),
                 &indexes.map(|i| values[i]),
                 new_rank as f64,
             );
+            ranks[i] = new_rank;
         }
     }
 
@@ -169,33 +174,6 @@ fn _merge_estimators(
         (a.ranks[i], a.values[i])
     }
 
-    fn set_at(a: &mut MutableQuantileEstimatorData, i: usize, data: (usize, f64)) {
-        a.ranks[i] = data.0;
-        a.values[i] = data.1;
-    }
-
-    // Initialize the min/max quantiles...
-    if q1.values[0] <= q2.values[0] {
-        set_at(&mut new_estimator, 0, get_at(&q1, 0));
-    } else {
-        set_at(&mut new_estimator, 0, get_at(&q2, 0));
-    }
-
-    let new_est_len = new_estimator.ranks.len();
-    if q1.values[q1.values.len() - 1] >= q2.values[q2.values.len() - 1] {
-        set_at(
-            &mut new_estimator,
-            new_est_len - 1,
-            get_at(&q1, q1.ranks.len() - 1),
-        );
-    } else {
-        set_at(
-            &mut new_estimator,
-            new_est_len - 1,
-            get_at(&q2, q2.ranks.len() - 1),
-        );
-    }
-
     // May eventually replace with algorithm that doesn't use extra memory...
     // Calculate a "merged" quantiles by linearly iterpolating ranks based on the values we see...
     let mut dual_est_quants: Vec<(f64, f64)> = Vec::with_capacity(q1.ranks.len() + q2.ranks.len());
@@ -214,12 +192,18 @@ fn _merge_estimators(
             break;
         } else if q1_past_end {
             let next = get_at(&q2, q2_idx);
-            dual_est_quants.push(((next.0 + q1_prior.map(|v| v.0).unwrap_or(0)) as f64, next.1));
+            dual_est_quants.push((
+                (next.0 + q1_prior.map(|v| v.0 + 1).unwrap_or(0)) as f64,
+                next.1,
+            ));
             q2_prior = Some(next);
             q2_idx += 1;
         } else if q2_past_end {
             let next = get_at(&q1, q1_idx);
-            dual_est_quants.push(((next.0 + q2_prior.map(|v| v.0).unwrap_or(0)) as f64, next.1));
+            dual_est_quants.push((
+                (next.0 + q2_prior.map(|v| v.0 + 1).unwrap_or(0)) as f64,
+                next.1,
+            ));
             q1_prior = Some(next);
             q1_idx += 1;
         } else if q1.values[q1_idx] <= q2.values[q2_idx] {
@@ -252,10 +236,13 @@ fn _merge_estimators(
     // New number of observations is the sum of both...
     *(new_estimator.observations) = *(q1.observations) + *(q2.observations);
     let rank_range = *(new_estimator.observations) - 1;
+    println!("{rank_range}");
 
     // Solve all inner quantiles using traditional interpolation...
     let mut index_between = 0;
 
+    println!("{dual_est_quants:?}");
+
     for ti in 1..new_estimator.targets.len() - 1 {
         // Calculate new rank...
         let target = new_estimator.targets[ti];
@@ -279,11 +266,68 @@ fn _merge_estimators(
                 .min(dual_est_quants.len() - 1),
         ];
 
-        new_estimator.ranks[ti] = approx_obs_rank;
         new_estimator.values[ti] = pchip_prediction(
             &indexes.map(|i| dual_est_quants[i].0),
             &indexes.map(|i| dual_est_quants[i].1),
             approx_obs_rank as f64,
+        );
+        new_estimator.ranks[ti] = approx_obs_rank;
+    }
+}
+
+trait PrimativeCast<T> {
+    fn as_(&self) -> T;
+}
+
+impl PrimativeCast<f64> for f64 {
+    #[inline]
+    fn as_(&self) -> f64 {
+        *self
+    }
+}
+
+impl PrimativeCast<f64> for usize {
+    #[inline]
+    fn as_(&self) -> f64 {
+        *self as f64
+    }
+}
+
+fn _interpolated_value_prediction<
+    I: PartialOrd + PrimativeCast<f64> + Copy,
+    O: PartialOrd + PrimativeCast<f64> + Copy,
+>(
+    xs: &[I],
+    ys: &[O],
+    x: f64,
+    lower_val: f64,
+    upper_val: f64,
+    not_enough_data_value: f64,
+) -> f64 {
+    debug_assert!(xs.is_sorted());
+    debug_assert!(xs.len() == ys.len());
+
+    if xs.len() < 1 {
+        return not_enough_data_value;
+    }
+
+    let idx = xs.partition_point(|&v| v.as_() < x);
+    if idx > xs.len() {
+        upper_val
+    } else if idx == 0 {
+        lower_val
+    } else {
+        let indexes = [
+            idx.saturating_sub(2),
+            idx.saturating_sub(1),
+            idx,
+            idx.saturating_add(1).min(xs.len() - 1),
+        ];
+
+        pchip_prediction(
+            &indexes.map(|i| xs[i].as_()),
+            &indexes.map(|i| ys[i].as_()),
+            x,
         )
     }
 }
@@ -298,7 +342,171 @@ pub trait QuantileEstimator: Distribution {
     fn combine(&self, other: &Self) -> Self;
 }
 
-#[derive(Clone)]
+trait SimpleQuantileEstimatorRepresentation: Clone {
+    fn new_like(other: &Self) -> Self;
+    fn _data(&self) -> QuantileEstimatorData;
+    fn _mut_data(&mut self) -> MutableQuantileEstimatorData;
+    fn _is_initialized(&self) -> bool {
+        let data = self._data();
+        *data.observations >= data.ranks.len()
+    }
+}
+
+impl<Q: SimpleQuantileEstimatorRepresentation> QuantileEstimator for Q {
+    fn update(&mut self, sample: f64) {
+        let data = self._mut_data();
+
+        match (*data.observations + 1).cmp(&data.values.len()) {
+            Ordering::Less => {
+                data.values[*data.observations] = sample;
+                *data.observations += 1;
+            }
+            Ordering::Equal => {
+                data.values[*data.observations] = sample;
+                data.values.sort_by(|a, b| a.total_cmp(b));
+                for i in 0..data.ranks.len() {
+                    data.ranks[i] = i;
+                }
+                *data.observations += 1;
+            }
+            Ordering::Greater => {
+                _add_sample_to_estimator(data, sample);
+            }
+        }
+    }
+
+    fn combine(&self, other: &Self) -> Self {
+        match (self._is_initialized(), other._is_initialized()) {
+            (true, true) => {
+                let mut new_quant_est = Self::new_like(&self);
+
+                _merge_estimators(self._data(), other._data(), new_quant_est._mut_data());
+
+                new_quant_est
+            }
+            (true, false) | (false, false) => {
+                let other_data = other._data();
+                let mut new_quant_est = self.clone();
+                new_quant_est.update_all(&other_data.values[..*other_data.observations]);
+                new_quant_est
+            }
+            (false, true) => {
+                let self_data = self._data();
+                let mut new_quant_est = other.clone();
+                new_quant_est.update_all(&self_data.values[..*self_data.observations]);
+                new_quant_est
+            }
+        }
+    }
+}
+
+impl<Q: SimpleQuantileEstimatorRepresentation> Distribution for Q {
+    fn cdf(&self, x: f64) -> f64 {
+        let data = self._data();
+        if self._is_initialized() {
+            _interpolated_value_prediction(
+                data.values,
+                data.ranks,
+                x,
+                0.0,
+                (*data.observations - 1) as f64,
+                0.0,
+            ) / (*data.observations - 1).max(1) as f64
+        } else {
+            let xs_sorted = data.values[..*data.observations]
+                .iter()
+                .copied()
+                .sorted_by(|a, b| a.total_cmp(b))
+                .collect_vec();
+            let ys = (0..*data.observations).collect_vec();
+            _interpolated_value_prediction(
+                &xs_sorted,
+                &ys,
+                x,
+                0.0,
+                (*data.observations - 1) as f64,
+                0.0,
+            )
+        }
+    }
+
+    fn logcdf(&self, x: f64) -> f64 {
+        self.cdf(x).ln()
+    }
+
+    fn ccdf(&self, x: f64) -> f64 {
+        1.0 - self.cdf(x)
+    }
+
+    fn logccdf(&self, x: f64) -> f64 {
+        (-self.cdf(x)).ln_1p()
+    }
+
+    fn ppf(&self, p: f64) -> f64 {
+        let data = self._data();
+        let est_rank = p.clamp(0.0, 1.0) * (*data.observations - 1) as f64;
+
+        let data = self._data();
+        let (min_val, max_val) = self.support();
+
+        if self._is_initialized() {
+            _interpolated_value_prediction(
+                data.ranks,
+                data.values,
+                est_rank,
+                min_val,
+                max_val,
+                0.0_f64.clamp(min_val, max_val),
+            )
+        } else {
+            let ys_sorted = data.values[..*data.observations]
+                .iter()
+                .copied()
+                .sorted_by(|a, b| a.total_cmp(b))
+                .collect_vec();
+            let xs = (0..*data.observations).collect_vec();
+            _interpolated_value_prediction(
+                &xs,
+                &ys_sorted,
+                est_rank,
+                min_val,
+                max_val,
+                0.0_f64.clamp(min_val, max_val),
+            )
+        }
+    }
+
+    fn pdf(&self, _x: f64) -> f64 {
+        // Will have to calculate derivatives, cache normalization factor (such that area under curve is 1)...
+        // May be worth splitting out into different class to allow pre-processing this stuff...
+        // TODO: Would prefer quintic splines for this... Allows us to avoid normalization...
+        panic!("Currently not supported!");
+    }
+
+    fn logpdf(&self, x: f64) -> f64 {
+        self.pdf(x).ln()
+    }
+
+    fn support(&self) -> (f64, f64) {
+        let data = self._data();
+
+        if self._is_initialized() {
+            (
+                *data.values.first().unwrap_or(&f64::NEG_INFINITY),
+                *data.values.last().unwrap_or(&f64::INFINITY),
+            )
+        } else {
+            data.values[..*data.observations]
+                .iter()
+                .copied()
+                .minmax()
+                .into_option()
+                .unwrap_or((f64::NEG_INFINITY, f64::INFINITY))
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
 struct FixedSizeQuantileEstimator<const N: usize> {
     values: [f64; N],
     ranks: [usize; N],
@@ -308,6 +516,9 @@ struct FixedSizeQuantileEstimator<const N: usize> {
 
 impl<const N: usize> FixedSizeQuantileEstimator<N> {
     pub fn new(targets: &[f64; N]) -> Self {
+        assert!(
+            targets.is_sorted() && targets.first() == Some(&0.0) && targets.last() == Some(&1.0)
+        );
         Self {
             values: [0.0; N],
             ranks: [0; N],
@@ -315,8 +526,14 @@ impl<const N: usize> FixedSizeQuantileEstimator<N> {
             observations: 0,
         }
     }
+}
+
+impl<const N: usize> SimpleQuantileEstimatorRepresentation for FixedSizeQuantileEstimator<N> {
+    fn new_like(other: &Self) -> Self {
+        Self::new(&other.targets)
+    }
 
-    fn _as_data(&self) -> QuantileEstimatorData<'_> {
+    fn _data(&self) -> QuantileEstimatorData<'_> {
         QuantileEstimatorData {
             ranks: &self.ranks,
             values: &self.values,
@@ -325,7 +542,7 @@ impl<const N: usize> FixedSizeQuantileEstimator<N> {
         }
     }
 
-    fn _as_mut_data(&mut self) -> MutableQuantileEstimatorData<'_> {
+    fn _mut_data(&mut self) -> MutableQuantileEstimatorData<'_> {
         MutableQuantileEstimatorData {
             ranks: &mut self.ranks,
             values: &mut self.values,
@@ -333,140 +550,128 @@ impl<const N: usize> FixedSizeQuantileEstimator<N> {
             observations: &mut self.observations,
         }
     }
-
-    fn _is_initialized(&self) -> bool {
-        self.observations >= N
-    }
 }
 
-impl<const N: usize> Distribution for FixedSizeQuantileEstimator<N> {
-    fn cdf(&self, x: f64) -> f64 {
-        let upper_p = self.values.partition_point(|&v| v < x);
-        if upper_p > self.values.len() {
-            1.0
-        } else if upper_p == 0 {
-            0.0
-        } else {
-            let indexes = [
-                upper_p.saturating_sub(2),
-                upper_p.saturating_sub(1),
-                upper_p,
-                upper_p.saturating_add(1).min(self.values.len()),
-            ];
+#[derive(Clone, Debug)]
+struct VectorQuantileEstimator {
+    values: Vec<f64>,
+    ranks: Vec<usize>,
+    targets: Vec<f64>,
+    observations: usize,
+}
 
-            pchip_prediction(
-                &indexes.map(|i| self.values[i]),
-                &indexes.map(|i| self.ranks[i] as f64),
-                x,
-            ) / (self.observations - 1) as f64
+impl VectorQuantileEstimator {
+    fn new(targets: &[f64]) -> Self {
+        assert!(
+            targets.is_sorted() && targets.first() == Some(&0.0) && targets.last() == Some(&1.0)
+        );
+        Self {
+            values: vec![0.0; targets.len()],
+            ranks: (0..targets.len()).collect_vec(),
+            targets: Vec::from(targets),
+            observations: 0,
         }
     }
+}
 
-    fn logcdf(&self, x: f64) -> f64 {
-        self.cdf(x).ln()
-    }
-
-    fn ccdf(&self, x: f64) -> f64 {
-        1.0 - self.cdf(x)
+impl SimpleQuantileEstimatorRepresentation for VectorQuantileEstimator {
+    fn new_like(other: &Self) -> Self {
+        Self::new(&other.targets)
     }
 
-    fn logccdf(&self, x: f64) -> f64 {
-        (-self.cdf(x)).ln_1p()
+    fn _data(&self) -> QuantileEstimatorData<'_> {
+        QuantileEstimatorData {
+            ranks: &self.ranks,
+            values: &self.values,
+            targets: &self.targets,
+            observations: &self.observations,
+        }
     }
 
-    fn ppf(&self, p: f64) -> f64 {
-        let est_rank = p.clamp(0.0, 1.0) * (self.observations - 1) as f64;
-        let upper_p = self.ranks.partition_point(|&r| (r as f64) < est_rank);
-        if upper_p > self.values.len() {
-            self.values[self.values.len() - 1]
-        } else if upper_p == 0 {
-            self.values[0]
-        } else {
-            let indexes = [
-                upper_p.saturating_sub(2),
-                upper_p.saturating_sub(1),
-                upper_p,
-                upper_p.saturating_add(1).min(self.values.len()),
-            ];
-
-            pchip_prediction(
-                &indexes.map(|i| self.ranks[i] as f64),
-                &indexes.map(|i| self.values[i]),
-                est_rank,
-            )
+    fn _mut_data(&mut self) -> MutableQuantileEstimatorData<'_> {
+        MutableQuantileEstimatorData {
+            ranks: &mut self.ranks,
+            values: &mut self.values,
+            targets: &self.targets,
+            observations: &mut self.observations,
         }
     }
+}
 
-    fn pdf(&self, x: f64) -> f64 {
-        // Will have to calculate derivatives, cache normalization factor (such that area under curve is 1)...
-        // May be worth splitting out into different class to allow pre-processing this stuff...
-        panic!("TODO!");
-        let upper_p = self.values.partition_point(|&v| v < x);
-        if upper_p > self.values.len() {
-            0.0
-        } else if upper_p == 0 {
-            0.0
+#[cfg(test)]
+mod test {
+    use crate::{
+        p2estimator::{FixedSizeQuantileEstimator, QuantileEstimator, VectorQuantileEstimator},
+        statistics::{linspace, Distribution, Exponential},
+    };
+    use itertools::Itertools;
+    use rand::{rngs::Xoshiro256PlusPlus, RngExt, SeedableRng};
+
+    fn is_close(a: f64, b: f64) -> bool {
+        let rel_tol = 1e-9;
+        let abs_tol = 0.0;
+        if a == b {
+            true
         } else {
-            0.0
+            (a - b).abs() <= (rel_tol * (a.abs()).max(b.abs())).max(abs_tol)
         }
     }
 
-    fn logpdf(&self, x: f64) -> f64 {
-        self.pdf(x).ln()
-    }
+    #[test]
+    fn quantiles_on_exponential_dist() {
+        let expon = Exponential::new(1.0);
+        let mut estimator =
+            FixedSizeQuantileEstimator::new(&[0.0, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]);
 
-    fn support(&self) -> (f64, f64) {
-        (
-            *self.values.first().unwrap_or(&f64::NEG_INFINITY),
-            *self.values.last().unwrap_or(&f64::INFINITY),
-        )
-    }
-}
+        let mut rng = Xoshiro256PlusPlus::seed_from_u64(12345654321);
 
-impl<const N: usize> QuantileEstimator for FixedSizeQuantileEstimator<N> {
-    fn update(&mut self, sample: f64) {
-        match (self.observations + 1).cmp(&self.values.len()) {
-            Ordering::Less => {
-                self.values[self.observations] = sample;
-                self.observations += 1;
-            }
-            Ordering::Equal => {
-                self.values[self.observations] = sample;
-                self.values.sort_by(|a, b| a.total_cmp(b));
-                for i in 0..self.ranks.len() {
-                    self.ranks[i] = i;
-                }
-                self.observations += 1;
-            }
-            Ordering::Greater => {
-                _add_sample_to_estimator(self._as_mut_data(), sample);
-            }
+        for _ in 0..10_000 {
+            let sample = expon.ppf(rng.random());
+
+            estimator.update(sample);
+        }
+
+        for val in linspace(0.0, 0.90, 90) {
+            assert!((estimator.ppf(val) - expon.ppf(val)).abs() <= 0.04);
+            let dist_val = expon.ppf(val);
+            assert!((estimator.cdf(dist_val) - expon.cdf(dist_val)).abs() <= 0.2);
+
+            // Basic probability distribution checks...
+            assert!(is_close(
+                estimator.ccdf(dist_val),
+                1.0 - estimator.cdf(dist_val)
+            ));
+
+            assert!(is_close(
+                estimator.logcdf(dist_val),
+                estimator.cdf(dist_val).ln()
+            ));
+            assert!(is_close(
+                estimator.logccdf(dist_val),
+                estimator.ccdf(dist_val).ln()
+            ));
+
+            assert!((estimator.cdf(estimator.ppf(val)) - val).abs() <= 0.06);
         }
     }
 
-    fn combine(&self, other: &Self) -> Self {
-        match (self._is_initialized(), other._is_initialized()) {
-            (true, true) => {
-                let mut new_quant_est = Self::new(&self.targets);
+    #[test]
+    fn test_quantile_merging() {
+        let expon = Exponential::new(1.0);
+        let mut merged_estimator =
+            VectorQuantileEstimator::new(&linspace(0.0, 1.0, 10).collect_vec());
 
-                _merge_estimators(
-                    self._as_data(),
-                    other._as_data(),
-                    new_quant_est._as_mut_data(),
-                );
+        let mut rng = Xoshiro256PlusPlus::seed_from_u64(12345654321);
 
-                new_quant_est
-            }
-            (true, false) | (false, false) => {
-                let mut new_quant_est = self.clone();
-                new_quant_est.update_all(&other.values[..other.observations]);
-                new_quant_est
-            }
-            (false, true) => {
-                let mut new_quant_est = other.clone();
-                new_quant_est.update_all(&self.values[..self.observations]);
-                new_quant_est
+        for _ in 0..100 {
+            let targets: Vec<f64> = linspace(0.0, 1.0, rng.random_range(5..15)).collect();
+            let mut estimator = VectorQuantileEstimator::new(&targets);
+
+            for _ in 0..100 {
+                estimator.update(expon.ppf(rng.random()));
             }
+
+            merged_estimator = merged_estimator.combine(&estimator);
         }
     }
 }
diff --git a/src/statistics.rs b/src/statistics.rs
index 6fc6e40..909bb21 100644
--- a/src/statistics.rs
+++ b/src/statistics.rs
@@ -410,9 +410,15 @@ impl Distribution for Laplace {
     }
 }
 
+pub fn linspace(start: f64, stop: f64, steps: usize) -> impl Iterator<Item = f64> {
+    (0..steps)
+        .map(move |n| n as f64 / (steps as f64 - 1.0))
+        .map(move |n| start * (1.0 - n) + stop * n)
+}
+
 #[cfg(test)]
 mod test {
-    use crate::statistics::{ExponentialEstimator, Frechet, HalfT, Laplace};
+    use crate::statistics::{linspace, ExponentialEstimator, Frechet, HalfT, Laplace};
     use std::fmt::Debug;
 
     pub trait TestDistribution: Debug {
@@ -475,12 +481,6 @@ mod test {
         (a - b).abs() <= (rel_tol * (a.abs()).max(b.abs())).max(abs_tol)
     }
 
-    fn linspace(start: f64, stop: f64, steps: usize) -> impl Iterator<Item = f64> {
-        (0..steps)
-            .map(move |n| n as f64 / (steps as f64 - 1.0))
-            .map(move |n| start * (1.0 - n) + stop * n)
-    }
-
     #[test]
     fn basic_distribution_propery_checks() {
         for dist in get_dists() {

From 117fcfc31ed06eb296c85201b42e28aaba716679 Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Wed, 27 May 2026 01:29:24 -0600
Subject: [PATCH 30/39] Final touches on quantile estimator...

---
 src/p2estimator.rs | 78 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 62 insertions(+), 16 deletions(-)

diff --git a/src/p2estimator.rs b/src/p2estimator.rs
index 7e3b702..aa19e29 100644
--- a/src/p2estimator.rs
+++ b/src/p2estimator.rs
@@ -1,8 +1,7 @@
 use std::cmp::Ordering;
 
-use crate::{segments::MergeIterator, statistics::Distribution};
-use itertools::{izip, Itertools};
-use rayon::iter::Interleave;
+use crate::statistics::Distribution;
+use itertools::Itertools;
 // Implementation of P2 estimator.
 // See "The P2 Algorithm for Dynamic Statistical Computing Calculation of Quantiles and Histograms Without Storing Observations"
 // at https://www.cse.wustl.edu/~jain/papers/ftp/psqr.pdf
@@ -48,7 +47,6 @@ fn pchip_point_derivative(dx0: f64, dy0: f64, dx1: f64, dy1: f64) -> f64 {
 }
 
 fn pchip_prediction(x_points: &[f64; 4], y_points: &[f64; 4], x: f64) -> f64 {
-    println!("{x_points:?}, {y_points:?}, {x}");
     debug_assert!(x_points.is_sorted() && x <= x_points[2] && x >= x_points[1]);
     let m0 = pchip_point_derivative(
         x_points[1] - x_points[0],
@@ -158,7 +156,7 @@ fn _add_sample_to_estimator(data: MutableQuantileEstimatorData, sample: f64) {
 fn _merge_estimators(
     q1: QuantileEstimatorData,
     q2: QuantileEstimatorData,
-    mut new_estimator: MutableQuantileEstimatorData,
+    new_estimator: MutableQuantileEstimatorData,
 ) {
     debug_check_valid_estimator(q1.ranks, q1.values, q1.targets, *q1.observations);
     debug_check_valid_estimator(q2.ranks, q2.values, q2.targets, *q2.observations);
@@ -213,7 +211,9 @@ fn _merge_estimators(
                 .map(|other_prior| (next.1 - other_prior.1) / (other_next.1 - other_prior.1))
                 .unwrap_or(0.0);
             let other_rank_est = q2_prior
-                .map(|other_prior| other_prior.0 as f64 * (1.0 - w) + other_next.0 as f64 * w)
+                .map(|other_prior| {
+                    (other_prior.0 + 1) as f64 * (1.0 - w) + (other_next.0 + 1) as f64 * w
+                })
                 .unwrap_or(0.0);
             dual_est_quants.push((next.0 as f64 + other_rank_est, next.1));
             q1_prior = Some(next);
@@ -225,7 +225,9 @@ fn _merge_estimators(
                 .map(|other_prior| (next.1 - other_prior.1) / (other_next.1 - other_prior.1))
                 .unwrap_or(0.0);
             let other_rank_est = q1_prior
-                .map(|other_prior| other_prior.0 as f64 * (1.0 - w) + other_next.0 as f64 * w)
+                .map(|other_prior| {
+                    (other_prior.0 + 1) as f64 * (1.0 - w) + (other_next.0 + 1) as f64 * w
+                })
                 .unwrap_or(0.0);
             dual_est_quants.push((next.0 as f64 + other_rank_est, next.1));
             q2_prior = Some(next);
@@ -236,12 +238,20 @@ fn _merge_estimators(
     // New number of observations is the sum of both...
     *(new_estimator.observations) = *(q1.observations) + *(q2.observations);
     let rank_range = *(new_estimator.observations) - 1;
-    println!("{rank_range}");
 
     // Solve all inner quantiles using traditional interpolation...
     let mut index_between = 0;
 
-    println!("{dual_est_quants:?}");
+    new_estimator.ranks.first_mut().map(|r| *r = 0);
+    new_estimator.ranks.last_mut().map(|r| *r = rank_range);
+    new_estimator
+        .values
+        .first_mut()
+        .map(|v| *v = dual_est_quants[0].1);
+    new_estimator
+        .values
+        .last_mut()
+        .map(|v| *v = dual_est_quants[dual_est_quants.len() - 1].1);
 
     for ti in 1..new_estimator.targets.len() - 1 {
         // Calculate new rank...
@@ -251,7 +261,7 @@ fn _merge_estimators(
 
         // Find where it lands in cdf...
         while index_between < dual_est_quants.len()
-            && (approx_obs_rank as f64) < dual_est_quants[index_between].0
+            && (approx_obs_rank as f64) > dual_est_quants[index_between].0
         {
             index_between += 1;
         }
@@ -260,7 +270,7 @@ fn _merge_estimators(
         let indexes = [
             index_between.saturating_sub(2),
             index_between.saturating_sub(1),
-            index_between,
+            index_between.min(dual_est_quants.len() - 1),
             index_between
                 .saturating_add(1)
                 .min(dual_est_quants.len() - 1),
@@ -333,6 +343,7 @@ fn _interpolated_value_prediction<
 }
 
 pub trait QuantileEstimator: Distribution {
+    fn from_prior(prior: &Self, count: usize) -> Self;
     fn update(&mut self, sample: f64);
     fn update_all(&mut self, samples: &[f64]) {
         for &s in samples.iter() {
@@ -340,12 +351,13 @@ pub trait QuantileEstimator: Distribution {
         }
     }
     fn combine(&self, other: &Self) -> Self;
+    fn samples(&self) -> usize;
 }
 
 trait SimpleQuantileEstimatorRepresentation: Clone {
     fn new_like(other: &Self) -> Self;
-    fn _data(&self) -> QuantileEstimatorData;
-    fn _mut_data(&mut self) -> MutableQuantileEstimatorData;
+    fn _data(&self) -> QuantileEstimatorData<'_>;
+    fn _mut_data(&mut self) -> MutableQuantileEstimatorData<'_>;
     fn _is_initialized(&self) -> bool {
         let data = self._data();
         *data.observations >= data.ranks.len()
@@ -353,6 +365,32 @@ trait SimpleQuantileEstimatorRepresentation: Clone {
 }
 
 impl<Q: SimpleQuantileEstimatorRepresentation> QuantileEstimator for Q {
+    fn samples(&self) -> usize {
+        *self._data().observations
+    }
+
+    fn from_prior(prior: &Self, count_per_entry: usize) -> Self {
+        let prior_data = prior._data();
+        let mut new_self = Self::new_like(prior);
+        let new_data = new_self._mut_data();
+
+        let new_observations = count_per_entry * prior_data.ranks.len();
+
+        for i in 0..new_data.targets.len() {
+            let closest_rank = ((new_data.targets[i] * (new_observations - 1) as f64) as usize)
+                .clamp(
+                    i,
+                    (new_observations - 1) - (new_data.targets.len() - (i + 1)),
+                );
+
+            new_data.ranks[i] = closest_rank;
+            new_data.values[i] = prior.ppf(closest_rank as f64 / (new_observations - 1) as f64)
+        }
+        *new_data.observations = new_observations;
+
+        new_self
+    }
+
     fn update(&mut self, sample: f64) {
         let data = self._mut_data();
 
@@ -631,10 +669,12 @@ mod test {
             estimator.update(sample);
         }
 
+        assert!(estimator.samples() == 10_000);
+
         for val in linspace(0.0, 0.90, 90) {
             assert!((estimator.ppf(val) - expon.ppf(val)).abs() <= 0.04);
             let dist_val = expon.ppf(val);
-            assert!((estimator.cdf(dist_val) - expon.cdf(dist_val)).abs() <= 0.2);
+            assert!((estimator.cdf(dist_val) - expon.cdf(dist_val)).abs() <= 0.04);
 
             // Basic probability distribution checks...
             assert!(is_close(
@@ -650,8 +690,6 @@ mod test {
                 estimator.logccdf(dist_val),
                 estimator.ccdf(dist_val).ln()
             ));
-
-            assert!((estimator.cdf(estimator.ppf(val)) - val).abs() <= 0.06);
         }
     }
 
@@ -673,5 +711,13 @@ mod test {
 
             merged_estimator = merged_estimator.combine(&estimator);
         }
+
+        assert!(merged_estimator.samples() == 10_000);
+
+        for val in linspace(0.0, 0.75, 75) {
+            assert!((merged_estimator.ppf(val) - expon.ppf(val)).abs() <= 0.1);
+            let dist_val = expon.ppf(val);
+            assert!((merged_estimator.cdf(dist_val) - expon.cdf(dist_val)).abs() <= 0.04)
+        }
     }
 }

From 1a3b1657a56d6e96452c8fdd724a0fc0ceed00b6 Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Wed, 27 May 2026 18:08:40 -0600
Subject: [PATCH 31/39] Better fretchet fitting, need to test..

---
 src/join_estimation.rs | 64 ++++++++++++++++++++++++----------
 src/p2estimator.rs     | 78 +++++++++++++++++++++++++++++++++++++++---
 src/statistics.rs      | 11 ------
 3 files changed, 120 insertions(+), 33 deletions(-)

diff --git a/src/join_estimation.rs b/src/join_estimation.rs
index b688ed9..2d9f970 100644
--- a/src/join_estimation.rs
+++ b/src/join_estimation.rs
@@ -1,7 +1,12 @@
-use std::{fmt::Debug, ops};
+use std::{
+    f64::{self, consts::E},
+    fmt::Debug,
+    ops,
+};
 
 use crate::{
     assembly::{block_consensus_distance, block_length_on_query, block_target_distance, LinkType},
+    p2estimator::{custom_quantile_estimator::FrechetQuant, QuantileEstimator},
     segments::Block,
     statistics::{ln_add_exp, Distribution, ExponentialEstimator, Frechet, HalfT, Laplace},
 };
@@ -37,7 +42,6 @@ impl JoinEstimator for BayesianJoinEstimator {
         let rel_con_dist = consensus_dist as f64
             / block_length_on_query(first_block).max(block_length_on_query(second_block)) as f64;
 
-        /*
         println!("{:#?}", self);
         println!(
             "{} {} {}",
@@ -56,16 +60,16 @@ impl JoinEstimator for BayesianJoinEstimator {
             divergence_diff,
             self.divergence_join.pdf(divergence_diff),
             self.divergence_nojoin.pdf(divergence_diff)
-        );*/
+        );
 
         let join_score = self.join_prior.ln()
             + self.target_distance_join.logpdf(target_dist)
-            + self.divergence_join.logpdf(divergence_diff);
-        //+ self.consensus_distance_join.logpdf(rel_con_dist);
+            + self.divergence_join.logpdf(divergence_diff)
+            + self.consensus_distance_join.logpdf(rel_con_dist);
         let nojoin_score = (-self.join_prior).ln_1p()
             + self.target_distance_nojoin.logpdf(target_dist)
-            + self.divergence_nojoin.logpdf(divergence_diff);
-        //+ self.consensus_distance_nojoin.logpdf(rel_con_dist);
+            + self.divergence_nojoin.logpdf(divergence_diff)
+            + self.consensus_distance_nojoin.logpdf(rel_con_dist);
 
         let score_norm = ln_add_exp(join_score, nojoin_score);
         let score = join_score - score_norm;
@@ -179,6 +183,36 @@ impl From<MomentEstimator> for Laplace {
     }
 }
 
+impl From<&FrechetQuant> for Frechet {
+    fn from(value: &FrechetQuant) -> Self {
+        // Technique developed in notebooks, should write down...
+        fn unscaled_fretchet_ppf(x: f64, a: f64) -> f64 {
+            (-(x.ln())).powf(-1.0 / a)
+        }
+
+        // Chosen so p2 (middle quantile) is close to median...
+        let power_scale = 1.5;
+
+        let p1 = 1.0 / E; // -ln(1/e)...
+        let p2 = (1.0 / E).powf(1.0 / power_scale);
+        let p3 = (1.0 / E).powf(1.0 / (power_scale * power_scale));
+
+        let q1 = value.ppf(p1);
+        let q2 = value.ppf(p2);
+        let q3 = value.ppf(p3);
+
+        let relative_q = (q2 - q1) / (q3 - q1);
+
+        // Because we carefully chose quantiles... Solution for a simplifies to below...
+        let a = power_scale.ln() / (1.0 / relative_q + 1.0).ln();
+        let s = (q2 - q1) / (unscaled_fretchet_ppf(p2, a) - unscaled_fretchet_ppf(p1, a));
+        // Rather than directly estimating m, we assume the mode of the distribution is at 0...
+        let m = -(a / (1.0 + a)).powf(1.0 / a);
+
+        Frechet::new(a, s, m)
+    }
+}
+
 impl From<&BayesianJoinStatistics> for BayesianJoinEstimator {
     fn from(statistics: &BayesianJoinStatistics) -> Self {
         Self {
@@ -186,11 +220,7 @@ impl From<&BayesianJoinStatistics> for BayesianJoinEstimator {
             target_distance_nojoin: statistics.unjoinable_target_distance.into(),
             divergence_join: statistics.joinable_divergence.into(),
             divergence_nojoin: statistics.unjoinable_divergence.into(),
-            consensus_distance_join: Frechet::from_log_moments(
-                statistics.joinable_consensus_log.mean(),
-                statistics.joinable_consensus_log.standard_deviation(),
-                -0.05,
-            ),
+            consensus_distance_join: (&statistics.joinable_consensus).into(),
             consensus_distance_nojoin: statistics.unjoinable_consensus.into(),
             // We take sqrt since we count all pairs, not just neighbors.
             join_prior: (statistics.joinable_target_distance.samples() as f64
@@ -208,7 +238,7 @@ pub struct BayesianJoinStatistics {
     unjoinable_target_distance: MomentEstimator,
     joinable_divergence: MomentEstimator,
     unjoinable_divergence: MomentEstimator,
-    joinable_consensus_log: MomentEstimator,
+    joinable_consensus: FrechetQuant,
     unjoinable_consensus: MomentEstimator,
 }
 
@@ -231,9 +261,7 @@ impl JoinStatisticsCollector for BayesianJoinStatistics {
             unjoinable_divergence: bayesian_prior
                 .unjoinable_divergence
                 .to_psuedo_count(pseudo_count),
-            joinable_consensus_log: bayesian_prior
-                .joinable_consensus_log
-                .to_psuedo_count(pseudo_count),
+            joinable_consensus: FrechetQuant::from_prior(&bayesian_prior.joinable_consensus, 1),
             unjoinable_consensus: bayesian_prior
                 .unjoinable_consensus
                 .to_psuedo_count(pseudo_count),
@@ -252,7 +280,7 @@ impl JoinStatisticsCollector for BayesianJoinStatistics {
             self.joinable_divergence += divergence_diff;
             if matches!(join_type, LinkType::Forward | LinkType::Reverse) {
                 //println!("CDist: {}", rel_con_dist);
-                self.joinable_consensus_log += (rel_con_dist + 0.05).max(1e-50).ln();
+                self.joinable_consensus.update(rel_con_dist);
             }
         } else {
             self.unjoinable_target_distance += target_dist as f64;
@@ -271,7 +299,7 @@ impl JoinStatisticsCollector for BayesianJoinStatistics {
                 + other.unjoinable_target_distance,
             joinable_divergence: self.joinable_divergence + other.joinable_divergence,
             unjoinable_divergence: self.unjoinable_divergence + other.unjoinable_divergence,
-            joinable_consensus_log: self.joinable_consensus_log + other.joinable_consensus_log,
+            joinable_consensus: self.joinable_consensus.combine(&other.joinable_consensus),
             unjoinable_consensus: self.unjoinable_consensus + other.unjoinable_consensus,
         }
     }
diff --git a/src/p2estimator.rs b/src/p2estimator.rs
index aa19e29..2a8c89c 100644
--- a/src/p2estimator.rs
+++ b/src/p2estimator.rs
@@ -351,6 +351,7 @@ pub trait QuantileEstimator: Distribution {
         }
     }
     fn combine(&self, other: &Self) -> Self;
+    #[allow(dead_code)]
     fn samples(&self) -> usize;
 }
 
@@ -374,7 +375,7 @@ impl<Q: SimpleQuantileEstimatorRepresentation> QuantileEstimator for Q {
         let mut new_self = Self::new_like(prior);
         let new_data = new_self._mut_data();
 
-        let new_observations = count_per_entry * prior_data.ranks.len();
+        let new_observations = count_per_entry.max(1) * prior_data.ranks.len();
 
         for i in 0..new_data.targets.len() {
             let closest_rank = ((new_data.targets[i] * (new_observations - 1) as f64) as usize)
@@ -545,7 +546,7 @@ impl<Q: SimpleQuantileEstimatorRepresentation> Distribution for Q {
 }
 
 #[derive(Clone, Debug)]
-struct FixedSizeQuantileEstimator<const N: usize> {
+pub struct FixedSizeQuantileEstimator<const N: usize> {
     values: [f64; N],
     ranks: [usize; N],
     targets: [f64; N],
@@ -591,7 +592,7 @@ impl<const N: usize> SimpleQuantileEstimatorRepresentation for FixedSizeQuantile
 }
 
 #[derive(Clone, Debug)]
-struct VectorQuantileEstimator {
+pub struct VectorQuantileEstimator {
     values: Vec<f64>,
     ranks: Vec<usize>,
     targets: Vec<f64>,
@@ -599,7 +600,7 @@ struct VectorQuantileEstimator {
 }
 
 impl VectorQuantileEstimator {
-    fn new(targets: &[f64]) -> Self {
+    pub fn new(targets: &[f64]) -> Self {
         assert!(
             targets.is_sorted() && targets.first() == Some(&0.0) && targets.last() == Some(&1.0)
         );
@@ -636,6 +637,75 @@ impl SimpleQuantileEstimatorRepresentation for VectorQuantileEstimator {
     }
 }
 
+pub mod custom_quantile_estimator {
+    use super::*;
+    use std::f64::consts::E;
+
+    macro_rules! replace_expr {
+        ($_t:tt,$sub:expr) => {
+            $sub
+        };
+    }
+
+    macro_rules! count_exprs {
+        ($($val:expr),+) => {<[()]>::len(&[$(replace_expr!($val,())),+])};
+    }
+
+    macro_rules! implement_fixed_quantile_estimator {
+        ($name:ident[$($val:expr),+]) => {
+            #[derive(Clone, Debug)]
+            pub struct $name {
+                values: [f64; Self::COUNT],
+                ranks: [usize; Self::COUNT],
+                observations: usize,
+            }
+
+            impl $name {
+                const TARGETS: [f64; count_exprs!($($val),+) + 2] = [0.0, $($val),+, 1.0];
+                const COUNT: usize = Self::TARGETS.len();
+
+                pub fn new() -> Self {
+                    Self {
+                        values: [0.0; _],
+                        ranks: [0; _],
+                        observations: 0
+                    }
+                }
+            }
+
+            impl Default for $name {
+                fn default() -> Self {
+                    Self::new()
+                }
+            }
+
+            impl SimpleQuantileEstimatorRepresentation for $name {
+                fn new_like(_other: &Self) -> Self {
+                    Self::default()
+                }
+                fn _data(&self) -> QuantileEstimatorData<'_> {
+                    QuantileEstimatorData {
+                        ranks: &self.ranks,
+                        values: &self.values,
+                        targets: &Self::TARGETS,
+                        observations: &self.observations,
+                    }
+                }
+                fn _mut_data(&mut self) -> MutableQuantileEstimatorData<'_> {
+                    MutableQuantileEstimatorData {
+                        ranks: &mut self.ranks,
+                        values: &mut self.values,
+                        targets: &Self::TARGETS,
+                        observations: &mut self.observations,
+                    }
+                }
+            }
+        };
+    }
+
+    implement_fixed_quantile_estimator!(FrechetQuant[0.5 / E, 0.25, 1.0 / E, 0.5, 0.5 + 1.0 / 2.0 * E, 0.75]);
+}
+
 #[cfg(test)]
 mod test {
     use crate::{
diff --git a/src/statistics.rs b/src/statistics.rs
index 909bb21..1a8701e 100644
--- a/src/statistics.rs
+++ b/src/statistics.rs
@@ -259,17 +259,6 @@ impl Frechet {
             minimum,
         }
     }
-
-    pub fn from_log_moments(log_mean: f64, log_std: f64, minimum: f64) -> Self {
-        let alpha = f64::consts::PI / (6.0 * log_std);
-        let lambda = (alpha * log_mean - f64::consts::EULER_GAMMA).exp();
-        let scale = lambda.powf(1.0 / alpha);
-        Self {
-            alpha,
-            scale,
-            minimum,
-        }
-    }
 }
 
 impl Default for Frechet {

From 2f7c9f569b21bf8a4c81a5daa0689b3f73b52030 Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Thu, 28 May 2026 00:57:21 -0600
Subject: [PATCH 32/39] Test gumbel...

---
 src/assembly.rs        |  4 +--
 src/join_estimation.rs | 32 ++++++++++++--------
 src/statistics.rs      | 67 ++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 86 insertions(+), 17 deletions(-)

diff --git a/src/assembly.rs b/src/assembly.rs
index a58875a..4cbaf6b 100644
--- a/src/assembly.rs
+++ b/src/assembly.rs
@@ -128,13 +128,13 @@ fn get_link_cost(
 
     // Cost = linear consensus cost + linear target gap cost...
     min_value
-        + piecewise_linear_cost(
+        /*+ piecewise_linear_cost(
             -(annotation_args.free_join_consensus_overlap as f64).abs(),
             (annotation_args.free_join_consensus_gap as f64).abs(),
             alpha,
             beta,
             consensus_gap as f64,
-        )
+        )*/
         + expected_score
 }
 
diff --git a/src/join_estimation.rs b/src/join_estimation.rs
index 2d9f970..3bbd204 100644
--- a/src/join_estimation.rs
+++ b/src/join_estimation.rs
@@ -8,7 +8,7 @@ use crate::{
     assembly::{block_consensus_distance, block_length_on_query, block_target_distance, LinkType},
     p2estimator::{custom_quantile_estimator::FrechetQuant, QuantileEstimator},
     segments::Block,
-    statistics::{ln_add_exp, Distribution, ExponentialEstimator, Frechet, HalfT, Laplace},
+    statistics::{ln_add_exp, Distribution, ExponentialEstimator, Frechet, Gumbel, HalfT, Laplace},
 };
 
 pub trait JoinEstimator: Clone + Default + Debug {
@@ -28,7 +28,7 @@ pub struct BayesianJoinEstimator {
     target_distance_nojoin: ExponentialEstimator,
     divergence_join: HalfT,
     divergence_nojoin: HalfT,
-    consensus_distance_join: Frechet,
+    consensus_distance_join: Gumbel,
     consensus_distance_nojoin: Laplace,
     join_prior: f64,
 }
@@ -64,12 +64,12 @@ impl JoinEstimator for BayesianJoinEstimator {
 
         let join_score = self.join_prior.ln()
             + self.target_distance_join.logpdf(target_dist)
-            + self.divergence_join.logpdf(divergence_diff)
-            + self.consensus_distance_join.logpdf(rel_con_dist);
+            + self.divergence_join.logpdf(divergence_diff);
+        //+ self.consensus_distance_join.logpdf(rel_con_dist);
         let nojoin_score = (-self.join_prior).ln_1p()
             + self.target_distance_nojoin.logpdf(target_dist)
-            + self.divergence_nojoin.logpdf(divergence_diff)
-            + self.consensus_distance_nojoin.logpdf(rel_con_dist);
+            + self.divergence_nojoin.logpdf(divergence_diff);
+        //+ self.consensus_distance_nojoin.logpdf(rel_con_dist);
 
         let score_norm = ln_add_exp(join_score, nojoin_score);
         let score = join_score - score_norm;
@@ -191,7 +191,7 @@ impl From<&FrechetQuant> for Frechet {
         }
 
         // Chosen so p2 (middle quantile) is close to median...
-        let power_scale = 1.5;
+        let power_scale = 2.0;
 
         let p1 = 1.0 / E; // -ln(1/e)...
         let p2 = (1.0 / E).powf(1.0 / power_scale);
@@ -207,7 +207,7 @@ impl From<&FrechetQuant> for Frechet {
         let a = power_scale.ln() / (1.0 / relative_q + 1.0).ln();
         let s = (q2 - q1) / (unscaled_fretchet_ppf(p2, a) - unscaled_fretchet_ppf(p1, a));
         // Rather than directly estimating m, we assume the mode of the distribution is at 0...
-        let m = -(a / (1.0 + a)).powf(1.0 / a);
+        let m = -s * (a / (1.0 + a)).powf(1.0 / a);
 
         Frechet::new(a, s, m)
     }
@@ -220,7 +220,11 @@ impl From<&BayesianJoinStatistics> for BayesianJoinEstimator {
             target_distance_nojoin: statistics.unjoinable_target_distance.into(),
             divergence_join: statistics.joinable_divergence.into(),
             divergence_nojoin: statistics.unjoinable_divergence.into(),
-            consensus_distance_join: (&statistics.joinable_consensus).into(),
+            consensus_distance_join: Gumbel::new(
+                0.0,
+                6.0_f64.sqrt() * statistics.joinable_consensus.standard_deviation()
+                    / f64::consts::PI,
+            ),
             consensus_distance_nojoin: statistics.unjoinable_consensus.into(),
             // We take sqrt since we count all pairs, not just neighbors.
             join_prior: (statistics.joinable_target_distance.samples() as f64
@@ -238,7 +242,7 @@ pub struct BayesianJoinStatistics {
     unjoinable_target_distance: MomentEstimator,
     joinable_divergence: MomentEstimator,
     unjoinable_divergence: MomentEstimator,
-    joinable_consensus: FrechetQuant,
+    joinable_consensus: MomentEstimator,
     unjoinable_consensus: MomentEstimator,
 }
 
@@ -261,7 +265,9 @@ impl JoinStatisticsCollector for BayesianJoinStatistics {
             unjoinable_divergence: bayesian_prior
                 .unjoinable_divergence
                 .to_psuedo_count(pseudo_count),
-            joinable_consensus: FrechetQuant::from_prior(&bayesian_prior.joinable_consensus, 1),
+            joinable_consensus: bayesian_prior
+                .joinable_consensus
+                .to_psuedo_count(pseudo_count),
             unjoinable_consensus: bayesian_prior
                 .unjoinable_consensus
                 .to_psuedo_count(pseudo_count),
@@ -280,7 +286,7 @@ impl JoinStatisticsCollector for BayesianJoinStatistics {
             self.joinable_divergence += divergence_diff;
             if matches!(join_type, LinkType::Forward | LinkType::Reverse) {
                 //println!("CDist: {}", rel_con_dist);
-                self.joinable_consensus.update(rel_con_dist);
+                self.joinable_consensus += rel_con_dist;
             }
         } else {
             self.unjoinable_target_distance += target_dist as f64;
@@ -299,7 +305,7 @@ impl JoinStatisticsCollector for BayesianJoinStatistics {
                 + other.unjoinable_target_distance,
             joinable_divergence: self.joinable_divergence + other.joinable_divergence,
             unjoinable_divergence: self.unjoinable_divergence + other.unjoinable_divergence,
-            joinable_consensus: self.joinable_consensus.combine(&other.joinable_consensus),
+            joinable_consensus: self.joinable_consensus + other.joinable_consensus,
             unjoinable_consensus: self.unjoinable_consensus + other.unjoinable_consensus,
         }
     }
diff --git a/src/statistics.rs b/src/statistics.rs
index 1a8701e..dc0e728 100644
--- a/src/statistics.rs
+++ b/src/statistics.rs
@@ -328,6 +328,68 @@ impl Distribution for Frechet {
     }
 }
 
+#[derive(Debug, Clone)]
+pub struct Gumbel {
+    location: f64,
+    scale: f64,
+}
+
+impl Gumbel {
+    pub fn new(location: f64, scale: f64) -> Self {
+        Self { location, scale }
+    }
+}
+
+impl Default for Gumbel {
+    fn default() -> Self {
+        Self::new(0.0, 1.0)
+    }
+}
+
+impl ParameterizedDistribution for Gumbel {}
+
+impl Distribution for Gumbel {
+    fn logpdf(&self, x: f64) -> f64 {
+        let mu = self.location;
+        let beta = self.scale;
+        let z = (x - mu) / beta;
+        (1.0 / beta).ln() - (z + (-z).exp())
+    }
+
+    fn pdf(&self, x: f64) -> f64 {
+        self.logpdf(x).exp()
+    }
+
+    fn cdf(&self, x: f64) -> f64 {
+        self.logcdf(x).exp()
+    }
+
+    fn logcdf(&self, x: f64) -> f64 {
+        let mu = self.location;
+        let beta = self.scale;
+        let z = (x - mu) / beta;
+        -((-z).exp())
+    }
+
+    fn ppf(&self, p: f64) -> f64 {
+        let mu = self.location;
+        let beta = self.scale;
+        mu - beta * (-p.ln()).ln()
+    }
+
+    fn ccdf(&self, x: f64) -> f64 {
+        1.0 - self.cdf(x)
+    }
+
+    fn logccdf(&self, x: f64) -> f64 {
+        self.ccdf(x).ln()
+    }
+
+    fn support(&self) -> (f64, f64) {
+        (f64::NEG_INFINITY, f64::INFINITY)
+    }
+}
+
 #[derive(Debug, Clone)]
 pub struct Laplace {
     mean: f64,
@@ -407,7 +469,7 @@ pub fn linspace(start: f64, stop: f64, steps: usize) -> impl Iterator<Item = f64
 
 #[cfg(test)]
 mod test {
-    use crate::statistics::{linspace, ExponentialEstimator, Frechet, HalfT, Laplace};
+    use super::*;
     use std::fmt::Debug;
 
     pub trait TestDistribution: Debug {
@@ -454,13 +516,14 @@ mod test {
 
     use super::{Exponential, ParameterizedDistribution};
 
-    fn get_dists() -> [Box<dyn TestDistribution>; 5] {
+    fn get_dists() -> [Box<dyn TestDistribution>; 6] {
         [
             as_box(Exponential::unit()),
             as_box(ExponentialEstimator::unit()),
             as_box(HalfT::unit()),
             as_box(Frechet::unit()),
             as_box(Laplace::unit()),
+            as_box(Gumbel::unit()),
         ]
     }
 

From 52dc778ee52ed9bf757197ef4d9de8fc74fb6f12 Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Fri, 29 May 2026 10:01:01 -0600
Subject: [PATCH 33/39] Normalize by consensus.

---
 .../plot_consensus_distance_repeatmasker.py   |  26 ++-
 src/assembly.rs                               |  51 +++++-
 src/join_estimation.rs                        | 165 ++++++++++++------
 src/p2estimator.rs                            |  14 +-
 src/pipeline.rs                               |   8 +-
 src/segments.rs                               |   4 +-
 src/statistics.rs                             |  73 +++++++-
 7 files changed, 268 insertions(+), 73 deletions(-)

diff --git a/scripts/plot_consensus_distance_repeatmasker.py b/scripts/plot_consensus_distance_repeatmasker.py
index 2bd07bd..23dab81 100644
--- a/scripts/plot_consensus_distance_repeatmasker.py
+++ b/scripts/plot_consensus_distance_repeatmasker.py
@@ -12,16 +12,28 @@
 sequences = []
 
 
-def get_gap(pstart, pend, qstart, qend, ppos, is_pos, check_valid: bool = True):
+def get_gap(
+    pstart,
+    pend,
+    qstart,
+    qend,
+    qremaining,
+    ppos,
+    is_pos,
+    check_valid: bool = True,
+):
+    c_len = max(qremaining + qend, qremaining + qstart)
     try:
         if is_pos and ppos:
             if check_valid:
                 assert pstart <= qstart <= qend and pstart <= pend <= qend
             gap = qstart - pend
+            gap /= c_len
         elif not is_pos and not ppos:
             if check_valid:
                 assert qstart <= qend <= pend and qstart <= pstart <= pend
             gap = pstart - qend
+            gap /= c_len
         else:
             gap = None
     except AssertionError:
@@ -61,9 +73,11 @@ def get_gap(pstart, pend, qstart, qend, ppos, is_pos, check_valid: bool = True):
             if is_pos:
                 qstart = int(tokens[11])
                 qend = int(tokens[12])
+                qremaining = int(tokens[13].strip("()"))
             else:
                 qstart = int(tokens[13])
                 qend = int(tokens[12])
+                qremaining = int(tokens[11].strip("()"))
 
             assert qend >= qstart
             length += qend - qstart
@@ -73,12 +87,16 @@ def get_gap(pstart, pend, qstart, qend, ppos, is_pos, check_valid: bool = True):
                 if pname != name:
                     gap = None
                 else:
-                    gap = get_gap(pstart, pend, qstart, qend, ppos, is_pos)
+                    gap = get_gap(pstart, pend, qstart, qend, qremaining, ppos, is_pos)
 
                 if gap is not None:
                     if shared_name not in gap_info:
                         gap_info[shared_name] = []
 
+                    if gap > 2:
+                        print(
+                            f"Gap > 2, Gap Value {gap}: {seq}\n\t {'\n\t'.join(seqs)}"
+                        )
                     gap_info[shared_name].append(gap)
 
             prior = (name, is_pos, qstart, qend)
@@ -106,9 +124,11 @@ def get_gap(pstart, pend, qstart, qend, ppos, is_pos, check_valid: bool = True):
     if is_pos:
         qstart = int(tokens[11])
         qend = int(tokens[12])
+        qremaining = int(tokens[13].strip("()"))
     else:
         qstart = int(tokens[13])
         qend = int(tokens[12])
+        qremaining = int(tokens[11].strip("()"))
 
     random_prior = other_priors.get(name, None)
     if random_prior is not None:
@@ -116,7 +136,7 @@ def get_gap(pstart, pend, qstart, qend, ppos, is_pos, check_valid: bool = True):
         if pjoin_id == join_id:
             gap = None
         else:
-            gap = get_gap(pstart, pend, qstart, qend, ppos, is_pos, False)
+            gap = get_gap(pstart, pend, qstart, qend, qremaining, ppos, is_pos, False)
 
         target_gap = tstart - p_tstart
 
diff --git a/src/assembly.rs b/src/assembly.rs
index 4cbaf6b..5f49087 100644
--- a/src/assembly.rs
+++ b/src/assembly.rs
@@ -188,6 +188,34 @@ pub fn block_length_on_query(b: &Block) -> usize {
     b.query_end.abs_diff(b.query_start) + 1
 }
 
+pub enum ConsensusDistanceNormalization {
+    Max,
+    Min,
+    Sum,
+    WithLength(usize),
+}
+
+pub fn relative_consensus_distance(
+    first_block: &Block,
+    second_block: &Block,
+    mode: ConsensusDistanceNormalization,
+) -> (f64, LinkType) {
+    let (dist, link_type) = block_consensus_distance(first_block, second_block);
+    let div = match mode {
+        ConsensusDistanceNormalization::Sum => {
+            block_length_on_query(first_block) + block_length_on_query(second_block)
+        }
+        ConsensusDistanceNormalization::Max => {
+            block_length_on_query(first_block).max(block_length_on_query(second_block))
+        }
+        ConsensusDistanceNormalization::Min => {
+            block_length_on_query(first_block).min(block_length_on_query(second_block))
+        }
+        ConsensusDistanceNormalization::WithLength(length) => length,
+    };
+    (dist as f64 / div as f64, link_type)
+}
+
 fn is_joinable(
     target_distance: isize,
     consensus_distance: isize,
@@ -231,6 +259,7 @@ fn new_alignment_to_blocks_map(
 
 pub fn gather_join_statistics<T: JoinStatisticsCollector>(
     alignments: &[Alignment],
+    query_lengths: &HashMap<usize, usize>,
     annotation_args: &AnnotationArgs,
 ) -> Vec<(usize, T)> {
     let mut query_ids: Vec<usize> = alignments.iter().map(|a| a.query_id).unique().collect();
@@ -256,6 +285,9 @@ pub fn gather_join_statistics<T: JoinStatisticsCollector>(
 
             gather_join_statistics_single_family(
                 compat_alignments,
+                *query_lengths
+                    .get(&id)
+                    .expect("Query length missing for alignment!"),
                 annotation_args,
                 &mut new_stats,
             );
@@ -268,6 +300,7 @@ pub fn gather_join_statistics<T: JoinStatisticsCollector>(
 
 fn gather_join_statistics_single_family<'a>(
     compatable_alignments: impl Iterator<Item = Block>,
+    consensus_length: usize,
     args: &AnnotationArgs,
     join_stats: &mut impl JoinStatisticsCollector,
 ) {
@@ -293,7 +326,13 @@ fn gather_join_statistics_single_family<'a>(
                         args,
                     );
 
-                    join_stats.add(a_block, b_block, idx + 1 == idx2, joinable);
+                    join_stats.add(
+                        a_block,
+                        b_block,
+                        consensus_length,
+                        idx + 1 == idx2,
+                        joinable,
+                    );
                 })
         })
 }
@@ -301,6 +340,7 @@ fn gather_join_statistics_single_family<'a>(
 fn link_assemblies<T: JoinEstimator>(
     graph: &mut HashMap<(SegmentAndDenseRow, SegmentAndDenseRow), Edge>,
     compatable_blocks: impl Iterator<Item = (usize, usize)>,
+    consensus_length: usize,
     segments: &SegmentedMatrix,
     query_statistics: &QueryStatistics<T>,
     _region_statistics: &RegionStatistics,
@@ -333,7 +373,10 @@ fn link_assemblies<T: JoinEstimator>(
                 min_block_length,
                 args,
             ) {
-                let join_prob = query_statistics.estimator.predict(a_block, b_block, false);
+                let join_prob =
+                    query_statistics
+                        .estimator
+                        .predict(a_block, b_block, consensus_length, false);
 
                 if join_prob >= args.join_likelihood_threshold {
                     let weight = if a_block.row_idx == b_block.row_idx && ((b.0 - 1) <= a.0) {
@@ -370,6 +413,7 @@ pub struct SegmentAssemblyGraph {
 impl SegmentAssemblyGraph {
     pub fn new<T: JoinEstimator>(
         alignments: &[Alignment],
+        query_lengths: &HashMap<usize, usize>,
         segments: &SegmentedMatrix,
         region_statistics: &RegionStatistics,
         query_statistics: &[QueryStatistics<T>],
@@ -400,6 +444,9 @@ impl SegmentAssemblyGraph {
                 link_assemblies(
                     &mut link_graph,
                     compat_blocks,
+                    *query_lengths
+                        .get(&id)
+                        .expect("Unable to find query length for alignment!"),
                     segments,
                     &query_statistics[id],
                     region_statistics,
diff --git a/src/join_estimation.rs b/src/join_estimation.rs
index 3bbd204..827f98b 100644
--- a/src/join_estimation.rs
+++ b/src/join_estimation.rs
@@ -1,25 +1,44 @@
 use std::{
-    f64::{self, consts::E},
+    f64::{self},
     fmt::Debug,
     ops,
 };
 
 use crate::{
-    assembly::{block_consensus_distance, block_length_on_query, block_target_distance, LinkType},
-    p2estimator::{custom_quantile_estimator::FrechetQuant, QuantileEstimator},
+    assembly::{
+        block_target_distance, relative_consensus_distance, ConsensusDistanceNormalization,
+        LinkType,
+    },
+    p2estimator::{
+        custom_quantile_estimator::{LomaxQuant, MedianEstimator},
+        QuantileEstimator,
+    },
     segments::Block,
-    statistics::{ln_add_exp, Distribution, ExponentialEstimator, Frechet, Gumbel, HalfT, Laplace},
+    statistics::{ln_add_exp, Distribution, ExponentialEstimator, HalfT, Laplace, Lomax},
 };
 
 pub trait JoinEstimator: Clone + Default + Debug {
-    fn predict(&self, first_block: &Block, second_block: &Block, log_space: bool) -> f64;
+    fn predict(
+        &self,
+        first_block: &Block,
+        second_block: &Block,
+        consensus_length: usize,
+        log_space: bool,
+    ) -> f64;
 }
 
 pub trait JoinStatisticsCollector: Clone + Debug {
     fn new() -> Self;
     fn new_from_prior(bayesian_prior: &Self, pseudo_count: usize) -> Self;
     fn combine(&self, other: &Self) -> Self;
-    fn add(&mut self, first_block: &Block, second_block: &Block, neighbors: bool, joinable: bool);
+    fn add(
+        &mut self,
+        first_block: &Block,
+        second_block: &Block,
+        consenus_length: usize,
+        neighbors: bool,
+        joinable: bool,
+    );
 }
 
 #[derive(Debug, Clone, Default)]
@@ -28,25 +47,44 @@ pub struct BayesianJoinEstimator {
     target_distance_nojoin: ExponentialEstimator,
     divergence_join: HalfT,
     divergence_nojoin: HalfT,
-    consensus_distance_join: Gumbel,
+    consensus_distance_join_pos: ExponentialEstimator,
+    consensus_distance_join_neg: ExponentialEstimator,
+    consensus_norm_pos: f64,
+    consensus_norm_neg: f64,
     consensus_distance_nojoin: Laplace,
     join_prior: f64,
 }
 
 impl JoinEstimator for BayesianJoinEstimator {
-    fn predict(&self, first_block: &Block, second_block: &Block, log_space: bool) -> f64 {
+    fn predict(
+        &self,
+        first_block: &Block,
+        second_block: &Block,
+        consensus_distance: usize,
+        log_space: bool,
+    ) -> f64 {
         let target_dist = block_target_distance(first_block, second_block) as f64;
         // Absolute value as t-dist is symmetric and we want to get prob in tail, also, we know the mean is 0...
         let divergence_diff = (second_block.kimura80 - first_block.kimura80).abs();
-        let (consensus_dist, _join_type) = block_consensus_distance(first_block, second_block);
-        let rel_con_dist = consensus_dist as f64
-            / block_length_on_query(first_block).max(block_length_on_query(second_block)) as f64;
+        let (rel_con_dist, _join_type) = relative_consensus_distance(
+            first_block,
+            second_block,
+            ConsensusDistanceNormalization::WithLength(consensus_distance),
+        );
+
+        let consensus_join_dist_logpdf = |x: f64| {
+            if x < 0.0 {
+                self.consensus_distance_join_neg.logpdf(x.abs()) + self.consensus_norm_neg.ln()
+            } else {
+                self.consensus_distance_join_pos.logpdf(x.abs()) + self.consensus_norm_pos.ln()
+            }
+        };
 
         println!("{:#?}", self);
         println!(
             "{} {} {}",
             rel_con_dist,
-            self.consensus_distance_join.pdf(rel_con_dist),
+            consensus_join_dist_logpdf(rel_con_dist).exp(),
             self.consensus_distance_nojoin.pdf(rel_con_dist)
         );
         println!(
@@ -183,48 +221,45 @@ impl From<MomentEstimator> for Laplace {
     }
 }
 
-impl From<&FrechetQuant> for Frechet {
-    fn from(value: &FrechetQuant) -> Self {
-        // Technique developed in notebooks, should write down...
-        fn unscaled_fretchet_ppf(x: f64, a: f64) -> f64 {
-            (-(x.ln())).powf(-1.0 / a)
-        }
-
-        // Chosen so p2 (middle quantile) is close to median...
-        let power_scale = 2.0;
-
-        let p1 = 1.0 / E; // -ln(1/e)...
-        let p2 = (1.0 / E).powf(1.0 / power_scale);
-        let p3 = (1.0 / E).powf(1.0 / (power_scale * power_scale));
+impl From<&LomaxQuant> for Lomax {
+    fn from(value: &LomaxQuant) -> Self {
+        // Using quantile selection trick originally developed for frechet... (quant ratio formula)
+        // Choose first quantile p, then second such that p2 = 1 - (1 - p)^2 and you get this nice closed form for alpha...
+        let p1 = LomaxQuant::PROB1;
+        let p2 = LomaxQuant::PROB2;
 
-        let q1 = value.ppf(p1);
-        let q2 = value.ppf(p2);
-        let q3 = value.ppf(p3);
+        let v1 = value.ppf(p1);
+        let v2 = value.ppf(p2);
 
-        let relative_q = (q2 - q1) / (q3 - q1);
-
-        // Because we carefully chose quantiles... Solution for a simplifies to below...
-        let a = power_scale.ln() / (1.0 / relative_q + 1.0).ln();
-        let s = (q2 - q1) / (unscaled_fretchet_ppf(p2, a) - unscaled_fretchet_ppf(p1, a));
-        // Rather than directly estimating m, we assume the mode of the distribution is at 0...
-        let m = -s * (a / (1.0 + a)).powf(1.0 / a);
+        let a = -(1.0 - p1).ln() / (v2 / v1 - 1.0).ln();
+        let y = v1 / ((1.0 - p1).powf(-1.0 / a) - 1.0);
+        Lomax::new(a, y)
+    }
+}
 
-        Frechet::new(a, s, m)
+impl From<&MedianEstimator> for ExponentialEstimator {
+    fn from(value: &MedianEstimator) -> Self {
+        Self::new(value.ppf(0.5) / 2.0_f64.ln(), value.samples())
     }
 }
 
 impl From<&BayesianJoinStatistics> for BayesianJoinEstimator {
     fn from(statistics: &BayesianJoinStatistics) -> Self {
+        let cons_total = (statistics.joinable_consensus_pos.samples()
+            + statistics.joinable_consensus_neg.samples())
+        .max(1);
+        let cons_pos_perc = statistics.joinable_consensus_pos.samples() as f64 / cons_total as f64;
+        let cons_neg_perc = statistics.joinable_consensus_neg.samples() as f64 / cons_total as f64;
+
         Self {
             target_distance_join: statistics.joinable_target_distance.into(),
             target_distance_nojoin: statistics.unjoinable_target_distance.into(),
             divergence_join: statistics.joinable_divergence.into(),
             divergence_nojoin: statistics.unjoinable_divergence.into(),
-            consensus_distance_join: Gumbel::new(
-                0.0,
-                6.0_f64.sqrt() * statistics.joinable_consensus.standard_deviation()
-                    / f64::consts::PI,
-            ),
+            consensus_distance_join_pos: (&statistics.joinable_consensus_pos).into(),
+            consensus_distance_join_neg: (&statistics.joinable_consensus_neg).into(),
+            consensus_norm_pos: 0.5 * cons_pos_perc,
+            consensus_norm_neg: 0.5 * cons_neg_perc,
             consensus_distance_nojoin: statistics.unjoinable_consensus.into(),
             // We take sqrt since we count all pairs, not just neighbors.
             join_prior: (statistics.joinable_target_distance.samples() as f64
@@ -242,7 +277,8 @@ pub struct BayesianJoinStatistics {
     unjoinable_target_distance: MomentEstimator,
     joinable_divergence: MomentEstimator,
     unjoinable_divergence: MomentEstimator,
-    joinable_consensus: MomentEstimator,
+    joinable_consensus_pos: MedianEstimator,
+    joinable_consensus_neg: MedianEstimator,
     unjoinable_consensus: MomentEstimator,
 }
 
@@ -265,33 +301,51 @@ impl JoinStatisticsCollector for BayesianJoinStatistics {
             unjoinable_divergence: bayesian_prior
                 .unjoinable_divergence
                 .to_psuedo_count(pseudo_count),
-            joinable_consensus: bayesian_prior
-                .joinable_consensus
-                .to_psuedo_count(pseudo_count),
+            joinable_consensus_pos: MedianEstimator::from_prior(
+                &bayesian_prior.joinable_consensus_pos,
+                pseudo_count,
+            ),
+            joinable_consensus_neg: MedianEstimator::from_prior(
+                &bayesian_prior.joinable_consensus_neg,
+                pseudo_count,
+            ),
             unjoinable_consensus: bayesian_prior
                 .unjoinable_consensus
                 .to_psuedo_count(pseudo_count),
         }
     }
 
-    fn add(&mut self, first_block: &Block, second_block: &Block, _neighbors: bool, joinable: bool) {
+    fn add(
+        &mut self,
+        first_block: &Block,
+        second_block: &Block,
+        consensus_length: usize,
+        neighbors: bool,
+        joinable: bool,
+    ) {
         let target_dist = block_target_distance(first_block, second_block).abs() as usize;
         let divergence_diff = (second_block.kimura80 - first_block.kimura80).abs();
-        let (consensus_dist, join_type) = block_consensus_distance(first_block, second_block);
-        let rel_con_dist = consensus_dist as f64
-            / block_length_on_query(first_block).max(block_length_on_query(second_block)) as f64;
+        let (rel_con_dist, join_type) = relative_consensus_distance(
+            first_block,
+            second_block,
+            ConsensusDistanceNormalization::WithLength(consensus_length),
+        );
 
         if joinable {
             self.joinable_target_distance += target_dist as f64;
             self.joinable_divergence += divergence_diff;
-            if matches!(join_type, LinkType::Forward | LinkType::Reverse) {
+            if neighbors && matches!(join_type, LinkType::Forward | LinkType::Reverse) {
                 //println!("CDist: {}", rel_con_dist);
-                self.joinable_consensus += rel_con_dist;
+                if rel_con_dist >= 0.0 {
+                    self.joinable_consensus_pos.update(rel_con_dist.abs());
+                } else {
+                    self.joinable_consensus_neg.update(rel_con_dist.abs());
+                }
             }
         } else {
             self.unjoinable_target_distance += target_dist as f64;
             self.unjoinable_divergence += divergence_diff;
-            if matches!(join_type, LinkType::Forward | LinkType::Reverse) {
+            if neighbors && matches!(join_type, LinkType::Forward | LinkType::Reverse) {
                 self.unjoinable_consensus += rel_con_dist;
             }
         }
@@ -305,7 +359,12 @@ impl JoinStatisticsCollector for BayesianJoinStatistics {
                 + other.unjoinable_target_distance,
             joinable_divergence: self.joinable_divergence + other.joinable_divergence,
             unjoinable_divergence: self.unjoinable_divergence + other.unjoinable_divergence,
-            joinable_consensus: self.joinable_consensus + other.joinable_consensus,
+            joinable_consensus_pos: self
+                .joinable_consensus_pos
+                .combine(&other.joinable_consensus_pos),
+            joinable_consensus_neg: self
+                .joinable_consensus_neg
+                .combine(&other.joinable_consensus_neg),
             unjoinable_consensus: self.unjoinable_consensus + other.unjoinable_consensus,
         }
     }
diff --git a/src/p2estimator.rs b/src/p2estimator.rs
index 2a8c89c..0616cbc 100644
--- a/src/p2estimator.rs
+++ b/src/p2estimator.rs
@@ -322,7 +322,7 @@ fn _interpolated_value_prediction<
     }
 
     let idx = xs.partition_point(|&v| v.as_() < x);
-    if idx > xs.len() {
+    if idx >= xs.len() {
         upper_val
     } else if idx == 0 {
         lower_val
@@ -639,7 +639,6 @@ impl SimpleQuantileEstimatorRepresentation for VectorQuantileEstimator {
 
 pub mod custom_quantile_estimator {
     use super::*;
-    use std::f64::consts::E;
 
     macro_rules! replace_expr {
         ($_t:tt,$sub:expr) => {
@@ -666,8 +665,8 @@ pub mod custom_quantile_estimator {
 
                 pub fn new() -> Self {
                     Self {
-                        values: [0.0; _],
-                        ranks: [0; _],
+                        values: [0.0; Self::COUNT],
+                        ranks: [0; Self::COUNT],
                         observations: 0
                     }
                 }
@@ -703,7 +702,12 @@ pub mod custom_quantile_estimator {
         };
     }
 
-    implement_fixed_quantile_estimator!(FrechetQuant[0.5 / E, 0.25, 1.0 / E, 0.5, 0.5 + 1.0 / 2.0 * E, 0.75]);
+    implement_fixed_quantile_estimator!(LomaxQuant[0.18, 0.36, 0.4752, 0.5904, 0.7952]);
+    impl LomaxQuant {
+        pub const PROB1: f64 = 0.36;
+        pub const PROB2: f64 = 0.59;
+    }
+    implement_fixed_quantile_estimator!(MedianEstimator[0.25, 0.5, 0.75]);
 }
 
 #[cfg(test)]
diff --git a/src/pipeline.rs b/src/pipeline.rs
index 5ca4f2a..c3ef770 100644
--- a/src/pipeline.rs
+++ b/src/pipeline.rs
@@ -252,8 +252,11 @@ pub fn run_naive_trace<T: JoinStatisticsCollector>(
             .expect("Unable to write confidences!!!");
     }
 
-    let query_join_statistics =
-        gather_join_statistics(proximity_group.alignments, &args.annotation_args);
+    let query_join_statistics = gather_join_statistics(
+        proximity_group.alignments,
+        &alignment_data.query_lengths,
+        &args.annotation_args,
+    );
 
     NaiveTraceResults {
         target_start: proximity_group.target_start,
@@ -286,6 +289,7 @@ pub fn run_history_trace<T: JoinEstimator, S: JoinStatisticsCollector>(
         &trace_statistics.query_statistics,
         &naive_trace.score_params,
         &args.annotation_args,
+        &alignment_data.query_lengths,
     );
 
     let history = history_viterbi_on_segments(
diff --git a/src/segments.rs b/src/segments.rs
index 81cdb3e..74e987b 100644
--- a/src/segments.rs
+++ b/src/segments.rs
@@ -1,5 +1,5 @@
 use core::f64;
-use std::{cmp::Ordering, fmt::Debug, iter::Fuse};
+use std::{cmp::Ordering, collections::HashMap, fmt::Debug, iter::Fuse};
 
 use crate::{
     alignment::{Alignment, Strand},
@@ -603,9 +603,11 @@ pub fn assemble_and_link_segments<'a, T: JoinEstimator>(
     query_statistics: &[QueryStatistics<T>],
     score_params: &ScoreParams,
     annotation_args: &AnnotationArgs,
+    query_lengths: &HashMap<usize, usize>,
 ) -> (&'a SegmentedMatrix, SegmentAssemblyGraph) {
     let assembly_graph = SegmentAssemblyGraph::new(
         proximity_group.alignments,
+        query_lengths,
         &initial_segments.segments,
         region_statistics,
         query_statistics,
diff --git a/src/statistics.rs b/src/statistics.rs
index dc0e728..3271f27 100644
--- a/src/statistics.rs
+++ b/src/statistics.rs
@@ -135,7 +135,7 @@ impl Distribution for ExponentialEstimator {
     }
 
     fn logcdf(&self, x: f64) -> f64 {
-        self.cdf(x).ln()
+        (-self.logccdf(x).exp()).ln_1p()
     }
 
     fn cdf(&self, x: f64) -> f64 {
@@ -390,6 +390,66 @@ impl Distribution for Gumbel {
     }
 }
 
+#[derive(Debug, Clone)]
+pub struct Lomax {
+    alpha: f64,
+    lambda: f64,
+}
+
+impl Lomax {
+    pub fn new(alpha: f64, lambda: f64) -> Self {
+        Self { alpha, lambda }
+    }
+}
+
+impl ParameterizedDistribution for Lomax {}
+
+impl Default for Lomax {
+    fn default() -> Self {
+        Self::new(1.0, 1.0)
+    }
+}
+
+impl Distribution for Lomax {
+    fn logpdf(&self, x: f64) -> f64 {
+        let a = self.alpha;
+        let y = self.lambda;
+        (a / y).ln() - (a + 1.0) * (1.0 + x / y).ln()
+    }
+
+    fn pdf(&self, x: f64) -> f64 {
+        self.logpdf(x).exp()
+    }
+
+    fn logccdf(&self, x: f64) -> f64 {
+        let a = self.alpha;
+        let y = self.lambda;
+        -a * (1.0 + x / y).ln()
+    }
+
+    fn ccdf(&self, x: f64) -> f64 {
+        self.logccdf(x).exp()
+    }
+
+    fn cdf(&self, x: f64) -> f64 {
+        -(self.logccdf(x).exp_m1())
+    }
+
+    fn logcdf(&self, x: f64) -> f64 {
+        (-self.ccdf(x)).ln_1p()
+    }
+
+    fn ppf(&self, p: f64) -> f64 {
+        let a = self.alpha;
+        let y = self.lambda;
+        y * ((1.0 - p).powf(-1.0 / a) - 1.0)
+    }
+
+    fn support(&self) -> (f64, f64) {
+        (0.0, f64::INFINITY)
+    }
+}
+
 #[derive(Debug, Clone)]
 pub struct Laplace {
     mean: f64,
@@ -516,7 +576,7 @@ mod test {
 
     use super::{Exponential, ParameterizedDistribution};
 
-    fn get_dists() -> [Box<dyn TestDistribution>; 6] {
+    fn get_dists() -> [Box<dyn TestDistribution>; 7] {
         [
             as_box(Exponential::unit()),
             as_box(ExponentialEstimator::unit()),
@@ -524,6 +584,7 @@ mod test {
             as_box(Frechet::unit()),
             as_box(Laplace::unit()),
             as_box(Gumbel::unit()),
+            as_box(Lomax::unit()),
         ]
     }
 
@@ -539,6 +600,9 @@ mod test {
             println!("Testing distribution: {:?}", dist);
             let (mut low, mut high) = dist.tsupport();
 
+            assert!(dist.tcdf(low) == 0.0);
+            assert!(dist.tcdf(high) == 1.0);
+
             if high == f64::INFINITY {
                 high = 5.0;
             }
@@ -558,9 +622,4 @@ mod test {
             }
         }
     }
-
-    #[test]
-    fn test_exponential_distribution() {
-        let _dist = Exponential::unit();
-    }
 }

From bce495e891878a8ab2c33e32622e2cd4f1a2254b Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Tue, 2 Jun 2026 18:59:31 -0600
Subject: [PATCH 34/39] New version working...

---
 src/assembly.rs         | 202 +++++++++++++++++++++-------------------
 src/join_estimation.rs  | 159 +++++++++++--------------------
 src/main.rs             |  32 +------
 src/pipeline.rs         |   3 +-
 src/segments.rs         |  12 ++-
 src/statistics.rs       | 141 +++++++++++++---------------
 src/trace_statistics.rs |  54 ++++++-----
 7 files changed, 269 insertions(+), 334 deletions(-)

diff --git a/src/assembly.rs b/src/assembly.rs
index 5f49087..35eb626 100644
--- a/src/assembly.rs
+++ b/src/assembly.rs
@@ -4,10 +4,11 @@ use itertools::Itertools;
 
 use crate::{
     alignment::{Alignment, Strand},
-    join_estimation::{JoinEstimator, JoinStatisticsCollector},
+    chunks::ProximityGroup,
+    join_estimation::{JoinEstimator, JoinStatisticsCollector, LinkInfo},
     score_params::ScoreParams,
-    segments::{Block, SegmentedMatrix, SegmentedMatrixView},
-    trace_statistics::{QueryStatistics, RegionStatistics},
+    segments::{Block, InitialSegments, SegmentedMatrix, SegmentedMatrixView},
+    trace_statistics::{calculate_region_statistics, QueryStatistics, RegionStatistics},
     AnnotationArgs,
 };
 
@@ -81,61 +82,13 @@ pub struct Edge {
     pub link_type: LinkType,
 }
 
-fn piecewise_linear_cost(
-    neg_start: f64,
-    pos_start: f64,
-    neg_slope: f64,
-    pos_slope: f64,
-    value: f64,
-) -> f64 {
-    if value < neg_start {
-        (value - neg_start).abs() * neg_slope
-    } else if value > pos_slope {
-        (value - pos_start).abs() * pos_slope
-    } else {
-        0.0
-    }
-}
-
-fn get_link_cost(
-    annotation_args: &AnnotationArgs,
-    score_params: &ScoreParams,
-    consensus_gap: isize,
-    join_prob: f64,
-) -> f64 {
-    // Minimum cost (a query loop)
-    let min_value = score_params.query_loop_score;
-    let value_range = (score_params.query_loop_score - score_params.query_jump_score).abs();
-
-    // Get overlap and gap ranges with free areas incorperated in, otherwise math is not quite right.
-    let overlap_range = ((annotation_args.consensus_join_overlap as f64)
-        - (annotation_args.free_join_consensus_overlap as f64))
-        .abs()
-        .max(1.0);
-    let gap_range = ((annotation_args.consensus_join_distance as f64)
-        - (annotation_args.free_join_consensus_gap as f64))
-        .abs()
-        .max(1.0);
-
-    // Compute slopes....
-    let alpha =
-        -value_range * (annotation_args.join_consensus_overlap_penalty / overlap_range).abs();
-    let beta = -value_range * (annotation_args.join_consensus_gap_penalty / gap_range).abs();
-
+fn get_link_cost(score_params: &ScoreParams, join_prob: f64) -> f64 {
     // Doing this as the expected value over the transition scores...
     let expected_score = join_prob * score_params.query_loop_score
         + (1.0 - join_prob) * score_params.query_jump_score;
 
     // Cost = linear consensus cost + linear target gap cost...
-    min_value
-        /*+ piecewise_linear_cost(
-            -(annotation_args.free_join_consensus_overlap as f64).abs(),
-            (annotation_args.free_join_consensus_gap as f64).abs(),
-            alpha,
-            beta,
-            consensus_gap as f64,
-        )*/
-        + expected_score
+    expected_score
 }
 
 pub fn block_target_distance(first_block: &Block, second_block: &Block) -> isize {
@@ -188,6 +141,7 @@ pub fn block_length_on_query(b: &Block) -> usize {
     b.query_end.abs_diff(b.query_start) + 1
 }
 
+#[allow(dead_code)]
 pub enum ConsensusDistanceNormalization {
     Max,
     Min,
@@ -257,14 +211,31 @@ fn new_alignment_to_blocks_map(
     alignment_block_map
 }
 
+fn calculate_unexplained_bases(
+    segments: SegmentedMatrixView,
+    region_statistics: &RegionStatistics,
+    first_block_segment: usize,
+    second_block_segment: usize,
+    second_block_target_start: usize,
+) -> usize {
+    let ub = region_statistics.unexplained_bases[second_block_segment]
+        .abs_diff(region_statistics.unexplained_bases[first_block_segment])
+        + (second_block_target_start - segments[second_block_segment].start_col);
+    ub
+}
+
 pub fn gather_join_statistics<T: JoinStatisticsCollector>(
-    alignments: &[Alignment],
+    group: &ProximityGroup,
+    initial_segments: &InitialSegments,
     query_lengths: &HashMap<usize, usize>,
     annotation_args: &AnnotationArgs,
 ) -> Vec<(usize, T)> {
+    let alignments = group.alignments;
+
     let mut query_ids: Vec<usize> = alignments.iter().map(|a| a.query_id).unique().collect();
     query_ids.sort();
 
+    let region_stats = calculate_region_statistics(initial_segments);
     let mut query_stats: Vec<(usize, T)> = Vec::with_capacity(query_ids.len());
 
     query_ids
@@ -277,7 +248,14 @@ pub fn gather_join_statistics<T: JoinStatisticsCollector>(
                     .iter()
                     .enumerate()
                     .filter(|&(_, a)| a.query_id == *id)
-                    .map(|(i, a)| Block::from_alignment(a, i, 0.0, 0.0)),
+                    .map(|(i, a)| {
+                        let b = Block::from_alignment(a, group.target_start, i, 0.0, 0.0);
+                        let seg_i = initial_segments
+                            .view_segments()
+                            .partition_point(|v| v.start_col <= b.col_start)
+                            .saturating_sub(1);
+                        (seg_i, b)
+                    }),
             )
         })
         .for_each(|(id, compat_alignments)| {
@@ -288,6 +266,8 @@ pub fn gather_join_statistics<T: JoinStatisticsCollector>(
                 *query_lengths
                     .get(&id)
                     .expect("Query length missing for alignment!"),
+                initial_segments.view_segments(),
+                &region_stats,
                 annotation_args,
                 &mut new_stats,
             );
@@ -298,42 +278,72 @@ pub fn gather_join_statistics<T: JoinStatisticsCollector>(
     query_stats
 }
 
+fn link_info(
+    first_block: &Block,
+    second_block: &Block,
+    annotation_args: &AnnotationArgs,
+    unexplained_bases: usize,
+    consensus_length: usize,
+    neighbors: bool,
+) -> LinkInfo {
+    let (consensus_distance, link_type) = block_consensus_distance(first_block, second_block);
+    let joinable = is_joinable(
+        block_target_distance(first_block, second_block),
+        consensus_distance,
+        link_type,
+        block_length_on_query(first_block).min(block_length_on_query(second_block)),
+        annotation_args,
+    );
+
+    LinkInfo {
+        target_distance: block_target_distance(first_block, second_block),
+        consensus_distance,
+        link_type,
+        consensus_length,
+        unexplained_bases,
+        neighbors,
+        joinable,
+    }
+}
+
 fn gather_join_statistics_single_family<'a>(
-    compatable_alignments: impl Iterator<Item = Block>,
+    compatable_alignments: impl Iterator<Item = (usize, Block)>,
     consensus_length: usize,
+    segments: SegmentedMatrixView,
+    region_statistics: &RegionStatistics,
     args: &AnnotationArgs,
     join_stats: &mut impl JoinStatisticsCollector,
 ) {
     let compatable_blocks = compatable_alignments
-        .sorted_by_key(|a| a.col_start)
+        .sorted_by_key(|(_u_b, a)| a.col_start)
         .collect_vec();
 
     compatable_blocks
         .iter()
         .enumerate()
-        .for_each(|(idx, a_block)| {
-            compatable_blocks[idx + 1..]
-                .iter()
-                .enumerate()
-                .for_each(|(idx2, b_block)| {
-                    let (consensus_distance, link_type) =
-                        block_consensus_distance(a_block, b_block);
-                    let joinable = is_joinable(
-                        block_target_distance(a_block, b_block),
-                        consensus_distance,
-                        link_type,
-                        block_length_on_query(a_block).min(block_length_on_query(b_block)),
-                        args,
-                    );
-
+        .for_each(|(idx, (a_segment_idx, a_block))| {
+            compatable_blocks[idx + 1..].iter().enumerate().for_each(
+                |(idx2, (b_segment_idx, b_block))| {
                     join_stats.add(
                         a_block,
                         b_block,
-                        consensus_length,
-                        idx + 1 == idx2,
-                        joinable,
+                        &link_info(
+                            a_block,
+                            b_block,
+                            args,
+                            calculate_unexplained_bases(
+                                segments,
+                                region_statistics,
+                                *a_segment_idx,
+                                *b_segment_idx,
+                                b_block.col_start,
+                            ),
+                            consensus_length,
+                            idx + 1 == idx2,
+                        ),
                     );
-                })
+                },
+            )
         })
 }
 
@@ -343,7 +353,7 @@ fn link_assemblies<T: JoinEstimator>(
     consensus_length: usize,
     segments: &SegmentedMatrix,
     query_statistics: &QueryStatistics<T>,
-    _region_statistics: &RegionStatistics,
+    region_statistics: &RegionStatistics,
     score_params: &ScoreParams,
     args: &AnnotationArgs,
 ) {
@@ -360,29 +370,31 @@ fn link_assemblies<T: JoinEstimator>(
             let a_block = &segments[a.0].blocks[a.1];
             let b_block = &segments[b.0].blocks[b.1];
 
-            let target_distance = block_target_distance(a_block, b_block);
-            let min_block_length =
-                block_length_on_query(a_block).min(block_length_on_query(b_block));
-
-            let (consensus_distance, link_type) = block_consensus_distance(a_block, b_block);
-
-            if is_joinable(
-                target_distance,
-                consensus_distance,
-                link_type,
-                min_block_length,
+            let link = link_info(
+                a_block,
+                b_block,
                 args,
-            ) {
-                let join_prob =
-                    query_statistics
-                        .estimator
-                        .predict(a_block, b_block, consensus_length, false);
+                calculate_unexplained_bases(
+                    segments,
+                    region_statistics,
+                    a.0,
+                    b.0,
+                    b_block.col_start,
+                ),
+                consensus_length,
+                a.0 + 1 == b.0,
+            );
+
+            if link.joinable {
+                let join_prob = query_statistics
+                    .estimator
+                    .predict(a_block, b_block, &link, false);
 
                 if join_prob >= args.join_likelihood_threshold {
                     let weight = if a_block.row_idx == b_block.row_idx && ((b.0 - 1) <= a.0) {
                         score_params.query_loop_score
                     } else {
-                        get_link_cost(args, score_params, consensus_distance, join_prob)
+                        get_link_cost(score_params, join_prob)
                     };
 
                     graph.insert(
@@ -391,7 +403,7 @@ fn link_assemblies<T: JoinEstimator>(
                             weight,
                             first_sparse_row: a.1,
                             second_sparse_row: b.1,
-                            link_type,
+                            link_type: link.link_type,
                         },
                     );
                 }
diff --git a/src/join_estimation.rs b/src/join_estimation.rs
index 827f98b..f28fc62 100644
--- a/src/join_estimation.rs
+++ b/src/join_estimation.rs
@@ -5,16 +5,13 @@ use std::{
 };
 
 use crate::{
-    assembly::{
-        block_target_distance, relative_consensus_distance, ConsensusDistanceNormalization,
-        LinkType,
-    },
+    assembly::{relative_consensus_distance, ConsensusDistanceNormalization, LinkType},
     p2estimator::{
         custom_quantile_estimator::{LomaxQuant, MedianEstimator},
         QuantileEstimator,
     },
     segments::Block,
-    statistics::{ln_add_exp, Distribution, ExponentialEstimator, HalfT, Laplace, Lomax},
+    statistics::{ln_add_exp, AssymetricLaplace, Distribution, ExponentialEstimator, HalfT, Lomax},
 };
 
 pub trait JoinEstimator: Clone + Default + Debug {
@@ -22,23 +19,28 @@ pub trait JoinEstimator: Clone + Default + Debug {
         &self,
         first_block: &Block,
         second_block: &Block,
-        consensus_length: usize,
+        link_info: &LinkInfo,
         log_space: bool,
     ) -> f64;
 }
 
+pub struct LinkInfo {
+    #[allow(dead_code)]
+    pub target_distance: isize,
+    #[allow(dead_code)]
+    pub consensus_distance: isize,
+    pub link_type: LinkType,
+    pub consensus_length: usize,
+    pub unexplained_bases: usize,
+    pub neighbors: bool,
+    pub joinable: bool,
+}
+
 pub trait JoinStatisticsCollector: Clone + Debug {
     fn new() -> Self;
     fn new_from_prior(bayesian_prior: &Self, pseudo_count: usize) -> Self;
     fn combine(&self, other: &Self) -> Self;
-    fn add(
-        &mut self,
-        first_block: &Block,
-        second_block: &Block,
-        consenus_length: usize,
-        neighbors: bool,
-        joinable: bool,
-    );
+    fn add(&mut self, first_block: &Block, second_block: &Block, link_info: &LinkInfo);
 }
 
 #[derive(Debug, Clone, Default)]
@@ -47,11 +49,8 @@ pub struct BayesianJoinEstimator {
     target_distance_nojoin: ExponentialEstimator,
     divergence_join: HalfT,
     divergence_nojoin: HalfT,
-    consensus_distance_join_pos: ExponentialEstimator,
-    consensus_distance_join_neg: ExponentialEstimator,
-    consensus_norm_pos: f64,
-    consensus_norm_neg: f64,
-    consensus_distance_nojoin: Laplace,
+    consensus_distance_join: AssymetricLaplace,
+    consensus_distance_nojoin: AssymetricLaplace,
     join_prior: f64,
 }
 
@@ -60,54 +59,26 @@ impl JoinEstimator for BayesianJoinEstimator {
         &self,
         first_block: &Block,
         second_block: &Block,
-        consensus_distance: usize,
+        link_info: &LinkInfo,
         log_space: bool,
     ) -> f64 {
-        let target_dist = block_target_distance(first_block, second_block) as f64;
+        let target_dist = link_info.unexplained_bases as f64;
         // Absolute value as t-dist is symmetric and we want to get prob in tail, also, we know the mean is 0...
         let divergence_diff = (second_block.kimura80 - first_block.kimura80).abs();
         let (rel_con_dist, _join_type) = relative_consensus_distance(
             first_block,
             second_block,
-            ConsensusDistanceNormalization::WithLength(consensus_distance),
-        );
-
-        let consensus_join_dist_logpdf = |x: f64| {
-            if x < 0.0 {
-                self.consensus_distance_join_neg.logpdf(x.abs()) + self.consensus_norm_neg.ln()
-            } else {
-                self.consensus_distance_join_pos.logpdf(x.abs()) + self.consensus_norm_pos.ln()
-            }
-        };
-
-        println!("{:#?}", self);
-        println!(
-            "{} {} {}",
-            rel_con_dist,
-            consensus_join_dist_logpdf(rel_con_dist).exp(),
-            self.consensus_distance_nojoin.pdf(rel_con_dist)
-        );
-        println!(
-            "{} {} {}",
-            target_dist,
-            self.target_distance_join.pdf(target_dist),
-            self.target_distance_nojoin.pdf(target_dist)
-        );
-        println!(
-            "{} {} {}",
-            divergence_diff,
-            self.divergence_join.pdf(divergence_diff),
-            self.divergence_nojoin.pdf(divergence_diff)
+            ConsensusDistanceNormalization::WithLength(link_info.consensus_length),
         );
 
         let join_score = self.join_prior.ln()
             + self.target_distance_join.logpdf(target_dist)
-            + self.divergence_join.logpdf(divergence_diff);
-        //+ self.consensus_distance_join.logpdf(rel_con_dist);
+            + self.divergence_join.logpdf(divergence_diff)
+            + self.consensus_distance_join.logpdf(rel_con_dist);
         let nojoin_score = (-self.join_prior).ln_1p()
             + self.target_distance_nojoin.logpdf(target_dist)
-            + self.divergence_nojoin.logpdf(divergence_diff);
-        //+ self.consensus_distance_nojoin.logpdf(rel_con_dist);
+            + self.divergence_nojoin.logpdf(divergence_diff)
+            + self.consensus_distance_nojoin.logpdf(rel_con_dist);
 
         let score_norm = ln_add_exp(join_score, nojoin_score);
         let score = join_score - score_norm;
@@ -215,12 +186,6 @@ impl From<MomentEstimator> for HalfT {
     }
 }
 
-impl From<MomentEstimator> for Laplace {
-    fn from(value: MomentEstimator) -> Self {
-        Self::from_moments(value.mean(), value.standard_deviation())
-    }
-}
-
 impl From<&LomaxQuant> for Lomax {
     fn from(value: &LomaxQuant) -> Self {
         // Using quantile selection trick originally developed for frechet... (quant ratio formula)
@@ -245,22 +210,20 @@ impl From<&MedianEstimator> for ExponentialEstimator {
 
 impl From<&BayesianJoinStatistics> for BayesianJoinEstimator {
     fn from(statistics: &BayesianJoinStatistics) -> Self {
-        let cons_total = (statistics.joinable_consensus_pos.samples()
-            + statistics.joinable_consensus_neg.samples())
-        .max(1);
-        let cons_pos_perc = statistics.joinable_consensus_pos.samples() as f64 / cons_total as f64;
-        let cons_neg_perc = statistics.joinable_consensus_neg.samples() as f64 / cons_total as f64;
-
         Self {
             target_distance_join: statistics.joinable_target_distance.into(),
             target_distance_nojoin: statistics.unjoinable_target_distance.into(),
             divergence_join: statistics.joinable_divergence.into(),
             divergence_nojoin: statistics.unjoinable_divergence.into(),
-            consensus_distance_join_pos: (&statistics.joinable_consensus_pos).into(),
-            consensus_distance_join_neg: (&statistics.joinable_consensus_neg).into(),
-            consensus_norm_pos: 0.5 * cons_pos_perc,
-            consensus_norm_neg: 0.5 * cons_neg_perc,
-            consensus_distance_nojoin: statistics.unjoinable_consensus.into(),
+            consensus_distance_join: AssymetricLaplace::from_exponential_halves(
+                0.0,
+                statistics.joinable_consensus_neg.mean(),
+                statistics.joinable_consensus_pos.mean(),
+            ),
+            consensus_distance_nojoin: AssymetricLaplace::symmetric_from_moments(
+                statistics.unjoinable_consensus.mean(),
+                statistics.unjoinable_consensus.standard_deviation(),
+            ),
             // We take sqrt since we count all pairs, not just neighbors.
             join_prior: (statistics.joinable_target_distance.samples() as f64
                 / (statistics.joinable_target_distance.samples()
@@ -277,8 +240,8 @@ pub struct BayesianJoinStatistics {
     unjoinable_target_distance: MomentEstimator,
     joinable_divergence: MomentEstimator,
     unjoinable_divergence: MomentEstimator,
-    joinable_consensus_pos: MedianEstimator,
-    joinable_consensus_neg: MedianEstimator,
+    joinable_consensus_pos: MomentEstimator,
+    joinable_consensus_neg: MomentEstimator,
     unjoinable_consensus: MomentEstimator,
 }
 
@@ -301,51 +264,45 @@ impl JoinStatisticsCollector for BayesianJoinStatistics {
             unjoinable_divergence: bayesian_prior
                 .unjoinable_divergence
                 .to_psuedo_count(pseudo_count),
-            joinable_consensus_pos: MedianEstimator::from_prior(
-                &bayesian_prior.joinable_consensus_pos,
-                pseudo_count,
-            ),
-            joinable_consensus_neg: MedianEstimator::from_prior(
-                &bayesian_prior.joinable_consensus_neg,
-                pseudo_count,
-            ),
+            joinable_consensus_pos: bayesian_prior
+                .joinable_consensus_pos
+                .to_psuedo_count(pseudo_count),
+            joinable_consensus_neg: bayesian_prior
+                .joinable_consensus_neg
+                .to_psuedo_count(pseudo_count),
             unjoinable_consensus: bayesian_prior
                 .unjoinable_consensus
                 .to_psuedo_count(pseudo_count),
         }
     }
 
-    fn add(
-        &mut self,
-        first_block: &Block,
-        second_block: &Block,
-        consensus_length: usize,
-        neighbors: bool,
-        joinable: bool,
-    ) {
-        let target_dist = block_target_distance(first_block, second_block).abs() as usize;
+    fn add(&mut self, first_block: &Block, second_block: &Block, link_info: &LinkInfo) {
+        if !link_info.neighbors {
+            return;
+        }
+
+        let target_dist = link_info.unexplained_bases;
         let divergence_diff = (second_block.kimura80 - first_block.kimura80).abs();
         let (rel_con_dist, join_type) = relative_consensus_distance(
             first_block,
             second_block,
-            ConsensusDistanceNormalization::WithLength(consensus_length),
+            ConsensusDistanceNormalization::WithLength(link_info.consensus_length),
         );
 
-        if joinable {
+        if link_info.joinable {
             self.joinable_target_distance += target_dist as f64;
             self.joinable_divergence += divergence_diff;
-            if neighbors && matches!(join_type, LinkType::Forward | LinkType::Reverse) {
-                //println!("CDist: {}", rel_con_dist);
+            if matches!(join_type, LinkType::Forward | LinkType::Reverse) {
                 if rel_con_dist >= 0.0 {
-                    self.joinable_consensus_pos.update(rel_con_dist.abs());
+                    self.joinable_consensus_pos += rel_con_dist.abs();
                 } else {
-                    self.joinable_consensus_neg.update(rel_con_dist.abs());
+                    self.joinable_consensus_neg += rel_con_dist.abs();
                 }
             }
         } else {
             self.unjoinable_target_distance += target_dist as f64;
             self.unjoinable_divergence += divergence_diff;
-            if neighbors && matches!(join_type, LinkType::Forward | LinkType::Reverse) {
+            if matches!(join_type, LinkType::Forward | LinkType::Reverse) {
                 self.unjoinable_consensus += rel_con_dist;
             }
         }
@@ -359,12 +316,8 @@ impl JoinStatisticsCollector for BayesianJoinStatistics {
                 + other.unjoinable_target_distance,
             joinable_divergence: self.joinable_divergence + other.joinable_divergence,
             unjoinable_divergence: self.unjoinable_divergence + other.unjoinable_divergence,
-            joinable_consensus_pos: self
-                .joinable_consensus_pos
-                .combine(&other.joinable_consensus_pos),
-            joinable_consensus_neg: self
-                .joinable_consensus_neg
-                .combine(&other.joinable_consensus_neg),
+            joinable_consensus_pos: self.joinable_consensus_pos + other.joinable_consensus_pos,
+            joinable_consensus_neg: self.joinable_consensus_neg + other.joinable_consensus_neg,
             unjoinable_consensus: self.unjoinable_consensus + other.unjoinable_consensus,
         }
     }
diff --git a/src/main.rs b/src/main.rs
index 0063815..5b01b6d 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -170,7 +170,7 @@ pub struct AnnotationArgs {
 
     /// The maximum seperation or overlap in nucleotides on both target and consensus
     /// for a join to be allowed between inverted alignments.
-    #[arg(long = "inversion-distance", default_value = "50", value_name = "n")]
+    #[arg(long = "inversion-distance", default_value = "200", value_name = "n")]
     pub inversion_distance: isize,
 
     /// The size of the window looked at to determine a single alignment score in nucleotides.
@@ -223,36 +223,6 @@ pub struct AnnotationArgs {
     /// Set to 0 or greater to disable.
     #[arg(long = "min-history-score", default_value = "-500.0", value_name = "f")]
     pub min_relative_history_score: f64,
-
-    /// The amount of overlap between two joinable sequences in the consensus
-    /// before a penalty starts being applied to the join.
-    #[arg(long = "free-join-overlap", default_value = "4", value_name = "n")]
-    pub free_join_consensus_overlap: usize,
-
-    /// The amount of gap between two joinable sequences
-    /// before a penalty starts being applied to the join.
-    #[arg(long = "free-join-gap", default_value = "10", value_name = "n")]
-    pub free_join_consensus_gap: usize,
-
-    /// The amount of penalty to apply to a join at the maximum allowed consensus overlap
-    /// A value of 1 means to apply a penalty equal to a query jump.
-    /// The cost grows linearly to this value as the overlap increases.
-    #[arg(
-        long = "consensus-overlap-penalty",
-        default_value = "1.0",
-        value_name = "f"
-    )]
-    pub join_consensus_overlap_penalty: f64,
-
-    /// The amount of penalty to apply to a join at the maximum allowed consensus gap
-    /// A value of 1 means to apply a penalty equal to a query jump.
-    /// The cost grows linearly to this value as the gap increases.
-    #[arg(
-        long = "consensus-gap-penalty",
-        default_value = "0.5",
-        value_name = "f"
-    )]
-    pub join_consensus_gap_penalty: f64,
 }
 
 #[derive(Args, Debug, Clone, Default)]
diff --git a/src/pipeline.rs b/src/pipeline.rs
index c3ef770..7200f78 100644
--- a/src/pipeline.rs
+++ b/src/pipeline.rs
@@ -253,7 +253,8 @@ pub fn run_naive_trace<T: JoinStatisticsCollector>(
     }
 
     let query_join_statistics = gather_join_statistics(
-        proximity_group.alignments,
+        proximity_group,
+        &segments,
         &alignment_data.query_lengths,
         &args.annotation_args,
     );
diff --git a/src/segments.rs b/src/segments.rs
index 74e987b..b65964f 100644
--- a/src/segments.rs
+++ b/src/segments.rs
@@ -100,14 +100,20 @@ impl Block {
         (self.query_id, self.row_idx)
     }
 
-    pub fn from_alignment(alignment: &Alignment, row: usize, confidence: f64, score: f64) -> Self {
+    pub fn from_alignment(
+        alignment: &Alignment,
+        group_start: usize,
+        row: usize,
+        confidence: f64,
+        score: f64,
+    ) -> Self {
         Self {
             row_idx: row,
             block_type: BlockType::Alignment,
             strand: alignment.strand,
             query_id: Some(alignment.query_id),
-            col_start: alignment.target_start,
-            col_end: alignment.target_end,
+            col_start: alignment.target_start.saturating_sub(group_start),
+            col_end: alignment.target_end.saturating_sub(group_start),
             query_start: alignment.query_start,
             query_end: alignment.query_end,
             avg_confidence: confidence,
diff --git a/src/statistics.rs b/src/statistics.rs
index 3271f27..448c118 100644
--- a/src/statistics.rs
+++ b/src/statistics.rs
@@ -11,7 +11,7 @@ pub fn ln_add_exp(a: f64, b: f64) -> f64 {
 
 // TODO: Support for generic floating types...
 #[allow(dead_code)]
-pub trait Distribution: Clone {
+pub trait Distribution {
     fn pdf(&self, x: f64) -> f64;
     fn cdf(&self, x: f64) -> f64;
     fn ppf(&self, p: f64) -> f64;
@@ -22,7 +22,7 @@ pub trait Distribution: Clone {
     fn logccdf(&self, x: f64) -> f64;
 }
 
-pub trait ParameterizedDistribution: Distribution + Debug + Default {
+pub trait ParameterizedDistribution: Distribution + Debug + Default + Clone {
     fn unit() -> Self {
         Self::default()
     }
@@ -451,40 +451,54 @@ impl Distribution for Lomax {
 }
 
 #[derive(Debug, Clone)]
-pub struct Laplace {
-    mean: f64,
+pub struct AssymetricLaplace {
+    mode: f64,
     scale: f64,
+    mode_quantile: f64,
 }
 
-impl ParameterizedDistribution for Laplace {}
+impl ParameterizedDistribution for AssymetricLaplace {}
 
-impl Laplace {
-    pub fn new(mean: f64, scale: f64) -> Self {
-        Self { mean, scale }
-    }
-
-    pub fn from_moments(mean: f64, standard_deviation: f64) -> Self {
+impl AssymetricLaplace {
+    pub fn new(mode: f64, scale: f64, mode_quantile: f64) -> Self {
         Self {
-            mean,
-            scale: standard_deviation / f64::consts::SQRT_2,
+            mode,
+            scale,
+            mode_quantile,
         }
     }
+
+    pub fn from_exponential_halves(mode: f64, negative_mean: f64, positive_mean: f64) -> Self {
+        Self::new(
+            mode,
+            (negative_mean * positive_mean) / (negative_mean + positive_mean),
+            1.0 / (positive_mean / negative_mean + 1.0),
+        )
+    }
+
+    pub fn symmetric_from_moments(mean: f64, standard_deviation: f64) -> Self {
+        Self::new(mean, standard_deviation / (8.0_f64.sqrt()), 0.5)
+    }
 }
 
-impl Default for Laplace {
+impl Default for AssymetricLaplace {
     fn default() -> Self {
-        Self {
-            mean: 0.0,
-            scale: 1.0,
-        }
+        Self::symmetric_from_moments(0.0, 1.0)
     }
 }
 
-impl Distribution for Laplace {
+impl Distribution for AssymetricLaplace {
     fn logpdf(&self, x: f64) -> f64 {
-        let mu = self.mean;
-        let b = self.scale;
-        (0.5 / b).ln() + -((x - mu).abs() / b)
+        let m = self.mode;
+        let l = self.scale;
+        let p = self.mode_quantile;
+        let exp_comp = if x <= m {
+            ((1.0 - p) / l) * (x - m)
+        } else {
+            -(p / l) * (x - m)
+        };
+
+        ((p * (1.0 - p)) / l).ln() + exp_comp
     }
 
     fn pdf(&self, x: f64) -> f64 {
@@ -492,9 +506,14 @@ impl Distribution for Laplace {
     }
 
     fn cdf(&self, x: f64) -> f64 {
-        let mu = self.mean;
-        let b = self.scale;
-        0.5 + 0.5 * (x - mu).signum() * (1.0 - (-(x - mu).abs() / b).exp())
+        let m = self.mode;
+        let l = self.scale;
+        let p = self.mode_quantile;
+        if x <= m {
+            p * (((1.0 - p) / l) * (x - m)).exp()
+        } else {
+            1.0 - (1.0 - p) * (-(p / l) * (x - m)).exp()
+        }
     }
 
     fn logcdf(&self, x: f64) -> f64 {
@@ -510,10 +529,14 @@ impl Distribution for Laplace {
     }
 
     fn ppf(&self, p: f64) -> f64 {
-        let mu = self.mean;
-        let b = self.scale;
-        let p = p.clamp(0.0, 1.0);
-        mu - b * (p - 0.5).signum() * (1.0 - 2.0 * (p - 0.5).abs()).ln()
+        let m = self.mode;
+        let l = self.scale;
+        let pm = self.mode_quantile;
+        if p <= pm {
+            m + (l / (1.0 - pm)) * (p / pm).ln()
+        } else {
+            m - (l / pm) * ((1.0 - p) / (1.0 - pm)).ln()
+        }
     }
 
     fn support(&self) -> (f64, f64) {
@@ -532,45 +555,11 @@ mod test {
     use super::*;
     use std::fmt::Debug;
 
-    pub trait TestDistribution: Debug {
-        fn tpdf(&self, x: f64) -> f64;
-        fn tcdf(&self, x: f64) -> f64;
-        fn tppf(&self, p: f64) -> f64;
-        fn tsupport(&self) -> (f64, f64);
-        fn tccdf(&self, x: f64) -> f64;
-        fn tlogpdf(&self, x: f64) -> f64;
-        fn tlogcdf(&self, x: f64) -> f64;
-        fn tlogccdf(&self, x: f64) -> f64;
-    }
-
-    impl<T: ParameterizedDistribution> TestDistribution for T {
-        fn tpdf(&self, x: f64) -> f64 {
-            self.pdf(x)
-        }
-        fn tcdf(&self, x: f64) -> f64 {
-            self.cdf(x)
-        }
-        fn tppf(&self, p: f64) -> f64 {
-            self.ppf(p)
-        }
-        fn tsupport(&self) -> (f64, f64) {
-            self.support()
-        }
-        fn tccdf(&self, x: f64) -> f64 {
-            self.ccdf(x)
-        }
-        fn tlogpdf(&self, x: f64) -> f64 {
-            self.logpdf(x)
-        }
-        fn tlogcdf(&self, x: f64) -> f64 {
-            self.logcdf(x)
-        }
-        fn tlogccdf(&self, x: f64) -> f64 {
-            self.logccdf(x)
-        }
-    }
+    // Add debug trait to allow for printout...
+    pub trait TestDistribution: Distribution + Debug {}
+    impl<T: Distribution + Debug> TestDistribution for T {}
 
-    fn as_box<T: ParameterizedDistribution + 'static>(d: T) -> Box<dyn TestDistribution> {
+    fn as_box<T: TestDistribution + 'static>(d: T) -> Box<dyn TestDistribution> {
         Box::new(d)
     }
 
@@ -582,7 +571,7 @@ mod test {
             as_box(ExponentialEstimator::unit()),
             as_box(HalfT::unit()),
             as_box(Frechet::unit()),
-            as_box(Laplace::unit()),
+            as_box(AssymetricLaplace::unit()),
             as_box(Gumbel::unit()),
             as_box(Lomax::unit()),
         ]
@@ -598,10 +587,10 @@ mod test {
     fn basic_distribution_propery_checks() {
         for dist in get_dists() {
             println!("Testing distribution: {:?}", dist);
-            let (mut low, mut high) = dist.tsupport();
+            let (mut low, mut high) = dist.support();
 
-            assert!(dist.tcdf(low) == 0.0);
-            assert!(dist.tcdf(high) == 1.0);
+            assert!(dist.cdf(low) == 0.0);
+            assert!(dist.cdf(high) == 1.0);
 
             if high == f64::INFINITY {
                 high = 5.0;
@@ -613,12 +602,12 @@ mod test {
             for x in linspace(low, high, 100) {
                 // Basic properties...
                 // println!("{x} -> {} vs {}", dist.tpdf(x), dist.tlogpdf(x).exp());
-                assert!(is_close(dist.tpdf(x), dist.tlogpdf(x).exp()));
-                assert!(is_close(dist.tcdf(x), dist.tlogcdf(x).exp()));
-                assert!(is_close(dist.tccdf(x), dist.tlogccdf(x).exp()));
-                assert!(is_close(dist.tccdf(x), 1.0 - dist.tcdf(x)));
+                assert!(is_close(dist.pdf(x), dist.logpdf(x).exp()));
+                assert!(is_close(dist.cdf(x), dist.logcdf(x).exp()));
+                assert!(is_close(dist.ccdf(x), dist.logccdf(x).exp()));
+                assert!(is_close(dist.ccdf(x), 1.0 - dist.cdf(x)));
                 // println!("{x} -> {}", dist.tppf(dist.tcdf(x)));
-                assert!(is_close(dist.tppf(dist.tcdf(x)), x));
+                assert!(is_close(dist.ppf(dist.cdf(x)), x));
             }
         }
     }
diff --git a/src/trace_statistics.rs b/src/trace_statistics.rs
index cb49630..f71fa39 100644
--- a/src/trace_statistics.rs
+++ b/src/trace_statistics.rs
@@ -6,7 +6,7 @@ use crate::{
     alignment::AlignmentData,
     join_estimation::{JoinEstimator, JoinStatisticsCollector},
     pipeline::NaiveTraceResults,
-    segments::Segment,
+    segments::{InitialSegments, Segment},
 };
 
 #[derive(Debug)]
@@ -37,6 +37,33 @@ pub enum OccuranceCountingMode {
     Trace,
 }
 
+pub fn calculate_region_statistics(segments: &InitialSegments) -> RegionStatistics {
+    let mut region_stat = RegionStatistics {
+        total_bases: 0,
+        unexplained_bases: Vec::with_capacity(segments.len()),
+    };
+
+    let mut unexplained_bases_up_to: usize = 0;
+    let mut prior_segment: Option<&Segment> = None;
+
+    for seg in segments.view_segments() {
+        if let Some(prior_segment) = prior_segment {
+            // If a skip block was the prior block, add it's bases as unexplained.
+            if prior_segment.blocks.len() == 1 && prior_segment.blocks[0].row_idx == 0 {
+                unexplained_bases_up_to += seg.end_col - seg.start_col + 1;
+            }
+            unexplained_bases_up_to += seg.start_col - prior_segment.end_col - 1;
+            region_stat.total_bases += seg.start_col - prior_segment.end_col - 1;
+        }
+        region_stat.total_bases += seg.end_col - seg.start_col + 1;
+        region_stat.unexplained_bases.push(unexplained_bases_up_to);
+
+        prior_segment = Some(seg);
+    }
+
+    region_stat
+}
+
 pub fn trace_statistics<S: JoinStatisticsCollector + Debug + Into<E>, E: JoinEstimator>(
     naive_traces: &[NaiveTraceResults<S>],
     alignment_data: &AlignmentData,
@@ -117,30 +144,7 @@ pub fn trace_statistics<S: JoinStatisticsCollector + Debug + Into<E>, E: JoinEst
             }
         }
 
-        let mut region_stat = RegionStatistics {
-            total_bases: 0,
-            unexplained_bases: Vec::with_capacity(trace_results.segments.len()),
-        };
-
-        let mut unexplained_bases_up_to: usize = 0;
-        let mut prior_segment: Option<&Segment> = None;
-
-        for seg in trace_results.segments.view_segments() {
-            if let Some(prior_segment) = prior_segment {
-                // If a skip block was the prior block, add it's bases as unexplained.
-                if prior_segment.blocks.len() == 1 && prior_segment.blocks[0].row_idx == 0 {
-                    unexplained_bases_up_to += seg.end_col - seg.start_col + 1;
-                }
-                unexplained_bases_up_to += seg.start_col - prior_segment.end_col - 1;
-                region_stat.total_bases += seg.start_col - prior_segment.end_col - 1;
-            }
-            region_stat.total_bases += seg.end_col - seg.start_col + 1;
-            region_stat.unexplained_bases.push(unexplained_bases_up_to);
-
-            prior_segment = Some(seg);
-        }
-
-        all_region_stats.push(region_stat);
+        all_region_stats.push(calculate_region_statistics(&trace_results.segments));
     }
 
     // Calculate join statistics for all families using combined prior as a starting point...

From 22654e4b728da32274f2d63bc6da5fcf30b8241e Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Wed, 3 Jun 2026 17:53:03 -0600
Subject: [PATCH 35/39] Overhaul table viz to work better with very large runs.

---
 fixtures/soda/table.html | 618 +++++++++++++++++++++++++++------------
 src/assembly.rs          |   1 +
 src/join_estimation.rs   |  18 +-
 src/p2estimator.rs       | 189 +++++++-----
 src/statistics.rs        |   8 +-
 src/viz/stats.rs         |  46 +--
 6 files changed, 572 insertions(+), 308 deletions(-)

diff --git a/fixtures/soda/table.html b/fixtures/soda/table.html
index 73ccfeb..db87869 100644
--- a/fixtures/soda/table.html
+++ b/fixtures/soda/table.html
@@ -2,16 +2,18 @@
 <html>
     <head>
         <title>PAGE_TITLE</title>
-        <link rel="icon" type="image/x-icon" href="icon.svg"/>
+        <link rel="icon" type="image/x-icon" href="icon.svg" />
         <style>
             table {
-                border-collapse: collapse; 
+                border-collapse: collapse;
                 border: 2px solid rgb(140 140 140);
-                font-family: sans-serif; font-size: 0.8rem; 
-                letter-spacing: 1px; 
+                font-family: sans-serif;
+                font-size: 0.8rem;
+                letter-spacing: 1px;
             }
-            th, td { 
-                border: 1px solid rgb(160 160 160); 
+            th,
+            td {
+                border: 1px solid rgb(160 160 160);
                 padding: 8px 10px;
             }
 
@@ -35,210 +37,454 @@
                 margin: 0;
             }
         </style>
+        <script id="table-data" type="text/tab-separated-values">
+            TSV_TARGET
+        </script>
         <script type="module">
             SODA_TARGET;
-            
+
+            let ELEMS_PER_PAGE = 50;
+
             let d3 = soda.internalD3;
 
-            d3.selectAll("table").each(function(d, i) {
-                let header = d3.select(this).select("thead");
-                let body = d3.select(this).select("tbody");
+            let tsv = d3.tsvParse(
+                d3.select("#table-data").text().trim(),
+                (d) => {
+                    let new_obj = {};
+                    for (let [k, v] of Object.entries(d)) {
+                        let k_parts = k.split("_");
+                        let k_type = k_parts[k_parts.length - 1];
+                        let new_val;
+                        switch (k_type) {
+                            case "boxplot":
+                            case "violin":
+                                new_val = v
+                                    .split(":")
+                                    .map((v) => parseFloat(v));
+                                break;
+                            case "float":
+                                new_val = parseFloat(v);
+                                break;
+                            case "region":
+                            case "int":
+                                new_val = parseInt(v);
+                                break;
+                            case "string":
+                            default:
+                                new_val = v;
+                        }
+
+                        new_obj[k] = new_val;
+                    }
+
+                    return new_obj;
+                },
+            );
+
+            function getCurrentOrder(fallback = [null, "desc"]) {
+                let url = new URL(window.location.href);
+                let order = url.searchParams.get("sortby");
 
-                function to_value(v) {
-                    let vf = parseFloat(v);
-                    return (vf != vf)? v: vf;
+                if (order) {
+                    let [col, ord] = order.split(":");
+                    if (!fallback[0] || col == fallback[0]) {
+                        return [col, ord == "asc" ? "asc" : "desc"];
+                    }
                 }
+                return fallback;
+            }
 
-                header.selectAll("th")
-                    .on("click", function(d, i) {
-                        header.selectAll("th")
-                            .attr("class", function(d2, i2) {
-                                return (i2 != i)? "": ({
-                                    "": "desc",
-                                    "desc": "asc",
-                                    "asc": "desc"
-                                }[this.className] ?? "desc");
-                            });
+            function getFilter() {
+                let url = new URL(window.location.href);
+                let filter = url.searchParams.get("filter");
+
+                if (filter) {
+                    let new_obj = {};
+                    for (let item of filter.split(",")) {
+                        let [k, v] = item.split(":");
+                        new_obj[k] = v;
+                    }
+                    return new_obj;
+                }
+
+                return {};
+            }
+
+            function encodeFilter(filter) {
+                return Object.entries(filter)
+                    .map(([k, v]) => `${k}:${v}`)
+                    .join(",");
+            }
+
+            function updateUrl(data) {
+                let url = new URL(window.location.href);
+                let params = url.searchParams;
+                for (let [k, v] of Object.entries(data)) {
+                    params.set(k, v);
+                }
+                window.history.pushState(null, "", url);
+            }
 
-                        let order = this.className;
+            function renderHeader() {
+                let header = d3.select("#table-target").select("thead");
+                let row = header.select("tr");
+                let full_filter = getFilter();
 
-                        let ordered_elements = body.selectAll("tr")
-                            .nodes()
-                            .sort((a, b) => {
-                                let a_val = to_value(d3.select(a).selectAll("td").nodes()[i].innerText);
-                                let b_val = to_value(d3.select(b).selectAll("td").nodes()[i].innerText);
-                                return (order === "asc")? d3.ascending(a_val, b_val): d3.descending(a_val, b_val);
+                row.selectAll("th")
+                    .data(tsv.columns)
+                    .enter()
+                    .append("th")
+                    .each(function (d) {
+                        let this_sel = d3.select(this);
+                        let k_parts = d.split("_");
+                        let k_type = k_parts[k_parts.length - 1];
+                        let k_name = k_parts.slice(0, -1).join(" ");
+                        this_sel.text(k_name);
+
+                        if (!["boxplot", "violin"].includes(k_type)) {
+                            let [col, direction] = getCurrentOrder();
+
+                            if (col == d) {
+                                this_sel.attr("class", direction);
+                            }
+
+                            this_sel.on("click", function () {
+                                let [col, ord] = getCurrentOrder([d, "asc"]);
+                                let new_ord = ord == "asc" ? "desc" : "asc";
+                                row.selectAll("th").attr(
+                                    "class",
+                                    function (other_d) {
+                                        return other_d == d ? new_ord : "";
+                                    },
+                                );
+                                updateUrl({
+                                    sortby: `${d}:${new_ord}`,
+                                    page: 0,
+                                });
+                                renderData();
                             });
 
-                        for(let elm of ordered_elements) {
-                            let parent = elm.parentElement;
-                            parent.removeChild(elm);
-                            parent.appendChild(elm);
-                        };
+                            this_sel.append("br");
+                            let prior_text = full_filter[d] ?? "";
+                            this_sel
+                                .append("input")
+                                .attr("type", "text")
+                                .attr("value", prior_text)
+                                .on("click", function (d) {
+                                    //d3.event.preventDefault();
+                                    d3.event.stopPropagation();
+                                    return false;
+                                })
+                                .on("input", function () {
+                                    let value = d3.event.target.value;
+                                    if (value == prior_text) {
+                                        return;
+                                    }
+                                    prior_text = value;
+                                    let filter = getFilter();
+                                    if (value) {
+                                        filter[d] = value;
+                                    } else {
+                                        delete filter[d];
+                                    }
+
+                                    updateUrl({
+                                        filter: encodeFilter(filter),
+                                        page: 0,
+                                    });
+                                    renderData();
+                                });
+                        }
                     });
+            }
 
-                header.selectAll("th").append("br");
-                header.selectAll("th")
-                    .append("input")
-                    .attr("type", "text")
-                    .on("click", function(d, i) {
-                        d3.event.stopPropagation();
-                    })
-                    .on("input", function(d, i, sel) {
-                        let filters = sel.map((v) => v.value);
-
-                        body.selectAll("tr")
-                            .style("display", function(d, i) {
-                                let vals = d3.select(this).selectAll("td").nodes().map((v) => v.innerText);
-                                let isValid = vals.every((v, i) => filters[i] == "" || v.includes(filters[i]));
-                                return (isValid)? "table-row": "none";
-                            });
+            function renderData() {
+                let filter = getFilter();
+                let [col, ord] = getCurrentOrder();
+
+                let inner_tsv = tsv;
+
+                if (Object.keys(filter).length > 0) {
+                    inner_tsv = inner_tsv.filter((row) => {
+                        return Object.entries(filter).every(([k, v]) => {
+                            return row[k].toString().includes(v.toString());
+                        });
                     });
-            });
-
-            d3.selectAll("figure.boxplot")
-                .each(function() {
-                    let width = 250;
-                    let height = 70;
-                    let margin = {
-                        top: 10, right: 0, bottom: 30, left: 0
-                    };
-                    let inner_width = width - margin.left - margin.right;
-                    let inner_height = height - margin.bottom - margin.top; 
-
-                    let svg = d3.select(this)
-                        .append("svg")
-                        .attr("width", width)
-                        .attr("height", height)
-                        .append("g")
-                        .attr("transform", "translate(" + margin.left + "," + margin.top + ")");
-                    
-                    let data = this.dataset.samples.split(",").map((v) => parseFloat(v)).sort(d3.ascending);
-
-                    let q1 = d3.quantile(data, 0.25)
-                    let median = d3.quantile(data, 0.5)
-                    let q3 = d3.quantile(data, 0.75)
-                    let min = d3.min(data);
-                    let max = d3.max(data);
-                    let range = max - min;
-                    let offset = Math.max(range * 0.1, 1);
-
-                    let x = d3.scaleLinear()
-                        .domain([min - offset, max + offset])
-                        .range([0, inner_width]);
-                    svg.append("g")
-                        .attr("transform", "translate(0," + inner_height + ")")
-                        .call(d3.axisBottom(x).ticks(5));
-
-                    let y = d3.scaleLinear()
-                        .domain([0, 1])
-                        .range([inner_height, 0]);
-
-                    let center = y(0.5);
-                    let bar_thickness = y(0) - y(0.5)
-                    
-                    svg.append("line")
-                        .attr("x1", x(min))
-                        .attr("x2", x(max))
-                        .attr("y1", center)
-                        .attr("y2", center)
-                        .attr("stroke", "black");
-
-                    svg.append("rect")
-                        .attr("x", x(q1))
-                        .attr("y", center - bar_thickness / 2)
-                        .attr("width", x(q3) - x(q1))
-                        .attr("height", bar_thickness)
-                        .attr("stroke", "black")
-                        .style("fill", "#69b3a2");
-
-                    svg.selectAll("toto")
-                        .data([min, median, max])
-                        .enter()
-                        .append("line")
-                        .attr("x1", (d) => x(d))
-                        .attr("x2", (d) => x(d))
-                        .attr("y1", center - bar_thickness / 2)
-                        .attr("y2", center + bar_thickness / 2)
-                        .attr("stroke", "black")
-                });
+                }
 
-            d3.selectAll("figure.violin")
-                .each(function() {
-                    function kernelDensityEstimator(kernel, X) {
-                        return (V) => {
-                            return X.map((x) => {
-                                return [x, d3.mean(V, (v) => { return kernel(x - v); })];
-                            });
-                        };
-                    }
-                    function kernelEpanechnikov(k) {
-                        return (v) => Math.abs(v /= k) <= 1 ? 0.75 * (1 - v * v) / k : 0;
-                    }
-                    function gaussianKernel(k) {
-                        return (v) => (1 / Math.sqrt(2 * Math.PI)) * Math.exp(-0.5 * (v / k) * (v / k)) / k;
+                inner_tsv.columns = tsv.columns;
+
+                if (inner_tsv.columns.includes(col)) {
+                    if (ord == "asc") {
+                        inner_tsv = inner_tsv.sort((a, b) =>
+                            d3.ascending(a[col], b[col]),
+                        );
+                    } else {
+                        inner_tsv = inner_tsv.sort((a, b) =>
+                            d3.descending(a[col], b[col]),
+                        );
                     }
+                }
 
-                    let width = 250;
-                    let height = 70;
-                    let margin = {
-                        top: 10, right: 0, bottom: 30, left: 0
+                inner_tsv.columns = tsv.columns;
+
+                let url = new URL(window.location.href);
+                let maxPage = Math.ceil(inner_tsv.length / ELEMS_PER_PAGE);
+                let page = Math.min(
+                    maxPage - 1,
+                    Math.max(
+                        0,
+                        Math.floor(parseInt(url.searchParams.get("page") ?? 0)),
+                    ),
+                );
+                let offset = ELEMS_PER_PAGE * page;
+
+                d3.select("#page-indicator").text(
+                    `Page: ${page + 1}/${maxPage}`,
+                );
+                d3.select("#prior-page").on("click", () => {
+                    updateUrl({ page: Math.max(0, page - 1) });
+                    renderData();
+                });
+                d3.select("#next-page").on("click", () => {
+                    updateUrl({ page: Math.min(maxPage - 1, page + 1) });
+                    renderData();
+                });
+
+                d3.select("#table-target")
+                    .select("tbody")
+                    .selectAll("tr")
+                    .remove();
+
+                let rows = d3
+                    .select("#table-target")
+                    .select("tbody")
+                    .selectAll("tr")
+                    .data(inner_tsv.slice(offset, offset + ELEMS_PER_PAGE))
+                    .enter()
+                    .append("tr");
+
+                let cells = rows
+                    .selectAll("td")
+                    .data((row) => {
+                        return inner_tsv.columns.map((col) => {
+                            return { column: col, value: row[col] };
+                        });
+                    })
+                    .enter()
+                    .append("td")
+                    .each(function (d, i) {
+                        let k_parts = d.column.split("_");
+                        let k_type = k_parts[k_parts.length - 1];
+
+                        switch (k_type) {
+                            case "region":
+                                d3.select(this)
+                                    .append("a")
+                                    .attr("href", `${d.value}/index.html`)
+                                    .text(d.value);
+                                break;
+                            case "boxplot":
+                                renderBoxplot(d3.select(this), d.value);
+                                break;
+                            case "violin":
+                                renderViolin(d3.select(this), d.value);
+                                break;
+                            case "float":
+                            case "int":
+                            case "string":
+                            default:
+                                d3.select(this).text(d.value);
+                        }
+                    });
+            }
+
+            function renderBoxplot(selection, data) {
+                let width = 250;
+                let height = 70;
+                let margin = {
+                    top: 10,
+                    right: 0,
+                    bottom: 30,
+                    left: 0,
+                };
+                let inner_width = width - margin.left - margin.right;
+                let inner_height = height - margin.bottom - margin.top;
+
+                let svg = selection
+                    .append("svg")
+                    .attr("width", width)
+                    .attr("height", height)
+                    .append("g")
+                    .attr(
+                        "transform",
+                        "translate(" + margin.left + "," + margin.top + ")",
+                    );
+
+                data = data.sort(d3.ascending);
+
+                let q1 = d3.quantile(data, 0.25);
+                let median = d3.quantile(data, 0.5);
+                let q3 = d3.quantile(data, 0.75);
+                let min = d3.min(data);
+                let max = d3.max(data);
+                let range = max - min;
+                let offset = Math.max(range * 0.1, 1);
+
+                let x = d3
+                    .scaleLinear()
+                    .domain([min - offset, max + offset])
+                    .range([0, inner_width]);
+                svg.append("g")
+                    .attr("transform", "translate(0," + inner_height + ")")
+                    .call(d3.axisBottom(x).ticks(5));
+
+                let y = d3
+                    .scaleLinear()
+                    .domain([0, 1])
+                    .range([inner_height, 0]);
+
+                let center = y(0.5);
+                let bar_thickness = y(0) - y(0.5);
+
+                svg.append("line")
+                    .attr("x1", x(min))
+                    .attr("x2", x(max))
+                    .attr("y1", center)
+                    .attr("y2", center)
+                    .attr("stroke", "black");
+
+                svg.append("rect")
+                    .attr("x", x(q1))
+                    .attr("y", center - bar_thickness / 2)
+                    .attr("width", x(q3) - x(q1))
+                    .attr("height", bar_thickness)
+                    .attr("stroke", "black")
+                    .style("fill", "#69b3a2");
+
+                svg.selectAll("toto")
+                    .data([min, median, max])
+                    .enter()
+                    .append("line")
+                    .attr("x1", (d) => x(d))
+                    .attr("x2", (d) => x(d))
+                    .attr("y1", center - bar_thickness / 2)
+                    .attr("y2", center + bar_thickness / 2)
+                    .attr("stroke", "black");
+            }
+
+            function renderViolin(selection, data) {
+                function kernelDensityEstimator(kernel, X) {
+                    return (V) => {
+                        return X.map((x) => {
+                            return [
+                                x,
+                                d3.mean(V, (v) => {
+                                    return kernel(x - v);
+                                }),
+                            ];
+                        });
                     };
-                    let inner_width = width - margin.left - margin.right;
-                    let inner_height = height - margin.bottom - margin.top; 
-
-                    let svg = d3.select(this)
-                        .append("svg")
-                        .attr("width", width)
-                        .attr("height", height)
-                        .append("g")
-                        .attr("transform", "translate(" + margin.left + "," + margin.top + ")");
-                    
-                    let data = this.dataset.samples.split(",").map((v) => parseFloat(v)).sort(d3.ascending);
-
-                    let min = d3.min(data);
-                    let max = d3.max(data);
-                    let range = max - min;
-                    let offset = Math.max(range * 0.1, 1);
-
-                    let irq = (d3.quantile(data, 0.75) - d3.quantile(data, 0.25)) / 1.349
-                    let sigma = Math.min(d3.deviation(data), irq);
-                    let est_h = 0.5 * sigma * (data.length ** (-0.2));
-                    est_h = (est_h != est_h)? 0.2: est_h;
-
-                    let x = d3.scaleLinear()
-                        .domain([min - offset, max + offset])
-                        .range([0, inner_width]);
-                    svg.append("g")
-                        .attr("transform", "translate(0," + inner_height + ")")
-                        .call(d3.axisBottom(x).ticks(5));
-
-                    let kde = kernelDensityEstimator(gaussianKernel(est_h), x.ticks(100));
-                    let kde_result = kde(data);
-                    let kde_max = d3.max(kde_result.map((v) => v[1]));
-
-                    let y = d3.scaleLinear()
-                        .domain([0, kde_max])
-                        .range([inner_height, 0]);
-
-                    svg.append("path")
-                        .datum(kde_result)
-                        .style("stroke", "none")
-                        .style("fill","#69b3a2")
-                        .attr("d", d3.area()
+                }
+                function kernelEpanechnikov(k) {
+                    return (v) =>
+                        Math.abs((v /= k)) <= 1 ? (0.75 * (1 - v * v)) / k : 0;
+                }
+                function gaussianKernel(k) {
+                    return (v) =>
+                        ((1 / Math.sqrt(2 * Math.PI)) *
+                            Math.exp(-0.5 * (v / k) * (v / k))) /
+                        k;
+                }
+
+                let width = 250;
+                let height = 70;
+                let margin = {
+                    top: 10,
+                    right: 0,
+                    bottom: 30,
+                    left: 0,
+                };
+                let inner_width = width - margin.left - margin.right;
+                let inner_height = height - margin.bottom - margin.top;
+
+                let svg = selection
+                    .append("svg")
+                    .attr("width", width)
+                    .attr("height", height)
+                    .append("g")
+                    .attr(
+                        "transform",
+                        "translate(" + margin.left + "," + margin.top + ")",
+                    );
+
+                data = data.sort(d3.ascending);
+
+                let min = d3.min(data);
+                let max = d3.max(data);
+                let range = max - min;
+                let offset = Math.max(range * 0.1, 1);
+
+                let irq =
+                    (d3.quantile(data, 0.75) - d3.quantile(data, 0.25)) / 1.349;
+                let sigma = Math.min(d3.deviation(data), irq);
+                let est_h = 0.5 * sigma * data.length ** -0.2;
+                est_h = est_h != est_h ? 0.2 : est_h;
+
+                let x = d3
+                    .scaleLinear()
+                    .domain([min - offset, max + offset])
+                    .range([0, inner_width]);
+                svg.append("g")
+                    .attr("transform", "translate(0," + inner_height + ")")
+                    .call(d3.axisBottom(x).ticks(5));
+
+                let kde = kernelDensityEstimator(
+                    gaussianKernel(est_h),
+                    x.ticks(100),
+                );
+                let kde_result = kde(data);
+                let kde_max = d3.max(kde_result.map((v) => v[1]));
+
+                let y = d3
+                    .scaleLinear()
+                    .domain([0, kde_max])
+                    .range([inner_height, 0]);
+
+                svg.append("path")
+                    .datum(kde_result)
+                    .style("stroke", "none")
+                    .style("fill", "#69b3a2")
+                    .attr(
+                        "d",
+                        d3
+                            .area()
                             .x((d) => x(d[0]))
                             .y1((d) => y(d[1]))
                             .y0((d) => y(0))
-                            .curve(d3.curveCatmullRom)
-                        );
-                })
+                            .curve(d3.curveCatmullRom),
+                    );
+            }
+
+            renderHeader();
+            renderData();
         </script>
     </head>
     <body>
         <h1>PAGE_TITLE</h1>
-        <br>
+        <br />
         <a href="index.html">&lt Back</a>
-        <br>
-        TABLE_TARGET
+        <br />
+        <div style="display: flex; gap: 2em; padding: 2em 0.5em 0.25em 0.5em">
+            <input type="button" id="prior-page" value="Prior Page" />
+            <div id="page-indicator"></div>
+            <input type="button" id="next-page" value="Next Page" />
+        </div>
+        <table id="table-target">
+            <thead>
+                <tr></tr>
+            </thead>
+            <tbody></tbody>
+        </table>
     </body>
-</html>
\ No newline at end of file
+</html>
diff --git a/src/assembly.rs b/src/assembly.rs
index 35eb626..9485691 100644
--- a/src/assembly.rs
+++ b/src/assembly.rs
@@ -357,6 +357,7 @@ fn link_assemblies<T: JoinEstimator>(
     score_params: &ScoreParams,
     args: &AnnotationArgs,
 ) {
+    println!("{:#?}", query_statistics);
     // this relies on the alignments being sorted by target start
     let compatable_blocks = compatable_blocks.sorted().collect_vec();
 
diff --git a/src/join_estimation.rs b/src/join_estimation.rs
index f28fc62..89f3763 100644
--- a/src/join_estimation.rs
+++ b/src/join_estimation.rs
@@ -6,10 +6,7 @@ use std::{
 
 use crate::{
     assembly::{relative_consensus_distance, ConsensusDistanceNormalization, LinkType},
-    p2estimator::{
-        custom_quantile_estimator::{LomaxQuant, MedianEstimator},
-        QuantileEstimator,
-    },
+    p2estimator::custom_quantile_estimator::{LomaxQuant, MedianEstimator},
     segments::Block,
     statistics::{ln_add_exp, AssymetricLaplace, Distribution, ExponentialEstimator, HalfT, Lomax},
 };
@@ -210,11 +207,18 @@ impl From<&MedianEstimator> for ExponentialEstimator {
 
 impl From<&BayesianJoinStatistics> for BayesianJoinEstimator {
     fn from(statistics: &BayesianJoinStatistics) -> Self {
+        println!("{:#?}", statistics);
         Self {
             target_distance_join: statistics.joinable_target_distance.into(),
             target_distance_nojoin: statistics.unjoinable_target_distance.into(),
             divergence_join: statistics.joinable_divergence.into(),
-            divergence_nojoin: statistics.unjoinable_divergence.into(),
+            divergence_nojoin: HalfT::from_sample_mean(
+                statistics
+                    .unjoinable_divergence
+                    .mean()
+                    .max(statistics.joinable_divergence.mean()),
+                statistics.unjoinable_divergence.samples(),
+            ),
             consensus_distance_join: AssymetricLaplace::from_exponential_halves(
                 0.0,
                 statistics.joinable_consensus_neg.mean(),
@@ -277,10 +281,6 @@ impl JoinStatisticsCollector for BayesianJoinStatistics {
     }
 
     fn add(&mut self, first_block: &Block, second_block: &Block, link_info: &LinkInfo) {
-        if !link_info.neighbors {
-            return;
-        }
-
         let target_dist = link_info.unexplained_bases;
         let divergence_diff = (second_block.kimura80 - first_block.kimura80).abs();
         let (rel_con_dist, join_type) = relative_consensus_distance(
diff --git a/src/p2estimator.rs b/src/p2estimator.rs
index 0616cbc..9da5da7 100644
--- a/src/p2estimator.rs
+++ b/src/p2estimator.rs
@@ -1,4 +1,8 @@
-use std::cmp::Ordering;
+use std::{
+    cmp::Ordering,
+    ops::{Add, AddAssign},
+    usize,
+};
 
 use crate::statistics::Distribution;
 use itertools::Itertools;
@@ -8,14 +12,14 @@ use itertools::Itertools;
 //
 // We replace the P2 interpolation with PCHIP instead (See paper A Method for Constructing Local Monotone Piecewise Cubic Interpolants by F. N. Fritsch and J. Butland, or https://doi.org/10.1137/0905021)
 
-struct QuantileEstimatorData<'a> {
+pub struct QuantileEstimatorData<'a> {
     ranks: &'a [usize],
     values: &'a [f64],
     targets: &'a [f64],
     observations: &'a usize,
 }
 
-struct MutableQuantileEstimatorData<'a> {
+pub struct MutableQuantileEstimatorData<'a> {
     ranks: &'a mut [usize],
     values: &'a mut [f64],
     targets: &'a [f64],
@@ -342,40 +346,20 @@ fn _interpolated_value_prediction<
     }
 }
 
-pub trait QuantileEstimator: Distribution {
-    fn from_prior(prior: &Self, count: usize) -> Self;
-    fn update(&mut self, sample: f64);
-    fn update_all(&mut self, samples: &[f64]) {
-        for &s in samples.iter() {
-            self.update(s);
-        }
-    }
-    fn combine(&self, other: &Self) -> Self;
-    #[allow(dead_code)]
-    fn samples(&self) -> usize;
-}
-
-trait SimpleQuantileEstimatorRepresentation: Clone {
-    fn new_like(other: &Self) -> Self;
-    fn _data(&self) -> QuantileEstimatorData<'_>;
-    fn _mut_data(&mut self) -> MutableQuantileEstimatorData<'_>;
-    fn _is_initialized(&self) -> bool {
-        let data = self._data();
-        *data.observations >= data.ranks.len()
-    }
-}
+#[derive(Clone)]
+pub struct QuantileEstimator<T: QuantileEstimatorRepresentation>(T);
 
-impl<Q: SimpleQuantileEstimatorRepresentation> QuantileEstimator for Q {
-    fn samples(&self) -> usize {
-        *self._data().observations
+impl<T: QuantileEstimatorRepresentation> QuantileEstimator<T> {
+    pub fn samples(&self) -> usize {
+        *self.0._data().observations
     }
 
-    fn from_prior(prior: &Self, count_per_entry: usize) -> Self {
-        let prior_data = prior._data();
-        let mut new_self = Self::new_like(prior);
-        let new_data = new_self._mut_data();
+    pub fn to_psuedo_count(&self, count: usize) -> Self {
+        let prior_data = self.0._data();
+        let mut new_self = Self(T::new_like(&self.0));
+        let new_data = new_self.0._mut_data();
 
-        let new_observations = count_per_entry.max(1) * prior_data.ranks.len();
+        let new_observations = count.max(1) * prior_data.ranks.len();
 
         for i in 0..new_data.targets.len() {
             let closest_rank = ((new_data.targets[i] * (new_observations - 1) as f64) as usize)
@@ -385,15 +369,41 @@ impl<Q: SimpleQuantileEstimatorRepresentation> QuantileEstimator for Q {
                 );
 
             new_data.ranks[i] = closest_rank;
-            new_data.values[i] = prior.ppf(closest_rank as f64 / (new_observations - 1) as f64)
+            new_data.values[i] = self.ppf(closest_rank as f64 / (new_observations - 1) as f64)
         }
         *new_data.observations = new_observations;
 
         new_self
     }
+}
+
+impl QuantileEstimator<VectorQuantileRepresentation> {
+    pub fn new_from_slice(targets: &[f64]) -> Self {
+        Self(VectorQuantileRepresentation::new(targets))
+    }
+}
+
+impl<const N: usize> QuantileEstimator<ArrayQuantileRepresentation<N>> {
+    pub fn new_from_array(targets: &[f64; N]) -> Self {
+        Self(ArrayQuantileRepresentation::<N>::new(targets))
+    }
+}
+
+impl<T: QuantileEstimatorRepresentation + Default> Default for QuantileEstimator<T> {
+    fn default() -> Self {
+        Self(T::default())
+    }
+}
 
-    fn update(&mut self, sample: f64) {
-        let data = self._mut_data();
+impl<T: QuantileEstimatorRepresentation + Default> QuantileEstimator<T> {
+    fn new() -> Self {
+        Self::default()
+    }
+}
+
+impl<T: QuantileEstimatorRepresentation> AddAssign<f64> for QuantileEstimator<T> {
+    fn add_assign(&mut self, sample: f64) {
+        let data = self.0._mut_data();
 
         match (*data.observations + 1).cmp(&data.values.len()) {
             Ordering::Less => {
@@ -413,36 +423,48 @@ impl<Q: SimpleQuantileEstimatorRepresentation> QuantileEstimator for Q {
             }
         }
     }
+}
+
+impl<T: QuantileEstimatorRepresentation> AddAssign<&[f64]> for QuantileEstimator<T> {
+    fn add_assign(&mut self, samples: &[f64]) {
+        for &s in samples.iter() {
+            *self += s;
+        }
+    }
+}
+
+impl<T: QuantileEstimatorRepresentation> Add<QuantileEstimator<T>> for QuantileEstimator<T> {
+    type Output = QuantileEstimator<T>;
 
-    fn combine(&self, other: &Self) -> Self {
-        match (self._is_initialized(), other._is_initialized()) {
+    fn add(self, rhs: QuantileEstimator<T>) -> Self::Output {
+        match (self.0._is_initialized(), rhs.0._is_initialized()) {
             (true, true) => {
-                let mut new_quant_est = Self::new_like(&self);
+                let mut new_quant_est = Self(T::new_like(&self.0));
 
-                _merge_estimators(self._data(), other._data(), new_quant_est._mut_data());
+                _merge_estimators(self.0._data(), rhs.0._data(), new_quant_est.0._mut_data());
 
                 new_quant_est
             }
             (true, false) | (false, false) => {
-                let other_data = other._data();
+                let other_data = rhs.0._data();
                 let mut new_quant_est = self.clone();
-                new_quant_est.update_all(&other_data.values[..*other_data.observations]);
+                new_quant_est += &other_data.values[..*other_data.observations];
                 new_quant_est
             }
             (false, true) => {
-                let self_data = self._data();
-                let mut new_quant_est = other.clone();
-                new_quant_est.update_all(&self_data.values[..*self_data.observations]);
+                let self_data = self.0._data();
+                let mut new_quant_est = rhs.clone();
+                new_quant_est += &self_data.values[..*self_data.observations];
                 new_quant_est
             }
         }
     }
 }
 
-impl<Q: SimpleQuantileEstimatorRepresentation> Distribution for Q {
+impl<T: QuantileEstimatorRepresentation> Distribution for QuantileEstimator<T> {
     fn cdf(&self, x: f64) -> f64 {
-        let data = self._data();
-        if self._is_initialized() {
+        let data = self.0._data();
+        if self.0._is_initialized() {
             _interpolated_value_prediction(
                 data.values,
                 data.ranks,
@@ -482,13 +504,13 @@ impl<Q: SimpleQuantileEstimatorRepresentation> Distribution for Q {
     }
 
     fn ppf(&self, p: f64) -> f64 {
-        let data = self._data();
+        let data = self.0._data();
         let est_rank = p.clamp(0.0, 1.0) * (*data.observations - 1) as f64;
 
-        let data = self._data();
+        let data = self.0._data();
         let (min_val, max_val) = self.support();
 
-        if self._is_initialized() {
+        if self.0._is_initialized() {
             _interpolated_value_prediction(
                 data.ranks,
                 data.values,
@@ -527,9 +549,9 @@ impl<Q: SimpleQuantileEstimatorRepresentation> Distribution for Q {
     }
 
     fn support(&self) -> (f64, f64) {
-        let data = self._data();
+        let data = self.0._data();
 
-        if self._is_initialized() {
+        if self.0._is_initialized() {
             (
                 *data.values.first().unwrap_or(&f64::NEG_INFINITY),
                 *data.values.last().unwrap_or(&f64::INFINITY),
@@ -545,16 +567,26 @@ impl<Q: SimpleQuantileEstimatorRepresentation> Distribution for Q {
     }
 }
 
+pub trait QuantileEstimatorRepresentation: Clone {
+    fn new_like(other: &Self) -> Self;
+    fn _data(&self) -> QuantileEstimatorData<'_>;
+    fn _mut_data(&mut self) -> MutableQuantileEstimatorData<'_>;
+    fn _is_initialized(&self) -> bool {
+        let data = self._data();
+        *data.observations >= data.ranks.len()
+    }
+}
+
 #[derive(Clone, Debug)]
-pub struct FixedSizeQuantileEstimator<const N: usize> {
+pub struct ArrayQuantileRepresentation<const N: usize> {
     values: [f64; N],
     ranks: [usize; N],
     targets: [f64; N],
     observations: usize,
 }
 
-impl<const N: usize> FixedSizeQuantileEstimator<N> {
-    pub fn new(targets: &[f64; N]) -> Self {
+impl<const N: usize> ArrayQuantileRepresentation<N> {
+    fn new(targets: &[f64; N]) -> Self {
         assert!(
             targets.is_sorted() && targets.first() == Some(&0.0) && targets.last() == Some(&1.0)
         );
@@ -567,7 +599,7 @@ impl<const N: usize> FixedSizeQuantileEstimator<N> {
     }
 }
 
-impl<const N: usize> SimpleQuantileEstimatorRepresentation for FixedSizeQuantileEstimator<N> {
+impl<const N: usize> QuantileEstimatorRepresentation for ArrayQuantileRepresentation<N> {
     fn new_like(other: &Self) -> Self {
         Self::new(&other.targets)
     }
@@ -592,15 +624,15 @@ impl<const N: usize> SimpleQuantileEstimatorRepresentation for FixedSizeQuantile
 }
 
 #[derive(Clone, Debug)]
-pub struct VectorQuantileEstimator {
+pub struct VectorQuantileRepresentation {
     values: Vec<f64>,
     ranks: Vec<usize>,
     targets: Vec<f64>,
     observations: usize,
 }
 
-impl VectorQuantileEstimator {
-    pub fn new(targets: &[f64]) -> Self {
+impl VectorQuantileRepresentation {
+    fn new(targets: &[f64]) -> Self {
         assert!(
             targets.is_sorted() && targets.first() == Some(&0.0) && targets.last() == Some(&1.0)
         );
@@ -613,7 +645,7 @@ impl VectorQuantileEstimator {
     }
 }
 
-impl SimpleQuantileEstimatorRepresentation for VectorQuantileEstimator {
+impl QuantileEstimatorRepresentation for VectorQuantileRepresentation {
     fn new_like(other: &Self) -> Self {
         Self::new(&other.targets)
     }
@@ -651,15 +683,15 @@ pub mod custom_quantile_estimator {
     }
 
     macro_rules! implement_fixed_quantile_estimator {
-        ($name:ident[$($val:expr),+]) => {
+        ($name:ident, $repr_name:ident, [$($val:expr), +]) => {
             #[derive(Clone, Debug)]
-            pub struct $name {
+            pub struct $repr_name {
                 values: [f64; Self::COUNT],
                 ranks: [usize; Self::COUNT],
                 observations: usize,
             }
 
-            impl $name {
+            impl $repr_name {
                 const TARGETS: [f64; count_exprs!($($val),+) + 2] = [0.0, $($val),+, 1.0];
                 const COUNT: usize = Self::TARGETS.len();
 
@@ -672,13 +704,13 @@ pub mod custom_quantile_estimator {
                 }
             }
 
-            impl Default for $name {
+            impl Default for $repr_name {
                 fn default() -> Self {
                     Self::new()
                 }
             }
 
-            impl SimpleQuantileEstimatorRepresentation for $name {
+            impl QuantileEstimatorRepresentation for $repr_name {
                 fn new_like(_other: &Self) -> Self {
                     Self::default()
                 }
@@ -699,21 +731,27 @@ pub mod custom_quantile_estimator {
                     }
                 }
             }
+
+            pub type $name = QuantileEstimator<$repr_name>;
         };
     }
 
-    implement_fixed_quantile_estimator!(LomaxQuant[0.18, 0.36, 0.4752, 0.5904, 0.7952]);
+    implement_fixed_quantile_estimator!(
+        LomaxQuant,
+        LomaxQuantRepr,
+        [0.18, 0.36, 0.4752, 0.5904, 0.7952]
+    );
     impl LomaxQuant {
         pub const PROB1: f64 = 0.36;
         pub const PROB2: f64 = 0.59;
     }
-    implement_fixed_quantile_estimator!(MedianEstimator[0.25, 0.5, 0.75]);
+    implement_fixed_quantile_estimator!(MedianEstimator, MedianEstimatorRepr, [0.25, 0.5, 0.75]);
 }
 
 #[cfg(test)]
 mod test {
     use crate::{
-        p2estimator::{FixedSizeQuantileEstimator, QuantileEstimator, VectorQuantileEstimator},
+        p2estimator::QuantileEstimator,
         statistics::{linspace, Distribution, Exponential},
     };
     use itertools::Itertools;
@@ -733,14 +771,13 @@ mod test {
     fn quantiles_on_exponential_dist() {
         let expon = Exponential::new(1.0);
         let mut estimator =
-            FixedSizeQuantileEstimator::new(&[0.0, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]);
+            QuantileEstimator::new_from_array(&[0.0, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]);
 
         let mut rng = Xoshiro256PlusPlus::seed_from_u64(12345654321);
 
         for _ in 0..10_000 {
             let sample = expon.ppf(rng.random());
-
-            estimator.update(sample);
+            estimator += sample;
         }
 
         assert!(estimator.samples() == 10_000);
@@ -771,19 +808,19 @@ mod test {
     fn test_quantile_merging() {
         let expon = Exponential::new(1.0);
         let mut merged_estimator =
-            VectorQuantileEstimator::new(&linspace(0.0, 1.0, 10).collect_vec());
+            QuantileEstimator::new_from_slice(&linspace(0.0, 1.0, 10).collect_vec());
 
         let mut rng = Xoshiro256PlusPlus::seed_from_u64(12345654321);
 
         for _ in 0..100 {
             let targets: Vec<f64> = linspace(0.0, 1.0, rng.random_range(5..15)).collect();
-            let mut estimator = VectorQuantileEstimator::new(&targets);
+            let mut estimator = QuantileEstimator::new_from_slice(&targets);
 
             for _ in 0..100 {
-                estimator.update(expon.ppf(rng.random()));
+                estimator += expon.ppf(rng.random());
             }
 
-            merged_estimator = merged_estimator.combine(&estimator);
+            merged_estimator = merged_estimator + estimator;
         }
 
         assert!(merged_estimator.samples() == 10_000);
diff --git a/src/statistics.rs b/src/statistics.rs
index 448c118..fa0373a 100644
--- a/src/statistics.rs
+++ b/src/statistics.rs
@@ -469,11 +469,9 @@ impl AssymetricLaplace {
     }
 
     pub fn from_exponential_halves(mode: f64, negative_mean: f64, positive_mean: f64) -> Self {
-        Self::new(
-            mode,
-            (negative_mean * positive_mean) / (negative_mean + positive_mean),
-            1.0 / (positive_mean / negative_mean + 1.0),
-        )
+        let nm = negative_mean.max(1e-8);
+        let pm = positive_mean.max(1e-8);
+        Self::new(mode, (nm * pm) / (nm + pm), 1.0 / (pm / nm + 1.0))
     }
 
     pub fn symmetric_from_moments(mean: f64, standard_deviation: f64) -> Self {
diff --git a/src/viz/stats.rs b/src/viz/stats.rs
index 4b74fb1..4a3a202 100644
--- a/src/viz/stats.rs
+++ b/src/viz/stats.rs
@@ -20,29 +20,17 @@ struct InversionInfo {
     pub normal_joins: usize,
 }
 
-fn write_table<const N: usize, A: std::fmt::Display, B: std::fmt::Display>(
+fn write_tsv<const N: usize, A: std::fmt::Display, B: std::fmt::Display>(
     writer: &mut impl Write,
     header: &[A; N],
     data: &[[B; N]],
 ) -> std::io::Result<()> {
-    writeln!(writer, "<table>\n<thead>")?;
-
-    for heading in header.iter() {
-        writeln!(writer, "<th>{}</th>", heading)?;
-    }
-
-    writeln!(writer, "</thead>\n<tbody>")?;
+    writeln!(writer, "{}", header.iter().join("\t"))?;
 
     for row in data.iter() {
-        write!(writer, "<tr>")?;
-        for entry in row.iter() {
-            write!(writer, "<td>{}</td>", entry)?;
-        }
-        writeln!(writer, "</tr>")?;
+        writeln!(writer, "{}", row.iter().join("\t"))?;
     }
 
-    writeln!(writer, "</tbody>\n</table>")?;
-
     Ok(())
 }
 
@@ -54,13 +42,13 @@ fn write_statistics_table_page<const N: usize, A: std::fmt::Display, B: std::fmt
 ) -> std::io::Result<()> {
     let mut tmp_writer = Vec::<u8>::new();
 
-    write_table(&mut tmp_writer, header, data)?;
+    write_tsv(&mut tmp_writer, header, data)?;
 
     let table_page = TABLE_HTML
         .replace("PAGE_TITLE", title)
         .replace("SODA_TARGET", SODA_JS)
         .replace(
-            "TABLE_TARGET",
+            "TSV_TARGET",
             str::from_utf8(&tmp_writer).expect("UTF8 decoding failed!"),
         );
 
@@ -101,11 +89,11 @@ pub fn write_family_statistics(
         stats_writer,
         "Family Statistics",
         &[
-            "Family",
-            "Occurrences",
-            "Coverage",
-            "Kimura80 Boxplot",
-            "Kimura80 KDE",
+            "Family_string",
+            "Occurrences_int",
+            "Coverage_int",
+            "Kimura80_Boxplot_boxplot",
+            "Kimura80_KDE_violin",
         ],
         &family_stats
             .iter()
@@ -115,14 +103,8 @@ pub fn write_family_statistics(
                     k.to_string(),
                     v.occurrences.to_string(),
                     v.coverage.to_string(),
-                    format!(
-                        "<figure class=\"boxplot\" data-samples=\"{}\"></figure>",
-                        v.kimura80_values.0.iter().join(",")
-                    ),
-                    format!(
-                        "<figure class=\"violin\" data-samples=\"{}\"></figure>",
-                        v.kimura80_values.0.iter().join(",")
-                    ),
+                    v.kimura80_values.0.iter().join(":"),
+                    v.kimura80_values.0.iter().join(":"),
                 ]
             })
             .collect_vec(),
@@ -165,13 +147,13 @@ pub fn write_inversion_statistics(
     write_statistics_table_page(
         stats_writer,
         "Inversion Statistics",
-        &["Region", "Inversions", "Normal Joins"],
+        &["Region_region", "Inversions_int", "Normal_Joins_int"],
         &inversion_stats
             .iter()
             .sorted_by(|v1, v2| v2.1.cmp(v1.1))
             .map(|(k, v)| {
                 [
-                    format!("<a href=\"{}/index.html\">{}</a>", k, k),
+                    k.to_string(),
                     v.inversions.to_string(),
                     v.normal_joins.to_string(),
                 ]

From 5e409749d118b37565d1db955334d74b8a0e913a Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Thu, 4 Jun 2026 02:00:45 -0600
Subject: [PATCH 36/39] Clean up implementation...

---
 src/assembly.rs        |   1 -
 src/join_estimation.rs |  52 ++++++++-----
 src/main.rs            |   4 +
 src/p2estimator.rs     |  18 +++--
 src/statistics.rs      | 165 ++---------------------------------------
 src/viz/stats.rs       |   2 -
 6 files changed, 56 insertions(+), 186 deletions(-)

diff --git a/src/assembly.rs b/src/assembly.rs
index 9485691..35eb626 100644
--- a/src/assembly.rs
+++ b/src/assembly.rs
@@ -357,7 +357,6 @@ fn link_assemblies<T: JoinEstimator>(
     score_params: &ScoreParams,
     args: &AnnotationArgs,
 ) {
-    println!("{:#?}", query_statistics);
     // this relies on the alignments being sorted by target start
     let compatable_blocks = compatable_blocks.sorted().collect_vec();
 
diff --git a/src/join_estimation.rs b/src/join_estimation.rs
index 89f3763..2eba480 100644
--- a/src/join_estimation.rs
+++ b/src/join_estimation.rs
@@ -143,11 +143,11 @@ impl Default for MomentEstimator {
     }
 }
 
-impl ops::Add<MomentEstimator> for MomentEstimator {
+impl ops::Add<&MomentEstimator> for &MomentEstimator {
     type Output = MomentEstimator;
 
-    fn add(self, rhs: MomentEstimator) -> Self::Output {
-        Self {
+    fn add(self, rhs: &MomentEstimator) -> Self::Output {
+        MomentEstimator {
             sum_square: self.sum_square + rhs.sum_square,
             sum: self.sum + rhs.sum,
             samples: self.samples + rhs.samples,
@@ -171,14 +171,14 @@ impl ops::AddAssign<f64> for MomentEstimator {
     }
 }
 
-impl From<MomentEstimator> for ExponentialEstimator {
-    fn from(value: MomentEstimator) -> Self {
+impl From<&MomentEstimator> for ExponentialEstimator {
+    fn from(value: &MomentEstimator) -> Self {
         Self::new(value.mean(), value.samples().max(1))
     }
 }
 
-impl From<MomentEstimator> for HalfT {
-    fn from(value: MomentEstimator) -> Self {
+impl From<&MomentEstimator> for HalfT {
+    fn from(value: &MomentEstimator) -> Self {
         Self::from_sample_mean(value.mean(), value.samples().max(1))
     }
 }
@@ -205,13 +205,21 @@ impl From<&MedianEstimator> for ExponentialEstimator {
     }
 }
 
+impl From<&MedianEstimator> for HalfT {
+    fn from(value: &MedianEstimator) -> Self {
+        Self::new(
+            value.ppf(0.5) / Self::new(1.0, value.samples()).ppf(0.5),
+            value.samples(),
+        )
+    }
+}
+
 impl From<&BayesianJoinStatistics> for BayesianJoinEstimator {
     fn from(statistics: &BayesianJoinStatistics) -> Self {
-        println!("{:#?}", statistics);
         Self {
-            target_distance_join: statistics.joinable_target_distance.into(),
-            target_distance_nojoin: statistics.unjoinable_target_distance.into(),
-            divergence_join: statistics.joinable_divergence.into(),
+            target_distance_join: (&statistics.joinable_target_distance).into(),
+            target_distance_nojoin: (&statistics.unjoinable_target_distance).into(),
+            divergence_join: (&statistics.joinable_divergence).into(),
             divergence_nojoin: HalfT::from_sample_mean(
                 statistics
                     .unjoinable_divergence
@@ -281,6 +289,10 @@ impl JoinStatisticsCollector for BayesianJoinStatistics {
     }
 
     fn add(&mut self, first_block: &Block, second_block: &Block, link_info: &LinkInfo) {
+        if !link_info.neighbors {
+            return;
+        }
+
         let target_dist = link_info.unexplained_bases;
         let divergence_diff = (second_block.kimura80 - first_block.kimura80).abs();
         let (rel_con_dist, join_type) = relative_consensus_distance(
@@ -310,15 +322,15 @@ impl JoinStatisticsCollector for BayesianJoinStatistics {
 
     fn combine(&self, other: &Self) -> Self {
         Self {
-            joinable_target_distance: self.joinable_target_distance
-                + other.joinable_target_distance,
-            unjoinable_target_distance: self.unjoinable_target_distance
-                + other.unjoinable_target_distance,
-            joinable_divergence: self.joinable_divergence + other.joinable_divergence,
-            unjoinable_divergence: self.unjoinable_divergence + other.unjoinable_divergence,
-            joinable_consensus_pos: self.joinable_consensus_pos + other.joinable_consensus_pos,
-            joinable_consensus_neg: self.joinable_consensus_neg + other.joinable_consensus_neg,
-            unjoinable_consensus: self.unjoinable_consensus + other.unjoinable_consensus,
+            joinable_target_distance: &self.joinable_target_distance
+                + &other.joinable_target_distance,
+            unjoinable_target_distance: &self.unjoinable_target_distance
+                + &other.unjoinable_target_distance,
+            joinable_divergence: &self.joinable_divergence + &other.joinable_divergence,
+            unjoinable_divergence: &self.unjoinable_divergence + &other.unjoinable_divergence,
+            joinable_consensus_pos: &self.joinable_consensus_pos + &other.joinable_consensus_pos,
+            joinable_consensus_neg: &self.joinable_consensus_neg + &other.joinable_consensus_neg,
+            unjoinable_consensus: &self.unjoinable_consensus + &other.unjoinable_consensus,
         }
     }
 }
diff --git a/src/main.rs b/src/main.rs
index 5b01b6d..0243187 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -8,7 +8,11 @@ mod confidence;
 mod history_tracing;
 mod join_estimation;
 mod matrix;
+
+// Keeping around for enhanced parameter estimation work...
+#[allow(dead_code)]
 mod p2estimator;
+
 mod pipeline;
 mod score_params;
 mod segment_groups;
diff --git a/src/p2estimator.rs b/src/p2estimator.rs
index 9da5da7..ede85fe 100644
--- a/src/p2estimator.rs
+++ b/src/p2estimator.rs
@@ -346,7 +346,7 @@ fn _interpolated_value_prediction<
     }
 }
 
-#[derive(Clone)]
+#[derive(Clone, Debug)]
 pub struct QuantileEstimator<T: QuantileEstimatorRepresentation>(T);
 
 impl<T: QuantileEstimatorRepresentation> QuantileEstimator<T> {
@@ -433,13 +433,13 @@ impl<T: QuantileEstimatorRepresentation> AddAssign<&[f64]> for QuantileEstimator
     }
 }
 
-impl<T: QuantileEstimatorRepresentation> Add<QuantileEstimator<T>> for QuantileEstimator<T> {
+impl<T: QuantileEstimatorRepresentation> Add<&QuantileEstimator<T>> for &QuantileEstimator<T> {
     type Output = QuantileEstimator<T>;
 
-    fn add(self, rhs: QuantileEstimator<T>) -> Self::Output {
+    fn add(self, rhs: &QuantileEstimator<T>) -> Self::Output {
         match (self.0._is_initialized(), rhs.0._is_initialized()) {
             (true, true) => {
-                let mut new_quant_est = Self(T::new_like(&self.0));
+                let mut new_quant_est = QuantileEstimator::<T>(T::new_like(&self.0));
 
                 _merge_estimators(self.0._data(), rhs.0._data(), new_quant_est.0._mut_data());
 
@@ -752,11 +752,17 @@ pub mod custom_quantile_estimator {
 mod test {
     use crate::{
         p2estimator::QuantileEstimator,
-        statistics::{linspace, Distribution, Exponential},
+        statistics::{Distribution, Exponential},
     };
     use itertools::Itertools;
     use rand::{rngs::Xoshiro256PlusPlus, RngExt, SeedableRng};
 
+    pub fn linspace(start: f64, stop: f64, steps: usize) -> impl Iterator<Item = f64> {
+        (0..steps)
+            .map(move |n| n as f64 / (steps as f64 - 1.0))
+            .map(move |n| start * (1.0 - n) + stop * n)
+    }
+
     fn is_close(a: f64, b: f64) -> bool {
         let rel_tol = 1e-9;
         let abs_tol = 0.0;
@@ -820,7 +826,7 @@ mod test {
                 estimator += expon.ppf(rng.random());
             }
 
-            merged_estimator = merged_estimator + estimator;
+            merged_estimator = &merged_estimator + &estimator;
         }
 
         assert!(merged_estimator.samples() == 10_000);
diff --git a/src/statistics.rs b/src/statistics.rs
index fa0373a..f5014b8 100644
--- a/src/statistics.rs
+++ b/src/statistics.rs
@@ -22,6 +22,7 @@ pub trait Distribution {
     fn logccdf(&self, x: f64) -> f64;
 }
 
+#[allow(dead_code)]
 pub trait ParameterizedDistribution: Distribution + Debug + Default + Clone {
     fn unit() -> Self {
         Self::default()
@@ -242,154 +243,6 @@ impl Distribution for HalfT {
     }
 }
 
-#[derive(Debug, Clone)]
-pub struct Frechet {
-    alpha: f64,
-    scale: f64,
-    minimum: f64,
-}
-
-impl ParameterizedDistribution for Frechet {}
-
-impl Frechet {
-    pub fn new(alpha: f64, scale: f64, minimum: f64) -> Self {
-        Self {
-            alpha,
-            scale,
-            minimum,
-        }
-    }
-}
-
-impl Default for Frechet {
-    fn default() -> Self {
-        Self {
-            alpha: 1.0,
-            scale: 1.0,
-            minimum: 0.0,
-        }
-    }
-}
-
-impl Distribution for Frechet {
-    fn logpdf(&self, x: f64) -> f64 {
-        let a = self.alpha;
-        let s = self.scale;
-        let m = self.minimum;
-        if x > m {
-            (a / s).ln() + -(a + 1.0) * ((x - m) / s).ln() + -((x - m) / s).powf(-a)
-        } else {
-            f64::NEG_INFINITY
-        }
-    }
-
-    fn pdf(&self, x: f64) -> f64 {
-        self.logpdf(x).exp()
-    }
-
-    fn cdf(&self, x: f64) -> f64 {
-        self.logcdf(x).exp()
-    }
-
-    fn logcdf(&self, x: f64) -> f64 {
-        let a = self.alpha;
-        let s = self.scale;
-        let m = self.minimum;
-        if x > m {
-            -((x - m) / s).powf(-a)
-        } else {
-            f64::NEG_INFINITY
-        }
-    }
-
-    fn ppf(&self, p: f64) -> f64 {
-        let a = self.alpha;
-        let s = self.scale;
-        let m = self.minimum;
-        if p >= 1.0 {
-            f64::INFINITY
-        } else if p <= 0.0 {
-            m
-        } else {
-            m + s * (-p.min(1.0).ln()).powf(1.0 / -a)
-        }
-    }
-
-    fn ccdf(&self, x: f64) -> f64 {
-        1.0 - self.cdf(x)
-    }
-
-    fn logccdf(&self, x: f64) -> f64 {
-        self.ccdf(x).ln()
-    }
-
-    fn support(&self) -> (f64, f64) {
-        (self.minimum, f64::INFINITY)
-    }
-}
-
-#[derive(Debug, Clone)]
-pub struct Gumbel {
-    location: f64,
-    scale: f64,
-}
-
-impl Gumbel {
-    pub fn new(location: f64, scale: f64) -> Self {
-        Self { location, scale }
-    }
-}
-
-impl Default for Gumbel {
-    fn default() -> Self {
-        Self::new(0.0, 1.0)
-    }
-}
-
-impl ParameterizedDistribution for Gumbel {}
-
-impl Distribution for Gumbel {
-    fn logpdf(&self, x: f64) -> f64 {
-        let mu = self.location;
-        let beta = self.scale;
-        let z = (x - mu) / beta;
-        (1.0 / beta).ln() - (z + (-z).exp())
-    }
-
-    fn pdf(&self, x: f64) -> f64 {
-        self.logpdf(x).exp()
-    }
-
-    fn cdf(&self, x: f64) -> f64 {
-        self.logcdf(x).exp()
-    }
-
-    fn logcdf(&self, x: f64) -> f64 {
-        let mu = self.location;
-        let beta = self.scale;
-        let z = (x - mu) / beta;
-        -((-z).exp())
-    }
-
-    fn ppf(&self, p: f64) -> f64 {
-        let mu = self.location;
-        let beta = self.scale;
-        mu - beta * (-p.ln()).ln()
-    }
-
-    fn ccdf(&self, x: f64) -> f64 {
-        1.0 - self.cdf(x)
-    }
-
-    fn logccdf(&self, x: f64) -> f64 {
-        self.ccdf(x).ln()
-    }
-
-    fn support(&self) -> (f64, f64) {
-        (f64::NEG_INFINITY, f64::INFINITY)
-    }
-}
-
 #[derive(Debug, Clone)]
 pub struct Lomax {
     alpha: f64,
@@ -542,17 +395,17 @@ impl Distribution for AssymetricLaplace {
     }
 }
 
-pub fn linspace(start: f64, stop: f64, steps: usize) -> impl Iterator<Item = f64> {
-    (0..steps)
-        .map(move |n| n as f64 / (steps as f64 - 1.0))
-        .map(move |n| start * (1.0 - n) + stop * n)
-}
-
 #[cfg(test)]
 mod test {
     use super::*;
     use std::fmt::Debug;
 
+    pub fn linspace(start: f64, stop: f64, steps: usize) -> impl Iterator<Item = f64> {
+        (0..steps)
+            .map(move |n| n as f64 / (steps as f64 - 1.0))
+            .map(move |n| start * (1.0 - n) + stop * n)
+    }
+
     // Add debug trait to allow for printout...
     pub trait TestDistribution: Distribution + Debug {}
     impl<T: Distribution + Debug> TestDistribution for T {}
@@ -563,14 +416,12 @@ mod test {
 
     use super::{Exponential, ParameterizedDistribution};
 
-    fn get_dists() -> [Box<dyn TestDistribution>; 7] {
+    fn get_dists() -> [Box<dyn TestDistribution>; 5] {
         [
             as_box(Exponential::unit()),
             as_box(ExponentialEstimator::unit()),
             as_box(HalfT::unit()),
-            as_box(Frechet::unit()),
             as_box(AssymetricLaplace::unit()),
-            as_box(Gumbel::unit()),
             as_box(Lomax::unit()),
         ]
     }
diff --git a/src/viz/stats.rs b/src/viz/stats.rs
index 4a3a202..a9e2877 100644
--- a/src/viz/stats.rs
+++ b/src/viz/stats.rs
@@ -92,7 +92,6 @@ pub fn write_family_statistics(
             "Family_string",
             "Occurrences_int",
             "Coverage_int",
-            "Kimura80_Boxplot_boxplot",
             "Kimura80_KDE_violin",
         ],
         &family_stats
@@ -104,7 +103,6 @@ pub fn write_family_statistics(
                     v.occurrences.to_string(),
                     v.coverage.to_string(),
                     v.kimura80_values.0.iter().join(":"),
-                    v.kimura80_values.0.iter().join(":"),
                 ]
             })
             .collect_vec(),

From 06923ff8e96e920c5e4a719cf965d110dee8581f Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Thu, 4 Jun 2026 11:09:55 -0600
Subject: [PATCH 37/39] Enhance statistics displayed on family page.

---
 fixtures/soda/table.html | 17 ++++++++--
 src/main.rs              |  6 +++-
 src/segments.rs          | 24 ++++++++++++++
 src/viz/stats.rs         | 69 ++++++++++++++++++++++++++++++++++++++--
 4 files changed, 110 insertions(+), 6 deletions(-)

diff --git a/fixtures/soda/table.html b/fixtures/soda/table.html
index db87869..26ca293 100644
--- a/fixtures/soda/table.html
+++ b/fixtures/soda/table.html
@@ -36,6 +36,10 @@
             figure {
                 margin: 0;
             }
+
+            th input[type="text"] {
+                width: 6em;
+            }
         </style>
         <script id="table-data" type="text/tab-separated-values">
             TSV_TARGET
@@ -168,6 +172,7 @@
                             let prior_text = full_filter[d] ?? "";
                             this_sel
                                 .append("input")
+                                .attr("id", d)
                                 .attr("type", "text")
                                 .attr("value", prior_text)
                                 .on("click", function (d) {
@@ -238,10 +243,16 @@
                     ),
                 );
                 let offset = ELEMS_PER_PAGE * page;
-
-                d3.select("#page-indicator").text(
-                    `Page: ${page + 1}/${maxPage}`,
+                let offset_end = Math.min(
+                    ELEMS_PER_PAGE * (page + 1),
+                    inner_tsv.length,
                 );
+
+                d3.select("#page-indicator")
+                    .style("white-space", "pre")
+                    .text(
+                        `Page: ${page + 1}/${maxPage}     Families: ${offset + 1}-${offset_end}/${inner_tsv.length}`,
+                    );
                 d3.select("#prior-page").on("click", () => {
                     updateUrl({ page: Math.max(0, page - 1) });
                     renderData();
diff --git a/src/main.rs b/src/main.rs
index 0243187..2cf56cb 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -477,7 +477,11 @@ fn main() -> Result<()> {
                 .viz_output_path
                 .join("family_stats.html"),
         )?;
-        write_family_statistics(&mut family_stats_writer, &results)?;
+        write_family_statistics(
+            &mut family_stats_writer,
+            &results,
+            &alignment_data.query_lengths,
+        )?;
         let mut inv_stats_writer = File::create(
             args.visualization_args
                 .viz_output_path
diff --git a/src/segments.rs b/src/segments.rs
index b65964f..f3b35c3 100644
--- a/src/segments.rs
+++ b/src/segments.rs
@@ -3,6 +3,7 @@ use std::{cmp::Ordering, collections::HashMap, fmt::Debug, iter::Fuse};
 
 use crate::{
     alignment::{Alignment, Strand},
+    annotation::AmbiguousAnnotation,
     assembly::SegmentAssemblyGraph,
     chunks::ProximityGroup,
     join_estimation::JoinEstimator,
@@ -122,6 +123,29 @@ impl Block {
             can_join_up_to: 0,
         }
     }
+
+    pub fn from_annotation(
+        annotation: &AmbiguousAnnotation,
+        selected_index: usize,
+        row: usize,
+    ) -> Self {
+        let simple_annot = &annotation.annotations[selected_index];
+
+        Self {
+            row_idx: row,
+            block_type: BlockType::Alignment,
+            strand: simple_annot.strand,
+            query_id: Some(simple_annot.query_id),
+            col_start: simple_annot.target_start,
+            col_end: simple_annot.target_end,
+            query_start: simple_annot.query_start,
+            query_end: simple_annot.query_end,
+            avg_confidence: annotation.confidence,
+            alignment_score: annotation.confidence.ln(),
+            kimura80: simple_annot.kimura80,
+            can_join_up_to: 0,
+        }
+    }
 }
 
 #[derive(Debug)]
diff --git a/src/viz/stats.rs b/src/viz/stats.rs
index a9e2877..82f7201 100644
--- a/src/viz/stats.rs
+++ b/src/viz/stats.rs
@@ -1,5 +1,11 @@
 use crate::{
-    alignment::Strand, annotation::AmbiguousAnnotation, segments::Unordered, viz::SODA_JS,
+    alignment::Strand,
+    annotation::AmbiguousAnnotation,
+    assembly::{
+        block_target_distance, relative_consensus_distance, ConsensusDistanceNormalization,
+    },
+    segments::{Block, Unordered},
+    viz::SODA_JS,
 };
 use core::str;
 use itertools::Itertools;
@@ -12,6 +18,12 @@ struct FamilyInfo {
     pub occurrences: usize,
     pub coverage: usize,
     pub kimura80_values: Unordered<Vec<f64>>,
+    pub join_consensus_dist: Unordered<Vec<f64>>,
+    pub nojoin_consensus_dist: Unordered<Vec<f64>>,
+    pub join_target_dist: Unordered<Vec<f64>>,
+    pub nojoin_target_dist: Unordered<Vec<f64>>,
+    pub join_kimura_dist: Unordered<Vec<f64>>,
+    pub nojoin_kimura_dist: Unordered<Vec<f64>>,
 }
 
 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
@@ -60,14 +72,39 @@ fn write_statistics_table_page<const N: usize, A: std::fmt::Display, B: std::fmt
 pub fn write_family_statistics(
     stats_writer: &mut impl Write,
     region_annotations: &[(usize, Vec<AmbiguousAnnotation>)],
+    query_lengths: &HashMap<usize, usize>,
 ) -> std::io::Result<()> {
     let mut family_stats = HashMap::<&String, FamilyInfo>::new();
 
     for (_region, annotations) in region_annotations.iter() {
+        let mut prior_elem = HashMap::<usize, (&AmbiguousAnnotation, usize)>::new();
+
         for amb_annot in annotations.iter() {
-            for query in amb_annot.annotations.iter() {
+            for (query_idx, query) in amb_annot.annotations.iter().enumerate() {
                 let name = &query.query_name;
                 let target_range = query.target_end - query.target_start;
+                let (join_cons, nojoin_cons, join_target, nojoin_target, join_div, nojoin_div) =
+                    if let Some(&(prior_annot, prior_query_idx)) = prior_elem.get(&query.query_id) {
+                        let block_c = Block::from_annotation(amb_annot, query_idx, 0);
+                        let block_p = Block::from_annotation(prior_annot, prior_query_idx, 0);
+                        let q_len = *query_lengths.get(&query.query_id).unwrap_or(&1);
+
+                        let (c_dist, _join_type) = relative_consensus_distance(
+                            &block_p,
+                            &block_c,
+                            ConsensusDistanceNormalization::WithLength(q_len),
+                        );
+                        let t_dist = block_target_distance(&block_p, &block_c) as f64;
+                        let d_dist = (block_c.kimura80 - block_p.kimura80).abs();
+
+                        if prior_annot.join_id == amb_annot.join_id {
+                            (Some(c_dist), None, Some(t_dist), None, Some(d_dist), None)
+                        } else {
+                            (None, Some(c_dist), None, Some(t_dist), None, Some(d_dist))
+                        }
+                    } else {
+                        (None, None, None, None, None, None)
+                    };
 
                 family_stats
                     .entry(name)
@@ -75,12 +112,28 @@ pub fn write_family_statistics(
                         f.occurrences += 1;
                         f.coverage += target_range;
                         f.kimura80_values.0.push(query.kimura80);
+                        f.join_consensus_dist.0.extend(join_cons.iter());
+                        f.nojoin_consensus_dist.0.extend(nojoin_cons.iter());
+                        f.join_target_dist.0.extend(join_target.iter());
+                        f.nojoin_target_dist.0.extend(nojoin_target.iter());
+                        f.join_kimura_dist.0.extend(join_div.iter());
+                        f.nojoin_kimura_dist.0.extend(nojoin_div.iter());
                     })
                     .or_insert(FamilyInfo {
                         occurrences: 1,
                         coverage: target_range,
                         kimura80_values: Unordered(vec![query.kimura80]),
+                        join_consensus_dist: Unordered(join_cons.iter().copied().collect()),
+                        nojoin_consensus_dist: Unordered(nojoin_cons.iter().copied().collect()),
+                        join_target_dist: Unordered(join_target.iter().copied().collect()),
+                        nojoin_target_dist: Unordered(nojoin_target.iter().copied().collect()),
+                        join_kimura_dist: Unordered(join_div.iter().copied().collect()),
+                        nojoin_kimura_dist: Unordered(nojoin_div.iter().copied().collect()),
                     });
+
+                prior_elem
+                    .entry(query.query_id)
+                    .insert_entry((amb_annot, query_idx));
             }
         }
     }
@@ -93,6 +146,12 @@ pub fn write_family_statistics(
             "Occurrences_int",
             "Coverage_int",
             "Kimura80_KDE_violin",
+            "Joined_Consensus_Distance_violin",
+            "Unjoined_Consensus_Distance_violin",
+            "Joined_Target_Distance_violin",
+            "Unjoined_Target_Distance_violin",
+            "Joined_Kimura_Difference_violin",
+            "Unjoined_Kimura_Difference_violin",
         ],
         &family_stats
             .iter()
@@ -103,6 +162,12 @@ pub fn write_family_statistics(
                     v.occurrences.to_string(),
                     v.coverage.to_string(),
                     v.kimura80_values.0.iter().join(":"),
+                    v.join_consensus_dist.0.iter().join(":"),
+                    v.nojoin_consensus_dist.0.iter().join(":"),
+                    v.join_target_dist.0.iter().join(":"),
+                    v.nojoin_target_dist.0.iter().join(":"),
+                    v.join_kimura_dist.0.iter().join(":"),
+                    v.nojoin_kimura_dist.0.iter().join(":"),
                 ]
             })
             .collect_vec(),

From d678818fd21ea08a08bab9f7e31f8618d1d142ad Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Thu, 4 Jun 2026 18:14:05 -0600
Subject: [PATCH 38/39] Parameter tuning...

---
 src/assembly.rs        | 55 ++++++++++++++++-------------------
 src/history_tracing.rs | 23 +++++++++++----
 src/join_estimation.rs | 21 +++++++++-----
 src/main.rs            | 66 +++++++++++++++++++++++++++++++++++-------
 src/score_params.rs    | 14 +++++++++
 src/segments.rs        |  1 -
 6 files changed, 125 insertions(+), 55 deletions(-)

diff --git a/src/assembly.rs b/src/assembly.rs
index 35eb626..0c0c166 100644
--- a/src/assembly.rs
+++ b/src/assembly.rs
@@ -6,7 +6,6 @@ use crate::{
     alignment::{Alignment, Strand},
     chunks::ProximityGroup,
     join_estimation::{JoinEstimator, JoinStatisticsCollector, LinkInfo},
-    score_params::ScoreParams,
     segments::{Block, InitialSegments, SegmentedMatrix, SegmentedMatrixView},
     trace_statistics::{calculate_region_statistics, QueryStatistics, RegionStatistics},
     AnnotationArgs,
@@ -82,15 +81,6 @@ pub struct Edge {
     pub link_type: LinkType,
 }
 
-fn get_link_cost(score_params: &ScoreParams, join_prob: f64) -> f64 {
-    // Doing this as the expected value over the transition scores...
-    let expected_score = join_prob * score_params.query_loop_score
-        + (1.0 - join_prob) * score_params.query_jump_score;
-
-    // Cost = linear consensus cost + linear target gap cost...
-    expected_score
-}
-
 pub fn block_target_distance(first_block: &Block, second_block: &Block) -> isize {
     second_block.col_start as isize - first_block.col_end as isize - 1
 }
@@ -147,6 +137,7 @@ pub enum ConsensusDistanceNormalization {
     Min,
     Sum,
     WithLength(usize),
+    WithUBAndLength(usize, usize),
 }
 
 pub fn relative_consensus_distance(
@@ -154,7 +145,16 @@ pub fn relative_consensus_distance(
     second_block: &Block,
     mode: ConsensusDistanceNormalization,
 ) -> (f64, LinkType) {
-    let (dist, link_type) = block_consensus_distance(first_block, second_block);
+    let (mut dist, link_type) = block_consensus_distance(first_block, second_block);
+
+    if let ConsensusDistanceNormalization::WithUBAndLength(ub, _length) = mode {
+        dist = if dist > 0 {
+            dist.saturating_sub(ub as isize).max(0)
+        } else {
+            dist
+        }
+    }
+
     let div = match mode {
         ConsensusDistanceNormalization::Sum => {
             block_length_on_query(first_block) + block_length_on_query(second_block)
@@ -166,6 +166,7 @@ pub fn relative_consensus_distance(
             block_length_on_query(first_block).min(block_length_on_query(second_block))
         }
         ConsensusDistanceNormalization::WithLength(length) => length,
+        ConsensusDistanceNormalization::WithUBAndLength(_ub, length) => length,
     };
     (dist as f64 / div as f64, link_type)
 }
@@ -324,24 +325,21 @@ fn gather_join_statistics_single_family<'a>(
         .for_each(|(idx, (a_segment_idx, a_block))| {
             compatable_blocks[idx + 1..].iter().enumerate().for_each(
                 |(idx2, (b_segment_idx, b_block))| {
-                    join_stats.add(
+                    let link_info = &link_info(
                         a_block,
                         b_block,
-                        &link_info(
-                            a_block,
-                            b_block,
-                            args,
-                            calculate_unexplained_bases(
-                                segments,
-                                region_statistics,
-                                *a_segment_idx,
-                                *b_segment_idx,
-                                b_block.col_start,
-                            ),
-                            consensus_length,
-                            idx + 1 == idx2,
+                        args,
+                        calculate_unexplained_bases(
+                            segments,
+                            region_statistics,
+                            *a_segment_idx,
+                            *b_segment_idx,
+                            b_block.col_start,
                         ),
+                        consensus_length,
+                        idx + 1 == idx2,
                     );
+                    join_stats.add(a_block, b_block, link_info);
                 },
             )
         })
@@ -354,7 +352,6 @@ fn link_assemblies<T: JoinEstimator>(
     segments: &SegmentedMatrix,
     query_statistics: &QueryStatistics<T>,
     region_statistics: &RegionStatistics,
-    score_params: &ScoreParams,
     args: &AnnotationArgs,
 ) {
     // this relies on the alignments being sorted by target start
@@ -392,9 +389,9 @@ fn link_assemblies<T: JoinEstimator>(
 
                 if join_prob >= args.join_likelihood_threshold {
                     let weight = if a_block.row_idx == b_block.row_idx && ((b.0 - 1) <= a.0) {
-                        score_params.query_loop_score
+                        1.0
                     } else {
-                        get_link_cost(score_params, join_prob)
+                        join_prob
                     };
 
                     graph.insert(
@@ -429,7 +426,6 @@ impl SegmentAssemblyGraph {
         segments: &SegmentedMatrix,
         region_statistics: &RegionStatistics,
         query_statistics: &[QueryStatistics<T>],
-        score_params: &ScoreParams,
         annotation_args: &AnnotationArgs,
     ) -> Self {
         let alignment_block_map = new_alignment_to_blocks_map(segments, alignments);
@@ -462,7 +458,6 @@ impl SegmentAssemblyGraph {
                     segments,
                     &query_statistics[id],
                     region_statistics,
-                    score_params,
                     annotation_args,
                 );
             });
diff --git a/src/history_tracing.rs b/src/history_tracing.rs
index 5348aa1..a01f220 100644
--- a/src/history_tracing.rs
+++ b/src/history_tracing.rs
@@ -265,7 +265,7 @@ struct JoinLink {
     origin_history: usize,
     linked_history: usize,
     link_side: Side,
-    score: f64,
+    probability: f64,
 }
 
 impl JoinLink {
@@ -273,7 +273,7 @@ impl JoinLink {
         self.origin_history == other.origin_history
             && self.linked_history == other.linked_history
             && self.link_side == other.link_side
-            && ((self.score - other.score).abs() < epsilon)
+            && ((self.probability - other.probability).abs() < epsilon)
     }
 }
 
@@ -297,7 +297,7 @@ impl Ord for JoinLink {
             .cmp(&other.origin_history)
             .then_with(|| self.linked_history.cmp(&other.linked_history))
             .then_with(|| self.link_side.cmp(&other.link_side))
-            .then_with(|| self.score.total_cmp(&other.score))
+            .then_with(|| self.probability.total_cmp(&other.probability))
     }
 }
 
@@ -362,7 +362,7 @@ fn get_valid_joins_for_current_group(
                                         origin_history: prior_origin_history,
                                         linked_history: prior_link_history,
                                         link_side: prior_linkable_side,
-                                        score: weight,
+                                        probability: weight,
                                     });
 
                                 solved_current +=
@@ -705,8 +705,19 @@ fn add_single_join(
         history_depth,
     );
 
-    let right_score = right_join_link.as_ref().map(|v| v.score).unwrap_or(0.0);
-    let left_score = left_join_link.as_ref().map(|v| v.score).unwrap_or(0.0);
+    let prior_is_skip = match &histories[prior_hist_idx] {
+        HistoryEntry::Append(val) | HistoryEntry::Join(val) => val.group_index == 0,
+        HistoryEntry::Root => true,
+    };
+
+    let right_score = right_join_link
+        .as_ref()
+        .map(|v| score_params.join_transition(prior_is_skip, v.probability))
+        .unwrap_or(0.0);
+    let left_score = left_join_link
+        .as_ref()
+        .map(|v| score_params.join_transition(prior_is_skip, v.probability))
+        .unwrap_or(0.0);
     // If two joins, we incurred a expensive query-to-query jump in the past, so now we undo that cost...
     let bonus = if left_join_link.is_some() && right_join_link.is_some() {
         -score_params.query_jump_score
diff --git a/src/join_estimation.rs b/src/join_estimation.rs
index 2eba480..aa68ee0 100644
--- a/src/join_estimation.rs
+++ b/src/join_estimation.rs
@@ -29,6 +29,7 @@ pub struct LinkInfo {
     pub link_type: LinkType,
     pub consensus_length: usize,
     pub unexplained_bases: usize,
+    #[allow(dead_code)]
     pub neighbors: bool,
     pub joinable: bool,
 }
@@ -65,7 +66,10 @@ impl JoinEstimator for BayesianJoinEstimator {
         let (rel_con_dist, _join_type) = relative_consensus_distance(
             first_block,
             second_block,
-            ConsensusDistanceNormalization::WithLength(link_info.consensus_length),
+            ConsensusDistanceNormalization::WithUBAndLength(
+                link_info.unexplained_bases,
+                link_info.consensus_length,
+            ),
         );
 
         let join_score = self.join_prior.ln()
@@ -232,6 +236,7 @@ impl From<&BayesianJoinStatistics> for BayesianJoinEstimator {
                 statistics.joinable_consensus_neg.mean(),
                 statistics.joinable_consensus_pos.mean(),
             ),
+            // TODO: Replace with beta dist, better matches dists we see...
             consensus_distance_nojoin: AssymetricLaplace::symmetric_from_moments(
                 statistics.unjoinable_consensus.mean(),
                 statistics.unjoinable_consensus.standard_deviation(),
@@ -240,8 +245,7 @@ impl From<&BayesianJoinStatistics> for BayesianJoinEstimator {
             join_prior: (statistics.joinable_target_distance.samples() as f64
                 / (statistics.joinable_target_distance.samples()
                     + statistics.unjoinable_target_distance.samples())
-                .max(1) as f64)
-                .sqrt(),
+                .max(1) as f64),
         }
     }
 }
@@ -289,16 +293,19 @@ impl JoinStatisticsCollector for BayesianJoinStatistics {
     }
 
     fn add(&mut self, first_block: &Block, second_block: &Block, link_info: &LinkInfo) {
-        if !link_info.neighbors {
-            return;
-        }
+        //if !link_info.neighbors {
+        //    return;
+        //}
 
         let target_dist = link_info.unexplained_bases;
         let divergence_diff = (second_block.kimura80 - first_block.kimura80).abs();
         let (rel_con_dist, join_type) = relative_consensus_distance(
             first_block,
             second_block,
-            ConsensusDistanceNormalization::WithLength(link_info.consensus_length),
+            ConsensusDistanceNormalization::WithUBAndLength(
+                link_info.unexplained_bases,
+                link_info.consensus_length,
+            ),
         );
 
         if link_info.joinable {
diff --git a/src/main.rs b/src/main.rs
index 2cf56cb..f4a418e 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -31,10 +31,13 @@ mod viterbi;
 mod viz;
 mod windowed_scores;
 
+use core::ops::RangeBounds;
 use std::{
     collections::HashMap,
+    fmt::Debug,
     fs::{self, create_dir_all, File},
     io::{BufRead, BufReader, BufWriter, Write},
+    ops::Bound,
     path::PathBuf,
 };
 
@@ -45,6 +48,8 @@ use anyhow::{Ok, Result};
 use clap::{Args, Parser};
 use itertools::Itertools;
 use rayon::prelude::*;
+use std::str::FromStr;
+use thiserror::Error;
 use viz::VizConstraint;
 
 use crate::{
@@ -111,6 +116,31 @@ pub struct PerformanceArgs {
     pub num_threads: usize,
 }
 
+#[derive(Error, Debug)]
+enum ParseRangedError<T: Debug, E> {
+    #[error(transparent)]
+    ParseError(#[from] E),
+    #[error("float value {0:?} is not between {1:?} and {2:?}")]
+    RangeError(T, Bound<T>, Bound<T>),
+}
+
+const fn ranged<F: PartialOrd + Debug + FromStr + Send + Sync + Clone + Copy + 'static>(
+    range: impl RangeBounds<F> + Send + Sync + Clone + 'static,
+) -> impl Fn(&str) -> Result<F, ParseRangedError<F, F::Err>> + Clone + Send + Sync + 'static {
+    move |v: &str| {
+        let f = F::from_str(v)?;
+        if range.contains(&f) {
+            Result::Ok(f)
+        } else {
+            Result::Err(ParseRangedError::RangeError(
+                f,
+                range.start_bound().map(|v| *v),
+                range.end_bound().map(|v| *v),
+            ))
+        }
+    }
+}
+
 #[derive(Args, Debug, Clone, Default)]
 pub struct AnnotationArgs {
     /// The penalty of jumping between query models
@@ -118,7 +148,8 @@ pub struct AnnotationArgs {
         short = 'J',
         long = "query-jump",
         default_value = "-127.0",
-        value_name = "f"
+        value_name = "f",
+        value_parser = ranged::<f64>(..-1.0)
     )]
     pub query_jump_penalty: f64,
 
@@ -128,7 +159,8 @@ pub struct AnnotationArgs {
         short = 'L',
         long = "skip-loop",
         default_value = "30",
-        value_name = "n"
+        value_name = "n",
+        value_parser = ranged::<usize>(1..)
     )]
     pub num_skip_loops_eq_to_jump: usize,
 
@@ -148,17 +180,19 @@ pub struct AnnotationArgs {
     #[arg(
         long = "join-likelihood-threshold",
         default_value = "0.25",
-        value_name = "f"
+        value_name = "f",
+        value_parser = ranged::<f64>(0.0..1.0)
     )]
     pub join_likelihood_threshold: f64,
 
-    /// The maximum overlap in the consensus at which
+    /// The maximum allowed overlap in the consensus at which
     /// a join is considered between compatible alignments.
     #[arg(
         short = 'O',
         long = "consensus-join-overlap",
         default_value = "200",
-        value_name = "n"
+        value_name = "n",
+        value_parser = ranged::<isize>(0..)
     )]
     pub consensus_join_overlap: isize,
 
@@ -167,14 +201,21 @@ pub struct AnnotationArgs {
     #[arg(
         short = 'C',
         long = "consensus-join-distance",
-        default_value = "3750",
-        value_name = "n"
+        default_value = "2500",
+        value_name = "n",
+        value_parser = ranged::<isize>(0..)
     )]
     pub consensus_join_distance: isize,
 
     /// The maximum seperation or overlap in nucleotides on both target and consensus
     /// for a join to be allowed between inverted alignments.
-    #[arg(long = "inversion-distance", default_value = "200", value_name = "n")]
+    #[arg(
+        short = 'I',
+        long = "inversion-distance",
+        default_value = "200",
+        value_name = "n",
+        value_parser = ranged::<isize>(0..)
+    )]
     pub inversion_distance: isize,
 
     /// The size of the window looked at to determine a single alignment score in nucleotides.
@@ -182,7 +223,8 @@ pub struct AnnotationArgs {
         short = 'W',
         long = "window-size",
         default_value = "31",
-        value_name = "n"
+        value_name = "n",
+        value_parser = ranged::<usize>(1..)
     )]
     pub score_window_size: usize,
 
@@ -191,7 +233,8 @@ pub struct AnnotationArgs {
         short = 'B',
         long = "background-window-size",
         default_value = "61",
-        value_name = "n"
+        value_name = "n",
+        value_parser = ranged::<usize>(1..)
     )]
     pub background_window_size: usize,
 
@@ -208,7 +251,8 @@ pub struct AnnotationArgs {
     #[arg(
         long = "min-segment-confidence",
         default_value = "0.1",
-        value_name = "f"
+        value_name = "f",
+        value_parser = ranged::<f64>(0.0..1.0)
     )]
     pub min_block_confidence: f64,
 
diff --git a/src/score_params.rs b/src/score_params.rs
index 783d0b6..868a948 100644
--- a/src/score_params.rs
+++ b/src/score_params.rs
@@ -69,4 +69,18 @@ impl ScoreParams {
             prior_is_different,
         )
     }
+
+    pub fn join_transition(&self, prior_is_skip: bool, join_probability: f64) -> f64 {
+        let true_join_cost = fast_select(
+            self.query_to_skip_score + (self.query_loop_score - self.query_jump_score).abs(),
+            self.query_loop_score,
+            prior_is_skip,
+        );
+        let false_join_cost = fast_select(
+            self.query_to_skip_score,
+            self.query_jump_score,
+            prior_is_skip,
+        );
+        true_join_cost * join_probability + (1.0 - join_probability) * false_join_cost
+    }
 }
diff --git a/src/segments.rs b/src/segments.rs
index f3b35c3..216015c 100644
--- a/src/segments.rs
+++ b/src/segments.rs
@@ -641,7 +641,6 @@ pub fn assemble_and_link_segments<'a, T: JoinEstimator>(
         &initial_segments.segments,
         region_statistics,
         query_statistics,
-        score_params,
         annotation_args,
     );
     finalize_segments(

From 9175b6a19238b6c2badaa9042144974fd05dbe8f Mon Sep 17 00:00:00 2001
From: isaacr <awesomeisaac2000@gmail.com>
Date: Tue, 9 Jun 2026 17:32:38 -0600
Subject: [PATCH 39/39] Better error handling.

---
 src/alignment.rs           | 66 ++++++++++++++++++---------
 src/join_estimation.rs     |  2 +-
 src/main.rs                | 84 +++++++++++++++++++++--------------
 src/p2estimator.rs         |  4 +-
 src/pipeline.rs            |  5 ++-
 src/segments.rs            |  8 +++-
 src/statistics.rs          |  1 -
 src/substitution_matrix.rs | 91 ++++++++++++++++++++++----------------
 src/support.rs             |  6 +--
 9 files changed, 167 insertions(+), 100 deletions(-)

diff --git a/src/alignment.rs b/src/alignment.rs
index d0d073c..a99580e 100644
--- a/src/alignment.rs
+++ b/src/alignment.rs
@@ -1,4 +1,5 @@
-use anyhow::Result;
+use anyhow::{anyhow, Context, Result};
+use itertools::Itertools;
 use serde::Deserialize;
 use std::collections::HashMap;
 use std::io::{BufRead, BufReader, Read};
@@ -520,7 +521,7 @@ impl AlignmentData {
         //   16: ACCT/GGA/TCT/CGTGGCCT/CGGGGGTTGGGGACCCCTG
         //   17: 14p41g.matrix
 
-        let substitution_matrices = VecMap::from(SubstitutionMatrix::parse(matrices));
+        let substitution_matrices = VecMap::from(SubstitutionMatrix::parse(matrices)?);
 
         let mut target_groups: Vec<TargetGroup> = vec![];
         let mut target_name_map: VecMap<String> = VecMap::new();
@@ -531,31 +532,47 @@ impl AlignmentData {
         let caf_lines = BufReader::new(caf).lines();
 
         caf_lines
-            .map(|l| l.expect("failed to read line"))
-            .filter(|l| !l.is_empty())
+            .filter_ok(|l| !l.is_empty())
             .enumerate()
-            .for_each(|(line_num, line)| {
+            .try_for_each(|(line_num, line_unchecked)| {
+                let error_msg =
+                    |msg, col| move || format!("{} at line '{}', column '{}'", msg, line_num, col);
+
+                let error_msg_str = |msg: String, col: usize| {
+                    move || format!("{} at line '{}', column '{}'", msg, line_num, col)
+                };
+
+                let line = line_unchecked?;
                 let tokens: Vec<&str> = line.split(',').collect();
 
+                if tokens.len() < 18 {
+                    return Err(anyhow!(
+                        "line {} does not have at least 18 columns!",
+                        line_num
+                    ));
+                }
+
                 let target_name = tokens[4].to_string();
-                let target_start =
-                    str::parse::<usize>(tokens[5]).expect("failed to parse target start");
-                let target_end =
-                    str::parse::<usize>(tokens[6]).expect("failed to parse target end");
+                let target_start = str::parse::<usize>(tokens[5])
+                    .with_context(error_msg("failed to parse target start", 5))?;
+                let target_end = str::parse::<usize>(tokens[6])
+                    .with_context(error_msg("failed to parse target end", 6))?;
                 let query_name = tokens[8].to_string();
-                let query_start =
-                    str::parse::<usize>(tokens[10]).expect("failed to parse query start");
-                let query_end = str::parse::<usize>(tokens[11]).expect("failed to parse query end");
-                let query_remaining =
-                    str::parse::<usize>(tokens[12]).expect("failed to parse query remaining");
+                let query_start = str::parse::<usize>(tokens[10])
+                    .with_context(error_msg("failed to parse query start", 10))?;
+                let query_end = str::parse::<usize>(tokens[11])
+                    .with_context(error_msg("failed to parse query end", 11))?;
+                let query_remaining = str::parse::<usize>(tokens[12])
+                    .with_context(error_msg("failed to parse query remaining", 12))?;
 
                 let strand = match tokens[13] {
-                    "0" => Strand::Forward,
-                    "1" => Strand::Reverse,
-                    _ => {
-                        panic!()
-                    }
-                };
+                    "0" => Ok(Strand::Forward),
+                    "1" => Ok(Strand::Reverse),
+                    v => Err(anyhow!(error_msg_str(
+                        format!("invalid strand value: '{}'", v),
+                        13
+                    )())),
+                }?;
 
                 let (query_start, query_end) = match strand {
                     Strand::Forward => (query_start, query_end),
@@ -595,7 +612,10 @@ impl AlignmentData {
                     .values()
                     .enumerate()
                     .find(|(_, m)| m.name == substitution_matrix_name)
-                    .expect("unknown substitution matrix")
+                    .with_context(error_msg_str(
+                        format!("unknown substitution matrix '{}'", substitution_matrix_name),
+                        17,
+                    ))?
                     .0;
 
                 target_group.alignments.push(Alignment {
@@ -610,7 +630,9 @@ impl AlignmentData {
                     query_id,
                     substitution_matrix_id,
                 });
-            });
+
+                Ok(())
+            })?;
 
         if let Some(buf) = ultra {
             let buf_reader = BufReader::new(buf);
diff --git a/src/join_estimation.rs b/src/join_estimation.rs
index aa68ee0..ea259b3 100644
--- a/src/join_estimation.rs
+++ b/src/join_estimation.rs
@@ -236,7 +236,7 @@ impl From<&BayesianJoinStatistics> for BayesianJoinEstimator {
                 statistics.joinable_consensus_neg.mean(),
                 statistics.joinable_consensus_pos.mean(),
             ),
-            // TODO: Replace with beta dist, better matches dists we see...
+            // TODO: Consider replacing with beta dist, better matches dists we see...
             consensus_distance_nojoin: AssymetricLaplace::symmetric_from_moments(
                 statistics.unjoinable_consensus.mean(),
                 statistics.unjoinable_consensus.standard_deviation(),
diff --git a/src/main.rs b/src/main.rs
index f4a418e..d4dcbc4 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -44,7 +44,7 @@ use std::{
 use alignment::AlignmentData;
 use chunks::ProximityGroup;
 
-use anyhow::{Ok, Result};
+use anyhow::{anyhow, Context, Ok, Result};
 use clap::{Args, Parser};
 use itertools::Itertools;
 use rayon::prelude::*;
@@ -348,18 +348,18 @@ fn main() -> Result<()> {
         if let Result::Ok(metadata) = fs::metadata(&viz_args.viz_output_path) {
             if metadata.is_dir() {
                 // TODO: real error
-                panic!(
-                    "directory: {} already exists",
-                    viz_args.viz_output_path.to_str().unwrap()
-                )
+                return Result::Err(anyhow!(
+                    "directory: '{}' already exists",
+                    viz_args.viz_output_path.to_str().unwrap_or("?")
+                ));
             }
         }
 
-        create_dir_all(&viz_args.viz_output_path)?;
-        viz_args.viz_output_path = viz_args.viz_output_path.canonicalize()?;
-
         if let Some(path) = &viz_args.viz_reference_bed_path {
-            let file = File::open(path).expect("failed to open viz reference bed file");
+            let file = File::open(path).context(format!(
+                "failed to open viz reference bed file: '{}'",
+                path.to_str().unwrap_or("?")
+            ))?;
             let reader = BufReader::new(file);
 
             let mut chrom_list = vec![String::from("sentinel")];
@@ -369,36 +369,51 @@ fn main() -> Result<()> {
                 .lines()
                 .map(|l| l.unwrap())
                 .enumerate()
-                .for_each(|(line_num, line)| {
+                .try_for_each(|(line_num, line)| {
+                    let line_num_info = || format!("failed to read line {}", line);
+
                     let tokens: Vec<&str> = line.split_whitespace().collect();
                     let chrom = tokens[0].to_string();
-                    let start = tokens[1].parse::<usize>().expect("failed to parse int");
+                    let start = tokens[1].parse::<usize>().with_context(line_num_info)?;
 
-                    let last_chrom = chrom_list.last().expect("chrom list is empty");
+                    let last_chrom = chrom_list.last().context("chrom list is empty")?;
 
                     if chrom == *last_chrom {
                         if prev_start > start {
-                            panic!("bed file is unsorted");
+                            return Result::Err(anyhow!("bed file is unsorted"));
                         }
                     } else if !chrom_list.contains(&chrom) {
                         chrom_list.push(chrom.clone());
                         index.insert(chrom, line_num);
                     } else {
-                        panic!("bed file is unsorted");
+                        return Result::Err(anyhow!("bed file is unsorted"));
                     }
 
                     prev_start = start;
-                });
+
+                    Ok(())
+                })
+                .context(format!(
+                    "failed to parse bed file: '{}'",
+                    path.to_str().unwrap_or("?")
+                ))?;
 
             viz_args.viz_reference_bed_index = index;
         }
     }
 
-    let alignments_file = File::open(&args.alignments)?;
-    let matrices_file = File::open(&args.matrices)?;
+    let alignments_file = File::open(&args.alignments).context(format!(
+        "failed to open alignments file: '{}'",
+        args.alignments
+    ))?;
+    let matrices_file = File::open(&args.matrices)
+        .context(format!("failed to open matrices file: '{}'", args.matrices))?;
 
     let ultra_file = match args.ultra_args.ultra_file_path {
-        Some(ref path) => Some(File::open(path)?),
+        Some(ref path) => Some(File::open(path).context(format!(
+            "failed to open ultra file: '{}'",
+            path.to_str().unwrap_or("?")
+        ))?),
         None => None,
     };
 
@@ -422,22 +437,27 @@ fn main() -> Result<()> {
     if let Some(path) = &args.io_args.regions_path {
         let regions_file = File::create(path).unwrap();
         let mut regions_writer = BufWriter::new(regions_file);
-        proximity_groups.iter().enumerate().for_each(|(idx, g)| {
-            writeln!(
-                &mut regions_writer,
-                "{},{},{}:{},{}:{}",
-                idx,
-                alignment_data.target_name_map.get(g.target_id),
-                g.target_start,
-                g.target_end,
-                g.line_start,
-                g.line_end,
-            )
-            .expect("failed to write to regions file")
-        });
+        proximity_groups
+            .iter()
+            .enumerate()
+            .try_for_each(|(idx, g)| {
+                writeln!(
+                    &mut regions_writer,
+                    "{},{},{}:{},{}:{}",
+                    idx,
+                    alignment_data.target_name_map.get(g.target_id),
+                    g.target_start,
+                    g.target_end,
+                    g.line_start,
+                    g.line_end,
+                )
+                .context("failed to write to regions file")
+            })?;
     }
 
     if viz_args.viz {
+        create_dir_all(&viz_args.viz_output_path)?;
+        viz_args.viz_output_path = viz_args.viz_output_path.canonicalize()?;
         let mut index_file = File::create(viz_args.viz_output_path.join("index.html")).unwrap();
 
         write_index_file(
@@ -446,7 +466,7 @@ fn main() -> Result<()> {
             &proximity_groups,
             &viz_args.viz_constraints,
         )
-        .expect("failed to write to index.html");
+        .context("failed to write to index.html file for visualization")?;
     }
 
     debug_assert!(validate_groups(
diff --git a/src/p2estimator.rs b/src/p2estimator.rs
index ede85fe..509d943 100644
--- a/src/p2estimator.rs
+++ b/src/p2estimator.rs
@@ -540,7 +540,9 @@ impl<T: QuantileEstimatorRepresentation> Distribution for QuantileEstimator<T> {
     fn pdf(&self, _x: f64) -> f64 {
         // Will have to calculate derivatives, cache normalization factor (such that area under curve is 1)...
         // May be worth splitting out into different class to allow pre-processing this stuff...
-        // TODO: Would prefer quintic splines for this... Allows us to avoid normalization...
+        // TODO: Would prefer monotonic quintic splines for this as we can just use the derivative... Allows us to avoid normalization...
+        // Idea: Traditional splines aren't monotonic, but if we make their derivative take the form (p(x))^2 where p(x) is a polynomial
+        // we are guaranteed to have monotonic splines. Final step would be solving for the basis functions.
         panic!("Currently not supported!");
     }
 
diff --git a/src/pipeline.rs b/src/pipeline.rs
index 7200f78..bf93a42 100644
--- a/src/pipeline.rs
+++ b/src/pipeline.rs
@@ -205,7 +205,10 @@ pub fn run_naive_trace<T: JoinStatisticsCollector>(
     .unwrap();
 
     confidence(&mut confidence_matrix);
-    let confidence_by_row = windowed_confidence(&mut confidence_matrix);
+    let confidence_by_row = windowed_confidence(
+        &mut confidence_matrix,
+        args.annotation_args.score_window_size,
+    );
 
     let segments;
     let simple_trace;
diff --git a/src/segments.rs b/src/segments.rs
index 216015c..118e13a 100644
--- a/src/segments.rs
+++ b/src/segments.rs
@@ -497,7 +497,13 @@ pub fn segments_from_matrix_trace(
             .collect_vec();
 
         // Compute scores and start/end points for all rows in this segment....
-        // TODO: This isn't fully correct, if we want it to be we need to track if this segment starts in, and if the segment ends in a skip state to compute correct transitions for history tracing...
+        // TODO: This isn't fully correct,
+        // if we want it to be identical to initial viterbi trace,
+        // we need to track if each block in this segment starts in the skip state,
+        // and if each block in each segment ends in a skip state to compute correct
+        // transitions for history tracing...
+        // This would require a few additional booleans/states for each block and adjustments
+        // to the history tracing to incorperate them...
         for column in seg.col_start..=seg.col_end {
             let rows = &matrix_definition.active_rows_by_col[column];
             let row_iter = rows
diff --git a/src/statistics.rs b/src/statistics.rs
index f5014b8..014aac1 100644
--- a/src/statistics.rs
+++ b/src/statistics.rs
@@ -9,7 +9,6 @@ pub fn ln_add_exp(a: f64, b: f64) -> f64 {
     max + (min - max).exp().ln_1p()
 }
 
-// TODO: Support for generic floating types...
 #[allow(dead_code)]
 pub trait Distribution {
     fn pdf(&self, x: f64) -> f64;
diff --git a/src/substitution_matrix.rs b/src/substitution_matrix.rs
index 40127f5..901a7d1 100644
--- a/src/substitution_matrix.rs
+++ b/src/substitution_matrix.rs
@@ -1,8 +1,7 @@
-use std::{
-    fs::File,
-    io::{BufRead, BufReader, Read},
-    path::Path,
-};
+use std::io::{BufRead, BufReader, Read};
+
+use anyhow::{anyhow, Context};
+use itertools::Itertools;
 
 use crate::alphabet::{
     ALIGNMENT_ALPHABET_STR, GAP_EXTEND_DIGITAL, GAP_OPEN_DIGITAL, STR_TO_DIGITAL_NUCLEOTIDE,
@@ -194,13 +193,7 @@ impl SubstitutionMatrix {
         }
     }
 
-    #[allow(dead_code)]
-    pub fn from_file(path: impl AsRef<Path>) -> Vec<SubstitutionMatrix> {
-        let file = File::open(path).expect("failed to open matrix file");
-        SubstitutionMatrix::parse(file)
-    }
-
-    pub fn parse<R: Read>(matrix_buf: R) -> Vec<SubstitutionMatrix> {
+    pub fn parse<R: Read>(matrix_buf: R) -> anyhow::Result<Vec<SubstitutionMatrix>> {
         let mut matrices = vec![];
 
         let buf_reader = BufReader::new(matrix_buf);
@@ -208,9 +201,8 @@ impl SubstitutionMatrix {
 
         let mut lines: Vec<String> = buf_reader
             .lines()
-            .map(|l| l.expect("failed to read line"))
-            .filter(|l| !l.is_empty())
-            .collect();
+            .filter_ok(|l| !l.is_empty())
+            .try_collect()?;
 
         // add a single blank line to the
         // end to serve as a sentinel
@@ -232,26 +224,38 @@ impl SubstitutionMatrix {
         line_tokens
             .iter()
             .zip(line_tokens.iter().skip(1))
-            .for_each(|(tokens, next_tokens)| {
+            .enumerate()
+            .try_for_each(|(line_num, (tokens, next_tokens))| {
+                let error_msg = |msg| move || format!("{} at line {}", msg, line_num);
+
                 match state {
                     ParserState::Header => match tokens.first() {
                         Some(&"#matrix") => {
                             name = tokens[1].to_string();
                         }
                         Some(&"#gap-open") => {
-                            gap_open = tokens[1].parse::<f64>().expect("failed to parse float");
+                            gap_open = tokens[1]
+                                .parse::<f64>()
+                                .with_context(error_msg("failed to parse int"))?;
                         }
                         Some(&"#gap-ext") => {
-                            gap_extend = tokens[1].parse::<f64>().expect("failed to parse float");
+                            gap_extend = tokens[1]
+                                .parse::<f64>()
+                                .with_context(error_msg("failed to parse float"))?;
                         }
                         Some(&"#lambda") => {
-                            lambda = tokens[1].parse::<f64>().expect("failed to parse float");
+                            lambda = tokens[1]
+                                .parse::<f64>()
+                                .with_context(error_msg("failed to parse float"))?;
                         }
                         Some(&"#fi") => {
                             let freqs: Vec<f64> = tokens[1..=4]
                                 .iter()
-                                .map(|&f| f.parse::<f64>().expect("failed to parse float"))
-                                .collect();
+                                .map(|&f| {
+                                    f.parse::<f64>()
+                                        .with_context(error_msg("failed to parse float"))
+                                })
+                                .try_collect()?;
                             background_freqs_i
                                 .iter_mut()
                                 .zip(freqs)
@@ -260,14 +264,17 @@ impl SubstitutionMatrix {
                         Some(&"#fj") => {
                             let freqs: Vec<f64> = tokens[1..=4]
                                 .iter()
-                                .map(|&f| f.parse::<f64>().expect("failed to parse float"))
-                                .collect();
+                                .map(|&f| {
+                                    f.parse::<f64>()
+                                        .with_context(error_msg("failed to parse float"))
+                                })
+                                .try_collect()?;
                             background_freqs_j
                                 .iter_mut()
                                 .zip(freqs)
                                 .for_each(|(a, b)| *a = b);
                         }
-                        _ => panic!(),
+                        _ => return Err(anyhow!(error_msg("Unknown matrices file syntax!")())),
                     },
                     ParserState::Chars => {
                         chars = tokens.iter().map(|t| t.to_string()).collect();
@@ -275,8 +282,11 @@ impl SubstitutionMatrix {
                     ParserState::Scores => scores_vec.push(
                         tokens
                             .iter()
-                            .map(|t| t.parse::<f64>().expect("failed to parse float"))
-                            .collect(),
+                            .map(|t| {
+                                t.parse::<f64>()
+                                    .with_context(error_msg("failed to parse float"))
+                            })
+                            .try_collect()?,
                     ),
                 }
 
@@ -288,9 +298,12 @@ impl SubstitutionMatrix {
                     let char_indices: Vec<usize> = chars
                         .iter()
                         .map(|c| {
-                            (*STR_TO_DIGITAL_NUCLEOTIDE.get(c).expect("invalid char")) as usize
+                            STR_TO_DIGITAL_NUCLEOTIDE
+                                .get(c)
+                                .ok_or_else(|| anyhow!(error_msg("invalid char")()))
+                                .map(|v| *v as usize)
                         })
-                        .collect();
+                        .try_collect()?;
 
                     scores_vec
                         .iter()
@@ -321,26 +334,30 @@ impl SubstitutionMatrix {
                         scores,
                     ));
                     scores_vec = vec![];
+
+                    Ok::<_, anyhow::Error>(())
                 };
 
                 state = match next_tokens.first() {
                     Some(token) if token.starts_with('#') => {
                         if let ParserState::Scores = state {
-                            add_matrix()
+                            add_matrix()?;
                         }
                         ParserState::Header
                     }
                     Some(token) if token.parse::<f64>().is_err() => ParserState::Chars,
                     Some(token) if token.parse::<f64>().is_ok() => ParserState::Scores,
                     None => {
-                        add_matrix();
-                        return;
+                        add_matrix()?;
+                        return Ok(());
                     }
-                    _ => panic!(),
+                    _ => return Err(anyhow!(error_msg("Unknown syntax in matrices file!")())),
                 };
-            });
 
-        matrices
+                Ok(())
+            })?;
+
+        Ok(matrices)
     }
 }
 
@@ -432,7 +449,7 @@ mod tests {
             [0.0220, 0.1331, 0.0368, 0.9131],
         ];
 
-        let matrix_vec = SubstitutionMatrix::parse(matrix_buf);
+        let matrix_vec = SubstitutionMatrix::parse(matrix_buf)?;
         let matrix = matrix_vec.first().unwrap();
 
         matrix
@@ -477,7 +494,7 @@ mod tests {
             -30 -30 -30 -30 -30 -30 -30 -30 -30 -30 -30 -30"
             .as_bytes();
 
-        let matrix_vec = SubstitutionMatrix::parse(matrix_buf);
+        let matrix_vec = SubstitutionMatrix::parse(matrix_buf)?;
         let matrix = matrix_vec.first().unwrap();
 
         let correct: [[f64; 12]; 12] = [
@@ -568,7 +585,7 @@ mod tests {
             -30 -30 -30 -30 -30 -30 -30 -30 -30 -30 -30 -30"
             .as_bytes();
 
-        let matrix_vec = SubstitutionMatrix::parse(matrix_buf);
+        let matrix_vec = SubstitutionMatrix::parse(matrix_buf)?;
         let matrix = matrix_vec.first().unwrap();
 
         let mut ali: Vec<Alignment> = [
diff --git a/src/support.rs b/src/support.rs
index 4250363..1a6f9c0 100644
--- a/src/support.rs
+++ b/src/support.rs
@@ -2,10 +2,8 @@ use crate::matrix::Matrix;
 
 /// Smooth the values of a confidence matrix by convolving it with a fixed size rectangular kernel that adds to 1.
 /// This is the same as averaging values over the window range.
-/// The current kernel size is hardcoded to 31.
-pub fn windowed_confidence(matrix: &mut Matrix<f64>) -> Vec<f64> {
-    // TODO: parameterize this
-    let half_window_size = 15usize;
+pub fn windowed_confidence(matrix: &mut Matrix<f64>, window_size: usize) -> Vec<f64> {
+    let half_window_size = (window_size - 1) / 2;
 
     let mut confidence_by_row = vec![0.0; matrix.num_rows()];