diff --git a/.gitignore b/.gitignore index a744cfb..801aca5 100644 --- a/.gitignore +++ b/.gitignore @@ -19,4 +19,6 @@ Cargo.lock # Visuals generated by aurora... /viz/ -out.txt + +# Temporary output files... +/out*.txt diff --git a/.zed/settings.json b/.zed/settings.json new file mode 100644 index 0000000..d229d6c --- /dev/null +++ b/.zed/settings.json @@ -0,0 +1,16 @@ +// Folder-specific settings +// +// For a full list of overridable settings, and general information on folder-specific settings, +// see the documentation: https://zed.dev/docs/configuring-zed#settings-files +{ + "terminal": { + "detect_venv": { + "on": { + "directories": [ + ".venv" + ], + "activate_script": "default" + } + } + } +} diff --git a/Cargo.toml b/Cargo.toml index fbf7952..1a636d7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,7 @@ serde_json = "1.0.93" itertools = "0.11.0" rayon = "1.8.0" base64 = "0.22.1" +puruspe = "0.4.4" [target.'cfg(not(target_env = "msvc"))'.dependencies] tikv-jemallocator = "0.5" @@ -29,3 +30,5 @@ lto = "thin" codegen-units = 1 debug = false +[dev-dependencies] +rand = "0.10.1" diff --git a/fixtures/soda/table.html b/fixtures/soda/table.html index 73ccfeb..26ca293 100644 --- a/fixtures/soda/table.html +++ b/fixtures/soda/table.html @@ -2,16 +2,18 @@ PAGE_TITLE - + +

PAGE_TITLE

-
+
< Back -
- TABLE_TARGET +
+
+ +
+ +
+ + + + + +
- \ No newline at end of file + diff --git a/scripts/plot_consensus_distance_caf.py b/scripts/plot_consensus_distance_caf.py new file mode 100644 index 0000000..18d4937 --- /dev/null +++ b/scripts/plot_consensus_distance_caf.py @@ -0,0 +1,44 @@ +import sys + +import matplotlib.pyplot as plt +import numpy as np + +caf_file = sys.argv[1] + +gap_info = {} +prior = {} + +with open(caf_file, "r") as f: + for line in f: + tokens = line.strip().split(",") + name = tokens[8].strip() + qstart = int(tokens[10]) + qend = int(tokens[11]) + tstart = int(tokens[5]) + is_neg_strand = int(tokens[13]) + + if name in prior: + pstart, pend, p_is_neg, p_tstart = prior[name] + + if tstart - p_tstart < 10000: + if name not in gap_info: + gap_info[name] = [] + + if not is_neg_strand and not p_is_neg: + gap = qstart - pend + elif is_neg_strand and p_is_neg: + gap = qend - pstart + else: + continue + + gap_info[name].append(gap) + + prior[name] = (qstart, qend, is_neg_strand, tstart) + + +for gap_name, gap_vals in sorted( + gap_info.items(), key=lambda k: len(k[1]), reverse=True +): + plt.title(gap_name) + plt.hist(gap_vals, 500) + plt.show() diff --git a/scripts/plot_consensus_distance_repeatmasker.py b/scripts/plot_consensus_distance_repeatmasker.py new file mode 100644 index 0000000..23dab81 --- /dev/null +++ b/scripts/plot_consensus_distance_repeatmasker.py @@ -0,0 +1,191 @@ +# This file plots histograms of join distance per tandem repeat family given a repeatmasker formatted bed file as an argument. + +import sys + +import matplotlib.pyplot as plt +import numpy as np + +caf_file = sys.argv[1] + +gap_info = {} +length_estimate = {} +sequences = [] + + +def get_gap( + pstart, + pend, + qstart, + qend, + qremaining, + ppos, + is_pos, + check_valid: bool = True, +): + c_len = max(qremaining + qend, qremaining + qstart) + try: + if is_pos and ppos: + if check_valid: + assert pstart <= qstart <= qend and pstart <= pend <= qend + gap = qstart - pend + gap /= c_len + elif not is_pos and not ppos: + if check_valid: + assert qstart <= qend <= pend and qstart <= pstart <= pend + gap = pstart - qend + gap /= c_len + else: + gap = None + except AssertionError: + if is_pos and ppos: + print(f"Violation: {pstart} => {pend} => {qstart} => {qend}") + else: + print(f"Violation: {pend} <= {pstart} <= {qend} <= {qstart}") + + print(f"Offending annotation: {' '.join(line.split('\t')[:-1])}") + print(f"Offending sequences:\n\t {'\n\t'.join(seqs)}") + gap = None + + return gap + + +with open(caf_file, "r") as f: + for line in f: + shared_name = line.strip().split("\t")[3] # .split("#")[-1] + seqs = line.strip().split("\t")[-1].split(",") + sequences.extend(seqs) + + if len(seqs) <= 1: + continue + + # Sort by location on the target... + seqs = sorted(seqs, key=lambda k: int(k.split(" ")[5])) + + prior = None + length = 0 + + for seq in seqs: + tokens = seq.split(" ") + name = f"{tokens[9]}#{tokens[10]}" + is_pos = tokens[8] == "+" + join_id = int(tokens[14]) + + if is_pos: + qstart = int(tokens[11]) + qend = int(tokens[12]) + qremaining = int(tokens[13].strip("()")) + else: + qstart = int(tokens[13]) + qend = int(tokens[12]) + qremaining = int(tokens[11].strip("()")) + + assert qend >= qstart + length += qend - qstart + + if prior is not None: + pname, ppos, pstart, pend = prior + if pname != name: + gap = None + else: + gap = get_gap(pstart, pend, qstart, qend, qremaining, ppos, is_pos) + + if gap is not None: + if shared_name not in gap_info: + gap_info[shared_name] = [] + + if gap > 2: + print( + f"Gap > 2, Gap Value {gap}: {seq}\n\t {'\n\t'.join(seqs)}" + ) + gap_info[shared_name].append(gap) + + prior = (name, is_pos, qstart, qend) + + old_length_est = length_estimate.get(shared_name, (0, 0, 0)) + length_estimate[shared_name] = ( + old_length_est[0] + length, + old_length_est[1] + 1, + max(old_length_est[2], length), + ) + + +sequences.sort(key=lambda seq: int(seq.split(" ")[5])) +gap_nojoin_info = {} +other_priors = {} +target_nojoin_info = {} + +for seq in sequences: + tokens = seq.split(" ") + name = f"{tokens[9]}#{tokens[10]}" + is_pos = tokens[8] == "+" + join_id = int(tokens[14]) + tstart = int(tokens[5]) + + if is_pos: + qstart = int(tokens[11]) + qend = int(tokens[12]) + qremaining = int(tokens[13].strip("()")) + else: + qstart = int(tokens[13]) + qend = int(tokens[12]) + qremaining = int(tokens[11].strip("()")) + + random_prior = other_priors.get(name, None) + if random_prior is not None: + ppos, pstart, pend, pjoin_id, p_tstart = random_prior + if pjoin_id == join_id: + gap = None + else: + gap = get_gap(pstart, pend, qstart, qend, qremaining, ppos, is_pos, False) + + target_gap = tstart - p_tstart + + if gap is not None: + if name not in target_nojoin_info: + target_nojoin_info[name] = [] + target_nojoin_info[name].append(target_gap) + + if name not in gap_nojoin_info: + gap_nojoin_info[name] = [] + gap_nojoin_info[name].append(gap) + + other_priors[name] = (is_pos, qstart, qend, join_id, tstart) + + +for gap_name, gap_join_vals in sorted( + gap_info.items(), key=lambda k: len(k[1]), reverse=True +): + gap_join_vals = np.array(gap_join_vals) + # gap_vals = gap_vals[np.abs(gap_vals) > 1] + + avg_len = length_estimate[gap_name][0] / length_estimate[gap_name][1] + max_len = length_estimate[gap_name][2] + + avg_pos_d = np.mean(gap_join_vals[gap_join_vals >= 0]) + avg_neg_d = np.mean(gap_join_vals[gap_join_vals <= 0]) + + info = f"Positive Avg: {avg_pos_d:.02f},\n Negative Avg: {avg_neg_d:.02f}, ({avg_len / avg_pos_d:.02f}, {max_len / avg_pos_d:.02f})" + + range = (0, np.max(gap_join_vals)) + x = np.arange(*range) + lamb = 1 / avg_pos_d + + fig, (ax1, ax2, ax3) = plt.subplots(1, 3, squeeze=True) + ax1.set_title(f"{gap_name} (Avg Length: {avg_len:.02f}, Max Length: {max_len})") + + ax1.hist(gap_join_vals, 50, label=info) + # plt.plot(x, lamb * np.exp(-lamb * x)) + ax1.legend() + + ax2.set_title(f"{gap_name} Non-Join Consensus Gaps") + ax2.hist(gap_nojoin_info[gap_name], 50) + + ax3.set_title(f"{gap_name} Scatterplot") + ax3.scatter(target_nojoin_info[gap_name], gap_nojoin_info[gap_name]) + ax3.set_xlabel("Target Distance") + ax3.set_ylabel("Consensus Distance") + + w, h = fig.get_size_inches() + fig.set_size_inches(w * 3, h) + fig.tight_layout() + plt.show() diff --git a/scripts/plot_distributions_aurora.py b/scripts/plot_distributions_aurora.py new file mode 100644 index 0000000..dfe8961 --- /dev/null +++ b/scripts/plot_distributions_aurora.py @@ -0,0 +1,434 @@ +import sys +import typing +from dataclasses import dataclass, fields +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np +from matplotlib.colors import to_rgba +from scipy.optimize import curve_fit, minimize +from scipy.stats import ( + ecdf, + expon, + genpareto, + gumbel_r, + halfnorm, + invweibull, + laplace_asymmetric, + norm, + weibull_min, +) + + +@dataclass +class AuroraEntry: + annotation_count: int + target: str + target_start: list[int] + target_end: list[int] + query: list[str] + query_start: list[int] + query_end: list[int] + strand: list[str] + score: float + kimera80: list[float] + join_id: int + region: int + + @classmethod + def from_line(cls, line: str) -> typing.Self: + parts = line.strip().split() + + parsed = [] + max_list_len = 0 + + for field, part in zip(fields(cls), parts): + if isinstance(field.type, str): + raise ValueError("Can't parse strings!") + + origin = typing.get_origin(field.type) + if origin is not None and issubclass(origin, list): + tp = typing.get_args(field.type)[0] + parsed.append([tp(v) for v in part.split(",")]) + max_list_len = max(max_list_len, len(parsed[-1])) + else: + parsed.append(field.type(part)) + + if max_list_len > 0: + parsed_new = [] + + for v in parsed: + if isinstance(v, list): + if len(v) == max_list_len: + parsed_new.append(v) + elif len(v) == 1: + parsed_new.append(v * max_list_len) + else: + raise ValueError( + f"One of the elements has and invalid length: {v}" + ) + else: + parsed_new.append(v) + + parsed = parsed_new + + return cls(*parsed) + + def get_key(self) -> tuple[int, int]: + return (self.region, self.join_id) + + def select(self, idx: int) -> typing.Self: + return type(self)( + self.annotation_count, + self.target, + [self.target_start[idx]], + [self.target_end[idx]], + [self.query[idx]], + [self.query_start[idx]], + [self.query_end[idx]], + [self.strand[idx]], + self.score, + [self.kimera80[idx]], + self.join_id, + self.region, + ) + + +if len(sys.argv) not in [2, 3]: + print("Usage:") + print(f"\t{Path(sys.argv[0]).name} AURORA_OUTPUT_FILE [dist|scatter]") + sys.exit(1) + +aurora_file = sys.argv[1] +mode = sys.argv[2] if len(sys.argv) > 2 else "dist" + +if mode not in ["scatter", "dist"]: + print("Second argument (the mode) must be 'scatter' or 'dist'!") + sys.exit(1) + +if mode == "dist": + print("Generating distributions plots...") +else: + print("Generating scatter plots...") + +joined_annots: dict[tuple[int, int], list[AuroraEntry]] = {} + +with open(aurora_file, "r") as f: + for line in f: + entry = AuroraEntry.from_line(line) + key = entry.get_key() + if key not in joined_annots: + joined_annots[key] = [] + joined_annots[key].append(entry) + +for key, annots in joined_annots.items(): + annots.sort(key=lambda k: min(k.target_start)) + + +random_stats = {} +join_stats = {} +prior_vals = {} +seq_size = {} + + +def consensus_dist(a: AuroraEntry, b: AuroraEntry, a_idx: int, b_idx: int): + def best(*a): + return min(a, key=lambda v: abs(v)) + + match (a.strand[a_idx], b.strand[b_idx]): + case ("+", "+"): + return b.query_start[b_idx] - a.query_end[a_idx] - 1 + case ("-", "-"): + return a.query_end[a_idx] - b.query_start[b_idx] - 1 + case ("-", "+"): + return None + return best( + a.query_start[a_idx] - b.query_start[b_idx] - 1, + b.query_end[b_idx] - a.query_end[a_idx] - 1, + ) + case ("+", "-"): + return None + return best( + b.query_start[b_idx] - a.query_start[a_idx] - 1, + a.query_end[a_idx] - b.query_end[b_idx] - 1, + ) + case _: + raise ValueError("Unknown stand configuration.") + + +def _target_distance(a: AuroraEntry, b: AuroraEntry, ai: int, bi: int): + assert b.target_start[bi] - a.target_end[ai] - 1 >= 0 + return b.target_start[bi] - a.target_end[ai] - 1 + + +def _kimura_dist(a, b, ai, bi): + return abs(b.kimera80[bi] - a.kimera80[ai]) + + +def _relative_consensus_dist(a, b, ai, bi): + d = consensus_dist(a, b, ai, bi) + sum_seq_len = max( + [ + abs(v) + for v in ( + a.query_end[ai] - a.query_start[ai], + b.query_end[bi] - b.query_start[bi], + ) + ] + ) + if d is None: + return None + try: + return d / sum_seq_len + except ZeroDivisionError: + return None + + +stats_to_compute = { + "Target Distance": _target_distance, + "Divergence Change": _kimura_dist, + "Relative Consensus Distance": _relative_consensus_dist, +} + + +class Distribution: + def __init__(self, dist, defaults, exclude_location=True): + self._dist = dist + self.DEFAULTS = list(defaults) + self._excl_loc = exclude_location + self.NAME = getattr( + dist, "__name__", getattr(type(dist), "__qualname__", repr(dist)) + ) + + def pdf(self, x, *args): + # print(*args) + if self._excl_loc: + return self._dist.pdf(x, *args[:-1], 0.0, args[-1]) + else: + return self._dist.pdf(x, *args) + + def cdf(self, x, *args): + if self._excl_loc: + return self._dist.cdf(x, *args[:-1], 0.0, args[-1]) + else: + return self._dist.cdf(x, *args) + + def logcdf(self, x, *args): + print(*args) + if self._excl_loc: + return self._dist.logcdf(x, *args[:-1], 0.0, args[-1]) + else: + return self._dist.logcdf(x, *args) + + +estimator = { + "Relative Consensus Distance": Distribution( + invweibull, (1.0, 0.0, 1.0), False + ), # Distribution(invweibull, (1.0, 0.0, 1.0), False), Distribution(laplace_asymmetric, (1.0, 0.0, 1.0), False) + "Target Distance": Distribution( + genpareto, (0.0, 1.0) + ), # Distribution(expon, (1.0,)), # Distribution(genpareto, (0.0, 1.0)), Distribution(weibull_min, (1.0, 10000) + "Divergence Change": Distribution(halfnorm, (1.0,)), +} + + +def fit_dist(data, dist): + emp_cdf = ecdf(data).cdf + + return curve_fit( + dist.cdf, + emp_cdf.quantiles, + emp_cdf.probabilities, + dist.DEFAULTS, + full_output=True, + )[0] + + +random_idx_reference = {} +random_is_join = {} + +for name in stats_to_compute: + join_stats[name] = {} + random_stats[name] = {} + +all_anots_flat = [ann for annots in joined_annots.values() for ann in annots] +all_anots_flat.sort(key=lambda v: min(v.target_start)) + + +for ann_i, ann in enumerate(all_anots_flat): + for i in range(len(ann.query)): + name = ann.query[i] + (pann, j, pann_i) = prior_vals.get(name, (None, None, None)) + if pann is not None: + # if pann.region == ann.region and pann.join_id == ann.join_id: + # continue + + (ann1, idx1), (ann2, idx2) = sorted( + [(pann, j), (ann, i)], key=lambda v: v[0].target_start[v[1]] + ) + + stats = { + stat_name: stats_to_compute[stat_name](ann1, ann2, idx1, idx2) + for stat_name in stats_to_compute + } + + if all([v is not None for v in stats.values()]): + for stat_name, value in stats.items(): + values_per_query = random_stats[stat_name] + + if name not in values_per_query: + values_per_query[name] = [] + values_per_query[name].append(value) + + if name not in random_idx_reference: + random_idx_reference[name] = [] + random_is_join[name] = [] + + random_idx_reference[name].append((ann_i, i, pann_i, j)) + random_is_join[name].append( + ann in joined_annots.get(pann.get_key(), []) + ) + + seq_size[name] = max( + seq_size.get(name, 1), + ann.query_start[i], + ann.query_end[i], + ) + prior_vals[name] = (ann, i, ann_i) + +for k, annots in joined_annots.items(): + if len(annots) <= 1: + continue + + for a, b in zip(annots[:-1], annots[1:]): + for i in range(len(a.query)): + name = a.query[i] + + stats = { + stat_name: stats_to_compute[stat_name](a, b, i, i) + for stat_name in stats_to_compute + } + + if all([v is not None for v in stats.values()]): + for stat_name, value in stats.items(): + values_per_query = join_stats[stat_name] + + if name not in values_per_query: + values_per_query[name] = [] + values_per_query[name].append(value) + + +for query_name, _ in sorted( + next(iter(join_stats.values())).items(), key=lambda k: -len(k[1]) +): + # if not query_name.startswith("sin"): + # continue + join_indexes = np.flatnonzero(random_is_join[query_name]) + not_join_indexes = np.flatnonzero(~np.array(random_is_join[query_name])) + + if mode == "dist": + fig, axs = plt.subplots(3, len(stats_to_compute)) + axs = axs.T + + fig.suptitle(f"{query_name} (Size: {seq_size.get(query_name, 0)})") + + for name, (ax1, ax2, ax3) in zip(stats_to_compute, axs): + est = estimator[name] + + join_samples = np.array(join_stats[name][query_name]) + sx = np.linspace(join_samples.min(), join_samples.max(), 1000) + fit = fit_dist(join_samples, est) + ax1.set_title(f"Join {name}") + ax1.hist( + join_samples, + 200, + density=True, + label=f"Mean: {np.mean(join_samples):.02f}\nSTD: {np.std(join_samples):.02f}", + ) + ax1.plot( + sx, + est.pdf(sx, *fit), + label=f"Fit: {', '.join(f'{v:.02f}' for v in fit)}", + ) + ax1.legend(fontsize="xx-small") + + random_samples = np.array(random_stats[name][query_name]) + sx2 = np.linspace(random_samples.min(), random_samples.max(), 1000) + fit2 = fit_dist(random_samples[not_join_indexes], est) + ax2.set_title(f"All {name}") + ax2.plot( + [0], + [0], + color="black", + visible=False, + label=f"Mean: {np.mean(random_samples):.02f}\nSTD: {np.std(random_samples):.02f}", + ) + ax2.hist( + [random_samples[not_join_indexes], random_samples[join_indexes]], + 200, + label=["Not Joined", "Joined"], + density=True, + stacked=True, + color=[to_rgba(c) for c in ["tab:orange", "tab:blue"]], + ) + ax2.plot( + sx2, + est.pdf(sx2, *fit2), + "black", + label=f"Fit: {', '.join(f'{v:.02f}' for v in fit2)}", + ) + ax2.legend(fontsize="xx-small") + + ax3.set_title("CDFs") + ax3.ecdf(join_samples, label="Joins CDF") + ax3.ecdf(random_samples[not_join_indexes], label="No Joins CDF") + ax3.plot(sx, est.cdf(sx, *fit), label="Est. Join CDF") + ax3.plot(sx2, est.cdf(sx2, *fit2), label="Est. No Join CDF") + ax3.legend(fontsize="xx-small") + + fig.set_size_inches(16, 8) + fig.tight_layout() + plt.show() + else: + plt.title(f"{query_name} (Size: {seq_size.get(query_name, 0)})") + + join_art = plt.plot( + np.array(random_stats["Target Distance"][query_name])[join_indexes], + np.array(random_stats["Consensus Distance"][query_name])[join_indexes], + "ro", + picker=5, + label="Joins", + ) + no_join_art = plt.plot( + np.array(random_stats["Target Distance"][query_name])[not_join_indexes], + np.array(random_stats["Consensus Distance"][query_name])[not_join_indexes], + "bo", + picker=5, + label="Not Joins", + ) + plt.xlabel("Target Distance") + plt.ylabel("Consensus Distance") + plt.legend() + fig = plt.gcf() + + def on_pick(evt): + mask = join_indexes if evt.artist == join_art else not_join_indexes + + for idx in evt.ind: + idx = mask[idx] + annot_idx, sub_i, pann_idx, p_sub_i = random_idx_reference[query_name][ + idx + ] + print(f"Index: {annot_idx}, Sub-Index: {sub_i}") + print( + f"\tTarget Distance: {random_stats['Target Distance'][query_name][idx]}" + ) + print( + f"\tConsensus Distance: {random_stats['Consensus Distance'][query_name][idx]}" + ) + print(f"\tPrior: {all_anots_flat[pann_idx].select(p_sub_i)}") + print(f"\tCurrent: {all_anots_flat[annot_idx].select(sub_i)}") + print(f"\tIs Joined: {random_is_join[query_name][idx]}") + + fig.canvas.mpl_connect("pick_event", on_pick) + plt.show() diff --git a/scripts/plot_target_distance_distribution.py b/scripts/plot_target_distance_distribution.py new file mode 100755 index 0000000..c107e36 --- /dev/null +++ b/scripts/plot_target_distance_distribution.py @@ -0,0 +1,200 @@ +# Plots distributions of distances between sequences of the same family in the Genome +# Takes a single bed file as an argument. +# Results vary between exponential and power-law depending on the family. Basically all look exponential with fatter tails for a good chunk of families. +# +# Most likely, based on results on understanding the genome (and shape!), the actual distributions are very likely Hyper-Exponential Distributions. +# Basically, it means each family is sampled from one of a weighted set of exponential distributions with different rates. +# This would make sense as TE's have different rates depending on what part of the genome your in (basically, there are functional domains or regions) +# +# Testing this though, would require an implementation of Prony's method to determine the fit. I wasn't able to find any implementations so this +# and it seems implementing such a method would take quite some time. It may be worth eventually adding if better fit's are needed. +# +# For now, an exponential distribution seems to provide a good enough approximation for use in aurora. It also is easy to fit well. +# Here's an interesting paper on the topic that seems to have landed in the same space I've been in: https://www.columbia.edu/~ww2040/FittingMixturesPerfEval98.pdf + +bed_file = sys.argv[1] + +seq_info = {} +cs = np.inf +ce = -np.inf + +with open(bed_file, "r") as f: + for line in f: + tokens = line.strip().split() + start = int(tokens[1]) + end = int(tokens[2]) + join_id = int(tokens[12]) + name = str(tokens[3]) + + if name.endswith("Simple_repeat"): + continue + + name = name.upper() + + if name not in seq_info: + seq_info[name] = [] + seq_info[name].append((start, end, join_id)) + + cs = min(cs, start) + ce = max(ce, end) + + +class Distribution: + def __init__(self, dist, defaults, exclude_location=True): + self._dist = dist + self.DEFAULTS = list(defaults) + self._excl_loc = exclude_location + self.NAME = getattr( + dist, "__name__", getattr(type(dist), "__qualname__", repr(dist)) + ) + + def pdf(self, x, *args): + # print(*args) + if self._excl_loc: + return self._dist.pdf(x, *args[:-1], 0.0, args[-1]) + else: + return self._dist.pdf(x, *args) + + def cdf(self, x, *args): + if self._excl_loc: + return self._dist.cdf(x, *args[:-1], 0.0, args[-1]) + else: + return self._dist.cdf(x, *args) + + def logcdf(self, x, *args): + print(*args) + if self._excl_loc: + return self._dist.logcdf(x, *args[:-1], 0.0, args[-1]) + else: + return self._dist.logcdf(x, *args) + + +distributions = [ + Distribution(expon, (1000.0,)), + Distribution(weibull_min, (2.0, 1000.0)), + # Distribution(invweibull, (1.0, 20000.0)), + Distribution(lomax, (1.0, 1000.0)), + # Distribution(burr12, (1.0, 1.0, 1000.0)), + # Distribution(lognorm, (1.0, 1000.0)), + # Distribution(fisk, (1.0, 1000.0)), + Distribution(genpareto, (0.0, 1000.0)), + Distribution(betaprime, (1.0, 2.0, 1000.0)), +] + + +for name, info in sorted(seq_info.items(), key=lambda k: -len(k[1])): + # if not name.startswith("CHARLIE1#"): + # continue + + info = np.array(info) + print(info.shape) + info = info[np.argsort(info[:, 0])] + starts = info[:, 0] + ends = info[:, 1] + + # Get distances between consecutive sequences... + _, indexes = np.unique(info[:, 2], return_index=True) + indexes = np.sort(indexes) + + dists = np.diff(starts[indexes]) + + counts, bins = np.histogram(dists, "auto", density=True) + x = (bins[1:] + bins[:-1]) / 2 + + # Get fit for exponential dist... + beta = np.mean(dists) # np.median(dists) / np.log(2) + + # Get fit for weibull dist... (and CDF)... + + valid_values = np.isfinite(wcdfx) & np.isfinite(wcdfy) + fit_line = curve_fit( + lambda x, m, b: m * x + b, + wcdfx[valid_values], + wcdfy[valid_values], + sigma=wcdfy_err[valid_values], + )[0] + + wk = fit_line[0] + w_beta = np.exp(-fit_line[1] / wk) + + cdf_fits = [ + curve_fit( + distrib.cdf, + cdf.quantiles, + cdf.probabilities, + distrib.DEFAULTS, + full_output=True, + )[0] + for distrib in distributions + ] + # print(cdf_fits) + # raise ValueError + + # Printout results... + print(f"Count: {starts.shape[0]}") + print(f"Genome Size: {ce - cs}") + est_scale = (ce - cs) / starts.shape[0] + print(f"Avg Occurance Rate (in nucleotides): {est_scale}") + print(f"Exponential Fit: Scale (beta): {beta}, Rate (lambda): {1 / beta}") + print( + f"Weibull Line Fit: Shape (k): {wk}, Scale (beta): {w_beta}, Rate (lambda): {1 / w_beta}" + ) + with np.printoptions(suppress=True): + for distrib, fit in zip(distributions, cdf_fits): + print(f"{distrib.NAME} CDF Fit: {fit}") + + fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2) + fig.suptitle(name) + + ax1.set_title("Histogram and Predicted PDFs") + ax1.set_ylabel("Density") + ax1.set_xlabel("Distance (nucleotides)") + ax1.stairs(counts, bins, fill=True, color="tan", label="Histogram") + ax1.plot( + x, weibull_min.pdf(x, wk, 0.0, w_beta), color="orange", label="Weibull Line Fit" + ) + ax1.plot(x, expon.pdf(x, 0.0, est_scale), color="green", label="Naive Fit Exp") + for distrib, fit in zip(distributions, cdf_fits): + ax1.plot(x, distrib.pdf(x, *fit), label=f"{distrib.NAME} CDF Fit") + ax1.legend() + + ax2.set_title("Weibull-Space of EDF") + ax2.set_ylabel(r"$ \ln(x) $") + ax2.set_xlabel(r"$ \ln(-\ln(1 - F(x))) $") + ax2.scatter(wcdfx, wcdfy, label="Empirical Distribution Function") + ax2.plot( + wcdfx, + fit_line[1] + fit_line[0] * wcdfx, + label=f"Line: {fit_line[0]:.02f}x + {fit_line[1]:.02f}", + ) + ax2.legend() + + ax3.set_title("Cumulative Distribution Functions") + ax3.set_ylabel(r"$ P(X <= x) $") + ax3.set_xlabel("Distance (nucleotides)") + ax3.scatter(cdf.quantiles, cdf.probabilities, color="tan", label="EDF") + ax3.plot( + cdf.quantiles, + weibull_min.cdf(cdf.quantiles, wk, 0.0, w_beta), + color="orange", + label="Weibull Line Fit", + ) + ax3.plot( + cdf.quantiles, + expon.cdf(cdf.quantiles, 0.0, est_scale), + color="green", + label="Naive Fit Exp", + ) + for distrib, fit in zip(distributions, cdf_fits): + ax3.plot( + cdf.quantiles, + distrib.cdf(cdf.quantiles, *fit), + label=f"{distrib.NAME} CDF Fit", + ) + ax3.legend() + + ax4.plot(cdf.quantiles, cdf.probabilities) + + fig.set_size_inches(12, 12) + fig.tight_layout() + plt.show() diff --git a/src/alignment.rs b/src/alignment.rs index d0d073c..a99580e 100644 --- a/src/alignment.rs +++ b/src/alignment.rs @@ -1,4 +1,5 @@ -use anyhow::Result; +use anyhow::{anyhow, Context, Result}; +use itertools::Itertools; use serde::Deserialize; use std::collections::HashMap; use std::io::{BufRead, BufReader, Read}; @@ -520,7 +521,7 @@ impl AlignmentData { // 16: ACCT/GGA/TCT/CGTGGCCT/CGGGGGTTGGGGACCCCTG // 17: 14p41g.matrix - let substitution_matrices = VecMap::from(SubstitutionMatrix::parse(matrices)); + let substitution_matrices = VecMap::from(SubstitutionMatrix::parse(matrices)?); let mut target_groups: Vec = vec![]; let mut target_name_map: VecMap = VecMap::new(); @@ -531,31 +532,47 @@ impl AlignmentData { let caf_lines = BufReader::new(caf).lines(); caf_lines - .map(|l| l.expect("failed to read line")) - .filter(|l| !l.is_empty()) + .filter_ok(|l| !l.is_empty()) .enumerate() - .for_each(|(line_num, line)| { + .try_for_each(|(line_num, line_unchecked)| { + let error_msg = + |msg, col| move || format!("{} at line '{}', column '{}'", msg, line_num, col); + + let error_msg_str = |msg: String, col: usize| { + move || format!("{} at line '{}', column '{}'", msg, line_num, col) + }; + + let line = line_unchecked?; let tokens: Vec<&str> = line.split(',').collect(); + if tokens.len() < 18 { + return Err(anyhow!( + "line {} does not have at least 18 columns!", + line_num + )); + } + let target_name = tokens[4].to_string(); - let target_start = - str::parse::(tokens[5]).expect("failed to parse target start"); - let target_end = - str::parse::(tokens[6]).expect("failed to parse target end"); + let target_start = str::parse::(tokens[5]) + .with_context(error_msg("failed to parse target start", 5))?; + let target_end = str::parse::(tokens[6]) + .with_context(error_msg("failed to parse target end", 6))?; let query_name = tokens[8].to_string(); - let query_start = - str::parse::(tokens[10]).expect("failed to parse query start"); - let query_end = str::parse::(tokens[11]).expect("failed to parse query end"); - let query_remaining = - str::parse::(tokens[12]).expect("failed to parse query remaining"); + let query_start = str::parse::(tokens[10]) + .with_context(error_msg("failed to parse query start", 10))?; + let query_end = str::parse::(tokens[11]) + .with_context(error_msg("failed to parse query end", 11))?; + let query_remaining = str::parse::(tokens[12]) + .with_context(error_msg("failed to parse query remaining", 12))?; let strand = match tokens[13] { - "0" => Strand::Forward, - "1" => Strand::Reverse, - _ => { - panic!() - } - }; + "0" => Ok(Strand::Forward), + "1" => Ok(Strand::Reverse), + v => Err(anyhow!(error_msg_str( + format!("invalid strand value: '{}'", v), + 13 + )())), + }?; let (query_start, query_end) = match strand { Strand::Forward => (query_start, query_end), @@ -595,7 +612,10 @@ impl AlignmentData { .values() .enumerate() .find(|(_, m)| m.name == substitution_matrix_name) - .expect("unknown substitution matrix") + .with_context(error_msg_str( + format!("unknown substitution matrix '{}'", substitution_matrix_name), + 17, + ))? .0; target_group.alignments.push(Alignment { @@ -610,7 +630,9 @@ impl AlignmentData { query_id, substitution_matrix_id, }); - }); + + Ok(()) + })?; if let Some(buf) = ultra { let buf_reader = BufReader::new(buf); diff --git a/src/annotation.rs b/src/annotation.rs index c904a57..0c97aba 100644 --- a/src/annotation.rs +++ b/src/annotation.rs @@ -108,7 +108,12 @@ fn get_strings(simple_annotations: &[SimpleAnnotation], simplify: bool) -> [Stri [ get_mutli_option_string(simple_annotations, |v| v.target_start, simplify), get_mutli_option_string(simple_annotations, |v| v.target_end, simplify), - get_mutli_option_string(simple_annotations, |v| v.query_name.clone(), simplify), + get_mutli_option_string( + simple_annotations, + // Remove spaces as it breaks the format... + |v| v.query_name.replace(" ", "-"), + simplify, + ), get_mutli_option_string(simple_annotations, |v| v.query_start, simplify), get_mutli_option_string(simple_annotations, |v| v.query_end, simplify), get_mutli_option_string(simple_annotations, |v| v.strand, simplify), @@ -127,7 +132,8 @@ impl AmbiguousAnnotation { format!( "{:w0$} {:w1$} {:w2$} {:w3$} {:w4$} {:w5$} {:w6$} {:w7$} {:4.3} {:w8$} {:w9$} {}", self.annotations.len(), - self.target_name, + // Remove spaces as it breaks the format... + self.target_name.replace(" ", "-"), ts, te, qn, diff --git a/src/assembly.rs b/src/assembly.rs index 527e08f..0c0c166 100644 --- a/src/assembly.rs +++ b/src/assembly.rs @@ -4,8 +4,10 @@ use itertools::Itertools; use crate::{ alignment::{Alignment, Strand}, - score_params::ScoreParams, - segments::SegmentedMatrix, + chunks::ProximityGroup, + join_estimation::{JoinEstimator, JoinStatisticsCollector, LinkInfo}, + segments::{Block, InitialSegments, SegmentedMatrix, SegmentedMatrixView}, + trace_statistics::{calculate_region_statistics, QueryStatistics, RegionStatistics}, AnnotationArgs, }; @@ -79,73 +81,280 @@ pub struct Edge { pub link_type: LinkType, } -fn piecewise_linear_cost( - neg_start: f64, - pos_start: f64, - neg_slope: f64, - pos_slope: f64, - value: f64, -) -> f64 { - if value < neg_start { - (value - neg_start).abs() * neg_slope - } else if value > pos_slope { - (value - pos_start).abs() * pos_slope +pub fn block_target_distance(first_block: &Block, second_block: &Block) -> isize { + second_block.col_start as isize - first_block.col_end as isize - 1 +} + +pub fn block_consensus_distance(first_block: &Block, second_block: &Block) -> (isize, LinkType) { + let select_closest = |prop1: (isize, LinkType), prop2: (isize, LinkType)| { + if prop1.0.abs() < prop2.0.abs() { + prop1 + } else { + prop2 + } + }; + + match (first_block.strand, second_block.strand) { + (Strand::Forward, Strand::Forward) => ( + second_block.query_start as isize - first_block.query_end as isize - 1, + LinkType::Forward, + ), + (Strand::Reverse, Strand::Reverse) => ( + first_block.query_end as isize - second_block.query_start as isize - 1, + LinkType::Reverse, + ), + (Strand::Forward, Strand::Reverse) => select_closest( + ( + first_block.query_start as isize - second_block.query_start as isize - 1, + LinkType::FRInversion1, + ), + ( + second_block.query_end as isize - first_block.query_end as isize - 1, + LinkType::FRInversion2, + ), + ), + (Strand::Reverse, Strand::Forward) => select_closest( + ( + second_block.query_start as isize - first_block.query_start as isize - 1, + LinkType::RFInversion1, + ), + ( + first_block.query_end as isize - second_block.query_end as isize - 1, + LinkType::RFInversion2, + ), + ), + _ => panic!("Invalid strand types!"), + } +} + +pub fn block_length_on_query(b: &Block) -> usize { + b.query_end.abs_diff(b.query_start) + 1 +} + +#[allow(dead_code)] +pub enum ConsensusDistanceNormalization { + Max, + Min, + Sum, + WithLength(usize), + WithUBAndLength(usize, usize), +} + +pub fn relative_consensus_distance( + first_block: &Block, + second_block: &Block, + mode: ConsensusDistanceNormalization, +) -> (f64, LinkType) { + let (mut dist, link_type) = block_consensus_distance(first_block, second_block); + + if let ConsensusDistanceNormalization::WithUBAndLength(ub, _length) = mode { + dist = if dist > 0 { + dist.saturating_sub(ub as isize).max(0) + } else { + dist + } + } + + let div = match mode { + ConsensusDistanceNormalization::Sum => { + block_length_on_query(first_block) + block_length_on_query(second_block) + } + ConsensusDistanceNormalization::Max => { + block_length_on_query(first_block).max(block_length_on_query(second_block)) + } + ConsensusDistanceNormalization::Min => { + block_length_on_query(first_block).min(block_length_on_query(second_block)) + } + ConsensusDistanceNormalization::WithLength(length) => length, + ConsensusDistanceNormalization::WithUBAndLength(_ub, length) => length, + }; + (dist as f64 / div as f64, link_type) +} + +fn is_joinable( + target_distance: isize, + consensus_distance: isize, + link_type: LinkType, + min_block_length: usize, + args: &AnnotationArgs, +) -> bool { + let within_target_distance_threshold = + target_distance < args.target_join_distance as isize && target_distance >= 0; + + let consensus_is_colinear = if link_type.is_inversion() { + consensus_distance.abs() < args.inversion_distance } else { - 0.0 + consensus_distance > -args.consensus_join_overlap + && consensus_distance < args.consensus_join_distance + }; + + // TODO: Hardcoded, change later... + let is_significant = + min_block_length >= 10 && -consensus_distance <= ((min_block_length / 2) as isize); + + within_target_distance_threshold && consensus_is_colinear && is_significant +} + +fn new_alignment_to_blocks_map( + segments: SegmentedMatrixView, + alignments: &[Alignment], +) -> Vec> { + let mut alignment_block_map = vec![Vec::::new(); alignments.len()]; + + for (s_idx, segment) in segments.iter().enumerate() { + for (b_idx, block) in segment.blocks.iter().enumerate() { + if block.row_idx > 0 && block.row_idx <= alignments.len() { + alignment_block_map[block.row_idx - 1].push((s_idx, b_idx)); + } + } } + + alignment_block_map +} + +fn calculate_unexplained_bases( + segments: SegmentedMatrixView, + region_statistics: &RegionStatistics, + first_block_segment: usize, + second_block_segment: usize, + second_block_target_start: usize, +) -> usize { + let ub = region_statistics.unexplained_bases[second_block_segment] + .abs_diff(region_statistics.unexplained_bases[first_block_segment]) + + (second_block_target_start - segments[second_block_segment].start_col); + ub } -fn get_link_cost( +pub fn gather_join_statistics( + group: &ProximityGroup, + initial_segments: &InitialSegments, + query_lengths: &HashMap, annotation_args: &AnnotationArgs, - score_params: &ScoreParams, - consensus_gap: f64, - target_gap: f64, -) -> f64 { - // Minimum cost (a query loop) - let min_value = score_params.query_loop_score; - let value_range = (score_params.query_loop_score - score_params.query_jump_score).abs(); - - // Get overlap and gap ranges with free areas incorperated in, otherwise math is not quite right. - let overlap_range = ((annotation_args.consensus_join_overlap as f64) - - (annotation_args.free_join_consensus_overlap as f64)) - .abs() - .max(1.0); - let gap_range = ((annotation_args.consensus_join_distance as f64) - - (annotation_args.free_join_consensus_gap as f64)) - .abs() - .max(1.0); - - // Compute slopes.... - let lambda = -value_range - * (annotation_args.join_target_gap_penalty - / annotation_args.target_join_distance.max(1) as f64) - .abs(); - let alpha = - -value_range * (annotation_args.join_consensus_overlap_penalty / overlap_range).abs(); - let beta = -value_range * (annotation_args.join_consensus_gap_penalty / gap_range).abs(); - - // Cost = linear consensus cost + linear target gap cost... - min_value - + piecewise_linear_cost( - -(annotation_args.free_join_consensus_overlap as f64).abs(), - (annotation_args.free_join_consensus_gap as f64).abs(), - alpha, - beta, - consensus_gap, - ) - + lambda * target_gap +) -> Vec<(usize, T)> { + let alignments = group.alignments; + + let mut query_ids: Vec = alignments.iter().map(|a| a.query_id).unique().collect(); + query_ids.sort(); + + let region_stats = calculate_region_statistics(initial_segments); + let mut query_stats: Vec<(usize, T)> = Vec::with_capacity(query_ids.len()); + + query_ids + .iter() + // grab the alignments for this ID + .map(|id| { + ( + *id, + alignments + .iter() + .enumerate() + .filter(|&(_, a)| a.query_id == *id) + .map(|(i, a)| { + let b = Block::from_alignment(a, group.target_start, i, 0.0, 0.0); + let seg_i = initial_segments + .view_segments() + .partition_point(|v| v.start_col <= b.col_start) + .saturating_sub(1); + (seg_i, b) + }), + ) + }) + .for_each(|(id, compat_alignments)| { + let mut new_stats = T::new(); + + gather_join_statistics_single_family( + compat_alignments, + *query_lengths + .get(&id) + .expect("Query length missing for alignment!"), + initial_segments.view_segments(), + ®ion_stats, + annotation_args, + &mut new_stats, + ); + + query_stats.push((id, new_stats)); + }); + + query_stats +} + +fn link_info( + first_block: &Block, + second_block: &Block, + annotation_args: &AnnotationArgs, + unexplained_bases: usize, + consensus_length: usize, + neighbors: bool, +) -> LinkInfo { + let (consensus_distance, link_type) = block_consensus_distance(first_block, second_block); + let joinable = is_joinable( + block_target_distance(first_block, second_block), + consensus_distance, + link_type, + block_length_on_query(first_block).min(block_length_on_query(second_block)), + annotation_args, + ); + + LinkInfo { + target_distance: block_target_distance(first_block, second_block), + consensus_distance, + link_type, + consensus_length, + unexplained_bases, + neighbors, + joinable, + } +} + +fn gather_join_statistics_single_family<'a>( + compatable_alignments: impl Iterator, + consensus_length: usize, + segments: SegmentedMatrixView, + region_statistics: &RegionStatistics, + args: &AnnotationArgs, + join_stats: &mut impl JoinStatisticsCollector, +) { + let compatable_blocks = compatable_alignments + .sorted_by_key(|(_u_b, a)| a.col_start) + .collect_vec(); + + compatable_blocks + .iter() + .enumerate() + .for_each(|(idx, (a_segment_idx, a_block))| { + compatable_blocks[idx + 1..].iter().enumerate().for_each( + |(idx2, (b_segment_idx, b_block))| { + let link_info = &link_info( + a_block, + b_block, + args, + calculate_unexplained_bases( + segments, + region_statistics, + *a_segment_idx, + *b_segment_idx, + b_block.col_start, + ), + consensus_length, + idx + 1 == idx2, + ); + join_stats.add(a_block, b_block, link_info); + }, + ) + }) } -fn link_assemblies( +fn link_assemblies( graph: &mut HashMap<(SegmentAndDenseRow, SegmentAndDenseRow), Edge>, compatable_blocks: impl Iterator, - alignments: &[Alignment], + consensus_length: usize, segments: &SegmentedMatrix, - score_params: &ScoreParams, + query_statistics: &QueryStatistics, + region_statistics: &RegionStatistics, args: &AnnotationArgs, ) { // this relies on the alignments being sorted by target start - // note: this assertion iter will only run in debug mode let compatable_blocks = compatable_blocks.sorted().collect_vec(); compatable_blocks.iter().enumerate().for_each(|(idx, a)| { @@ -158,98 +367,43 @@ fn link_assemblies( let a_block = &segments[a.0].blocks[a.1]; let b_block = &segments[b.0].blocks[b.1]; - // We allow this now, otherwise inversions might not properly join... - // If same alignment, and neighboring segments, don't join... - //if a_block.row_idx == b_block.row_idx && ((b.0 - 1) <= a.0) { - // return; - //} - - let target_distance = b_block.col_start as isize - a_block.col_end as isize; - - let a_length = a_block.query_end.abs_diff(a_block.query_start); - let b_length = b_block.query_end.abs_diff(b_block.query_start); - let min_length = a_length.min(b_length); - - let select_closest = |prop1: (isize, LinkType), prop2: (isize, LinkType)| { - if prop1.0.abs() < prop2.0.abs() { - prop1 - } else { - prop2 - } - }; - - // Query bounds are reversed for reverse sequences, so the start is actually greater than the end (Ex. start: 1510 -> end: 105) - - let (consensus_distance, link_type) = match ( - alignments[a_block.row_idx - 1].strand, - alignments[b_block.row_idx - 1].strand, - ) { - (Strand::Forward, Strand::Forward) => ( - b_block.query_start as isize - a_block.query_end as isize, - LinkType::Forward, - ), - (Strand::Reverse, Strand::Reverse) => ( - a_block.query_end as isize - b_block.query_start as isize, - LinkType::Reverse, - ), - (Strand::Forward, Strand::Reverse) => select_closest( - ( - a_block.query_start as isize - b_block.query_start as isize, - LinkType::FRInversion1, - ), - ( - b_block.query_end as isize - a_block.query_end as isize, - LinkType::FRInversion2, - ), - ), - (Strand::Reverse, Strand::Forward) => select_closest( - ( - b_block.query_start as isize - a_block.query_start as isize, - LinkType::RFInversion1, - ), - ( - a_block.query_end as isize - b_block.query_end as isize, - LinkType::RFInversion2, - ), + let link = link_info( + a_block, + b_block, + args, + calculate_unexplained_bases( + segments, + region_statistics, + a.0, + b.0, + b_block.col_start, ), - _ => panic!("Invalid strand types!"), - }; - - let within_target_distance_threshold = - target_distance < args.target_join_distance as isize; - - let consensus_is_colinear = if link_type.is_inversion() { - consensus_distance.abs() < args.inversion_distance - } else { - consensus_distance > -args.consensus_join_overlap - && consensus_distance < args.consensus_join_distance - }; - - // TODO: Hardcoded, change later... - let is_significant = - min_length >= 10 && -consensus_distance <= ((min_length / 2) as isize); - - let weight = if a_block.row_idx == b_block.row_idx && ((b.0 - 1) <= a.0) { - score_params.query_loop_score - } else { - get_link_cost( - args, - score_params, - consensus_distance as f64, - target_distance as f64, - ) - }; - - if within_target_distance_threshold && consensus_is_colinear && is_significant { - graph.insert( - ((a.0, a_block.row_idx), (b.0, b_block.row_idx)), - Edge { - weight, - first_sparse_row: a.1, - second_sparse_row: b.1, - link_type, - }, - ); + consensus_length, + a.0 + 1 == b.0, + ); + + if link.joinable { + let join_prob = query_statistics + .estimator + .predict(a_block, b_block, &link, false); + + if join_prob >= args.join_likelihood_threshold { + let weight = if a_block.row_idx == b_block.row_idx && ((b.0 - 1) <= a.0) { + 1.0 + } else { + join_prob + }; + + graph.insert( + ((a.0, a_block.row_idx), (b.0, b_block.row_idx)), + Edge { + weight, + first_sparse_row: a.1, + second_sparse_row: b.1, + link_type: link.link_type, + }, + ); + } } }); }); @@ -266,22 +420,15 @@ pub struct SegmentAssemblyGraph { } impl SegmentAssemblyGraph { - pub fn new( + pub fn new( alignments: &[Alignment], + query_lengths: &HashMap, segments: &SegmentedMatrix, - score_params: &ScoreParams, + region_statistics: &RegionStatistics, + query_statistics: &[QueryStatistics], annotation_args: &AnnotationArgs, ) -> Self { - let mut alignment_block_map = vec![Vec::::new(); alignments.len()]; - - for (s_idx, segment) in segments.iter().enumerate() { - for (b_idx, block) in segment.blocks.iter().enumerate() { - if block.row_idx > 0 && block.row_idx <= alignments.len() { - alignment_block_map[block.row_idx - 1].push((s_idx, b_idx)); - } - } - } - + let alignment_block_map = new_alignment_to_blocks_map(segments, alignments); let mut query_ids: Vec = alignments.iter().map(|a| a.query_id).unique().collect(); query_ids.sort(); @@ -292,19 +439,25 @@ impl SegmentAssemblyGraph { .iter() // grab the alignments for this ID .map(|id| { - alignments - .iter() - .enumerate() - .filter(|&(_, a)| a.query_id == *id) - .flat_map(|(a_idx, _)| alignment_block_map[a_idx].iter().copied()) + ( + *id, + alignments + .iter() + .enumerate() + .filter(|&(_, a)| a.query_id == *id) + .flat_map(|(a_idx, _)| alignment_block_map[a_idx].iter().copied()), + ) }) - .for_each(|compat_blocks| { + .for_each(|(id, compat_blocks)| { link_assemblies( &mut link_graph, compat_blocks, - alignments, + *query_lengths + .get(&id) + .expect("Unable to find query length for alignment!"), segments, - score_params, + &query_statistics[id], + region_statistics, annotation_args, ); }); diff --git a/src/balanced_tree.rs b/src/balanced_tree.rs index 603b68d..80629cf 100644 --- a/src/balanced_tree.rs +++ b/src/balanced_tree.rs @@ -181,7 +181,7 @@ impl + TryFrom + Copy + Debug> AVLIndexSet { } /// Iterate over the elements in the tree, in sorted order. Returns the element index and the depth of the element in the tree. - pub fn iter(&self) -> AVLInOrderSetIterator { + pub fn iter(&self) -> AVLInOrderSetIterator<'_, T> { let mut stack = Vec::with_capacity(self.depth() + 1); if let Some(root) = self.root { stack.push((root, 0_u8)); @@ -190,7 +190,7 @@ impl + TryFrom + Copy + Debug> AVLIndexSet { AVLInOrderSetIterator { tree: self, stack } } - pub fn bfs(&self) -> AVLBFSSetIterator { + pub fn bfs(&self) -> AVLBFSSetIterator<'_, T> { let mut queue = VecDeque::new(); if let Some(root) = self.root { diff --git a/src/history_tracing.rs b/src/history_tracing.rs index 8f39011..a01f220 100644 --- a/src/history_tracing.rs +++ b/src/history_tracing.rs @@ -265,7 +265,7 @@ struct JoinLink { origin_history: usize, linked_history: usize, link_side: Side, - score: f64, + probability: f64, } impl JoinLink { @@ -273,7 +273,7 @@ impl JoinLink { self.origin_history == other.origin_history && self.linked_history == other.linked_history && self.link_side == other.link_side - && ((self.score - other.score).abs() < epsilon) + && ((self.probability - other.probability).abs() < epsilon) } } @@ -297,7 +297,7 @@ impl Ord for JoinLink { .cmp(&other.origin_history) .then_with(|| self.linked_history.cmp(&other.linked_history)) .then_with(|| self.link_side.cmp(&other.link_side)) - .then_with(|| self.score.total_cmp(&other.score)) + .then_with(|| self.probability.total_cmp(&other.probability)) } } @@ -362,7 +362,7 @@ fn get_valid_joins_for_current_group( origin_history: prior_origin_history, linked_history: prior_link_history, link_side: prior_linkable_side, - score: weight, + probability: weight, }); solved_current += @@ -659,6 +659,15 @@ fn get_join_endpoints_from_links( (left_side, right_side) } +fn prior_history_index(entry: &HistoryEntry) -> usize { + if let HistoryEntry::Append(info) | HistoryEntry::Join(info) = entry { + info.prior_history + } else { + // 0 for the root of the histories... + 0 + } +} + fn add_single_join( histories: &mut Vec, segments: &[Segment], @@ -679,11 +688,13 @@ fn add_single_join( let new_group_index = segment_groups[segment_idx].add_group(&segments[segment_idx], join_blocks); - let join_prior_index = match (left_join_link, right_join_link) { - (Some(lv), Some(rv)) => lv.origin_history.min(rv.origin_history), - (Some(v), None) | (None, Some(v)) => v.origin_history, - _ => prior_hist_idx, - }; + let join_prior_index = prior_history_index( + &histories[match (left_join_link, right_join_link) { + (Some(lv), Some(rv)) => lv.origin_history.min(rv.origin_history), + (Some(v), None) | (None, Some(v)) => v.origin_history, + _ => panic!("Unreachable branch here, something went really wrong..."), + }], + ); // Clean expired history entries from the join path.... let simplified_join_index = remove_expired_history_entries( @@ -694,8 +705,19 @@ fn add_single_join( history_depth, ); - let right_score = right_join_link.as_ref().map(|v| v.score).unwrap_or(0.0); - let left_score = left_join_link.as_ref().map(|v| v.score).unwrap_or(0.0); + let prior_is_skip = match &histories[prior_hist_idx] { + HistoryEntry::Append(val) | HistoryEntry::Join(val) => val.group_index == 0, + HistoryEntry::Root => true, + }; + + let right_score = right_join_link + .as_ref() + .map(|v| score_params.join_transition(prior_is_skip, v.probability)) + .unwrap_or(0.0); + let left_score = left_join_link + .as_ref() + .map(|v| score_params.join_transition(prior_is_skip, v.probability)) + .unwrap_or(0.0); // If two joins, we incurred a expensive query-to-query jump in the past, so now we undo that cost... let bonus = if left_join_link.is_some() && right_join_link.is_some() { -score_params.query_jump_score @@ -1158,6 +1180,7 @@ fn get_joinable_extensions<'a>( .collect_vec() } +#[derive(Debug)] struct JoinStackEntry { joined_history_offset: usize, trace_segment_offset: usize, @@ -1170,6 +1193,7 @@ struct AddedBlockInfo { join_index: usize, } +#[derive(Debug)] struct JoinStack { pub stack: Vec, pub next_join_index: usize, @@ -1193,7 +1217,7 @@ impl JoinStack { /// Try adding one or two joins to the join stack if this block is a join and has linked edges. fn try_push(&mut self, entry: &HistoryEntry, added_block_info: Option<&AddedBlockInfo>) { if let (HistoryEntry::Join(val), Some(info)) = (entry, added_block_info) { - let mut top_stack_entries = 0; + let top_offset = self.stack.len(); if val.join_left_block.caused_by_history != info.history_index { self.stack.push(JoinStackEntry { @@ -1201,7 +1225,6 @@ impl JoinStack { trace_segment_offset: info.trace_stack_index, join_index: info.join_index, }); - top_stack_entries += 1; } if val.join_right_block.caused_by_history != info.history_index { @@ -1210,11 +1233,9 @@ impl JoinStack { trace_segment_offset: info.trace_stack_index, join_index: info.join_index, }); - top_stack_entries += 1; } - let top_vals_offset = self.stack.len() - top_stack_entries; - self.stack[top_vals_offset..].sort_unstable_by_key(|v| v.joined_history_offset); + self.stack[top_offset..].sort_unstable_by_key(|v| v.joined_history_offset); } } @@ -1278,7 +1299,7 @@ fn history_backtrace_append_block( .any(|&b| matches!(b.block_type, BlockType::Alignment | BlockType::TandemRepeat)) { match joiner.check_for_join(current_history_index) { - // Case 2: Involved in a join, add new block, but don't + // Case 2: Involved in a join, add new block. BlockAction::Join(join_index, stack_pos) => { let joins = get_joinable_extensions( blocks.iter().copied(), @@ -1383,6 +1404,13 @@ pub fn backtrace_histories( current_entry = &history.entries[current_idx]; } + if joiner.stack.len() != 0 { + panic!( + "Backtrace not done properly, there are {} leftover values on the join stack!", + joiner.stack.len() + ); + } + // Reverse so trace segments go from start to end instead of end to start. refined_segments.reverse(); refined_segments diff --git a/src/join_estimation.rs b/src/join_estimation.rs new file mode 100644 index 0000000..ea259b3 --- /dev/null +++ b/src/join_estimation.rs @@ -0,0 +1,343 @@ +use std::{ + f64::{self}, + fmt::Debug, + ops, +}; + +use crate::{ + assembly::{relative_consensus_distance, ConsensusDistanceNormalization, LinkType}, + p2estimator::custom_quantile_estimator::{LomaxQuant, MedianEstimator}, + segments::Block, + statistics::{ln_add_exp, AssymetricLaplace, Distribution, ExponentialEstimator, HalfT, Lomax}, +}; + +pub trait JoinEstimator: Clone + Default + Debug { + fn predict( + &self, + first_block: &Block, + second_block: &Block, + link_info: &LinkInfo, + log_space: bool, + ) -> f64; +} + +pub struct LinkInfo { + #[allow(dead_code)] + pub target_distance: isize, + #[allow(dead_code)] + pub consensus_distance: isize, + pub link_type: LinkType, + pub consensus_length: usize, + pub unexplained_bases: usize, + #[allow(dead_code)] + pub neighbors: bool, + pub joinable: bool, +} + +pub trait JoinStatisticsCollector: Clone + Debug { + fn new() -> Self; + fn new_from_prior(bayesian_prior: &Self, pseudo_count: usize) -> Self; + fn combine(&self, other: &Self) -> Self; + fn add(&mut self, first_block: &Block, second_block: &Block, link_info: &LinkInfo); +} + +#[derive(Debug, Clone, Default)] +pub struct BayesianJoinEstimator { + target_distance_join: ExponentialEstimator, + target_distance_nojoin: ExponentialEstimator, + divergence_join: HalfT, + divergence_nojoin: HalfT, + consensus_distance_join: AssymetricLaplace, + consensus_distance_nojoin: AssymetricLaplace, + join_prior: f64, +} + +impl JoinEstimator for BayesianJoinEstimator { + fn predict( + &self, + first_block: &Block, + second_block: &Block, + link_info: &LinkInfo, + log_space: bool, + ) -> f64 { + let target_dist = link_info.unexplained_bases as f64; + // Absolute value as t-dist is symmetric and we want to get prob in tail, also, we know the mean is 0... + let divergence_diff = (second_block.kimura80 - first_block.kimura80).abs(); + let (rel_con_dist, _join_type) = relative_consensus_distance( + first_block, + second_block, + ConsensusDistanceNormalization::WithUBAndLength( + link_info.unexplained_bases, + link_info.consensus_length, + ), + ); + + let join_score = self.join_prior.ln() + + self.target_distance_join.logpdf(target_dist) + + self.divergence_join.logpdf(divergence_diff) + + self.consensus_distance_join.logpdf(rel_con_dist); + let nojoin_score = (-self.join_prior).ln_1p() + + self.target_distance_nojoin.logpdf(target_dist) + + self.divergence_nojoin.logpdf(divergence_diff) + + self.consensus_distance_nojoin.logpdf(rel_con_dist); + + let score_norm = ln_add_exp(join_score, nojoin_score); + let score = join_score - score_norm; + + if log_space { + score + } else { + score.exp() + } + } +} + +impl From for BayesianJoinEstimator { + fn from(value: BayesianJoinStatistics) -> Self { + Self::from(&value) + } +} + +#[derive(Debug, Clone, Copy)] +struct MomentEstimator { + sum_square: f64, + sum: f64, + samples: usize, +} + +impl MomentEstimator { + fn new() -> Self { + Self { + sum_square: 0.0, + sum: 0.0, + samples: 0, + } + } + + fn to_psuedo_count(&self, count: usize) -> Self { + Self { + sum_square: (self.sum_square / self.samples.max(1) as f64) * count as f64, + sum: (self.sum / self.samples.max(1) as f64) * count as f64, + samples: count, + } + } + + fn mean(&self) -> f64 { + self.sum / self.samples.max(1) as f64 + } + + fn variance(&self) -> f64 { + // TODO: Use shifted data alg for more accuracy... + (self.sum_square - (self.sum * self.sum) / self.samples.max(1) as f64) + / (self.samples.max(2) as f64 - 1.0) + } + + fn standard_deviation(&self) -> f64 { + self.variance().sqrt() + } + + fn samples(&self) -> usize { + self.samples + } +} + +impl Default for MomentEstimator { + fn default() -> Self { + Self::new() + } +} + +impl ops::Add<&MomentEstimator> for &MomentEstimator { + type Output = MomentEstimator; + + fn add(self, rhs: &MomentEstimator) -> Self::Output { + MomentEstimator { + sum_square: self.sum_square + rhs.sum_square, + sum: self.sum + rhs.sum, + samples: self.samples + rhs.samples, + } + } +} + +impl ops::AddAssign for MomentEstimator { + fn add_assign(&mut self, rhs: MomentEstimator) { + self.sum_square += rhs.sum_square; + self.sum += rhs.sum; + self.samples += rhs.samples; + } +} + +impl ops::AddAssign for MomentEstimator { + fn add_assign(&mut self, rhs: f64) { + self.sum_square += rhs * rhs; + self.sum += rhs; + self.samples += 1; + } +} + +impl From<&MomentEstimator> for ExponentialEstimator { + fn from(value: &MomentEstimator) -> Self { + Self::new(value.mean(), value.samples().max(1)) + } +} + +impl From<&MomentEstimator> for HalfT { + fn from(value: &MomentEstimator) -> Self { + Self::from_sample_mean(value.mean(), value.samples().max(1)) + } +} + +impl From<&LomaxQuant> for Lomax { + fn from(value: &LomaxQuant) -> Self { + // Using quantile selection trick originally developed for frechet... (quant ratio formula) + // Choose first quantile p, then second such that p2 = 1 - (1 - p)^2 and you get this nice closed form for alpha... + let p1 = LomaxQuant::PROB1; + let p2 = LomaxQuant::PROB2; + + let v1 = value.ppf(p1); + let v2 = value.ppf(p2); + + let a = -(1.0 - p1).ln() / (v2 / v1 - 1.0).ln(); + let y = v1 / ((1.0 - p1).powf(-1.0 / a) - 1.0); + Lomax::new(a, y) + } +} + +impl From<&MedianEstimator> for ExponentialEstimator { + fn from(value: &MedianEstimator) -> Self { + Self::new(value.ppf(0.5) / 2.0_f64.ln(), value.samples()) + } +} + +impl From<&MedianEstimator> for HalfT { + fn from(value: &MedianEstimator) -> Self { + Self::new( + value.ppf(0.5) / Self::new(1.0, value.samples()).ppf(0.5), + value.samples(), + ) + } +} + +impl From<&BayesianJoinStatistics> for BayesianJoinEstimator { + fn from(statistics: &BayesianJoinStatistics) -> Self { + Self { + target_distance_join: (&statistics.joinable_target_distance).into(), + target_distance_nojoin: (&statistics.unjoinable_target_distance).into(), + divergence_join: (&statistics.joinable_divergence).into(), + divergence_nojoin: HalfT::from_sample_mean( + statistics + .unjoinable_divergence + .mean() + .max(statistics.joinable_divergence.mean()), + statistics.unjoinable_divergence.samples(), + ), + consensus_distance_join: AssymetricLaplace::from_exponential_halves( + 0.0, + statistics.joinable_consensus_neg.mean(), + statistics.joinable_consensus_pos.mean(), + ), + // TODO: Consider replacing with beta dist, better matches dists we see... + consensus_distance_nojoin: AssymetricLaplace::symmetric_from_moments( + statistics.unjoinable_consensus.mean(), + statistics.unjoinable_consensus.standard_deviation(), + ), + // We take sqrt since we count all pairs, not just neighbors. + join_prior: (statistics.joinable_target_distance.samples() as f64 + / (statistics.joinable_target_distance.samples() + + statistics.unjoinable_target_distance.samples()) + .max(1) as f64), + } + } +} + +#[derive(Debug, Clone, Default)] +pub struct BayesianJoinStatistics { + joinable_target_distance: MomentEstimator, + unjoinable_target_distance: MomentEstimator, + joinable_divergence: MomentEstimator, + unjoinable_divergence: MomentEstimator, + joinable_consensus_pos: MomentEstimator, + joinable_consensus_neg: MomentEstimator, + unjoinable_consensus: MomentEstimator, +} + +impl JoinStatisticsCollector for BayesianJoinStatistics { + fn new() -> Self { + Self::default() + } + + fn new_from_prior(bayesian_prior: &Self, pseudo_count: usize) -> Self { + Self { + joinable_target_distance: bayesian_prior + .joinable_target_distance + .to_psuedo_count(pseudo_count), + unjoinable_target_distance: bayesian_prior + .unjoinable_target_distance + .to_psuedo_count(pseudo_count), + joinable_divergence: bayesian_prior + .joinable_divergence + .to_psuedo_count(pseudo_count), + unjoinable_divergence: bayesian_prior + .unjoinable_divergence + .to_psuedo_count(pseudo_count), + joinable_consensus_pos: bayesian_prior + .joinable_consensus_pos + .to_psuedo_count(pseudo_count), + joinable_consensus_neg: bayesian_prior + .joinable_consensus_neg + .to_psuedo_count(pseudo_count), + unjoinable_consensus: bayesian_prior + .unjoinable_consensus + .to_psuedo_count(pseudo_count), + } + } + + fn add(&mut self, first_block: &Block, second_block: &Block, link_info: &LinkInfo) { + //if !link_info.neighbors { + // return; + //} + + let target_dist = link_info.unexplained_bases; + let divergence_diff = (second_block.kimura80 - first_block.kimura80).abs(); + let (rel_con_dist, join_type) = relative_consensus_distance( + first_block, + second_block, + ConsensusDistanceNormalization::WithUBAndLength( + link_info.unexplained_bases, + link_info.consensus_length, + ), + ); + + if link_info.joinable { + self.joinable_target_distance += target_dist as f64; + self.joinable_divergence += divergence_diff; + if matches!(join_type, LinkType::Forward | LinkType::Reverse) { + if rel_con_dist >= 0.0 { + self.joinable_consensus_pos += rel_con_dist.abs(); + } else { + self.joinable_consensus_neg += rel_con_dist.abs(); + } + } + } else { + self.unjoinable_target_distance += target_dist as f64; + self.unjoinable_divergence += divergence_diff; + if matches!(join_type, LinkType::Forward | LinkType::Reverse) { + self.unjoinable_consensus += rel_con_dist; + } + } + } + + fn combine(&self, other: &Self) -> Self { + Self { + joinable_target_distance: &self.joinable_target_distance + + &other.joinable_target_distance, + unjoinable_target_distance: &self.unjoinable_target_distance + + &other.unjoinable_target_distance, + joinable_divergence: &self.joinable_divergence + &other.joinable_divergence, + unjoinable_divergence: &self.unjoinable_divergence + &other.unjoinable_divergence, + joinable_consensus_pos: &self.joinable_consensus_pos + &other.joinable_consensus_pos, + joinable_consensus_neg: &self.joinable_consensus_neg + &other.joinable_consensus_neg, + unjoinable_consensus: &self.unjoinable_consensus + &other.unjoinable_consensus, + } + } +} diff --git a/src/main.rs b/src/main.rs index 9d6b56b..d4dcbc4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,11 +6,18 @@ mod balanced_tree; mod chunks; mod confidence; mod history_tracing; +mod join_estimation; mod matrix; + +// Keeping around for enhanced parameter estimation work... +#[allow(dead_code)] +mod p2estimator; + mod pipeline; mod score_params; mod segment_groups; mod segments; +mod statistics; mod substitution_matrix; mod support; @@ -18,31 +25,39 @@ mod support; #[allow(dead_code)] mod union_find; +mod trace_statistics; mod util; mod viterbi; mod viz; mod windowed_scores; +use core::ops::RangeBounds; use std::{ collections::HashMap, + fmt::Debug, fs::{self, create_dir_all, File}, io::{BufRead, BufReader, BufWriter, Write}, + ops::Bound, path::PathBuf, }; use alignment::AlignmentData; use chunks::ProximityGroup; -use anyhow::{Ok, Result}; +use anyhow::{anyhow, Context, Ok, Result}; use clap::{Args, Parser}; use itertools::Itertools; use rayon::prelude::*; +use std::str::FromStr; +use thiserror::Error; use viz::VizConstraint; use crate::{ annotation::AmbiguousAnnotation, chunks::validate_groups, + join_estimation::{BayesianJoinEstimator, BayesianJoinStatistics}, pipeline::{run_history_trace, run_naive_trace, NaiveTraceResults}, + trace_statistics::{trace_statistics, OccuranceCountingMode, TraceStatistics}, viz::{ stats::{write_family_statistics, write_inversion_statistics}, write_index_file, ICON_SVG, @@ -101,6 +116,31 @@ pub struct PerformanceArgs { pub num_threads: usize, } +#[derive(Error, Debug)] +enum ParseRangedError { + #[error(transparent)] + ParseError(#[from] E), + #[error("float value {0:?} is not between {1:?} and {2:?}")] + RangeError(T, Bound, Bound), +} + +const fn ranged( + range: impl RangeBounds + Send + Sync + Clone + 'static, +) -> impl Fn(&str) -> Result> + Clone + Send + Sync + 'static { + move |v: &str| { + let f = F::from_str(v)?; + if range.contains(&f) { + Result::Ok(f) + } else { + Result::Err(ParseRangedError::RangeError( + f, + range.start_bound().map(|v| *v), + range.end_bound().map(|v| *v), + )) + } + } +} + #[derive(Args, Debug, Clone, Default)] pub struct AnnotationArgs { /// The penalty of jumping between query models @@ -108,7 +148,8 @@ pub struct AnnotationArgs { short = 'J', long = "query-jump", default_value = "-127.0", - value_name = "f" + value_name = "f", + value_parser = ranged::(..-1.0) )] pub query_jump_penalty: f64, @@ -118,11 +159,12 @@ pub struct AnnotationArgs { short = 'L', long = "skip-loop", default_value = "30", - value_name = "n" + value_name = "n", + value_parser = ranged::(1..) )] pub num_skip_loops_eq_to_jump: usize, - /// The max distance across unaligned positions + /// The max distance across positions /// in the target (genome) at which a join is /// considered between compatible alignments #[arg( @@ -133,13 +175,24 @@ pub struct AnnotationArgs { )] pub target_join_distance: usize, - /// The maximum overlap in the consensus at which + /// Removes joins that fall below this threshold of occuring. + /// Value can be set between 0 and 1. + #[arg( + long = "join-likelihood-threshold", + default_value = "0.25", + value_name = "f", + value_parser = ranged::(0.0..1.0) + )] + pub join_likelihood_threshold: f64, + + /// The maximum allowed overlap in the consensus at which /// a join is considered between compatible alignments. #[arg( short = 'O', long = "consensus-join-overlap", default_value = "200", - value_name = "n" + value_name = "n", + value_parser = ranged::(0..) )] pub consensus_join_overlap: isize, @@ -148,14 +201,21 @@ pub struct AnnotationArgs { #[arg( short = 'C', long = "consensus-join-distance", - default_value = "2000", - value_name = "n" + default_value = "2500", + value_name = "n", + value_parser = ranged::(0..) )] pub consensus_join_distance: isize, /// The maximum seperation or overlap in nucleotides on both target and consensus /// for a join to be allowed between inverted alignments. - #[arg(long = "inversion-distance", default_value = "50", value_name = "n")] + #[arg( + short = 'I', + long = "inversion-distance", + default_value = "200", + value_name = "n", + value_parser = ranged::(0..) + )] pub inversion_distance: isize, /// The size of the window looked at to determine a single alignment score in nucleotides. @@ -163,7 +223,8 @@ pub struct AnnotationArgs { short = 'W', long = "window-size", default_value = "31", - value_name = "n" + value_name = "n", + value_parser = ranged::(1..) )] pub score_window_size: usize, @@ -172,7 +233,8 @@ pub struct AnnotationArgs { short = 'B', long = "background-window-size", default_value = "61", - value_name = "n" + value_name = "n", + value_parser = ranged::(1..) )] pub background_window_size: usize, @@ -189,7 +251,8 @@ pub struct AnnotationArgs { #[arg( long = "min-segment-confidence", default_value = "0.1", - value_name = "f" + value_name = "f", + value_parser = ranged::(0.0..1.0) )] pub min_block_confidence: f64, @@ -208,42 +271,6 @@ pub struct AnnotationArgs { /// Set to 0 or greater to disable. #[arg(long = "min-history-score", default_value = "-500.0", value_name = "f")] pub min_relative_history_score: f64, - - /// The amount of overlap between two joinable sequences in the consensus - /// before a penalty starts being applied to the join. - #[arg(long = "free-join-overlap", default_value = "4", value_name = "n")] - pub free_join_consensus_overlap: usize, - - /// The amount of gap between two joinable sequences - /// before a penalty starts being applied to the join. - #[arg(long = "free-join-gap", default_value = "10", value_name = "n")] - pub free_join_consensus_gap: usize, - - /// The amount of penalty to apply to a join at the maximum allowed consensus overlap - /// A value of 1 means to apply a penalty equal to a query jump. - /// The cost grows linearly to this value as the overlap increases. - #[arg( - long = "consensus-overlap-penalty", - default_value = "1.0", - value_name = "f" - )] - pub join_consensus_overlap_penalty: f64, - - /// The amount of penalty to apply to a join at the maximum allowed consensus gap - /// A value of 1 means to apply a penalty equal to a query jump. - /// The cost grows linearly to this value as the gap increases. - #[arg( - long = "consensus-gap-penalty", - default_value = "0.5", - value_name = "f" - )] - pub join_consensus_gap_penalty: f64, - - /// The amount of penalty to apply to a join at the maximum allowed target gap - /// A value of 1 means to apply a penalty equal to a query jump. - /// The cost grows linearly to this value as the gap between the sequences in the target space increases. - #[arg(long = "target-gap-penalty", default_value = "0.4", value_name = "f")] - pub join_target_gap_penalty: f64, } #[derive(Args, Debug, Clone, Default)] @@ -321,18 +348,18 @@ fn main() -> Result<()> { if let Result::Ok(metadata) = fs::metadata(&viz_args.viz_output_path) { if metadata.is_dir() { // TODO: real error - panic!( - "directory: {} already exists", - viz_args.viz_output_path.to_str().unwrap() - ) + return Result::Err(anyhow!( + "directory: '{}' already exists", + viz_args.viz_output_path.to_str().unwrap_or("?") + )); } } - create_dir_all(&viz_args.viz_output_path)?; - viz_args.viz_output_path = viz_args.viz_output_path.canonicalize()?; - if let Some(path) = &viz_args.viz_reference_bed_path { - let file = File::open(path).expect("failed to open viz reference bed file"); + let file = File::open(path).context(format!( + "failed to open viz reference bed file: '{}'", + path.to_str().unwrap_or("?") + ))?; let reader = BufReader::new(file); let mut chrom_list = vec![String::from("sentinel")]; @@ -342,36 +369,51 @@ fn main() -> Result<()> { .lines() .map(|l| l.unwrap()) .enumerate() - .for_each(|(line_num, line)| { + .try_for_each(|(line_num, line)| { + let line_num_info = || format!("failed to read line {}", line); + let tokens: Vec<&str> = line.split_whitespace().collect(); let chrom = tokens[0].to_string(); - let start = tokens[1].parse::().expect("failed to parse int"); + let start = tokens[1].parse::().with_context(line_num_info)?; - let last_chrom = chrom_list.last().expect("chrom list is empty"); + let last_chrom = chrom_list.last().context("chrom list is empty")?; if chrom == *last_chrom { if prev_start > start { - panic!("bed file is unsorted"); + return Result::Err(anyhow!("bed file is unsorted")); } } else if !chrom_list.contains(&chrom) { chrom_list.push(chrom.clone()); index.insert(chrom, line_num); } else { - panic!("bed file is unsorted"); + return Result::Err(anyhow!("bed file is unsorted")); } prev_start = start; - }); + + Ok(()) + }) + .context(format!( + "failed to parse bed file: '{}'", + path.to_str().unwrap_or("?") + ))?; viz_args.viz_reference_bed_index = index; } } - let alignments_file = File::open(&args.alignments)?; - let matrices_file = File::open(&args.matrices)?; + let alignments_file = File::open(&args.alignments).context(format!( + "failed to open alignments file: '{}'", + args.alignments + ))?; + let matrices_file = File::open(&args.matrices) + .context(format!("failed to open matrices file: '{}'", args.matrices))?; let ultra_file = match args.ultra_args.ultra_file_path { - Some(ref path) => Some(File::open(path)?), + Some(ref path) => Some(File::open(path).context(format!( + "failed to open ultra file: '{}'", + path.to_str().unwrap_or("?") + ))?), None => None, }; @@ -395,22 +437,27 @@ fn main() -> Result<()> { if let Some(path) = &args.io_args.regions_path { let regions_file = File::create(path).unwrap(); let mut regions_writer = BufWriter::new(regions_file); - proximity_groups.iter().enumerate().for_each(|(idx, g)| { - writeln!( - &mut regions_writer, - "{},{},{}:{},{}:{}", - idx, - alignment_data.target_name_map.get(g.target_id), - g.target_start, - g.target_end, - g.line_start, - g.line_end, - ) - .expect("failed to write to regions file") - }); + proximity_groups + .iter() + .enumerate() + .try_for_each(|(idx, g)| { + writeln!( + &mut regions_writer, + "{},{},{}:{},{}:{}", + idx, + alignment_data.target_name_map.get(g.target_id), + g.target_start, + g.target_end, + g.line_start, + g.line_end, + ) + .context("failed to write to regions file") + })?; } if viz_args.viz { + create_dir_all(&viz_args.viz_output_path)?; + viz_args.viz_output_path = viz_args.viz_output_path.canonicalize()?; let mut index_file = File::create(viz_args.viz_output_path.join("index.html")).unwrap(); write_index_file( @@ -419,7 +466,7 @@ fn main() -> Result<()> { &proximity_groups, &viz_args.viz_constraints, ) - .expect("failed to write to index.html"); + .context("failed to write to index.html file for visualization")?; } debug_assert!(validate_groups( @@ -448,22 +495,29 @@ fn main() -> Result<()> { .par_iter() .panic_fuse() .enumerate() - .map(|(region_idx, group)| { - ( - region_idx, - run_naive_trace(group, &alignment_data, region_idx, &args), - ) - }) - .collect::>(); - naive_results.sort_by_key(|v| v.0); + .map(|(region_idx, group)| run_naive_trace(group, &alignment_data, region_idx, &args)) + .collect::>>(); + naive_results.sort_by_key(|v| v.region_index); + + let trace_stats: TraceStatistics = trace_statistics( + &naive_results, + &alignment_data, + OccuranceCountingMode::Segments, + ); let mut results: Vec<(usize, Vec)> = proximity_groups .par_iter() .zip(naive_results) - .map(|(group, (region_idx, mut naive_trace))| { + .map(|(group, mut naive_trace)| { ( - region_idx, - run_history_trace(group, &alignment_data, &mut naive_trace, &args), + naive_trace.region_index, + run_history_trace( + group, + &alignment_data, + &trace_stats, + &mut naive_trace, + &args, + ), ) }) .collect(); @@ -487,7 +541,11 @@ fn main() -> Result<()> { .viz_output_path .join("family_stats.html"), )?; - write_family_statistics(&mut family_stats_writer, &results)?; + write_family_statistics( + &mut family_stats_writer, + &results, + &alignment_data.query_lengths, + )?; let mut inv_stats_writer = File::create( args.visualization_args .viz_output_path diff --git a/src/p2estimator.rs b/src/p2estimator.rs new file mode 100644 index 0000000..509d943 --- /dev/null +++ b/src/p2estimator.rs @@ -0,0 +1,842 @@ +use std::{ + cmp::Ordering, + ops::{Add, AddAssign}, + usize, +}; + +use crate::statistics::Distribution; +use itertools::Itertools; +// Implementation of P2 estimator. +// See "The P2 Algorithm for Dynamic Statistical Computing Calculation of Quantiles and Histograms Without Storing Observations" +// at https://www.cse.wustl.edu/~jain/papers/ftp/psqr.pdf +// +// We replace the P2 interpolation with PCHIP instead (See paper A Method for Constructing Local Monotone Piecewise Cubic Interpolants by F. N. Fritsch and J. Butland, or https://doi.org/10.1137/0905021) + +pub struct QuantileEstimatorData<'a> { + ranks: &'a [usize], + values: &'a [f64], + targets: &'a [f64], + observations: &'a usize, +} + +pub struct MutableQuantileEstimatorData<'a> { + ranks: &'a mut [usize], + values: &'a mut [f64], + targets: &'a [f64], + observations: &'a mut usize, +} + +fn cubic_hermite_spline(x0: f64, y0: f64, x1: f64, y1: f64, m0: f64, m1: f64, x: f64) -> f64 { + let t = (x - x0) / (x1 - x0); + let ms0 = (x1 - x0) * m0; + let ms1 = (x1 - x0) * m1; + + let h0 = (2.0 * t - 3.0) * t * t + 1.0; + let h1 = ((t - 2.0) * t + 1.0) * t; + let h2 = (-2.0 * t + 3.0) * t * t; + let h3 = (t - 1.0) * t * t; + + h0 * y0 + h1 * ms0 + h2 * y1 + h3 * ms1 +} + +fn pchip_point_derivative(dx0: f64, dy0: f64, dx1: f64, dy1: f64) -> f64 { + let s0 = if dx0 != 0.0 { dy0 / dx0 } else { 0.0 }; + let s1 = if dx1 != 0.0 { dy1 / dx1 } else { 0.0 }; + if s0 * s1 > 0.0 { + let alpha = (1.0 / 3.0) * (1.0 + dx1 / (dx0 + dx1)); + s0 * s1 / (alpha * s1 + (1.0 - alpha) * s0) + } else { + 0.0 + } +} + +fn pchip_prediction(x_points: &[f64; 4], y_points: &[f64; 4], x: f64) -> f64 { + debug_assert!(x_points.is_sorted() && x <= x_points[2] && x >= x_points[1]); + let m0 = pchip_point_derivative( + x_points[1] - x_points[0], + y_points[1] - y_points[0], + x_points[2] - x_points[1], + y_points[2] - y_points[1], + ); + let m1 = pchip_point_derivative( + x_points[2] - x_points[1], + y_points[2] - y_points[1], + x_points[3] - x_points[2], + y_points[3] - y_points[2], + ); + + cubic_hermite_spline( + x_points[1], + y_points[1], + x_points[2], + y_points[2], + m0, + m1, + x, + ) +} + +fn debug_check_valid_estimator( + ranks: &[usize], + values: &[f64], + targets: &[f64], + observations: usize, +) { + debug_assert!(ranks.len() > 2); + debug_assert!(ranks.len() == values.len() && ranks.len() == targets.len()); + debug_assert!(values.is_sorted() && ranks.is_sorted() && targets.is_sorted()); + debug_assert!(targets.first() == Some(&0.0) && targets.last() == Some(&1.0)); + debug_assert!(observations >= ranks.len()); + debug_assert!(ranks.first() == Some(&0) && ranks.last() == Some(&(observations - 1))); +} + +fn debug_check_uninitialized_estimator(ranks: &[usize], values: &[f64], targets: &[f64]) { + debug_assert!(ranks.len() > 2); + debug_assert!(ranks.len() == values.len() && ranks.len() == targets.len()); + debug_assert!(targets.is_sorted()); + debug_assert!(targets.first() == Some(&0.0) && targets.last() == Some(&1.0)); +} + +fn _add_sample_to_estimator(data: MutableQuantileEstimatorData, sample: f64) { + let MutableQuantileEstimatorData { + ranks, + values, + targets, + observations, + } = data; + debug_check_valid_estimator(ranks, values, targets, *observations); + // Find where sample falls within distribution... + let p = values.partition_point(|&v| v < sample); + let bound_p = p.min(values.len() - 1); + + // Update extremes... + if bound_p == 0 { + values[bound_p] = values[bound_p].min(sample); + } else if bound_p == (values.len() - 1) { + values[bound_p] = values[bound_p].max(sample); + } + + // Increment ranks of markers above newly inserted sample... + for i in bound_p.max(1)..ranks.len() { + ranks[i] = ranks[i] + 1; + } + + // Adjust inner markers to within 1 of their target quantile using p2 formula... + for i in 1..(values.len() - 1) { + // Observations hasn't been incremented yet, don't need to subtract 1... + let target_rank = (targets[i] * (*observations) as f64) as usize; + let current_rank = ranks[i]; + if current_rank.abs_diff(target_rank) > 1 { + //println!("{:?}, {}, {}", ranks, i, ranks[i]); + let new_rank: usize = target_rank.clamp( + ranks[i - 1].saturating_add(1), + ranks[i + 1].saturating_sub(1), + ); + if new_rank == current_rank { + continue; + } + + let idx_shift = if new_rank > current_rank { 1 } else { 0 }; + + let indexes = [ + (i + idx_shift).saturating_sub(2), + (i + idx_shift).saturating_sub(1), + (i + idx_shift), + (i + idx_shift).saturating_add(1).min(ranks.len() - 1), + ]; + + values[i] = pchip_prediction( + &indexes.map(|i| ranks[i] as f64), + &indexes.map(|i| values[i]), + new_rank as f64, + ); + ranks[i] = new_rank; + } + } + + *observations += 1; +} + +fn _merge_estimators( + q1: QuantileEstimatorData, + q2: QuantileEstimatorData, + new_estimator: MutableQuantileEstimatorData, +) { + debug_check_valid_estimator(q1.ranks, q1.values, q1.targets, *q1.observations); + debug_check_valid_estimator(q2.ranks, q2.values, q2.targets, *q2.observations); + debug_check_uninitialized_estimator( + new_estimator.ranks, + new_estimator.values, + new_estimator.targets, + ); + + assert!(new_estimator.targets.len() <= (*q1.observations + *q2.observations)); + + fn get_at(a: &QuantileEstimatorData, i: usize) -> (usize, f64) { + (a.ranks[i], a.values[i]) + } + + // May eventually replace with algorithm that doesn't use extra memory... + // Calculate a "merged" quantiles by linearly iterpolating ranks based on the values we see... + let mut dual_est_quants: Vec<(f64, f64)> = Vec::with_capacity(q1.ranks.len() + q2.ranks.len()); + + let mut q1_prior: Option<(usize, f64)> = None; + let mut q2_prior: Option<(usize, f64)> = None; + + let mut q1_idx = 0; + let mut q2_idx = 0; + + loop { + let q1_past_end = q1_idx >= q1.values.len(); + let q2_past_end = q2_idx >= q2.values.len(); + + if q1_past_end && q2_past_end { + break; + } else if q1_past_end { + let next = get_at(&q2, q2_idx); + dual_est_quants.push(( + (next.0 + q1_prior.map(|v| v.0 + 1).unwrap_or(0)) as f64, + next.1, + )); + q2_prior = Some(next); + q2_idx += 1; + } else if q2_past_end { + let next = get_at(&q1, q1_idx); + dual_est_quants.push(( + (next.0 + q2_prior.map(|v| v.0 + 1).unwrap_or(0)) as f64, + next.1, + )); + q1_prior = Some(next); + q1_idx += 1; + } else if q1.values[q1_idx] <= q2.values[q2_idx] { + let other_next = get_at(&q2, q2_idx); + let next = get_at(&q1, q1_idx); + let w = q2_prior + .map(|other_prior| (next.1 - other_prior.1) / (other_next.1 - other_prior.1)) + .unwrap_or(0.0); + let other_rank_est = q2_prior + .map(|other_prior| { + (other_prior.0 + 1) as f64 * (1.0 - w) + (other_next.0 + 1) as f64 * w + }) + .unwrap_or(0.0); + dual_est_quants.push((next.0 as f64 + other_rank_est, next.1)); + q1_prior = Some(next); + q1_idx += 1; + } else { + let other_next = get_at(&q1, q1_idx); + let next = get_at(&q2, q2_idx); + let w = q1_prior + .map(|other_prior| (next.1 - other_prior.1) / (other_next.1 - other_prior.1)) + .unwrap_or(0.0); + let other_rank_est = q1_prior + .map(|other_prior| { + (other_prior.0 + 1) as f64 * (1.0 - w) + (other_next.0 + 1) as f64 * w + }) + .unwrap_or(0.0); + dual_est_quants.push((next.0 as f64 + other_rank_est, next.1)); + q2_prior = Some(next); + q2_idx += 1; + } + } + + // New number of observations is the sum of both... + *(new_estimator.observations) = *(q1.observations) + *(q2.observations); + let rank_range = *(new_estimator.observations) - 1; + + // Solve all inner quantiles using traditional interpolation... + let mut index_between = 0; + + new_estimator.ranks.first_mut().map(|r| *r = 0); + new_estimator.ranks.last_mut().map(|r| *r = rank_range); + new_estimator + .values + .first_mut() + .map(|v| *v = dual_est_quants[0].1); + new_estimator + .values + .last_mut() + .map(|v| *v = dual_est_quants[dual_est_quants.len() - 1].1); + + for ti in 1..new_estimator.targets.len() - 1 { + // Calculate new rank... + let target = new_estimator.targets[ti]; + let approx_obs_rank = ((target * rank_range as f64) as usize) + .clamp(ti, rank_range - (new_estimator.targets.len() - (ti + 1))); + + // Find where it lands in cdf... + while index_between < dual_est_quants.len() + && (approx_obs_rank as f64) > dual_est_quants[index_between].0 + { + index_between += 1; + } + + // Get pchip estimate for the value... + let indexes = [ + index_between.saturating_sub(2), + index_between.saturating_sub(1), + index_between.min(dual_est_quants.len() - 1), + index_between + .saturating_add(1) + .min(dual_est_quants.len() - 1), + ]; + + new_estimator.values[ti] = pchip_prediction( + &indexes.map(|i| dual_est_quants[i].0), + &indexes.map(|i| dual_est_quants[i].1), + approx_obs_rank as f64, + ); + new_estimator.ranks[ti] = approx_obs_rank; + } +} + +trait PrimativeCast { + fn as_(&self) -> T; +} + +impl PrimativeCast for f64 { + #[inline] + fn as_(&self) -> f64 { + *self + } +} + +impl PrimativeCast for usize { + #[inline] + fn as_(&self) -> f64 { + *self as f64 + } +} + +fn _interpolated_value_prediction< + I: PartialOrd + PrimativeCast + Copy, + O: PartialOrd + PrimativeCast + Copy, +>( + xs: &[I], + ys: &[O], + x: f64, + lower_val: f64, + upper_val: f64, + not_enough_data_value: f64, +) -> f64 { + debug_assert!(xs.is_sorted()); + debug_assert!(xs.len() == ys.len()); + + if xs.len() < 1 { + return not_enough_data_value; + } + + let idx = xs.partition_point(|&v| v.as_() < x); + if idx >= xs.len() { + upper_val + } else if idx == 0 { + lower_val + } else { + let indexes = [ + idx.saturating_sub(2), + idx.saturating_sub(1), + idx, + idx.saturating_add(1).min(xs.len() - 1), + ]; + + pchip_prediction( + &indexes.map(|i| xs[i].as_()), + &indexes.map(|i| ys[i].as_()), + x, + ) + } +} + +#[derive(Clone, Debug)] +pub struct QuantileEstimator(T); + +impl QuantileEstimator { + pub fn samples(&self) -> usize { + *self.0._data().observations + } + + pub fn to_psuedo_count(&self, count: usize) -> Self { + let prior_data = self.0._data(); + let mut new_self = Self(T::new_like(&self.0)); + let new_data = new_self.0._mut_data(); + + let new_observations = count.max(1) * prior_data.ranks.len(); + + for i in 0..new_data.targets.len() { + let closest_rank = ((new_data.targets[i] * (new_observations - 1) as f64) as usize) + .clamp( + i, + (new_observations - 1) - (new_data.targets.len() - (i + 1)), + ); + + new_data.ranks[i] = closest_rank; + new_data.values[i] = self.ppf(closest_rank as f64 / (new_observations - 1) as f64) + } + *new_data.observations = new_observations; + + new_self + } +} + +impl QuantileEstimator { + pub fn new_from_slice(targets: &[f64]) -> Self { + Self(VectorQuantileRepresentation::new(targets)) + } +} + +impl QuantileEstimator> { + pub fn new_from_array(targets: &[f64; N]) -> Self { + Self(ArrayQuantileRepresentation::::new(targets)) + } +} + +impl Default for QuantileEstimator { + fn default() -> Self { + Self(T::default()) + } +} + +impl QuantileEstimator { + fn new() -> Self { + Self::default() + } +} + +impl AddAssign for QuantileEstimator { + fn add_assign(&mut self, sample: f64) { + let data = self.0._mut_data(); + + match (*data.observations + 1).cmp(&data.values.len()) { + Ordering::Less => { + data.values[*data.observations] = sample; + *data.observations += 1; + } + Ordering::Equal => { + data.values[*data.observations] = sample; + data.values.sort_by(|a, b| a.total_cmp(b)); + for i in 0..data.ranks.len() { + data.ranks[i] = i; + } + *data.observations += 1; + } + Ordering::Greater => { + _add_sample_to_estimator(data, sample); + } + } + } +} + +impl AddAssign<&[f64]> for QuantileEstimator { + fn add_assign(&mut self, samples: &[f64]) { + for &s in samples.iter() { + *self += s; + } + } +} + +impl Add<&QuantileEstimator> for &QuantileEstimator { + type Output = QuantileEstimator; + + fn add(self, rhs: &QuantileEstimator) -> Self::Output { + match (self.0._is_initialized(), rhs.0._is_initialized()) { + (true, true) => { + let mut new_quant_est = QuantileEstimator::(T::new_like(&self.0)); + + _merge_estimators(self.0._data(), rhs.0._data(), new_quant_est.0._mut_data()); + + new_quant_est + } + (true, false) | (false, false) => { + let other_data = rhs.0._data(); + let mut new_quant_est = self.clone(); + new_quant_est += &other_data.values[..*other_data.observations]; + new_quant_est + } + (false, true) => { + let self_data = self.0._data(); + let mut new_quant_est = rhs.clone(); + new_quant_est += &self_data.values[..*self_data.observations]; + new_quant_est + } + } + } +} + +impl Distribution for QuantileEstimator { + fn cdf(&self, x: f64) -> f64 { + let data = self.0._data(); + if self.0._is_initialized() { + _interpolated_value_prediction( + data.values, + data.ranks, + x, + 0.0, + (*data.observations - 1) as f64, + 0.0, + ) / (*data.observations - 1).max(1) as f64 + } else { + let xs_sorted = data.values[..*data.observations] + .iter() + .copied() + .sorted_by(|a, b| a.total_cmp(b)) + .collect_vec(); + let ys = (0..*data.observations).collect_vec(); + _interpolated_value_prediction( + &xs_sorted, + &ys, + x, + 0.0, + (*data.observations - 1) as f64, + 0.0, + ) + } + } + + fn logcdf(&self, x: f64) -> f64 { + self.cdf(x).ln() + } + + fn ccdf(&self, x: f64) -> f64 { + 1.0 - self.cdf(x) + } + + fn logccdf(&self, x: f64) -> f64 { + (-self.cdf(x)).ln_1p() + } + + fn ppf(&self, p: f64) -> f64 { + let data = self.0._data(); + let est_rank = p.clamp(0.0, 1.0) * (*data.observations - 1) as f64; + + let data = self.0._data(); + let (min_val, max_val) = self.support(); + + if self.0._is_initialized() { + _interpolated_value_prediction( + data.ranks, + data.values, + est_rank, + min_val, + max_val, + 0.0_f64.clamp(min_val, max_val), + ) + } else { + let ys_sorted = data.values[..*data.observations] + .iter() + .copied() + .sorted_by(|a, b| a.total_cmp(b)) + .collect_vec(); + let xs = (0..*data.observations).collect_vec(); + _interpolated_value_prediction( + &xs, + &ys_sorted, + est_rank, + min_val, + max_val, + 0.0_f64.clamp(min_val, max_val), + ) + } + } + + fn pdf(&self, _x: f64) -> f64 { + // Will have to calculate derivatives, cache normalization factor (such that area under curve is 1)... + // May be worth splitting out into different class to allow pre-processing this stuff... + // TODO: Would prefer monotonic quintic splines for this as we can just use the derivative... Allows us to avoid normalization... + // Idea: Traditional splines aren't monotonic, but if we make their derivative take the form (p(x))^2 where p(x) is a polynomial + // we are guaranteed to have monotonic splines. Final step would be solving for the basis functions. + panic!("Currently not supported!"); + } + + fn logpdf(&self, x: f64) -> f64 { + self.pdf(x).ln() + } + + fn support(&self) -> (f64, f64) { + let data = self.0._data(); + + if self.0._is_initialized() { + ( + *data.values.first().unwrap_or(&f64::NEG_INFINITY), + *data.values.last().unwrap_or(&f64::INFINITY), + ) + } else { + data.values[..*data.observations] + .iter() + .copied() + .minmax() + .into_option() + .unwrap_or((f64::NEG_INFINITY, f64::INFINITY)) + } + } +} + +pub trait QuantileEstimatorRepresentation: Clone { + fn new_like(other: &Self) -> Self; + fn _data(&self) -> QuantileEstimatorData<'_>; + fn _mut_data(&mut self) -> MutableQuantileEstimatorData<'_>; + fn _is_initialized(&self) -> bool { + let data = self._data(); + *data.observations >= data.ranks.len() + } +} + +#[derive(Clone, Debug)] +pub struct ArrayQuantileRepresentation { + values: [f64; N], + ranks: [usize; N], + targets: [f64; N], + observations: usize, +} + +impl ArrayQuantileRepresentation { + fn new(targets: &[f64; N]) -> Self { + assert!( + targets.is_sorted() && targets.first() == Some(&0.0) && targets.last() == Some(&1.0) + ); + Self { + values: [0.0; N], + ranks: [0; N], + targets: targets.clone(), + observations: 0, + } + } +} + +impl QuantileEstimatorRepresentation for ArrayQuantileRepresentation { + fn new_like(other: &Self) -> Self { + Self::new(&other.targets) + } + + fn _data(&self) -> QuantileEstimatorData<'_> { + QuantileEstimatorData { + ranks: &self.ranks, + values: &self.values, + targets: &self.targets, + observations: &self.observations, + } + } + + fn _mut_data(&mut self) -> MutableQuantileEstimatorData<'_> { + MutableQuantileEstimatorData { + ranks: &mut self.ranks, + values: &mut self.values, + targets: &self.targets, + observations: &mut self.observations, + } + } +} + +#[derive(Clone, Debug)] +pub struct VectorQuantileRepresentation { + values: Vec, + ranks: Vec, + targets: Vec, + observations: usize, +} + +impl VectorQuantileRepresentation { + fn new(targets: &[f64]) -> Self { + assert!( + targets.is_sorted() && targets.first() == Some(&0.0) && targets.last() == Some(&1.0) + ); + Self { + values: vec![0.0; targets.len()], + ranks: (0..targets.len()).collect_vec(), + targets: Vec::from(targets), + observations: 0, + } + } +} + +impl QuantileEstimatorRepresentation for VectorQuantileRepresentation { + fn new_like(other: &Self) -> Self { + Self::new(&other.targets) + } + + fn _data(&self) -> QuantileEstimatorData<'_> { + QuantileEstimatorData { + ranks: &self.ranks, + values: &self.values, + targets: &self.targets, + observations: &self.observations, + } + } + + fn _mut_data(&mut self) -> MutableQuantileEstimatorData<'_> { + MutableQuantileEstimatorData { + ranks: &mut self.ranks, + values: &mut self.values, + targets: &self.targets, + observations: &mut self.observations, + } + } +} + +pub mod custom_quantile_estimator { + use super::*; + + macro_rules! replace_expr { + ($_t:tt,$sub:expr) => { + $sub + }; + } + + macro_rules! count_exprs { + ($($val:expr),+) => {<[()]>::len(&[$(replace_expr!($val,())),+])}; + } + + macro_rules! implement_fixed_quantile_estimator { + ($name:ident, $repr_name:ident, [$($val:expr), +]) => { + #[derive(Clone, Debug)] + pub struct $repr_name { + values: [f64; Self::COUNT], + ranks: [usize; Self::COUNT], + observations: usize, + } + + impl $repr_name { + const TARGETS: [f64; count_exprs!($($val),+) + 2] = [0.0, $($val),+, 1.0]; + const COUNT: usize = Self::TARGETS.len(); + + pub fn new() -> Self { + Self { + values: [0.0; Self::COUNT], + ranks: [0; Self::COUNT], + observations: 0 + } + } + } + + impl Default for $repr_name { + fn default() -> Self { + Self::new() + } + } + + impl QuantileEstimatorRepresentation for $repr_name { + fn new_like(_other: &Self) -> Self { + Self::default() + } + fn _data(&self) -> QuantileEstimatorData<'_> { + QuantileEstimatorData { + ranks: &self.ranks, + values: &self.values, + targets: &Self::TARGETS, + observations: &self.observations, + } + } + fn _mut_data(&mut self) -> MutableQuantileEstimatorData<'_> { + MutableQuantileEstimatorData { + ranks: &mut self.ranks, + values: &mut self.values, + targets: &Self::TARGETS, + observations: &mut self.observations, + } + } + } + + pub type $name = QuantileEstimator<$repr_name>; + }; + } + + implement_fixed_quantile_estimator!( + LomaxQuant, + LomaxQuantRepr, + [0.18, 0.36, 0.4752, 0.5904, 0.7952] + ); + impl LomaxQuant { + pub const PROB1: f64 = 0.36; + pub const PROB2: f64 = 0.59; + } + implement_fixed_quantile_estimator!(MedianEstimator, MedianEstimatorRepr, [0.25, 0.5, 0.75]); +} + +#[cfg(test)] +mod test { + use crate::{ + p2estimator::QuantileEstimator, + statistics::{Distribution, Exponential}, + }; + use itertools::Itertools; + use rand::{rngs::Xoshiro256PlusPlus, RngExt, SeedableRng}; + + pub fn linspace(start: f64, stop: f64, steps: usize) -> impl Iterator { + (0..steps) + .map(move |n| n as f64 / (steps as f64 - 1.0)) + .map(move |n| start * (1.0 - n) + stop * n) + } + + fn is_close(a: f64, b: f64) -> bool { + let rel_tol = 1e-9; + let abs_tol = 0.0; + if a == b { + true + } else { + (a - b).abs() <= (rel_tol * (a.abs()).max(b.abs())).max(abs_tol) + } + } + + #[test] + fn quantiles_on_exponential_dist() { + let expon = Exponential::new(1.0); + let mut estimator = + QuantileEstimator::new_from_array(&[0.0, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]); + + let mut rng = Xoshiro256PlusPlus::seed_from_u64(12345654321); + + for _ in 0..10_000 { + let sample = expon.ppf(rng.random()); + estimator += sample; + } + + assert!(estimator.samples() == 10_000); + + for val in linspace(0.0, 0.90, 90) { + assert!((estimator.ppf(val) - expon.ppf(val)).abs() <= 0.04); + let dist_val = expon.ppf(val); + assert!((estimator.cdf(dist_val) - expon.cdf(dist_val)).abs() <= 0.04); + + // Basic probability distribution checks... + assert!(is_close( + estimator.ccdf(dist_val), + 1.0 - estimator.cdf(dist_val) + )); + + assert!(is_close( + estimator.logcdf(dist_val), + estimator.cdf(dist_val).ln() + )); + assert!(is_close( + estimator.logccdf(dist_val), + estimator.ccdf(dist_val).ln() + )); + } + } + + #[test] + fn test_quantile_merging() { + let expon = Exponential::new(1.0); + let mut merged_estimator = + QuantileEstimator::new_from_slice(&linspace(0.0, 1.0, 10).collect_vec()); + + let mut rng = Xoshiro256PlusPlus::seed_from_u64(12345654321); + + for _ in 0..100 { + let targets: Vec = linspace(0.0, 1.0, rng.random_range(5..15)).collect(); + let mut estimator = QuantileEstimator::new_from_slice(&targets); + + for _ in 0..100 { + estimator += expon.ppf(rng.random()); + } + + merged_estimator = &merged_estimator + &estimator; + } + + assert!(merged_estimator.samples() == 10_000); + + for val in linspace(0.0, 0.75, 75) { + assert!((merged_estimator.ppf(val) - expon.ppf(val)).abs() <= 0.1); + let dist_val = expon.ppf(val); + assert!((merged_estimator.cdf(dist_val) - expon.cdf(dist_val)).abs() <= 0.04) + } + } +} diff --git a/src/pipeline.rs b/src/pipeline.rs index 64e8d11..bf93a42 100644 --- a/src/pipeline.rs +++ b/src/pipeline.rs @@ -5,15 +5,18 @@ use itertools::Itertools; use crate::{ alignment::{AlignmentData, Strand}, annotation::{AmbiguousAnnotation, SimpleAnnotation}, + assembly::gather_join_statistics, chunks::ProximityGroup, confidence::confidence, history_tracing::{ backtrace_histories, history_viterbi_on_segments, History, RefinedTraceSegment, }, + join_estimation::{JoinEstimator, JoinStatisticsCollector}, matrix::{Matrix, MatrixDef}, score_params::{approximate_ideal_skip_state_score, ScoreParams}, segments::{assemble_and_link_segments, segments_from_matrix_trace, InitialSegments}, support::windowed_confidence, + trace_statistics::TraceStatistics, viterbi::{trace_segments, traceback, viterbi_collapsed, TraceSegment}, viz::{ debug::{dump_debug_history_info, dump_final_trace_statistics}, @@ -59,7 +62,7 @@ pub fn to_annotations( a.row_idx - proximity_group.alignments.len() - 1; let repeat = &proximity_group.tandem_repeats[tandem_repeat_idx]; format!( - "({}:{})#tandem repeat", + "({}:{})#tandem-repeat", repeat.period, repeat.consensus_pattern, ) } @@ -139,22 +142,25 @@ fn get_active_columns(matrix: &Matrix) -> Vec<(u active_cols } -pub struct NaiveTraceResults { +pub struct NaiveTraceResults { + pub target_start: usize, + pub target_end: usize, pub trace_segments: Vec, pub segments: InitialSegments, pub score_params: ScoreParams, pub alignment_confidences: Vec, pub active_columns: Vec<(usize, usize)>, + pub query_join_statistics: Vec<(usize, T)>, pub viz_writer: AdjudicationSodaWriter, pub region_index: usize, } -pub fn run_naive_trace( +pub fn run_naive_trace( proximity_group: &ProximityGroup, alignment_data: &AlignmentData, region_idx: usize, args: &AuroraArgs, -) -> NaiveTraceResults { +) -> NaiveTraceResults { let annot_args = &args.annotation_args; let score_params = ScoreParams::new( @@ -199,7 +205,10 @@ pub fn run_naive_trace( .unwrap(); confidence(&mut confidence_matrix); - let confidence_by_row = windowed_confidence(&mut confidence_matrix); + let confidence_by_row = windowed_confidence( + &mut confidence_matrix, + args.annotation_args.score_window_size, + ); let segments; let simple_trace; @@ -246,21 +255,32 @@ pub fn run_naive_trace( .expect("Unable to write confidences!!!"); } + let query_join_statistics = gather_join_statistics( + proximity_group, + &segments, + &alignment_data.query_lengths, + &args.annotation_args, + ); + NaiveTraceResults { + target_start: proximity_group.target_start, + target_end: proximity_group.target_end, trace_segments: simple_trace, segments, score_params, alignment_confidences: confidence_by_row, active_columns: get_active_columns(&confidence_matrix), + query_join_statistics, viz_writer, region_index: region_idx, } } -pub fn run_history_trace( +pub fn run_history_trace( proximity_group: &ProximityGroup, alignment_data: &AlignmentData, - naive_trace: &mut NaiveTraceResults, + trace_statistics: &TraceStatistics, + naive_trace: &mut NaiveTraceResults, args: &AuroraArgs, ) -> Vec { let vis_args = &args.visualization_args; @@ -269,8 +289,11 @@ pub fn run_history_trace( proximity_group, &mut naive_trace.segments, &naive_trace.trace_segments, + &trace_statistics.region_statistics[naive_trace.region_index], + &trace_statistics.query_statistics, &naive_trace.score_params, &args.annotation_args, + &alignment_data.query_lengths, ); let history = history_viterbi_on_segments( diff --git a/src/score_params.rs b/src/score_params.rs index 783d0b6..868a948 100644 --- a/src/score_params.rs +++ b/src/score_params.rs @@ -69,4 +69,18 @@ impl ScoreParams { prior_is_different, ) } + + pub fn join_transition(&self, prior_is_skip: bool, join_probability: f64) -> f64 { + let true_join_cost = fast_select( + self.query_to_skip_score + (self.query_loop_score - self.query_jump_score).abs(), + self.query_loop_score, + prior_is_skip, + ); + let false_join_cost = fast_select( + self.query_to_skip_score, + self.query_jump_score, + prior_is_skip, + ); + true_join_cost * join_probability + (1.0 - join_probability) * false_join_cost + } } diff --git a/src/segments.rs b/src/segments.rs index f0914f1..118e13a 100644 --- a/src/segments.rs +++ b/src/segments.rs @@ -1,9 +1,17 @@ use core::f64; -use std::{cmp::Ordering, fmt::Debug, iter::Fuse}; +use std::{cmp::Ordering, collections::HashMap, fmt::Debug, iter::Fuse}; use crate::{ - assembly::SegmentAssemblyGraph, chunks::ProximityGroup, matrix::Matrix, - score_params::ScoreParams, viterbi::TraceSegment, AnnotationArgs, + alignment::{Alignment, Strand}, + annotation::AmbiguousAnnotation, + assembly::SegmentAssemblyGraph, + chunks::ProximityGroup, + join_estimation::JoinEstimator, + matrix::Matrix, + score_params::ScoreParams, + trace_statistics::{QueryStatistics, RegionStatistics}, + viterbi::TraceSegment, + AnnotationArgs, }; use itertools::Itertools; @@ -53,6 +61,7 @@ pub enum BlockType { pub struct Block { pub row_idx: usize, pub block_type: BlockType, + pub strand: Strand, pub query_id: Option, pub col_start: usize, pub col_end: usize, @@ -60,6 +69,7 @@ pub struct Block { pub query_end: usize, pub avg_confidence: f64, pub alignment_score: f64, + pub kimura80: f64, pub can_join_up_to: usize, } @@ -90,6 +100,52 @@ impl Block { pub fn to_comparable(&self) -> (Option, usize) { (self.query_id, self.row_idx) } + + pub fn from_alignment( + alignment: &Alignment, + group_start: usize, + row: usize, + confidence: f64, + score: f64, + ) -> Self { + Self { + row_idx: row, + block_type: BlockType::Alignment, + strand: alignment.strand, + query_id: Some(alignment.query_id), + col_start: alignment.target_start.saturating_sub(group_start), + col_end: alignment.target_end.saturating_sub(group_start), + query_start: alignment.query_start, + query_end: alignment.query_end, + avg_confidence: confidence, + alignment_score: score, + kimura80: alignment.kimura80(alignment.query_start, alignment.query_end), + can_join_up_to: 0, + } + } + + pub fn from_annotation( + annotation: &AmbiguousAnnotation, + selected_index: usize, + row: usize, + ) -> Self { + let simple_annot = &annotation.annotations[selected_index]; + + Self { + row_idx: row, + block_type: BlockType::Alignment, + strand: simple_annot.strand, + query_id: Some(simple_annot.query_id), + col_start: simple_annot.target_start, + col_end: simple_annot.target_end, + query_start: simple_annot.query_start, + query_end: simple_annot.query_end, + avg_confidence: annotation.confidence, + alignment_score: annotation.confidence.ln(), + kimura80: simple_annot.kimura80, + can_join_up_to: 0, + } + } } #[derive(Debug)] @@ -102,6 +158,7 @@ pub struct Segment { } pub type SegmentedMatrix = Vec; +pub type SegmentedMatrixView<'a> = &'a [Segment]; #[derive(Copy, Clone, Debug)] enum MergeEntry { @@ -172,6 +229,7 @@ pub struct MergeIterator< val1: MergeEntry, val2: MergeEntry, prior_val: MergeEntry, + only_unique: bool, comparator: F, } @@ -180,13 +238,14 @@ impl, F: Fn(&I::Item, &I::Item) -> Orde where I::Item: Copy, { - pub fn new(iter1: I, iter2: J, comparator: F) -> Self { + pub fn new(iter1: I, iter2: J, comparator: F, only_unique: bool) -> Self { Self { iter1: iter1.fuse(), iter2: iter2.fuse(), val1: MergeEntry::Start, val2: MergeEntry::Start, prior_val: MergeEntry::Start, + only_unique, comparator, } } @@ -204,21 +263,14 @@ pub struct InitialSegments { initial_trace_scores: Vec, } -#[allow(dead_code)] -pub struct SegmentView<'a> { - pub start_col: usize, - pub end_col: usize, - pub blocks: &'a [Block], -} - #[allow(dead_code)] impl InitialSegments { - pub fn iter_segments(&self) -> impl Iterator { - self.segments.iter().map(|v| SegmentView { - start_col: v.start_col, - end_col: v.end_col, - blocks: &v.blocks, - }) + pub fn view_segments(&self) -> SegmentedMatrixView<'_> { + return &self.segments; + } + + pub fn len(&self) -> usize { + self.segments.len() } } @@ -249,6 +301,10 @@ where next_val = self.val2; self.val2 = self.iter2.next().into(); } + + if !self.only_unique { + break; + } } self.prior_val = next_val; @@ -275,7 +331,7 @@ pub fn unique_merging_iterator>( where I::Item: Copy + Ord, { - MergeIterator::new(list1, list2, |a, b| a.cmp(b)) + MergeIterator::new(list1, list2, |a, b| a.cmp(b), true) } #[derive(Debug)] @@ -441,7 +497,13 @@ pub fn segments_from_matrix_trace( .collect_vec(); // Compute scores and start/end points for all rows in this segment.... - // TODO: This isn't fully correct, if we want it to be we need to track if this segment starts in, and if the segment ends in a skip state to compute correct transitions for history tracing... + // TODO: This isn't fully correct, + // if we want it to be identical to initial viterbi trace, + // we need to track if each block in this segment starts in the skip state, + // and if each block in each segment ends in a skip state to compute correct + // transitions for history tracing... + // This would require a few additional booleans/states for each block and adjustments + // to the history tracing to incorperate them... for column in seg.col_start..=seg.col_end { let rows = &matrix_definition.active_rows_by_col[column]; let row_iter = rows @@ -524,17 +586,34 @@ pub fn segments_from_matrix_trace( _ => None, }; + let query_start = confidence_matrix.consensus_position(row_idx, start); + let query_end = confidence_matrix.consensus_position(row_idx, end); + + let kimura80 = match block_type { + BlockType::Alignment => { + group.alignments[row_idx - 1].kimura80(query_start, query_end) + } + _ => 0.0, + }; + + let strand = match block_type { + BlockType::Alignment => group.alignments[row_idx - 1].strand, + _ => Strand::Forward, + }; + Block { row_idx, block_type, + strand, query_id, col_start: start, col_end: end, - query_start: confidence_matrix.consensus_position(row_idx, start), - query_end: confidence_matrix.consensus_position(row_idx, end), + query_start, + query_end, avg_confidence: row_conf_sum[row_idx] / (row_valid_cell_count[row_idx].max(1) as f64), alignment_score: row_scores[row_idx], + kimura80, can_join_up_to: s_idx, } }) @@ -552,17 +631,22 @@ pub fn segments_from_matrix_trace( } } -pub fn assemble_and_link_segments<'a>( +pub fn assemble_and_link_segments<'a, T: JoinEstimator>( proximity_group: &ProximityGroup, initial_segments: &'a mut InitialSegments, trace_segments: &[TraceSegment], + region_statistics: &RegionStatistics, + query_statistics: &[QueryStatistics], score_params: &ScoreParams, annotation_args: &AnnotationArgs, + query_lengths: &HashMap, ) -> (&'a SegmentedMatrix, SegmentAssemblyGraph) { let assembly_graph = SegmentAssemblyGraph::new( proximity_group.alignments, + query_lengths, &initial_segments.segments, - score_params, + region_statistics, + query_statistics, annotation_args, ); finalize_segments( diff --git a/src/statistics.rs b/src/statistics.rs new file mode 100644 index 0000000..014aac1 --- /dev/null +++ b/src/statistics.rs @@ -0,0 +1,462 @@ +use core::f64; +use puruspe::{beta, betai, invbetai}; +use std::fmt::Debug; + +pub fn ln_add_exp(a: f64, b: f64) -> f64 { + let max = a.max(b); + let min = a.min(b); + // TODO: Possibly use more stable ln_1p_exp at https://github.com/JuliaStats/LogExpFunctions.jl/files/8218470/log1pexp.pdf (Implemented at https://github.com/JuliaStats/LogExpFunctions.jl/blob/master/src/basicfuns.jl#L263) + max + (min - max).exp().ln_1p() +} + +#[allow(dead_code)] +pub trait Distribution { + fn pdf(&self, x: f64) -> f64; + fn cdf(&self, x: f64) -> f64; + fn ppf(&self, p: f64) -> f64; + fn support(&self) -> (f64, f64); + fn ccdf(&self, x: f64) -> f64; + fn logpdf(&self, x: f64) -> f64; + fn logcdf(&self, x: f64) -> f64; + fn logccdf(&self, x: f64) -> f64; +} + +#[allow(dead_code)] +pub trait ParameterizedDistribution: Distribution + Debug + Default + Clone { + fn unit() -> Self { + Self::default() + } +} + +#[derive(Clone, Debug)] +pub struct Exponential { + lambda: f64, +} + +impl ParameterizedDistribution for Exponential {} + +impl Exponential { + pub fn new(lambda: f64) -> Self { + Self { lambda } + } + + pub fn from_scale(beta: f64) -> Self { + Self::new(1.0 / beta) + } +} + +impl Default for Exponential { + fn default() -> Self { + Self::new(1.0) + } +} + +impl Distribution for Exponential { + fn pdf(&self, x: f64) -> f64 { + self.lambda * (-self.lambda * x).exp() + } + + fn cdf(&self, x: f64) -> f64 { + 1.0 - (-self.lambda * x).exp() + } + + fn ppf(&self, p: f64) -> f64 { + -(1.0 - p).ln() / self.lambda + } + + fn ccdf(&self, x: f64) -> f64 { + (-self.lambda * x).exp() + } + + fn logpdf(&self, x: f64) -> f64 { + self.lambda.ln() - self.lambda * x + } + + fn logcdf(&self, x: f64) -> f64 { + (-(-self.lambda * x).exp()).ln_1p() + } + + fn logccdf(&self, x: f64) -> f64 { + -self.lambda * x + } + + fn support(&self) -> (f64, f64) { + (0.0, f64::INFINITY) + } +} + +#[derive(Debug, Clone)] +pub struct ExponentialEstimator { + sample_mean: f64, + degrees_of_freedom: usize, +} + +impl ParameterizedDistribution for ExponentialEstimator {} + +impl ExponentialEstimator { + pub fn new(sample_mean: f64, sample_size: usize) -> Self { + Self { + sample_mean: sample_mean, + degrees_of_freedom: sample_size, + } + } +} + +impl From for Exponential { + fn from(value: ExponentialEstimator) -> Self { + Self::from_scale(value.sample_mean) + } +} + +impl Default for ExponentialEstimator { + fn default() -> Self { + Self { + sample_mean: 1.0, + degrees_of_freedom: 1, + } + } +} + +impl Distribution for ExponentialEstimator { + fn logpdf(&self, x: f64) -> f64 { + let n = self.degrees_of_freedom as f64; + let sm = self.sample_mean; + ((n + 1.0) * n.ln() + n * sm.ln()) - ((n + 1.0) * (n * sm + x).ln()) + } + + fn pdf(&self, x: f64) -> f64 { + self.logpdf(x).exp() + } + + fn logccdf(&self, x: f64) -> f64 { + let n = self.degrees_of_freedom as f64; + let sm = self.sample_mean; + n * ((n * sm).ln() - (n * sm + x).ln()) + } + + fn logcdf(&self, x: f64) -> f64 { + (-self.logccdf(x).exp()).ln_1p() + } + + fn cdf(&self, x: f64) -> f64 { + -(self.logccdf(x).exp_m1()) + } + + fn ccdf(&self, x: f64) -> f64 { + self.logccdf(x).exp() + } + + fn ppf(&self, p: f64) -> f64 { + let n = self.degrees_of_freedom as f64; + let sm = self.sample_mean; + (n * sm) * ((1.0 - p).powf(-1.0 / n) - 1.0) + } + + fn support(&self) -> (f64, f64) { + (0.0, f64::INFINITY) + } +} + +#[derive(Debug, Clone)] +pub struct HalfT { + standard_deviation: f64, + degrees_of_freedom: usize, +} + +impl ParameterizedDistribution for HalfT {} + +impl HalfT { + #[allow(dead_code)] + pub fn new(standard_deviation: f64, degrees_of_freedom: usize) -> Self { + Self { + standard_deviation, + degrees_of_freedom, + } + } + + pub fn from_sample_mean(mean: f64, degrees_of_freedom: usize) -> Self { + Self { + standard_deviation: mean * (2.0 / f64::consts::PI).sqrt(), + degrees_of_freedom, + } + } +} + +impl Default for HalfT { + fn default() -> Self { + Self { + standard_deviation: 1.0, + degrees_of_freedom: 1, + } + } +} + +impl Distribution for HalfT { + fn logpdf(&self, x: f64) -> f64 { + let v = self.degrees_of_freedom as f64; + let s = self.standard_deviation; + let z = x / s; + if z >= 0.0 { + let norm = (2.0_f64).ln() - (0.5 * v.ln() + beta(0.5, 0.5 * v).ln() + s.ln()); + norm - 0.5 * (v + 1.0) * ((z * z) / v).ln_1p() + } else { + 0.0 + } + } + + fn pdf(&self, x: f64) -> f64 { + self.logpdf(x).exp() + } + + fn cdf(&self, x: f64) -> f64 { + let v = self.degrees_of_freedom as f64; + let z = x / self.standard_deviation; + if z >= 0.0 { + 1.0 - betai(0.5 * v, 0.5, v / (z * z + v)) + } else { + 0.0 + } + } + + fn logcdf(&self, x: f64) -> f64 { + self.cdf(x).ln() + } + + fn ppf(&self, p: f64) -> f64 { + let v = self.degrees_of_freedom as f64; + let inv_out = invbetai(1.0 - p, 0.5 * v, 0.5); + let x_unit = (v / inv_out - v).sqrt(); + x_unit * self.standard_deviation + } + + fn ccdf(&self, x: f64) -> f64 { + 1.0 - self.cdf(x) + } + + fn logccdf(&self, x: f64) -> f64 { + self.ccdf(x).ln() + } + + fn support(&self) -> (f64, f64) { + (0.0, f64::INFINITY) + } +} + +#[derive(Debug, Clone)] +pub struct Lomax { + alpha: f64, + lambda: f64, +} + +impl Lomax { + pub fn new(alpha: f64, lambda: f64) -> Self { + Self { alpha, lambda } + } +} + +impl ParameterizedDistribution for Lomax {} + +impl Default for Lomax { + fn default() -> Self { + Self::new(1.0, 1.0) + } +} + +impl Distribution for Lomax { + fn logpdf(&self, x: f64) -> f64 { + let a = self.alpha; + let y = self.lambda; + (a / y).ln() - (a + 1.0) * (1.0 + x / y).ln() + } + + fn pdf(&self, x: f64) -> f64 { + self.logpdf(x).exp() + } + + fn logccdf(&self, x: f64) -> f64 { + let a = self.alpha; + let y = self.lambda; + -a * (1.0 + x / y).ln() + } + + fn ccdf(&self, x: f64) -> f64 { + self.logccdf(x).exp() + } + + fn cdf(&self, x: f64) -> f64 { + -(self.logccdf(x).exp_m1()) + } + + fn logcdf(&self, x: f64) -> f64 { + (-self.ccdf(x)).ln_1p() + } + + fn ppf(&self, p: f64) -> f64 { + let a = self.alpha; + let y = self.lambda; + y * ((1.0 - p).powf(-1.0 / a) - 1.0) + } + + fn support(&self) -> (f64, f64) { + (0.0, f64::INFINITY) + } +} + +#[derive(Debug, Clone)] +pub struct AssymetricLaplace { + mode: f64, + scale: f64, + mode_quantile: f64, +} + +impl ParameterizedDistribution for AssymetricLaplace {} + +impl AssymetricLaplace { + pub fn new(mode: f64, scale: f64, mode_quantile: f64) -> Self { + Self { + mode, + scale, + mode_quantile, + } + } + + pub fn from_exponential_halves(mode: f64, negative_mean: f64, positive_mean: f64) -> Self { + let nm = negative_mean.max(1e-8); + let pm = positive_mean.max(1e-8); + Self::new(mode, (nm * pm) / (nm + pm), 1.0 / (pm / nm + 1.0)) + } + + pub fn symmetric_from_moments(mean: f64, standard_deviation: f64) -> Self { + Self::new(mean, standard_deviation / (8.0_f64.sqrt()), 0.5) + } +} + +impl Default for AssymetricLaplace { + fn default() -> Self { + Self::symmetric_from_moments(0.0, 1.0) + } +} + +impl Distribution for AssymetricLaplace { + fn logpdf(&self, x: f64) -> f64 { + let m = self.mode; + let l = self.scale; + let p = self.mode_quantile; + let exp_comp = if x <= m { + ((1.0 - p) / l) * (x - m) + } else { + -(p / l) * (x - m) + }; + + ((p * (1.0 - p)) / l).ln() + exp_comp + } + + fn pdf(&self, x: f64) -> f64 { + self.logpdf(x).exp() + } + + fn cdf(&self, x: f64) -> f64 { + let m = self.mode; + let l = self.scale; + let p = self.mode_quantile; + if x <= m { + p * (((1.0 - p) / l) * (x - m)).exp() + } else { + 1.0 - (1.0 - p) * (-(p / l) * (x - m)).exp() + } + } + + fn logcdf(&self, x: f64) -> f64 { + self.cdf(x).ln() + } + + fn ccdf(&self, x: f64) -> f64 { + 1.0 - self.cdf(x) + } + + fn logccdf(&self, x: f64) -> f64 { + self.ccdf(x).ln() + } + + fn ppf(&self, p: f64) -> f64 { + let m = self.mode; + let l = self.scale; + let pm = self.mode_quantile; + if p <= pm { + m + (l / (1.0 - pm)) * (p / pm).ln() + } else { + m - (l / pm) * ((1.0 - p) / (1.0 - pm)).ln() + } + } + + fn support(&self) -> (f64, f64) { + (f64::NEG_INFINITY, f64::INFINITY) + } +} + +#[cfg(test)] +mod test { + use super::*; + use std::fmt::Debug; + + pub fn linspace(start: f64, stop: f64, steps: usize) -> impl Iterator { + (0..steps) + .map(move |n| n as f64 / (steps as f64 - 1.0)) + .map(move |n| start * (1.0 - n) + stop * n) + } + + // Add debug trait to allow for printout... + pub trait TestDistribution: Distribution + Debug {} + impl TestDistribution for T {} + + fn as_box(d: T) -> Box { + Box::new(d) + } + + use super::{Exponential, ParameterizedDistribution}; + + fn get_dists() -> [Box; 5] { + [ + as_box(Exponential::unit()), + as_box(ExponentialEstimator::unit()), + as_box(HalfT::unit()), + as_box(AssymetricLaplace::unit()), + as_box(Lomax::unit()), + ] + } + + fn is_close(a: f64, b: f64) -> bool { + let rel_tol = 1e-9; + let abs_tol = 0.0; + (a - b).abs() <= (rel_tol * (a.abs()).max(b.abs())).max(abs_tol) + } + + #[test] + fn basic_distribution_propery_checks() { + for dist in get_dists() { + println!("Testing distribution: {:?}", dist); + let (mut low, mut high) = dist.support(); + + assert!(dist.cdf(low) == 0.0); + assert!(dist.cdf(high) == 1.0); + + if high == f64::INFINITY { + high = 5.0; + } + if low == f64::NEG_INFINITY { + low = -5.0; + } + + for x in linspace(low, high, 100) { + // Basic properties... + // println!("{x} -> {} vs {}", dist.tpdf(x), dist.tlogpdf(x).exp()); + assert!(is_close(dist.pdf(x), dist.logpdf(x).exp())); + assert!(is_close(dist.cdf(x), dist.logcdf(x).exp())); + assert!(is_close(dist.ccdf(x), dist.logccdf(x).exp())); + assert!(is_close(dist.ccdf(x), 1.0 - dist.cdf(x))); + // println!("{x} -> {}", dist.tppf(dist.tcdf(x))); + assert!(is_close(dist.ppf(dist.cdf(x)), x)); + } + } + } +} diff --git a/src/substitution_matrix.rs b/src/substitution_matrix.rs index 6364857..901a7d1 100644 --- a/src/substitution_matrix.rs +++ b/src/substitution_matrix.rs @@ -1,14 +1,14 @@ -use std::{ - fs::File, - io::{BufRead, BufReader, Read}, - path::Path, -}; +use std::io::{BufRead, BufReader, Read}; + +use anyhow::{anyhow, Context}; +use itertools::Itertools; use crate::alphabet::{ ALIGNMENT_ALPHABET_STR, GAP_EXTEND_DIGITAL, GAP_OPEN_DIGITAL, STR_TO_DIGITAL_NUCLEOTIDE, }; pub trait AlignmentScore { + #[allow(dead_code)] fn score(&self, target_char: u8, query_char: u8) -> f64; fn score_with_background(&self, target_char: u8, query_char: u8, frequencies: &[f64; 4]) -> f64; @@ -193,13 +193,7 @@ impl SubstitutionMatrix { } } - #[allow(dead_code)] - pub fn from_file(path: impl AsRef) -> Vec { - let file = File::open(path).expect("failed to open matrix file"); - SubstitutionMatrix::parse(file) - } - - pub fn parse(matrix_buf: R) -> Vec { + pub fn parse(matrix_buf: R) -> anyhow::Result> { let mut matrices = vec![]; let buf_reader = BufReader::new(matrix_buf); @@ -207,9 +201,8 @@ impl SubstitutionMatrix { let mut lines: Vec = buf_reader .lines() - .map(|l| l.expect("failed to read line")) - .filter(|l| !l.is_empty()) - .collect(); + .filter_ok(|l| !l.is_empty()) + .try_collect()?; // add a single blank line to the // end to serve as a sentinel @@ -231,26 +224,38 @@ impl SubstitutionMatrix { line_tokens .iter() .zip(line_tokens.iter().skip(1)) - .for_each(|(tokens, next_tokens)| { + .enumerate() + .try_for_each(|(line_num, (tokens, next_tokens))| { + let error_msg = |msg| move || format!("{} at line {}", msg, line_num); + match state { ParserState::Header => match tokens.first() { Some(&"#matrix") => { name = tokens[1].to_string(); } Some(&"#gap-open") => { - gap_open = tokens[1].parse::().expect("failed to parse float"); + gap_open = tokens[1] + .parse::() + .with_context(error_msg("failed to parse int"))?; } Some(&"#gap-ext") => { - gap_extend = tokens[1].parse::().expect("failed to parse float"); + gap_extend = tokens[1] + .parse::() + .with_context(error_msg("failed to parse float"))?; } Some(&"#lambda") => { - lambda = tokens[1].parse::().expect("failed to parse float"); + lambda = tokens[1] + .parse::() + .with_context(error_msg("failed to parse float"))?; } Some(&"#fi") => { let freqs: Vec = tokens[1..=4] .iter() - .map(|&f| f.parse::().expect("failed to parse float")) - .collect(); + .map(|&f| { + f.parse::() + .with_context(error_msg("failed to parse float")) + }) + .try_collect()?; background_freqs_i .iter_mut() .zip(freqs) @@ -259,14 +264,17 @@ impl SubstitutionMatrix { Some(&"#fj") => { let freqs: Vec = tokens[1..=4] .iter() - .map(|&f| f.parse::().expect("failed to parse float")) - .collect(); + .map(|&f| { + f.parse::() + .with_context(error_msg("failed to parse float")) + }) + .try_collect()?; background_freqs_j .iter_mut() .zip(freqs) .for_each(|(a, b)| *a = b); } - _ => panic!(), + _ => return Err(anyhow!(error_msg("Unknown matrices file syntax!")())), }, ParserState::Chars => { chars = tokens.iter().map(|t| t.to_string()).collect(); @@ -274,8 +282,11 @@ impl SubstitutionMatrix { ParserState::Scores => scores_vec.push( tokens .iter() - .map(|t| t.parse::().expect("failed to parse float")) - .collect(), + .map(|t| { + t.parse::() + .with_context(error_msg("failed to parse float")) + }) + .try_collect()?, ), } @@ -287,9 +298,12 @@ impl SubstitutionMatrix { let char_indices: Vec = chars .iter() .map(|c| { - (*STR_TO_DIGITAL_NUCLEOTIDE.get(c).expect("invalid char")) as usize + STR_TO_DIGITAL_NUCLEOTIDE + .get(c) + .ok_or_else(|| anyhow!(error_msg("invalid char")())) + .map(|v| *v as usize) }) - .collect(); + .try_collect()?; scores_vec .iter() @@ -320,26 +334,30 @@ impl SubstitutionMatrix { scores, )); scores_vec = vec![]; + + Ok::<_, anyhow::Error>(()) }; state = match next_tokens.first() { Some(token) if token.starts_with('#') => { if let ParserState::Scores = state { - add_matrix() + add_matrix()?; } ParserState::Header } Some(token) if token.parse::().is_err() => ParserState::Chars, Some(token) if token.parse::().is_ok() => ParserState::Scores, None => { - add_matrix(); - return; + add_matrix()?; + return Ok(()); } - _ => panic!(), + _ => return Err(anyhow!(error_msg("Unknown syntax in matrices file!")())), }; - }); - matrices + Ok(()) + })?; + + Ok(matrices) } } @@ -431,7 +449,7 @@ mod tests { [0.0220, 0.1331, 0.0368, 0.9131], ]; - let matrix_vec = SubstitutionMatrix::parse(matrix_buf); + let matrix_vec = SubstitutionMatrix::parse(matrix_buf)?; let matrix = matrix_vec.first().unwrap(); matrix @@ -476,7 +494,7 @@ mod tests { -30 -30 -30 -30 -30 -30 -30 -30 -30 -30 -30 -30" .as_bytes(); - let matrix_vec = SubstitutionMatrix::parse(matrix_buf); + let matrix_vec = SubstitutionMatrix::parse(matrix_buf)?; let matrix = matrix_vec.first().unwrap(); let correct: [[f64; 12]; 12] = [ @@ -567,7 +585,7 @@ mod tests { -30 -30 -30 -30 -30 -30 -30 -30 -30 -30 -30 -30" .as_bytes(); - let matrix_vec = SubstitutionMatrix::parse(matrix_buf); + let matrix_vec = SubstitutionMatrix::parse(matrix_buf)?; let matrix = matrix_vec.first().unwrap(); let mut ali: Vec = [ diff --git a/src/support.rs b/src/support.rs index 4250363..1a6f9c0 100644 --- a/src/support.rs +++ b/src/support.rs @@ -2,10 +2,8 @@ use crate::matrix::Matrix; /// Smooth the values of a confidence matrix by convolving it with a fixed size rectangular kernel that adds to 1. /// This is the same as averaging values over the window range. -/// The current kernel size is hardcoded to 31. -pub fn windowed_confidence(matrix: &mut Matrix) -> Vec { - // TODO: parameterize this - let half_window_size = 15usize; +pub fn windowed_confidence(matrix: &mut Matrix, window_size: usize) -> Vec { + let half_window_size = (window_size - 1) / 2; let mut confidence_by_row = vec![0.0; matrix.num_rows()]; diff --git a/src/trace_statistics.rs b/src/trace_statistics.rs new file mode 100644 index 0000000..f71fa39 --- /dev/null +++ b/src/trace_statistics.rs @@ -0,0 +1,174 @@ +use std::fmt::Debug; + +use itertools::izip; + +use crate::{ + alignment::AlignmentData, + join_estimation::{JoinEstimator, JoinStatisticsCollector}, + pipeline::NaiveTraceResults, + segments::{InitialSegments, Segment}, +}; + +#[derive(Debug)] +pub struct RegionStatistics { + pub total_bases: usize, + pub unexplained_bases: Vec, +} + +#[derive(Debug, Clone)] +pub struct QueryStatistics { + pub occurances: usize, + pub coverage: usize, + pub target_span: usize, + pub estimator: T, +} + +#[derive(Debug)] +pub struct TraceStatistics { + #[allow(dead_code)] + pub total_bases: usize, + pub query_statistics: Vec>, + pub region_statistics: Vec, +} + +pub enum OccuranceCountingMode { + Segments, + #[allow(dead_code)] + Trace, +} + +pub fn calculate_region_statistics(segments: &InitialSegments) -> RegionStatistics { + let mut region_stat = RegionStatistics { + total_bases: 0, + unexplained_bases: Vec::with_capacity(segments.len()), + }; + + let mut unexplained_bases_up_to: usize = 0; + let mut prior_segment: Option<&Segment> = None; + + for seg in segments.view_segments() { + if let Some(prior_segment) = prior_segment { + // If a skip block was the prior block, add it's bases as unexplained. + if prior_segment.blocks.len() == 1 && prior_segment.blocks[0].row_idx == 0 { + unexplained_bases_up_to += seg.end_col - seg.start_col + 1; + } + unexplained_bases_up_to += seg.start_col - prior_segment.end_col - 1; + region_stat.total_bases += seg.start_col - prior_segment.end_col - 1; + } + region_stat.total_bases += seg.end_col - seg.start_col + 1; + region_stat.unexplained_bases.push(unexplained_bases_up_to); + + prior_segment = Some(seg); + } + + region_stat +} + +pub fn trace_statistics, E: JoinEstimator>( + naive_traces: &[NaiveTraceResults], + alignment_data: &AlignmentData, + count_mode: OccuranceCountingMode, +) -> TraceStatistics { + // Asumption... All regions are sorted, no gaps. At least 1 region expected... + debug_assert!(naive_traces.first().map(|v| v.region_index) == Some(0)); + debug_assert!(naive_traces + .iter() + .zip(naive_traces.iter().skip(1)) + .all(|(v1, v2)| v1.region_index + 1 == v2.region_index)); + + assert!(naive_traces + .iter() + .zip(naive_traces.iter().skip(1)) + .all(|(v1, v2)| v1.region_index + 1 == v2.region_index && v1.target_end < v2.target_start)); + + let mut query_stats: Vec> = vec![ + QueryStatistics { + occurances: 0, + coverage: 0, + target_span: 0, + estimator: E::default(), + }; + alignment_data.query_name_map.size() + ]; + + let mut query_span: Vec> = + vec![None; alignment_data.query_name_map.size()]; + + let mut all_region_stats: Vec = Vec::with_capacity(naive_traces.len()); + // We combine stats for all families to use as a prior (psuedo-count, single sample) for all stats... + let mut all_family_stats: S = S::new(); + + for trace_results in naive_traces.iter() { + for (_query_id, stats) in trace_results.query_join_statistics.iter() { + all_family_stats = all_family_stats.combine(stats); + } + + match count_mode { + OccuranceCountingMode::Segments => { + for seg in trace_results.segments.view_segments().iter() { + for blk in seg.blocks.iter() { + if let Some(query_id) = blk.query_id { + query_stats[query_id].occurances += 1; + query_stats[query_id].coverage += blk.col_end - blk.col_start + 1; + query_span[query_id] = match query_span[query_id] { + None => Some(( + trace_results.target_start + blk.col_start, + trace_results.target_start + blk.col_end, + )), + Some((start, end)) => Some(( + start.min(trace_results.target_start + blk.col_start), + end.max(trace_results.target_start + blk.col_end), + )), + } + } + } + } + } + OccuranceCountingMode::Trace => { + for trace_blk in trace_results.trace_segments.iter() { + query_stats[trace_blk.query_id].occurances += 1; + query_stats[trace_blk.query_id].coverage += + trace_blk.col_end - trace_blk.col_start + 1; + + query_span[trace_blk.query_id] = match query_span[trace_blk.query_id] { + None => Some(( + trace_results.target_start + trace_blk.col_start, + trace_results.target_start + trace_blk.col_end, + )), + Some((start, end)) => Some(( + start.min(trace_results.target_start + trace_blk.col_start), + end.max(trace_results.target_start + trace_blk.col_end), + )), + } + } + } + } + + all_region_stats.push(calculate_region_statistics(&trace_results.segments)); + } + + // Calculate join statistics for all families using combined prior as a starting point... + let mut all_join_stats: Vec = + vec![S::new_from_prior(&all_family_stats, 1); alignment_data.query_name_map.size()]; + + for trace_results in naive_traces.iter() { + for (query_id, stats) in trace_results.query_join_statistics.iter() { + all_join_stats[*query_id] = all_join_stats[*query_id].combine(stats); + } + } + + for (query_info, query_span, join_stat) in + izip!(query_stats.iter_mut(), query_span.iter(), all_join_stats) + { + if let Some((start, end)) = query_span { + query_info.target_span = end - start + 1; + } + query_info.estimator = join_stat.into(); + } + + TraceStatistics { + total_bases: all_region_stats.iter().map(|v| v.total_bases).sum(), + query_statistics: query_stats, + region_statistics: all_region_stats, + } +} diff --git a/src/util.rs b/src/util.rs index 62cabe0..1c7fe5c 100644 --- a/src/util.rs +++ b/src/util.rs @@ -15,7 +15,7 @@ impl VecMap { Self { values } } - pub fn values(&self) -> std::slice::Iter { + pub fn values(&self) -> std::slice::Iter<'_, T> { self.values.iter() } diff --git a/src/viz/block.rs b/src/viz/block.rs index 6005b35..4065e6d 100644 --- a/src/viz/block.rs +++ b/src/viz/block.rs @@ -189,7 +189,7 @@ impl BlockGroup { None } else { match elems[8] { - "C" => Some(bed.strand), + "C" => Some(Strand::Reverse), _ => Some(Strand::from_str(elems[8])), } } diff --git a/src/viz/mod.rs b/src/viz/mod.rs index ad03f7f..06e12c3 100644 --- a/src/viz/mod.rs +++ b/src/viz/mod.rs @@ -48,7 +48,7 @@ pub fn write_index_file( .enumerate() .for_each(|(idx, c)| { index_links.push_str(&format!( - "
\n", + "\n", name = c.target_name, start = c.target_start, end = c.target_end, @@ -58,7 +58,7 @@ pub fn write_index_file( proximity_groups.iter().enumerate().for_each(|(idx, g)| { index_links.push_str(&format!( - "

region {idx} | {name} {start}:{end}

\n", + "\n", name = alignment_data.target_name_map.get(g.target_id), start = g.target_start, end = g.target_end, @@ -746,7 +746,7 @@ impl<'a> AdjudicationSodaData<'a> { let columns = self .active_columns .iter() - .flat_map(|&(start, end)| (start..=end)) + .flat_map(|&(start, end)| start..=end) .collect_vec(); vec![columns] } diff --git a/src/viz/stats.rs b/src/viz/stats.rs index 4b74fb1..82f7201 100644 --- a/src/viz/stats.rs +++ b/src/viz/stats.rs @@ -1,5 +1,11 @@ use crate::{ - alignment::Strand, annotation::AmbiguousAnnotation, segments::Unordered, viz::SODA_JS, + alignment::Strand, + annotation::AmbiguousAnnotation, + assembly::{ + block_target_distance, relative_consensus_distance, ConsensusDistanceNormalization, + }, + segments::{Block, Unordered}, + viz::SODA_JS, }; use core::str; use itertools::Itertools; @@ -12,6 +18,12 @@ struct FamilyInfo { pub occurrences: usize, pub coverage: usize, pub kimura80_values: Unordered>, + pub join_consensus_dist: Unordered>, + pub nojoin_consensus_dist: Unordered>, + pub join_target_dist: Unordered>, + pub nojoin_target_dist: Unordered>, + pub join_kimura_dist: Unordered>, + pub nojoin_kimura_dist: Unordered>, } #[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] @@ -20,29 +32,17 @@ struct InversionInfo { pub normal_joins: usize, } -fn write_table( +fn write_tsv( writer: &mut impl Write, header: &[A; N], data: &[[B; N]], ) -> std::io::Result<()> { - writeln!(writer, "\n")?; - - for heading in header.iter() { - writeln!(writer, "", heading)?; - } - - writeln!(writer, "\n")?; + writeln!(writer, "{}", header.iter().join("\t"))?; for row in data.iter() { - write!(writer, "")?; - for entry in row.iter() { - write!(writer, "", entry)?; - } - writeln!(writer, "")?; + writeln!(writer, "{}", row.iter().join("\t"))?; } - writeln!(writer, "\n
{}
{}
")?; - Ok(()) } @@ -54,13 +54,13 @@ fn write_statistics_table_page std::io::Result<()> { let mut tmp_writer = Vec::::new(); - write_table(&mut tmp_writer, header, data)?; + write_tsv(&mut tmp_writer, header, data)?; let table_page = TABLE_HTML .replace("PAGE_TITLE", title) .replace("SODA_TARGET", SODA_JS) .replace( - "TABLE_TARGET", + "TSV_TARGET", str::from_utf8(&tmp_writer).expect("UTF8 decoding failed!"), ); @@ -72,14 +72,39 @@ fn write_statistics_table_page)], + query_lengths: &HashMap, ) -> std::io::Result<()> { let mut family_stats = HashMap::<&String, FamilyInfo>::new(); for (_region, annotations) in region_annotations.iter() { + let mut prior_elem = HashMap::::new(); + for amb_annot in annotations.iter() { - for query in amb_annot.annotations.iter() { + for (query_idx, query) in amb_annot.annotations.iter().enumerate() { let name = &query.query_name; let target_range = query.target_end - query.target_start; + let (join_cons, nojoin_cons, join_target, nojoin_target, join_div, nojoin_div) = + if let Some(&(prior_annot, prior_query_idx)) = prior_elem.get(&query.query_id) { + let block_c = Block::from_annotation(amb_annot, query_idx, 0); + let block_p = Block::from_annotation(prior_annot, prior_query_idx, 0); + let q_len = *query_lengths.get(&query.query_id).unwrap_or(&1); + + let (c_dist, _join_type) = relative_consensus_distance( + &block_p, + &block_c, + ConsensusDistanceNormalization::WithLength(q_len), + ); + let t_dist = block_target_distance(&block_p, &block_c) as f64; + let d_dist = (block_c.kimura80 - block_p.kimura80).abs(); + + if prior_annot.join_id == amb_annot.join_id { + (Some(c_dist), None, Some(t_dist), None, Some(d_dist), None) + } else { + (None, Some(c_dist), None, Some(t_dist), None, Some(d_dist)) + } + } else { + (None, None, None, None, None, None) + }; family_stats .entry(name) @@ -87,12 +112,28 @@ pub fn write_family_statistics( f.occurrences += 1; f.coverage += target_range; f.kimura80_values.0.push(query.kimura80); + f.join_consensus_dist.0.extend(join_cons.iter()); + f.nojoin_consensus_dist.0.extend(nojoin_cons.iter()); + f.join_target_dist.0.extend(join_target.iter()); + f.nojoin_target_dist.0.extend(nojoin_target.iter()); + f.join_kimura_dist.0.extend(join_div.iter()); + f.nojoin_kimura_dist.0.extend(nojoin_div.iter()); }) .or_insert(FamilyInfo { occurrences: 1, coverage: target_range, kimura80_values: Unordered(vec![query.kimura80]), + join_consensus_dist: Unordered(join_cons.iter().copied().collect()), + nojoin_consensus_dist: Unordered(nojoin_cons.iter().copied().collect()), + join_target_dist: Unordered(join_target.iter().copied().collect()), + nojoin_target_dist: Unordered(nojoin_target.iter().copied().collect()), + join_kimura_dist: Unordered(join_div.iter().copied().collect()), + nojoin_kimura_dist: Unordered(nojoin_div.iter().copied().collect()), }); + + prior_elem + .entry(query.query_id) + .insert_entry((amb_annot, query_idx)); } } } @@ -101,11 +142,16 @@ pub fn write_family_statistics( stats_writer, "Family Statistics", &[ - "Family", - "Occurrences", - "Coverage", - "Kimura80 Boxplot", - "Kimura80 KDE", + "Family_string", + "Occurrences_int", + "Coverage_int", + "Kimura80_KDE_violin", + "Joined_Consensus_Distance_violin", + "Unjoined_Consensus_Distance_violin", + "Joined_Target_Distance_violin", + "Unjoined_Target_Distance_violin", + "Joined_Kimura_Difference_violin", + "Unjoined_Kimura_Difference_violin", ], &family_stats .iter() @@ -115,14 +161,13 @@ pub fn write_family_statistics( k.to_string(), v.occurrences.to_string(), v.coverage.to_string(), - format!( - "
", - v.kimura80_values.0.iter().join(",") - ), - format!( - "
", - v.kimura80_values.0.iter().join(",") - ), + v.kimura80_values.0.iter().join(":"), + v.join_consensus_dist.0.iter().join(":"), + v.nojoin_consensus_dist.0.iter().join(":"), + v.join_target_dist.0.iter().join(":"), + v.nojoin_target_dist.0.iter().join(":"), + v.join_kimura_dist.0.iter().join(":"), + v.nojoin_kimura_dist.0.iter().join(":"), ] }) .collect_vec(), @@ -165,13 +210,13 @@ pub fn write_inversion_statistics( write_statistics_table_page( stats_writer, "Inversion Statistics", - &["Region", "Inversions", "Normal Joins"], + &["Region_region", "Inversions_int", "Normal_Joins_int"], &inversion_stats .iter() .sorted_by(|v1, v2| v2.1.cmp(v1.1)) .map(|(k, v)| { [ - format!("{}", k, k), + k.to_string(), v.inversions.to_string(), v.normal_joins.to_string(), ] diff --git a/src/windowed_scores.rs b/src/windowed_scores.rs index 6044e30..39456b4 100644 --- a/src/windowed_scores.rs +++ b/src/windowed_scores.rs @@ -45,6 +45,7 @@ impl BackgroundFrequencies for Background<'_> { } } +#[allow(dead_code)] pub struct DummyBackground {} impl BackgroundFrequencies for DummyBackground {