From 2817d01374ad0aeab98d6f48a3dae9b30f878a8a Mon Sep 17 00:00:00 2001 From: Adam Getchell Date: Mon, 8 Jun 2026 13:10:28 -0700 Subject: [PATCH 1/4] feat(bench): archive release performance reports - Add an archive-performance utility that promotes curated benchmark reports into docs/PERFORMANCE.md while archiving prior release comparisons - Generate release comparisons in isolated temporary worktrees, including legacy command fallback for published tags - Wire release and historical archive recipes into just, Python packaging, and release documentation --- CHANGELOG.md | 10 + docs/BENCHMARKING.md | 20 + docs/RELEASING.md | 28 +- justfile | 10 + pyproject.toml | 4 +- scripts/README.md | 17 + scripts/archive_performance.py | 468 ++++++++++++++++++ scripts/tests/test_archive_performance.py | 561 ++++++++++++++++++++++ 8 files changed, 1112 insertions(+), 6 deletions(-) create mode 100644 scripts/archive_performance.py create mode 100644 scripts/tests/test_archive_performance.py diff --git a/CHANGELOG.md b/CHANGELOG.md index e826f64..eed0080 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -43,6 +43,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Move error and tolerance contracts into first-class modules with prelude exports - Update exact benchmarks to distinguish strict Result paths from rounded f64 paths - Document and exercise the rounded fallback pattern for RequiresRounding errors +- [**breaking**] Make exact f64 conversions strict [`89f3720`](https://github.com/acgetchell/la-stack/commit/89f3720ecde9f12d7a0f42e79394836615e8fd97) + - Make Matrix and Vector the finite-by-construction public types for exact arithmetic. + - Add rounded exact-to-f64 APIs for determinant and solve callers that want explicit lossy conversion. + - Return typed Unrepresentable reasons when strict exact-to-f64 conversion would round or become non-finite. + - Specialize D4 exact determinants and keep determinant/error-bound zero coefficients from evaluating overflowing absent terms. + - Update exact benchmark comparison reporting to compare strict and rounded APIs against legacy v0.4.2 rows. ### Changed @@ -72,6 +78,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 Use a literal regex pattern for the malformed Criterion JSON diagnostic so Windows paths with backslashes do not break pytest's match expression. - Align ty with Python 3.13 [`b9e0ba0`](https://github.com/acgetchell/la-stack/commit/b9e0ba08e54a15d8eddd5c5c53edc37bbc03939a) +- Preserve coordinates for overflowed accumulators [`1d976b3`](https://github.com/acgetchell/la-stack/commit/1d976b346172ad4eca37c68a3ec31817eeca8529) + + - Return matrix-cell metadata when inf-norm row sums or symmetry tolerance scaling overflow. + - Avoid reparsing finite-by-construction RHS vectors in LU and LDLT solves. ## [0.4.2] - 2026-06-04 diff --git a/docs/BENCHMARKING.md b/docs/BENCHMARKING.md index dfd27d7..0e2ce23 100644 --- a/docs/BENCHMARKING.md +++ b/docs/BENCHMARKING.md @@ -193,6 +193,26 @@ local. The report includes per-dimension tables showing median times, percent change, speedup, and last-release nalgebra/faer context where a matching `vs_linalg` peer exists. +Release PRs promote one curated comparison into committed docs: + +```bash +just performance-release v0.4.3 v0.4.2 +``` + +This runs the release-signal benchmark set, renders the comparison into +an isolated temporary worktree, copies the finished report to +`docs/PERFORMANCE.md`, and archives the previous committed report under +`docs/archive/performance/`. Archive filenames are release-pair names such as +`v0.4.2-vs-v0.4.1.md`, so the directory and generated index stay +lexicographically sorted. + +To regenerate and archive a historical published release comparison without +touching the current checkout: + +```bash +just performance-archive-published v0.4.2 v0.4.1 +``` + For exact-arithmetic comparisons against v0.4.2 or older baselines, rows such as `det_exact_rounded_f64 (vs det_exact_f64)` mean the current rounded API is being compared to the historical lossy `*_exact_f64` benchmark. Rows such as diff --git a/docs/RELEASING.md b/docs/RELEASING.md index 07f41a1..797cd96 100644 --- a/docs/RELEASING.md +++ b/docs/RELEASING.md @@ -20,6 +20,7 @@ Set these variables to avoid repeating the version string: # tag has the leading v, version does not TAG=vX.Y.Z VERSION=${TAG#v} +PREVIOUS_TAG=vA.B.C ``` Verify your git remotes: @@ -100,7 +101,24 @@ just plot-vs-linalg-readme Review the updated table in `README.md` and the plot in `docs/assets/` for accuracy. -5. Save benchmark baselines for this release +5. Update the release performance comparison + +```bash +# Runs the release-signal benchmark set in a temporary worktree, compares TAG +# against PREVIOUS_TAG, writes docs/PERFORMANCE.md, and archives the previous +# docs/PERFORMANCE.md under docs/archive/performance/. +just performance-release "$TAG" "$PREVIOUS_TAG" +``` + +Review `docs/PERFORMANCE.md` for the latest release-to-release comparison. Older +committed comparisons are archived under `docs/archive/performance/` with +lexicographically sorted filenames such as `v0.4.2-vs-v0.4.1.md`. Iterative +local reports still live under `target/bench-reports/`. To regenerate a +historical published release comparison, use +`just performance-archive-published `; it also runs +inside a temporary worktree. + +6. Save benchmark baselines for this release ```bash # Save a named full baseline for this release @@ -125,7 +143,7 @@ uploads a short-lived Actions artifact for debugging the run. See `docs/BENCHMARKING.md` for the full comparison workflow. -6. Validate the release branch +7. Validate the release branch ```bash just ci @@ -133,7 +151,7 @@ just citation-check cargo publish --locked --dry-run ``` -7. Stage and commit release artifacts +8. Stage and commit release artifacts ```bash git add Cargo.toml Cargo.lock CITATION.cff pyproject.toml CHANGELOG.md README.md docs/ @@ -143,11 +161,11 @@ git commit -m "chore(release): release $TAG - Bump version to $TAG - Update citation and utility package metadata - Update changelog with latest changes -- Update benchmark comparison table +- Update benchmark comparison table and release performance report - Update documentation for release" ``` -8. Push the branch and open a PR +9. Push the branch and open a PR ```bash git push -u origin "release/$TAG" diff --git a/justfile b/justfile index 811f19b..a6503e4 100644 --- a/justfile +++ b/justfile @@ -186,6 +186,14 @@ bench-compare baseline="last" suite="all" scope="release-signal": python-sync baseline="{{baseline}}" uv run bench-compare "$baseline" --suite "{{suite}}" --scope "{{scope}}" +# Generate release-signal measurements in a temp worktree, then promote/archive docs. +performance-release current_tag baseline_tag: python-sync + uv run archive-performance "{{current_tag}}" "{{baseline_tag}}" --generate-in-temp-worktree --worktree-ref HEAD + +# Generate a published-tag comparison in a temp worktree, then promote/archive docs. +performance-archive-published current_tag baseline_tag: python-sync + uv run archive-performance "{{current_tag}}" "{{baseline_tag}}" --generate-in-temp-worktree --worktree-ref "{{current_tag}}" --no-apply-current-diff + # Run the exact-arithmetic benchmark suite. bench-exact: cargo bench --features bench,exact --bench exact @@ -390,6 +398,8 @@ help-workflows: @echo " just bench-compile # Compile benches with warnings-as-errors" @echo " just bench-latest # Run cheap latest measurements" @echo " just bench-latest-vs-last # Run latest and compare against last" + @echo " just performance-release # Promote release performance docs" + @echo " just performance-archive-published # Archive published release comparison" @echo " just bench-save-last # Save full baseline as 'last'" @echo " just bench-vs-linalg # Run vs_linalg bench (optional filter)" @echo " just bench-vs-linalg-la-stack # Run la-stack rows from vs_linalg" diff --git a/pyproject.toml b/pyproject.toml index 6f9ff45..9136cf4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ dependencies = [ ] [project.scripts] archive-changelog = "archive_changelog:main" +archive-performance = "archive_performance:main" bench-compare = "bench_compare:main" criterion-dim-plot = "criterion_dim_plot:main" postprocess-changelog = "postprocess_changelog:main" @@ -45,7 +46,7 @@ check-docs-version-sync = "check_docs_version_sync:main" # Configure setuptools to find modules in scripts/ directory. [tool.setuptools] package-dir = { "" = "scripts" } -py-modules = [ "archive_changelog", "bench_compare", "check_docs_version_sync", "check_semgrep_fixtures", "criterion_dim_plot", "postprocess_changelog", "subprocess_utils", "tag_release" ] +py-modules = [ "archive_changelog", "archive_performance", "bench_compare", "check_docs_version_sync", "check_semgrep_fixtures", "criterion_dim_plot", "postprocess_changelog", "subprocess_utils", "tag_release" ] [tool.ruff] line-length = 160 @@ -88,6 +89,7 @@ ignore = [ [tool.ruff.lint.isort] known-first-party = [ "archive_changelog", + "archive_performance", "bench_compare", "check_semgrep_fixtures", "criterion_dim_plot", diff --git a/scripts/README.md b/scripts/README.md index 4b0e5e6..501dfb4 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -33,6 +33,21 @@ just bench-compare Use `uv run bench-compare --snapshot` for a no-baseline snapshot, or `uv run bench-compare ` to compare against a named saved baseline. +For release PRs, promote one curated release-to-release comparison into +committed docs and archive the previous committed report. Benchmark generation +runs in a temporary worktree: + +```bash +just performance-release v0.4.3 v0.4.2 +``` + +To regenerate a historical published-tag comparison without changing the current +checkout: + +```bash +just performance-archive-published v0.4.2 v0.4.1 +``` + ### Plotting Criterion benchmarks (la-stack vs nalgebra/faer) The plotter reads Criterion output under: @@ -169,6 +184,8 @@ tag-annotation size limit. | Script | Purpose | |---|---| +| `archive_performance.py` | Promote release performance docs and archive older comparisons | +| `bench_compare.py` | Compare Criterion benchmark baselines and render Markdown reports | | `criterion_dim_plot.py` | Plot Criterion benchmark results (CSV + SVG + README table) | | `tag_release.py` | Create annotated git tags from CHANGELOG.md sections | | `postprocess_changelog.py` | Strip trailing blank lines from git-cliff output | diff --git a/scripts/archive_performance.py b/scripts/archive_performance.py new file mode 100644 index 0000000..52d60da --- /dev/null +++ b/scripts/archive_performance.py @@ -0,0 +1,468 @@ +#!/usr/bin/env -S uv run +"""Promote a benchmark report into docs/PERFORMANCE.md and archive the old one. + +Release performance docs have two different lifetimes: + + - ``target/bench-reports/performance.md`` is local scratch output for the + current machine and branch. + - ``docs/PERFORMANCE.md`` is the latest curated release-to-release comparison. + - ``docs/archive/performance/*.md`` stores older curated comparisons. + +This script copies a freshly generated local report into ``docs/PERFORMANCE.md`` +and archives the previous committed report under a filename derived from the +report metadata, such as ``v0.4.2-vs-v0.4.1.md``. +""" + +from __future__ import annotations + +import argparse +import os +import re +import subprocess +import sys +import tarfile +import tempfile +from dataclasses import dataclass +from pathlib import Path + +from subprocess_utils import run_git_command, run_git_command_with_input, run_safe_command + +_VERSION_RE = re.compile(r"^\*\*la-stack\*\* v(?P[^\s`]+)", re.MULTILINE) +_BASELINE_RE = re.compile(r"^Comparison against baseline \*\*(?P[^*]+)\*\*:", re.MULTILINE) +_SEMVER_IDENTIFIER_RE = r"(?:0|[1-9][0-9]*|[0-9A-Za-z-]*[A-Za-z-][0-9A-Za-z-]*)" +_TAG_RE = re.compile( + rf"^v?(?:0|[1-9][0-9]*)\.(?:0|[1-9][0-9]*)\.(?:0|[1-9][0-9]*)" + rf"(?:-{_SEMVER_IDENTIFIER_RE}(?:\.{_SEMVER_IDENTIFIER_RE})*)?" + r"(?:\+[0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*)?$" +) + +_DEFAULT_SOURCE = "target/bench-reports/performance.md" +_DEFAULT_CURRENT = "docs/PERFORMANCE.md" +_DEFAULT_ARCHIVE_DIR = "docs/archive/performance" +_DEFAULT_SUITE = "all" +_DEFAULT_SCOPE = "release-signal" +_BENCH_TIMEOUT_SECONDS = 7200 +_COMMAND_TIMEOUT_SECONDS = 600 + + +@dataclass(frozen=True) +class ReportId: + """Release-pair identity parsed from a benchmark report.""" + + current_tag: str + baseline_tag: str + + @property + def archive_name(self) -> str: + """Return the canonical archive filename for this report.""" + return f"{self.current_tag}-vs-{self.baseline_tag}.md" + + +@dataclass(frozen=True) +class GenerationConfig: + """Configuration for benchmark report generation in a temp worktree.""" + + repo_root: Path + current_tag: str + baseline_tag: str + worktree_ref: str + suite: str = _DEFAULT_SUITE + scope: str = _DEFAULT_SCOPE + apply_current_diff: bool = True + + +def normalize_tag(tag: str) -> str: + """Return *tag* with a leading ``v`` and no surrounding whitespace.""" + normalized = tag.strip() + if not normalized: + msg = "tag must not be empty" + raise ValueError(msg) + if not normalized.startswith("v"): + normalized = f"v{normalized}" + if not _TAG_RE.fullmatch(normalized): + msg = f"expected a semver tag like v0.4.2, got {tag!r}" + raise ValueError(msg) + return normalized + + +def parse_report_id(text: str) -> ReportId: + """Parse the current version and baseline tag from a benchmark report.""" + version_match = _VERSION_RE.search(text) + if version_match is None: + msg = "could not find la-stack version line in benchmark report" + raise ValueError(msg) + + baseline_match = _BASELINE_RE.search(text) + if baseline_match is None: + msg = "could not find comparison baseline line in benchmark report" + raise ValueError(msg) + + return ReportId( + current_tag=normalize_tag(version_match.group("version")), + baseline_tag=normalize_tag(baseline_match.group("baseline")), + ) + + +def _read_text(path: Path) -> str: + return path.read_text(encoding="utf-8") + + +def _replace_file(src: Path, dst: Path) -> None: + src.replace(dst) + + +def _write_text(path: Path, text: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + tmp_path: Path | None = None + try: + with tempfile.NamedTemporaryFile( + "w", + encoding="utf-8", + dir=path.parent, + prefix=f".{path.name}.", + suffix=".tmp", + delete=False, + ) as tmp: + tmp_path = Path(tmp.name) + tmp.write(text) + tmp.flush() + os.fsync(tmp.fileno()) + _replace_file(tmp_path, path) + finally: + if tmp_path is not None and tmp_path.exists(): + tmp_path.unlink() + + +def _archive_readme(archive_dir: Path) -> str: + reports = sorted(path.name for path in archive_dir.glob("*.md") if path.name != "README.md") + lines = [ + "# Archived Performance Reports", + "", + "Older release-to-release benchmark comparisons are archived here.", + "`docs/PERFORMANCE.md` contains the latest curated comparison.", + "", + ] + if reports: + lines.extend(f"- [{name.removesuffix('.md')}]({name})" for name in reports) + else: + lines.append("- No archived performance reports yet.") + return "\n".join(lines) + "\n" + + +def update_archive_index(archive_dir: Path) -> None: + """Write a lexicographically sorted archive index.""" + _write_text(archive_dir / "README.md", _archive_readme(archive_dir)) + + +def _format_command_failure(command: list[str], exc: subprocess.CalledProcessError) -> str: + parts = [f"command failed ({exc.returncode}): {' '.join(command)}"] + if exc.stdout: + parts.append(f"stdout:\n{exc.stdout.strip()}") + if exc.stderr: + parts.append(f"stderr:\n{exc.stderr.strip()}") + return "\n".join(parts) + + +def _run_git(args: list[str], *, cwd: Path, timeout: int = _COMMAND_TIMEOUT_SECONDS) -> None: + try: + run_git_command(args, cwd=cwd, timeout=timeout) + except subprocess.CalledProcessError as exc: + raise RuntimeError(_format_command_failure(["git", *args], exc)) from exc + + +def _run_tool(command: str, args: list[str], *, cwd: Path, timeout: int = _COMMAND_TIMEOUT_SECONDS) -> None: + try: + run_safe_command(command, args, cwd=cwd, timeout=timeout) + except subprocess.CalledProcessError as exc: + raise RuntimeError(_format_command_failure([command, *args], exc)) from exc + + +def _safe_extract_tar(archive: Path, target_dir: Path) -> None: + target_dir.mkdir(parents=True, exist_ok=True) + target_root = target_dir.resolve() + with tarfile.open(archive, "r:gz") as tar: + for member in tar.getmembers(): + member_path = (target_dir / member.name).resolve() + if not member_path.is_relative_to(target_root): + msg = f"refusing to extract unsafe archive member {member.name!r}" + raise ValueError(msg) + tar.extractall(target_dir, filter="data") + + +def _download_release_baseline(*, baseline_tag: str, download_dir: Path, repo_root: Path) -> Path: + artifact = download_dir / f"la-stack-{baseline_tag}-criterion-baseline.tar.gz" + _run_tool( + "gh", + [ + "release", + "download", + baseline_tag, + "--pattern", + artifact.name, + "--dir", + str(download_dir), + ], + cwd=repo_root, + ) + if not artifact.exists(): + msg = f"release baseline asset was not downloaded: {artifact}" + raise FileNotFoundError(msg) + return artifact + + +def _apply_current_diff_to_worktree(*, repo_root: Path, worktree: Path) -> None: + diff = run_git_command(["diff", "--binary", "HEAD"], cwd=repo_root).stdout + if diff.strip(): + try: + run_git_command_with_input(["apply", "--binary"], diff, cwd=worktree) + except subprocess.CalledProcessError as exc: + raise RuntimeError(_format_command_failure(["git", "apply", "--binary"], exc)) from exc + + +def _has_current_release_signal_tooling(worktree: Path) -> bool: + justfile = worktree / "justfile" + bench_compare = worktree / "scripts" / "bench_compare.py" + if not justfile.exists() or not bench_compare.exists(): + return False + + justfile_text = _read_text(justfile) + bench_compare_text = _read_text(bench_compare) + return re.search(r"(?m)^bench-latest(?:[ :]|$)", justfile_text) is not None and '"--suite"' in bench_compare_text and '"--scope"' in bench_compare_text + + +def _run_benchmarks_and_render_report(*, worktree: Path, report: Path, config: GenerationConfig) -> None: + if _has_current_release_signal_tooling(worktree): + _run_tool("just", ["bench-latest"], cwd=worktree, timeout=_BENCH_TIMEOUT_SECONDS) + _run_tool( + "uv", + [ + "run", + "bench-compare", + config.baseline_tag, + "--suite", + config.suite, + "--scope", + config.scope, + "--output", + str(report), + ], + cwd=worktree, + timeout=_COMMAND_TIMEOUT_SECONDS, + ) + else: + _run_tool("just", ["bench-exact"], cwd=worktree, timeout=_BENCH_TIMEOUT_SECONDS) + _run_tool( + "uv", + [ + "run", + "bench-compare", + config.baseline_tag, + "--output", + str(report), + ], + cwd=worktree, + timeout=_COMMAND_TIMEOUT_SECONDS, + ) + + +def _generate_report_in_temp_worktree( + *, + config: GenerationConfig, +) -> str: + with tempfile.TemporaryDirectory(prefix="la-stack-performance-") as tmp: + tmp_dir = Path(tmp) + worktree = tmp_dir / "worktree" + report = tmp_dir / f"{config.current_tag}-vs-{config.baseline_tag}.md" + + _run_git(["worktree", "add", "--detach", str(worktree), config.worktree_ref], cwd=config.repo_root) + try: + if config.apply_current_diff: + _apply_current_diff_to_worktree(repo_root=config.repo_root, worktree=worktree) + baseline_archive = _download_release_baseline( + baseline_tag=config.baseline_tag, + download_dir=tmp_dir, + repo_root=config.repo_root, + ) + _safe_extract_tar(baseline_archive, worktree / "target") + _run_benchmarks_and_render_report(worktree=worktree, report=report, config=config) + return _read_text(report) + finally: + try: + _run_git(["worktree", "remove", "--force", str(worktree)], cwd=config.repo_root) + except RuntimeError as exc: + print(f"archive-performance: failed to remove temporary worktree: {exc}", file=sys.stderr) + + +def promote_report( + *, + source: Path, + current: Path, + archive_dir: Path, + expected_current_tag: str, + expected_baseline_tag: str, +) -> ReportId: + """Archive the old committed report and promote *source* as the current one.""" + source_text = _read_text(source) + source_id = parse_report_id(source_text) + expected_source_id = ReportId( + current_tag=normalize_tag(expected_current_tag), + baseline_tag=normalize_tag(expected_baseline_tag), + ) + if source_id != expected_source_id: + msg = ( + "benchmark report does not match requested release pair: " + f"found {source_id.current_tag} vs {source_id.baseline_tag}, " + f"expected {expected_source_id.current_tag} vs {expected_source_id.baseline_tag}" + ) + raise ValueError(msg) + + if current.exists(): + current_text = _read_text(current) + current_id = parse_report_id(current_text) + if current_id != source_id: + _write_text(archive_dir / current_id.archive_name, current_text) + + _write_text(current, source_text) + update_archive_index(archive_dir) + return source_id + + +def generate_and_promote_worktree_report( + *, + current: Path, + archive_dir: Path, + config: GenerationConfig, +) -> ReportId: + """Generate a comparison in a temp worktree, then promote it.""" + current_tag = normalize_tag(config.current_tag) + baseline_tag = normalize_tag(config.baseline_tag) + config = GenerationConfig( + repo_root=config.repo_root, + current_tag=current_tag, + baseline_tag=baseline_tag, + worktree_ref=config.worktree_ref, + suite=config.suite, + scope=config.scope, + apply_current_diff=config.apply_current_diff, + ) + report_text = _generate_report_in_temp_worktree( + config=config, + ) + with tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".md", delete=False) as tmp: + source = Path(tmp.name) + tmp.write(report_text) + try: + return promote_report( + source=source, + current=current, + archive_dir=archive_dir, + expected_current_tag=current_tag, + expected_baseline_tag=baseline_tag, + ) + finally: + if source.exists(): + source.unlink() + + +def build_parser() -> argparse.ArgumentParser: + """Build the CLI argument parser.""" + parser = argparse.ArgumentParser( + description="Promote a benchmark comparison into docs/PERFORMANCE.md and archive the previous report.", + ) + parser.add_argument("current_tag", help="Release tag for the new report, e.g. v0.4.3") + parser.add_argument("baseline_tag", help="Previous release tag used as the comparison baseline, e.g. v0.4.2") + parser.add_argument( + "--source", + default=_DEFAULT_SOURCE, + help=f"Generated benchmark report to promote (default: {_DEFAULT_SOURCE})", + ) + parser.add_argument( + "--current", + default=_DEFAULT_CURRENT, + help=f"Committed performance report path (default: {_DEFAULT_CURRENT})", + ) + parser.add_argument( + "--archive-dir", + default=_DEFAULT_ARCHIVE_DIR, + help=f"Archive directory for older reports (default: {_DEFAULT_ARCHIVE_DIR})", + ) + parser.add_argument( + "--generate-in-temp-worktree", + action="store_true", + help="Generate the comparison in a temporary detached worktree before promoting it.", + ) + parser.add_argument( + "--worktree-ref", + default="HEAD", + help="Git ref to check out in the temporary worktree (default: HEAD).", + ) + parser.add_argument( + "--no-apply-current-diff", + action="store_true", + help="Do not apply the current checkout's tracked diff to the temporary worktree.", + ) + parser.add_argument( + "--suite", + default=_DEFAULT_SUITE, + help=f"Benchmark suite for --generate-in-temp-worktree (default: {_DEFAULT_SUITE})", + ) + parser.add_argument( + "--scope", + default=_DEFAULT_SCOPE, + help=f"Comparison scope for --generate-in-temp-worktree (default: {_DEFAULT_SCOPE})", + ) + return parser + + +def main(argv: list[str] | None = None) -> int: + """CLI entry point.""" + args = build_parser().parse_args(argv) + root = Path.cwd() + source = Path(args.source) + current = Path(args.current) + archive_dir = Path(args.archive_dir) + if not source.is_absolute(): + source = root / source + if not current.is_absolute(): + current = root / current + if not archive_dir.is_absolute(): + archive_dir = root / archive_dir + + try: + if args.generate_in_temp_worktree: + report_id = generate_and_promote_worktree_report( + current=current, + archive_dir=archive_dir, + config=GenerationConfig( + repo_root=root, + current_tag=args.current_tag, + baseline_tag=args.baseline_tag, + worktree_ref=args.worktree_ref, + suite=args.suite, + scope=args.scope, + apply_current_diff=not args.no_apply_current_diff, + ), + ) + else: + report_id = promote_report( + source=source, + current=current, + archive_dir=archive_dir, + expected_current_tag=args.current_tag, + expected_baseline_tag=args.baseline_tag, + ) + except Exception as exc: + print(f"archive-performance: {exc}", file=sys.stderr) + return 1 + + if args.generate_in_temp_worktree: + print(f"Generated benchmark report in a temporary worktree and promoted it to {current}") + else: + print(f"Promoted {source} to {current}") + print(f"Current performance report: {report_id.current_tag} vs {report_id.baseline_tag}") + print(f"Archive directory: {archive_dir}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/tests/test_archive_performance.py b/scripts/tests/test_archive_performance.py new file mode 100644 index 0000000..7e45298 --- /dev/null +++ b/scripts/tests/test_archive_performance.py @@ -0,0 +1,561 @@ +"""Tests for archive_performance.py.""" + +from __future__ import annotations + +import io +import subprocess +import tarfile +from pathlib import Path + +import pytest + +import archive_performance +from archive_performance import GenerationConfig, generate_and_promote_worktree_report, main, normalize_tag, parse_report_id, promote_report + + +def _report(version: str, baseline: str) -> str: + return ( + "# Benchmark Performance\n\n" + f"**la-stack** v{version} · `abc1234` (release/test) · 2026-06-08 12:00:00 UTC\n" + "**Statistic**: median\n" + "**Suite**: all\n" + "**Scope**: release-signal\n\n" + "## Benchmark Results\n\n" + f"Comparison against baseline **{baseline}**:\n\n" + "Negative change = faster. Speedup > 1.00x = improvement.\n\n" + "## Exact arithmetic\n\n" + "| Benchmark | Baseline | Latest | Change | Speedup |\n" + "|-----------|---------:|-------:|-------:|--------:|\n" + "| det_exact | 1.0 ns | 0.9 ns | -10.0% | 1.11x |\n" + ) + + +def _write_baseline_archive(path: Path) -> None: + fixture_dir = path.parent / "baseline-fixture" + criterion_dir = fixture_dir / "criterion" + criterion_dir.mkdir(parents=True) + (criterion_dir / "placeholder.txt").write_text("baseline\n", encoding="utf-8") + with tarfile.open(path, "w:gz") as tar: + tar.add(criterion_dir, arcname="criterion") + + +def _write_unsafe_baseline_archive(path: Path) -> None: + payload = b"escape\n" + info = tarfile.TarInfo("../escape.txt") + info.size = len(payload) + with tarfile.open(path, "w:gz") as tar: + tar.addfile(info, io.BytesIO(payload)) + + +def _write_current_benchmark_tooling(worktree: Path) -> None: + (worktree / "scripts").mkdir(parents=True, exist_ok=True) + (worktree / "justfile").write_text("bench-latest: bench-vs-linalg-la-stack bench-exact\n", encoding="utf-8") + (worktree / "scripts" / "bench_compare.py").write_text('parser.add_argument("--suite")\nparser.add_argument("--scope")\n', encoding="utf-8") + + +def _write_legacy_benchmark_tooling(worktree: Path) -> None: + (worktree / "scripts").mkdir(parents=True, exist_ok=True) + (worktree / "justfile").write_text("bench-exact:\n", encoding="utf-8") + (worktree / "scripts" / "bench_compare.py").write_text('parser.add_argument("--output")\n', encoding="utf-8") + + +def test_normalize_tag_adds_leading_v() -> None: + assert normalize_tag("0.4.2") == "v0.4.2" + assert normalize_tag("v0.4.2") == "v0.4.2" + assert normalize_tag("v1.2.3-rc.1+build.7") == "v1.2.3-rc.1+build.7" + + +def test_normalize_tag_rejects_non_semver_baseline_names() -> None: + with pytest.raises(ValueError, match="semver tag"): + normalize_tag("last") + + +def test_parse_report_id_reads_current_and_baseline_tags() -> None: + report_id = parse_report_id(_report("0.4.2", "v0.4.1")) + + assert report_id.current_tag == "v0.4.2" + assert report_id.baseline_tag == "v0.4.1" + assert report_id.archive_name == "v0.4.2-vs-v0.4.1.md" + + +def test_promote_report_archives_previous_and_updates_sorted_index(tmp_path) -> None: + source = tmp_path / "target" / "bench-reports" / "performance.md" + current = tmp_path / "docs" / "PERFORMANCE.md" + archive_dir = tmp_path / "docs" / "archive" / "performance" + + source.parent.mkdir(parents=True) + current.parent.mkdir(parents=True) + archive_dir.mkdir(parents=True) + source.write_text(_report("0.4.2", "v0.4.1"), encoding="utf-8") + current.write_text(_report("0.4.1", "v0.4.0"), encoding="utf-8") + (archive_dir / "v0.3.1-vs-v0.3.0.md").write_text(_report("0.3.1", "v0.3.0"), encoding="utf-8") + + promoted = promote_report( + source=source, + current=current, + archive_dir=archive_dir, + expected_current_tag="v0.4.2", + expected_baseline_tag="v0.4.1", + ) + + assert promoted.archive_name == "v0.4.2-vs-v0.4.1.md" + assert current.read_text(encoding="utf-8") == source.read_text(encoding="utf-8") + assert (archive_dir / "v0.4.1-vs-v0.4.0.md").read_text(encoding="utf-8") == _report("0.4.1", "v0.4.0") + assert (archive_dir / "README.md").read_text(encoding="utf-8") == ( + "# Archived Performance Reports\n\n" + "Older release-to-release benchmark comparisons are archived here.\n" + "`docs/PERFORMANCE.md` contains the latest curated comparison.\n\n" + "- [v0.3.1-vs-v0.3.0](v0.3.1-vs-v0.3.0.md)\n" + "- [v0.4.1-vs-v0.4.0](v0.4.1-vs-v0.4.0.md)\n" + ) + + +def test_promote_report_is_idempotent_for_same_release_pair(tmp_path) -> None: + source = tmp_path / "performance-new.md" + current = tmp_path / "docs" / "PERFORMANCE.md" + archive_dir = tmp_path / "docs" / "archive" / "performance" + + source.write_text(_report("0.4.2", "v0.4.1"), encoding="utf-8") + current.parent.mkdir(parents=True) + current.write_text(_report("0.4.2", "v0.4.1"), encoding="utf-8") + + promote_report( + source=source, + current=current, + archive_dir=archive_dir, + expected_current_tag="v0.4.2", + expected_baseline_tag="v0.4.1", + ) + + assert not (archive_dir / "v0.4.2-vs-v0.4.1.md").exists() + assert "- No archived performance reports yet." in (archive_dir / "README.md").read_text(encoding="utf-8") + + +def test_promote_report_rejects_unexpected_release_pair(tmp_path) -> None: + source = tmp_path / "performance-new.md" + current = tmp_path / "docs" / "PERFORMANCE.md" + archive_dir = tmp_path / "docs" / "archive" / "performance" + source.write_text(_report("0.4.2", "v0.4.1"), encoding="utf-8") + + with pytest.raises(ValueError, match="does not match requested release pair"): + promote_report( + source=source, + current=current, + archive_dir=archive_dir, + expected_current_tag="v0.4.3", + expected_baseline_tag="v0.4.2", + ) + + +def test_main_promotes_generated_report_to_docs_performance(tmp_path, capsys) -> None: + source = tmp_path / "target" / "bench-reports" / "performance.md" + current = tmp_path / "docs" / "PERFORMANCE.md" + archive_dir = tmp_path / "docs" / "archive" / "performance" + generated = _report("0.4.3", "v0.4.2") + + source.parent.mkdir(parents=True) + source.write_text(generated, encoding="utf-8") + current.parent.mkdir(parents=True) + current.write_text(_report("0.4.2", "v0.4.1"), encoding="utf-8") + + rc = main( + [ + "v0.4.3", + "v0.4.2", + "--source", + str(source), + "--current", + str(current), + "--archive-dir", + str(archive_dir), + ] + ) + + assert rc == 0 + assert current.read_text(encoding="utf-8") == generated + assert (archive_dir / "v0.4.2-vs-v0.4.1.md").exists() + assert "Current performance report: v0.4.3 vs v0.4.2" in capsys.readouterr().out + + +def test_main_reports_release_pair_mismatch_to_stderr(tmp_path, capsys) -> None: + source = tmp_path / "target" / "bench-reports" / "performance.md" + current = tmp_path / "docs" / "PERFORMANCE.md" + archive_dir = tmp_path / "docs" / "archive" / "performance" + source.parent.mkdir(parents=True) + source.write_text(_report("0.4.3", "v0.4.2"), encoding="utf-8") + + rc = main( + [ + "v0.4.4", + "v0.4.3", + "--source", + str(source), + "--current", + str(current), + "--archive-dir", + str(archive_dir), + ] + ) + + captured = capsys.readouterr() + assert rc == 1 + assert "does not match requested release pair" in captured.err + assert not current.exists() + + +def test_main_generates_report_in_temp_worktree(tmp_path, monkeypatch, capsys) -> None: + current = tmp_path / "docs" / "PERFORMANCE.md" + archive_dir = tmp_path / "docs" / "archive" / "performance" + calls: list[tuple[str, tuple[str, ...], Path | None]] = [] + + def fake_run_git(args, cwd=None, **kwargs): + calls.append(("git", tuple(args), cwd)) + if args[:3] == ["worktree", "add", "--detach"]: + worktree = Path(args[3]) + worktree.mkdir(parents=True) + _write_current_benchmark_tooling(worktree) + return type("Result", (), {"stdout": ""})() + + def fake_run_git_with_input(args, input_data, cwd=None, **kwargs): + calls.append(("git-stdin", tuple(args), cwd)) + return type("Result", (), {"stdout": ""})() + + def fake_run_safe(command, args, cwd=None, **kwargs): + calls.append((command, tuple(args), cwd)) + if command == "gh": + download_dir = Path(args[args.index("--dir") + 1]) + _write_baseline_archive(download_dir / "la-stack-v0.4.2-criterion-baseline.tar.gz") + if command == "uv": + output = Path(args[args.index("--output") + 1]) + output.write_text(_report("0.4.3", "v0.4.2"), encoding="utf-8") + return type("Result", (), {"stdout": ""})() + + monkeypatch.chdir(tmp_path) + monkeypatch.setattr(archive_performance, "run_git_command", fake_run_git) + monkeypatch.setattr(archive_performance, "run_git_command_with_input", fake_run_git_with_input) + monkeypatch.setattr(archive_performance, "run_safe_command", fake_run_safe) + + rc = main( + [ + "v0.4.3", + "v0.4.2", + "--current", + str(current), + "--archive-dir", + str(archive_dir), + "--generate-in-temp-worktree", + "--worktree-ref", + "v0.4.3", + "--no-apply-current-diff", + "--suite", + "exact", + "--scope", + "release-signal", + ] + ) + + captured = capsys.readouterr() + assert rc == 0 + assert current.read_text(encoding="utf-8") == _report("0.4.3", "v0.4.2") + assert "Generated benchmark report in a temporary worktree" in captured.out + assert "target/bench-reports/performance.md" not in captured.out + assert any(kind == "git" and args[:3] == ("worktree", "add", "--detach") and args[4] == "v0.4.3" for kind, args, _ in calls) + assert any(kind == "uv" and "--suite" in args and args[args.index("--suite") + 1] == "exact" for kind, args, _ in calls) + assert not any(kind == "git" and args == ("diff", "--binary", "HEAD") for kind, args, _ in calls) + assert not any(kind == "git-stdin" for kind, _, _ in calls) + + +def test_temp_worktree_is_removed_when_benchmark_command_fails(tmp_path, monkeypatch, capsys) -> None: + current = tmp_path / "docs" / "PERFORMANCE.md" + archive_dir = tmp_path / "docs" / "archive" / "performance" + calls: list[tuple[str, tuple[str, ...], Path | None]] = [] + + def fake_run_git(args, cwd=None, **kwargs): + calls.append(("git", tuple(args), cwd)) + if args[:3] == ["worktree", "add", "--detach"]: + worktree = Path(args[3]) + worktree.mkdir(parents=True) + _write_current_benchmark_tooling(worktree) + return type("Result", (), {"stdout": ""})() + + def fake_run_git_with_input(args, input_data, cwd=None, **kwargs): + calls.append(("git-stdin", tuple(args), cwd)) + return type("Result", (), {"stdout": ""})() + + def fake_run_safe(command, args, cwd=None, **kwargs): + calls.append((command, tuple(args), cwd)) + if command == "gh": + download_dir = Path(args[args.index("--dir") + 1]) + _write_baseline_archive(download_dir / "la-stack-v0.4.2-criterion-baseline.tar.gz") + if command == "just" and args == ["bench-latest"]: + raise subprocess.CalledProcessError(42, [command, *args], output="bench stdout", stderr="bench stderr") + return type("Result", (), {"stdout": ""})() + + monkeypatch.chdir(tmp_path) + monkeypatch.setattr(archive_performance, "run_git_command", fake_run_git) + monkeypatch.setattr(archive_performance, "run_git_command_with_input", fake_run_git_with_input) + monkeypatch.setattr(archive_performance, "run_safe_command", fake_run_safe) + + rc = main( + [ + "v0.4.3", + "v0.4.2", + "--current", + str(current), + "--archive-dir", + str(archive_dir), + "--generate-in-temp-worktree", + "--worktree-ref", + "HEAD", + "--no-apply-current-diff", + ] + ) + + captured = capsys.readouterr() + assert rc == 1 + assert "command failed (42): just bench-latest" in captured.err + assert "bench stderr" in captured.err + assert not current.exists() + assert any(kind == "git" and args[:3] == ("worktree", "remove", "--force") for kind, args, _ in calls) + + +def test_generate_report_rejects_unsafe_baseline_archive(tmp_path, monkeypatch, capsys) -> None: + current = tmp_path / "docs" / "PERFORMANCE.md" + archive_dir = tmp_path / "docs" / "archive" / "performance" + calls: list[tuple[str, tuple[str, ...], Path | None]] = [] + + def fake_run_git(args, cwd=None, **kwargs): + calls.append(("git", tuple(args), cwd)) + if args[:3] == ["worktree", "add", "--detach"]: + worktree = Path(args[3]) + worktree.mkdir(parents=True) + _write_current_benchmark_tooling(worktree) + return type("Result", (), {"stdout": ""})() + + def fake_run_git_with_input(args, input_data, cwd=None, **kwargs): + calls.append(("git-stdin", tuple(args), cwd)) + return type("Result", (), {"stdout": ""})() + + def fake_run_safe(command, args, cwd=None, **kwargs): + calls.append((command, tuple(args), cwd)) + if command == "gh": + download_dir = Path(args[args.index("--dir") + 1]) + _write_unsafe_baseline_archive(download_dir / "la-stack-v0.4.2-criterion-baseline.tar.gz") + return type("Result", (), {"stdout": ""})() + + monkeypatch.chdir(tmp_path) + monkeypatch.setattr(archive_performance, "run_git_command", fake_run_git) + monkeypatch.setattr(archive_performance, "run_git_command_with_input", fake_run_git_with_input) + monkeypatch.setattr(archive_performance, "run_safe_command", fake_run_safe) + + rc = main( + [ + "v0.4.3", + "v0.4.2", + "--current", + str(current), + "--archive-dir", + str(archive_dir), + "--generate-in-temp-worktree", + "--worktree-ref", + "HEAD", + "--no-apply-current-diff", + ] + ) + + captured = capsys.readouterr() + assert rc == 1 + assert "refusing to extract unsafe archive member '../escape.txt'" in captured.err + assert not (tmp_path / "escape.txt").exists() + assert not current.exists() + assert not any(kind in {"just", "uv"} for kind, _, _ in calls) + assert any(kind == "git" and args[:3] == ("worktree", "remove", "--force") for kind, args, _ in calls) + + +def test_generate_report_fails_when_release_baseline_asset_missing(tmp_path, monkeypatch, capsys) -> None: + current = tmp_path / "docs" / "PERFORMANCE.md" + archive_dir = tmp_path / "docs" / "archive" / "performance" + calls: list[tuple[str, tuple[str, ...], Path | None]] = [] + + def fake_run_git(args, cwd=None, **kwargs): + calls.append(("git", tuple(args), cwd)) + if args[:3] == ["worktree", "add", "--detach"]: + worktree = Path(args[3]) + worktree.mkdir(parents=True) + _write_current_benchmark_tooling(worktree) + return type("Result", (), {"stdout": ""})() + + def fake_run_git_with_input(args, input_data, cwd=None, **kwargs): + calls.append(("git-stdin", tuple(args), cwd)) + return type("Result", (), {"stdout": ""})() + + def fake_run_safe(command, args, cwd=None, **kwargs): + calls.append((command, tuple(args), cwd)) + return type("Result", (), {"stdout": ""})() + + monkeypatch.chdir(tmp_path) + monkeypatch.setattr(archive_performance, "run_git_command", fake_run_git) + monkeypatch.setattr(archive_performance, "run_git_command_with_input", fake_run_git_with_input) + monkeypatch.setattr(archive_performance, "run_safe_command", fake_run_safe) + + rc = main( + [ + "v0.4.3", + "v0.4.2", + "--current", + str(current), + "--archive-dir", + str(archive_dir), + "--generate-in-temp-worktree", + "--worktree-ref", + "HEAD", + "--no-apply-current-diff", + ] + ) + + captured = capsys.readouterr() + assert rc == 1 + assert "release baseline asset was not downloaded" in captured.err + assert not current.exists() + assert not any(kind in {"just", "uv"} for kind, _, _ in calls) + assert any(kind == "git" and args[:3] == ("worktree", "remove", "--force") for kind, args, _ in calls) + + +def test_failed_atomic_replace_preserves_existing_report(tmp_path, monkeypatch) -> None: + source = tmp_path / "performance-new.md" + current = tmp_path / "docs" / "PERFORMANCE.md" + archive_dir = tmp_path / "docs" / "archive" / "performance" + original = _report("0.4.2", "v0.4.1") + + source.write_text(_report("0.4.3", "v0.4.2"), encoding="utf-8") + current.parent.mkdir(parents=True) + current.write_text(original, encoding="utf-8") + + def fail_replace(src, dst) -> None: + msg = f"simulated replace failure for {dst}" + raise OSError(msg) + + monkeypatch.setattr(archive_performance, "_replace_file", fail_replace) + + with pytest.raises(OSError, match="simulated replace failure"): + promote_report( + source=source, + current=current, + archive_dir=archive_dir, + expected_current_tag="v0.4.3", + expected_baseline_tag="v0.4.2", + ) + + assert current.read_text(encoding="utf-8") == original + assert not list(current.parent.glob(".PERFORMANCE.md.*.tmp")) + + +def test_generate_and_promote_uses_temp_worktree_and_current_diff(tmp_path, monkeypatch) -> None: + current = tmp_path / "docs" / "PERFORMANCE.md" + archive_dir = tmp_path / "docs" / "archive" / "performance" + current.parent.mkdir(parents=True) + current.write_text(_report("0.4.2", "v0.4.1"), encoding="utf-8") + calls: list[tuple[str, tuple[str, ...], Path | None]] = [] + + def fake_run_git(args, cwd=None, **kwargs): + calls.append(("git", tuple(args), cwd)) + if args[:3] == ["worktree", "add", "--detach"]: + worktree = Path(args[3]) + worktree.mkdir(parents=True) + _write_current_benchmark_tooling(worktree) + if args == ["diff", "--binary", "HEAD"]: + return type("Result", (), {"stdout": "diff --git a/README.md b/README.md\n"})() + return type("Result", (), {"stdout": ""})() + + def fake_run_git_with_input(args, input_data, cwd=None, **kwargs): + calls.append(("git-stdin", tuple(args), cwd)) + assert "diff --git" in input_data + return type("Result", (), {"stdout": ""})() + + def fake_run_safe(command, args, cwd=None, **kwargs): + calls.append((command, tuple(args), cwd)) + if command == "gh": + download_dir = Path(args[args.index("--dir") + 1]) + _write_baseline_archive(download_dir / "la-stack-v0.4.2-criterion-baseline.tar.gz") + if command == "uv": + output = Path(args[args.index("--output") + 1]) + output.write_text(_report("0.4.3", "v0.4.2"), encoding="utf-8") + return type("Result", (), {"stdout": ""})() + + monkeypatch.setattr(archive_performance, "run_git_command", fake_run_git) + monkeypatch.setattr(archive_performance, "run_git_command_with_input", fake_run_git_with_input) + monkeypatch.setattr(archive_performance, "run_safe_command", fake_run_safe) + + report_id = generate_and_promote_worktree_report( + current=current, + archive_dir=archive_dir, + config=GenerationConfig( + repo_root=tmp_path, + current_tag="v0.4.3", + baseline_tag="v0.4.2", + worktree_ref="HEAD", + apply_current_diff=True, + ), + ) + + assert report_id.archive_name == "v0.4.3-vs-v0.4.2.md" + assert current.read_text(encoding="utf-8") == _report("0.4.3", "v0.4.2") + assert (archive_dir / "v0.4.2-vs-v0.4.1.md").exists() + assert any(kind == "git" and args[:3] == ("worktree", "add", "--detach") and args[4] == "HEAD" for kind, args, _ in calls) + assert any(kind == "git-stdin" and args == ("apply", "--binary") for kind, args, _ in calls) + assert any(kind == "just" and args == ("bench-latest",) for kind, args, _ in calls) + assert any(kind == "git" and args[:3] == ("worktree", "remove", "--force") for kind, args, _ in calls) + + +def test_generate_and_promote_legacy_published_tag_uses_legacy_commands(tmp_path, monkeypatch) -> None: + current = tmp_path / "docs" / "PERFORMANCE.md" + archive_dir = tmp_path / "docs" / "archive" / "performance" + calls: list[tuple[str, tuple[str, ...], Path | None]] = [] + + def fake_run_git(args, cwd=None, **kwargs): + calls.append(("git", tuple(args), cwd)) + if args[:3] == ["worktree", "add", "--detach"]: + worktree = Path(args[3]) + worktree.mkdir(parents=True) + _write_legacy_benchmark_tooling(worktree) + return type("Result", (), {"stdout": ""})() + + def fake_run_git_with_input(args, input_data, cwd=None, **kwargs): + calls.append(("git-stdin", tuple(args), cwd)) + return type("Result", (), {"stdout": ""})() + + def fake_run_safe(command, args, cwd=None, **kwargs): + calls.append((command, tuple(args), cwd)) + if command == "gh": + download_dir = Path(args[args.index("--dir") + 1]) + _write_baseline_archive(download_dir / "la-stack-v0.4.1-criterion-baseline.tar.gz") + if command == "uv": + output = Path(args[args.index("--output") + 1]) + output.write_text(_report("0.4.2", "v0.4.1"), encoding="utf-8") + return type("Result", (), {"stdout": ""})() + + monkeypatch.setattr(archive_performance, "run_git_command", fake_run_git) + monkeypatch.setattr(archive_performance, "run_git_command_with_input", fake_run_git_with_input) + monkeypatch.setattr(archive_performance, "run_safe_command", fake_run_safe) + + report_id = generate_and_promote_worktree_report( + current=current, + archive_dir=archive_dir, + config=GenerationConfig( + repo_root=tmp_path, + current_tag="v0.4.2", + baseline_tag="v0.4.1", + worktree_ref="v0.4.2", + apply_current_diff=False, + ), + ) + + assert report_id.archive_name == "v0.4.2-vs-v0.4.1.md" + assert current.read_text(encoding="utf-8") == _report("0.4.2", "v0.4.1") + assert any(kind == "git" and args[:3] == ("worktree", "add", "--detach") and args[4] == "v0.4.2" for kind, args, _ in calls) + assert any(kind == "just" and args == ("bench-exact",) for kind, args, _ in calls) + assert not any(kind == "just" and args == ("bench-latest",) for kind, args, _ in calls) + assert not any(kind == "uv" and "--suite" in args for kind, args, _ in calls) + assert not any(kind == "uv" and "--scope" in args for kind, args, _ in calls) + assert not any(kind == "git" and args == ("diff", "--binary", "HEAD") for kind, args, _ in calls) + assert not any(kind == "git-stdin" for kind, _, _ in calls) From d31e26a9d7a47a6c3089028630640bcff5afe7c0 Mon Sep 17 00:00:00 2001 From: Adam Getchell Date: Mon, 8 Jun 2026 15:13:14 -0700 Subject: [PATCH 2/4] feat(bench): automate published performance report archiving - Track the latest curated release comparison in docs/PERFORMANCE.md and archive older comparisons under docs/archive/performance/ - Let performance-archive-published discover the latest stable GitHub release and previous stable baseline automatically - Generate release comparisons in isolated temporary worktrees, with release-asset restore and local baseline fallback paths - Update benchmark and release docs to use the scripted workflow instead of manual checkout steps --- .gitignore | 3 - docs/BENCHMARKING.md | 46 +- docs/PERFORMANCE.md | 114 +++++ docs/RELEASING.md | 9 +- docs/archive/performance/README.md | 6 + docs/archive/performance/v0.4.1-vs-v0.4.0.md | 85 ++++ justfile | 16 +- scripts/README.md | 7 +- scripts/archive_performance.py | 282 ++++++++++++- scripts/bench_compare.py | 30 +- scripts/tests/test_archive_performance.py | 420 ++++++++++++++++--- scripts/tests/test_bench_compare.py | 5 +- 12 files changed, 882 insertions(+), 141 deletions(-) create mode 100644 docs/PERFORMANCE.md create mode 100644 docs/archive/performance/README.md create mode 100644 docs/archive/performance/v0.4.1-vs-v0.4.0.md diff --git a/.gitignore b/.gitignore index 84f6f10..82ae0c2 100644 --- a/.gitignore +++ b/.gitignore @@ -5,9 +5,6 @@ /cobertura.xml .DS_Store -# Generated benchmark results (machine-specific) -docs/PERFORMANCE.md - # Python / uv **/__pycache__/ *.egg-info/ diff --git a/docs/BENCHMARKING.md b/docs/BENCHMARKING.md index 0e2ce23..af3bee7 100644 --- a/docs/BENCHMARKING.md +++ b/docs/BENCHMARKING.md @@ -151,40 +151,27 @@ benchmarks on every iteration. ### Workflow ```bash -# 1. Check out the old release and save its full baseline -git checkout v0.2.0 -just bench-save-baseline v0.2.0 +# Latest published release vs previous stable release +just performance-archive-published -# 2. Switch to current code and run latest la-stack measurements -git checkout main # or your feature branch -just bench-latest # populates target/criterion/*/new/ - -# 3. Generate a local comparison report -just bench-compare v0.2.0 +# Explicit historical repair +just performance-archive-published v0.4.2 v0.4.1 ``` -You can save multiple baselines and compare against any of them. +These recipes create isolated temporary worktrees, restore release baselines +from GitHub Release assets when available, and fall back to generating a missing +baseline in a second temporary worktree. They do not require changing the current +checkout. -If the release baseline is already present in `target/criterion/`, skip the -checkout step and compare directly. For example, to compare current code against -the saved `v0.4.2` release baseline: +For local scratch comparisons, you can save multiple baselines and compare +against any of them. If the release baseline is already present in +`target/criterion/`, compare directly: ```bash just bench-latest # gather latest la-stack measurements just bench-compare v0.4.2 # compare latest measurements against v0.4.2 ``` -If the release baseline is not present locally, download and restore the release -asset first: - -```bash -gh release download v0.4.2 --pattern "la-stack-v0.4.2-criterion-baseline.tar.gz" # fetch archived release baseline -mkdir -p target # ensure Criterion parent directory exists -tar -C target -xzf la-stack-v0.4.2-criterion-baseline.tar.gz # restore target/criterion baseline data -just bench-latest # gather latest la-stack measurements -just bench-compare v0.4.2 # compare latest measurements against v0.4.2 -``` - ### Output `just bench-compare` writes `target/bench-reports/performance.md` by @@ -206,13 +193,17 @@ an isolated temporary worktree, copies the finished report to `v0.4.2-vs-v0.4.1.md`, so the directory and generated index stay lexicographically sorted. -To regenerate and archive a historical published release comparison without +To regenerate and archive the latest published release comparison without touching the current checkout: ```bash -just performance-archive-published v0.4.2 v0.4.1 +just performance-archive-published ``` +The recipe discovers the latest stable published GitHub release and its previous +stable release automatically. For explicit historical repair, pass both tags: +`just performance-archive-published v0.4.2 v0.4.1`. + For exact-arithmetic comparisons against v0.4.2 or older baselines, rows such as `det_exact_rounded_f64 (vs det_exact_f64)` mean the current rounded API is being compared to the historical lossy `*_exact_f64` benchmark. Rows such as @@ -261,4 +252,5 @@ just bench-save-last When the GitHub Release is published, `.github/workflows/release-benchmarks.yml` saves a full release baseline and attaches `la-stack-$TAG-criterion-baseline.tar.gz` to the release as the durable archive. -See `docs/RELEASING.md` step 5 for where this fits in the release process. +See the `just performance-release` step in `docs/RELEASING.md` for where the +curated `docs/PERFORMANCE.md` comparison fits in the release process. diff --git a/docs/PERFORMANCE.md b/docs/PERFORMANCE.md new file mode 100644 index 0000000..0824ce0 --- /dev/null +++ b/docs/PERFORMANCE.md @@ -0,0 +1,114 @@ +# Exact Arithmetic Performance + +**la-stack** v0.4.2 · `7e11f93` (HEAD) · 2026-06-08 20:39:03 UTC +**Statistic**: median + +## Benchmark Results + +Comparison against baseline **v0.4.1**: + +Negative change = faster. Speedup > 1.00x = improvement. + +### D=2 + +| Benchmark | v0.4.1 | Current | Change | Speedup | +|-----------|-------:|--------:|-------:|--------:| +| det | 0.6 ns | 0.9 ns | +61.1% | 0.62x | +| det_direct | 0.7 ns | 1.0 ns | +44.7% | 0.69x | +| det_exact | 315.5 ns | 318.4 ns | +0.9% | 0.99x | +| det_exact_f64 | 555.7 ns | 555.7 ns | -0.0% | 1.00x | +| det_sign_exact | 0.7 ns | 1.5 ns | +128.2% | 0.44x | +| solve_exact | 7.05 µs | 7.06 µs | +0.2% | 1.00x | +| solve_exact_f64 | 7.50 µs | 7.67 µs | +2.3% | 0.98x | + +### D=3 + +| Benchmark | v0.4.1 | Current | Change | Speedup | +|-----------|-------:|--------:|-------:|--------:| +| det | 1.3 ns | 1.8 ns | +36.3% | 0.73x | +| det_direct | 4.7 ns | 2.2 ns | **-51.9%** | 2.08x | +| det_exact | 936.9 ns | 924.3 ns | **-1.3%** | 1.01x | +| det_exact_f64 | 1.18 µs | 1.19 µs | +1.1% | 0.99x | +| det_sign_exact | 2.4 ns | 4.2 ns | +78.1% | 0.56x | +| solve_exact | 27.02 µs | 27.41 µs | +1.5% | 0.99x | +| solve_exact_f64 | 28.06 µs | 27.98 µs | -0.3% | 1.00x | + +### D=4 + +| Benchmark | v0.4.1 | Current | Change | Speedup | +|-----------|-------:|--------:|-------:|--------:| +| det | 2.4 ns | 3.3 ns | +36.8% | 0.73x | +| det_direct | 2.4 ns | 4.1 ns | +70.2% | 0.59x | +| det_exact | 2.33 µs | 2.33 µs | -0.0% | 1.00x | +| det_exact_f64 | 2.59 µs | 2.58 µs | -0.7% | 1.01x | +| det_sign_exact | 5.3 ns | 6.9 ns | +30.5% | 0.77x | +| solve_exact | 67.14 µs | 67.99 µs | +1.3% | 0.99x | +| solve_exact_f64 | 67.86 µs | 68.51 µs | +1.0% | 0.99x | + +### D=5 + +| Benchmark | v0.4.1 | Current | Change | Speedup | +|-----------|-------:|--------:|-------:|--------:| +| det | 21.6 ns | 24.5 ns | +13.7% | 0.88x | +| det_direct | 2.3 ns | 4.7 ns | +104.8% | 0.49x | +| det_exact | 5.04 µs | 4.99 µs | -1.0% | 1.01x | +| det_exact_f64 | 5.32 µs | 5.31 µs | -0.1% | 1.00x | +| det_sign_exact | 4.97 µs | 4.99 µs | +0.3% | 1.00x | +| solve_exact | 134.99 µs | 136.04 µs | +0.8% | 0.99x | +| solve_exact_f64 | 137.11 µs | 138.97 µs | +1.4% | 0.99x | + +### Near-singular 3x3 + +| Benchmark | v0.4.1 | Current | Change | Speedup | +|-----------|-------:|--------:|-------:|--------:| +| det_sign_exact | 871.8 ns | 877.6 ns | +0.7% | 0.99x | +| det_exact | 907.3 ns | 904.4 ns | -0.3% | 1.00x | +| solve_exact | 4.31 µs | 4.25 µs | **-1.5%** | 1.02x | +| solve_exact_f64 | 4.29 µs | 4.32 µs | +0.7% | 0.99x | + +### Large entries 3x3 + +| Benchmark | v0.4.1 | Current | Change | Speedup | +|-----------|-------:|--------:|-------:|--------:| +| det_sign_exact | 3.14 µs | 3.09 µs | **-1.3%** | 1.01x | +| det_exact | 3.19 µs | 3.11 µs | **-2.3%** | 1.02x | +| solve_exact | 84.77 µs | 83.89 µs | **-1.0%** | 1.01x | +| solve_exact_f64 | 84.62 µs | 83.92 µs | -0.8% | 1.01x | + +### Hilbert 4x4 + +| Benchmark | v0.4.1 | Current | Change | Speedup | +|-----------|-------:|--------:|-------:|--------:| +| det_sign_exact | 5.3 ns | 6.9 ns | +30.4% | 0.77x | +| det_exact | 2.39 µs | 2.31 µs | **-3.2%** | 1.03x | +| solve_exact | 51.69 µs | 52.27 µs | +1.1% | 0.99x | +| solve_exact_f64 | 52.90 µs | 53.26 µs | +0.7% | 0.99x | + +### Hilbert 5x5 + +| Benchmark | v0.4.1 | Current | Change | Speedup | +|-----------|-------:|--------:|-------:|--------:| +| det_sign_exact | 5.03 µs | 4.88 µs | **-2.9%** | 1.03x | +| det_exact | 5.07 µs | 4.96 µs | **-2.1%** | 1.02x | +| solve_exact | 105.35 µs | 102.72 µs | **-2.5%** | 1.03x | +| solve_exact_f64 | 104.99 µs | 103.94 µs | -1.0% | 1.01x | + +## How to Update + +Release performance docs are generated in isolated temporary worktrees: + +```bash +# Release PR: update docs/PERFORMANCE.md and archive the previous report +just performance-release + +# Historical published comparison +just performance-archive-published + +# Explicit historical repair +just performance-archive-published +``` + +For local scratch comparisons, use `just bench-latest` and `just bench-compare`. +Those write `target/bench-reports/performance.md`. + +See `docs/BENCHMARKING.md` for the full comparison workflow. diff --git a/docs/RELEASING.md b/docs/RELEASING.md index 797cd96..b2ca91d 100644 --- a/docs/RELEASING.md +++ b/docs/RELEASING.md @@ -113,10 +113,11 @@ just performance-release "$TAG" "$PREVIOUS_TAG" Review `docs/PERFORMANCE.md` for the latest release-to-release comparison. Older committed comparisons are archived under `docs/archive/performance/` with lexicographically sorted filenames such as `v0.4.2-vs-v0.4.1.md`. Iterative -local reports still live under `target/bench-reports/`. To regenerate a -historical published release comparison, use -`just performance-archive-published `; it also runs -inside a temporary worktree. +local reports still live under `target/bench-reports/`. To regenerate the +latest published release comparison, use `just performance-archive-published`; +it discovers the latest stable published GitHub release and its previous stable +release automatically, then runs inside a temporary worktree. For explicit +historical repair, pass both tags. 6. Save benchmark baselines for this release diff --git a/docs/archive/performance/README.md b/docs/archive/performance/README.md new file mode 100644 index 0000000..dd5fc68 --- /dev/null +++ b/docs/archive/performance/README.md @@ -0,0 +1,6 @@ +# Archived Performance Reports + +Older release-to-release benchmark comparisons are archived here. +`docs/PERFORMANCE.md` contains the latest curated comparison. + +- [v0.4.1-vs-v0.4.0](v0.4.1-vs-v0.4.0.md) diff --git a/docs/archive/performance/v0.4.1-vs-v0.4.0.md b/docs/archive/performance/v0.4.1-vs-v0.4.0.md new file mode 100644 index 0000000..0f4f39b --- /dev/null +++ b/docs/archive/performance/v0.4.1-vs-v0.4.0.md @@ -0,0 +1,85 @@ +# Exact Arithmetic Performance + +**la-stack** v0.4.1 · `c6e04fd` (main) · 2026-04-21 22:30:49 UTC +**Statistic**: median + +## Benchmark Results + +Comparison against baseline **v0.4.0**: + +Negative change = faster. Speedup > 1.00x = improvement. + +### D=2 + +| Benchmark | v0.4.0 | Current | Change | Speedup | +|-----------|-------:|--------:|-------:|--------:| +| det | 0.6 ns | 0.6 ns | +0.9% | 0.99x | +| det_direct | 0.7 ns | 0.7 ns | +0.6% | 0.99x | +| det_exact | 250.3 ns | 250.0 ns | -0.1% | 1.00x | +| det_exact_f64 | 434.2 ns | 421.7 ns | **-2.9%** | 1.03x | +| det_sign_exact | 1.1 ns | 0.7 ns | **-39.6%** | 1.66x | +| solve_exact | 15.74 µs | 6.51 µs | **-58.7%** | 2.42x | +| solve_exact_f64 | 16.76 µs | 7.06 µs | **-57.9%** | 2.38x | + +### D=3 + +| Benchmark | v0.4.0 | Current | Change | Speedup | +|-----------|-------:|--------:|-------:|--------:| +| det | 1.4 ns | 1.3 ns | **-3.0%** | 1.03x | +| det_direct | 4.7 ns | 4.6 ns | **-2.3%** | 1.02x | +| det_exact | 719.2 ns | 741.2 ns | +3.1% | 0.97x | +| det_exact_f64 | 942.0 ns | 933.1 ns | -0.9% | 1.01x | +| det_sign_exact | 4.1 ns | 2.3 ns | **-43.3%** | 1.76x | +| solve_exact | 51.03 µs | 25.19 µs | **-50.6%** | 2.03x | +| solve_exact_f64 | 53.31 µs | 26.64 µs | **-50.0%** | 2.00x | + +### D=4 + +| Benchmark | v0.4.0 | Current | Change | Speedup | +|-----------|-------:|--------:|-------:|--------:| +| det | 2.4 ns | 2.4 ns | **-2.6%** | 1.03x | +| det_direct | 2.4 ns | 2.4 ns | -0.6% | 1.01x | +| det_exact | 1.89 µs | 1.88 µs | -0.3% | 1.00x | +| det_exact_f64 | 2.09 µs | 2.09 µs | +0.2% | 1.00x | +| det_sign_exact | 6.4 ns | 5.4 ns | **-15.7%** | 1.19x | +| solve_exact | 147.18 µs | 64.62 µs | **-56.1%** | 2.28x | +| solve_exact_f64 | 147.75 µs | 64.03 µs | **-56.7%** | 2.31x | + +### D=5 + +| Benchmark | v0.4.0 | Current | Change | Speedup | +|-----------|-------:|--------:|-------:|--------:| +| det | 26.3 ns | 24.4 ns | **-7.5%** | 1.08x | +| det_direct | 2.3 ns | 2.3 ns | **-1.7%** | 1.02x | +| det_exact | 4.24 µs | 4.13 µs | **-2.6%** | 1.03x | +| det_exact_f64 | 4.40 µs | 4.48 µs | +1.9% | 0.98x | +| det_sign_exact | 4.19 µs | 4.08 µs | **-2.7%** | 1.03x | +| solve_exact | 339.25 µs | 132.55 µs | **-60.9%** | 2.56x | +| solve_exact_f64 | 340.14 µs | 135.75 µs | **-60.1%** | 2.51x | + +### Near-singular 3x3 + +| Benchmark | v0.4.0 | Current | Change | Speedup | +|-----------|-------:|--------:|-------:|--------:| +| det_sign_exact | 909.2 ns | 705.9 ns | **-22.4%** | 1.29x | +| det_exact | 929.6 ns | 729.8 ns | **-21.5%** | 1.27x | + +## How to Update + +Release performance docs are generated in isolated temporary worktrees: + +```bash +# Release PR: update docs/PERFORMANCE.md and archive the previous report +just performance-release + +# Historical published comparison +just performance-archive-published + +# Explicit historical repair +just performance-archive-published +``` + +For local scratch comparisons, use `just bench-latest` and `just bench-compare`. +Those write `target/bench-reports/performance.md`. + +See `docs/BENCHMARKING.md` for the full comparison workflow. diff --git a/justfile b/justfile index a6503e4..79aab56 100644 --- a/justfile +++ b/justfile @@ -191,8 +191,20 @@ performance-release current_tag baseline_tag: python-sync uv run archive-performance "{{current_tag}}" "{{baseline_tag}}" --generate-in-temp-worktree --worktree-ref HEAD # Generate a published-tag comparison in a temp worktree, then promote/archive docs. -performance-archive-published current_tag baseline_tag: python-sync - uv run archive-performance "{{current_tag}}" "{{baseline_tag}}" --generate-in-temp-worktree --worktree-ref "{{current_tag}}" --no-apply-current-diff +performance-archive-published current_tag="" baseline_tag="": python-sync + #!/usr/bin/env bash + set -euo pipefail + current_tag="{{current_tag}}" + baseline_tag="{{baseline_tag}}" + if [[ -n "$current_tag" || -n "$baseline_tag" ]]; then + if [[ -z "$current_tag" || -z "$baseline_tag" ]]; then + echo "current_tag and baseline_tag must be provided together" >&2 + exit 2 + fi + uv run archive-performance "$current_tag" "$baseline_tag" --generate-in-temp-worktree --worktree-ref "$current_tag" --no-apply-current-diff + else + uv run archive-performance --published-latest --generate-in-temp-worktree --no-apply-current-diff + fi # Run the exact-arithmetic benchmark suite. bench-exact: diff --git a/scripts/README.md b/scripts/README.md index 501dfb4..189a723 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -41,13 +41,16 @@ runs in a temporary worktree: just performance-release v0.4.3 v0.4.2 ``` -To regenerate a historical published-tag comparison without changing the current +To regenerate the latest published-tag comparison without changing the current checkout: ```bash -just performance-archive-published v0.4.2 v0.4.1 +just performance-archive-published ``` +For explicit historical repair, pass both tags: +`just performance-archive-published v0.4.2 v0.4.1`. + ### Plotting Criterion benchmarks (la-stack vs nalgebra/faer) The plotter reads Criterion output under: diff --git a/scripts/archive_performance.py b/scripts/archive_performance.py index 52d60da..50f67d4 100644 --- a/scripts/archive_performance.py +++ b/scripts/archive_performance.py @@ -16,14 +16,19 @@ from __future__ import annotations import argparse +import json import os import re +import shutil import subprocess import sys import tarfile import tempfile +import tomllib +from collections.abc import Mapping from dataclasses import dataclass from pathlib import Path +from typing import Any, cast from subprocess_utils import run_git_command, run_git_command_with_input, run_safe_command @@ -35,6 +40,7 @@ rf"(?:-{_SEMVER_IDENTIFIER_RE}(?:\.{_SEMVER_IDENTIFIER_RE})*)?" r"(?:\+[0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*)?$" ) +_SEMVER_PARTS_RE = re.compile(r"^v?(?P0|[1-9][0-9]*)\.(?P0|[1-9][0-9]*)\.(?P0|[1-9][0-9]*)$") _DEFAULT_SOURCE = "target/bench-reports/performance.md" _DEFAULT_CURRENT = "docs/PERFORMANCE.md" @@ -43,6 +49,7 @@ _DEFAULT_SCOPE = "release-signal" _BENCH_TIMEOUT_SECONDS = 7200 _COMMAND_TIMEOUT_SECONDS = 600 +_HOW_TO_UPDATE_RE = re.compile(r"(?ms)^## How to Update\n.*\Z") @dataclass(frozen=True) @@ -71,6 +78,24 @@ class GenerationConfig: apply_current_diff: bool = True +@dataclass(frozen=True) +class ResolvedArchiveRequest: + """Release pair and worktree ref resolved from CLI arguments.""" + + current_tag: str + baseline_tag: str + worktree_ref: str + fetch_tags: bool = False + + +@dataclass(frozen=True) +class PublishedRelease: + """Stable GitHub release metadata used to infer release pairs.""" + + tag: str + published_at: str + + def normalize_tag(tag: str) -> str: """Return *tag* with a leading ``v`` and no surrounding whitespace.""" normalized = tag.strip() @@ -103,10 +128,117 @@ def parse_report_id(text: str) -> ReportId: ) +def _semver_sort_key(tag: str) -> tuple[int, int, int]: + match = _SEMVER_PARTS_RE.fullmatch(normalize_tag(tag)) + if match is None: + msg = f"expected a stable semver tag like v0.4.2, got {tag!r}" + raise ValueError(msg) + return (int(match.group("major")), int(match.group("minor")), int(match.group("patch"))) + + +def _stable_published_releases(releases: object) -> list[PublishedRelease]: + if not isinstance(releases, list): + msg = "expected GitHub release list to be a JSON array" + raise TypeError(msg) + + stable_releases: dict[str, PublishedRelease] = {} + for release in releases: + if not isinstance(release, Mapping): + continue + release = cast("Mapping[str, Any]", release) + if release.get("isDraft") or release.get("isPrerelease"): + continue + tag_name = release.get("tagName") + published_at = release.get("publishedAt") + if not isinstance(tag_name, str) or not isinstance(published_at, str) or not published_at: + continue + try: + normalized = normalize_tag(tag_name) + _semver_sort_key(normalized) + except ValueError: + continue + stable_releases[normalized] = PublishedRelease(tag=normalized, published_at=published_at) + + return list(stable_releases.values()) + + +def _published_release_pair(repo_root: Path) -> ReportId: + command = [ + "release", + "list", + "--json", + "tagName,isDraft,isPrerelease,publishedAt", + "--limit", + "100", + ] + try: + result = run_safe_command( + "gh", + command, + cwd=repo_root, + timeout=_COMMAND_TIMEOUT_SECONDS, + ) + except subprocess.CalledProcessError as exc: + raise RuntimeError(_format_command_failure(["gh", *command], exc)) from exc + try: + releases = json.loads(result.stdout) + except json.JSONDecodeError as exc: + msg = "could not parse GitHub release list JSON" + raise RuntimeError(msg) from exc + stable_releases = _stable_published_releases(releases) + if len(stable_releases) < 2: + msg = "expected at least two published stable semver releases" + raise RuntimeError(msg) + + current = max(stable_releases, key=lambda release: release.published_at) + current_key = _semver_sort_key(current.tag) + previous_tags = sorted( + (release.tag for release in stable_releases if _semver_sort_key(release.tag) < current_key), + key=_semver_sort_key, + ) + if not previous_tags: + msg = f"could not find a previous stable semver release before {current.tag}" + raise RuntimeError(msg) + return ReportId(current_tag=current.tag, baseline_tag=previous_tags[-1]) + + def _read_text(path: Path) -> str: return path.read_text(encoding="utf-8") +def _how_to_update_section() -> str: + lines = [ + "## How to Update", + "", + "Release performance docs are generated in isolated temporary worktrees:", + "", + "```bash", + "# Release PR: update docs/PERFORMANCE.md and archive the previous report", + "just performance-release ", + "", + "# Historical published comparison", + "just performance-archive-published", + "", + "# Explicit historical repair", + "just performance-archive-published ", + "```", + "", + "For local scratch comparisons, use `just bench-latest` and `just bench-compare`.", + "Those write `target/bench-reports/performance.md`.", + "", + "See `docs/BENCHMARKING.md` for the full comparison workflow.", + "", + ] + return "\n".join(lines) + + +def _normalize_how_to_update(text: str) -> str: + section = _how_to_update_section() + if _HOW_TO_UPDATE_RE.search(text): + return _HOW_TO_UPDATE_RE.sub(section, text) + return f"{text.rstrip()}\n\n{section}" + + def _replace_file(src: Path, dst: Path) -> None: src.replace(dst) @@ -170,13 +302,41 @@ def _run_git(args: list[str], *, cwd: Path, timeout: int = _COMMAND_TIMEOUT_SECO raise RuntimeError(_format_command_failure(["git", *args], exc)) from exc -def _run_tool(command: str, args: list[str], *, cwd: Path, timeout: int = _COMMAND_TIMEOUT_SECONDS) -> None: +def _fetch_release_tags(*, repo_root: Path, tags: list[str]) -> None: + refspecs = [f"refs/tags/{tag}:refs/tags/{tag}" for tag in tags] + _run_git(["fetch", "origin", *refspecs], cwd=repo_root) + + +def _run_tool(command: str, args: list[str], *, cwd: Path, timeout: int = _COMMAND_TIMEOUT_SECONDS, env: dict[str, str] | None = None) -> None: try: - run_safe_command(command, args, cwd=cwd, timeout=timeout) + run_safe_command(command, args, cwd=cwd, timeout=timeout, env=env) except subprocess.CalledProcessError as exc: raise RuntimeError(_format_command_failure([command, *args], exc)) from exc +def _current_rust_toolchain(repo_root: Path) -> str | None: + rust_toolchain = repo_root / "rust-toolchain.toml" + if not rust_toolchain.exists(): + return None + data = tomllib.loads(_read_text(rust_toolchain)) + toolchain = data.get("toolchain") + if not isinstance(toolchain, dict): + return None + channel = toolchain.get("channel") + return channel if isinstance(channel, str) else None + + +def _benchmark_env(repo_root: Path) -> dict[str, str] | None: + if "RUSTUP_TOOLCHAIN" in os.environ: + return None + toolchain = _current_rust_toolchain(repo_root) + if toolchain is None: + return None + env = os.environ.copy() + env["RUSTUP_TOOLCHAIN"] = toolchain + return env + + def _safe_extract_tar(archive: Path, target_dir: Path) -> None: target_dir.mkdir(parents=True, exist_ok=True) target_root = target_dir.resolve() @@ -210,6 +370,44 @@ def _download_release_baseline(*, baseline_tag: str, download_dir: Path, repo_ro return artifact +def _generate_release_baseline(*, baseline_tag: str, repo_root: Path, target_worktree: Path, tmp_dir: Path) -> None: + baseline_worktree = tmp_dir / "baseline-worktree" + _run_git(["worktree", "add", "--detach", str(baseline_worktree), baseline_tag], cwd=repo_root) + try: + _run_tool("just", ["bench-save-baseline", baseline_tag], cwd=baseline_worktree, timeout=_BENCH_TIMEOUT_SECONDS, env=_benchmark_env(repo_root)) + baseline_criterion = baseline_worktree / "target" / "criterion" + if not baseline_criterion.is_dir(): + msg = f"generated baseline Criterion results were not found: {baseline_criterion}" + raise FileNotFoundError(msg) + target_criterion = target_worktree / "target" / "criterion" + target_criterion.parent.mkdir(parents=True, exist_ok=True) + shutil.copytree(baseline_criterion, target_criterion, dirs_exist_ok=True) + finally: + try: + _run_git(["worktree", "remove", "--force", str(baseline_worktree)], cwd=repo_root) + except RuntimeError as exc: + print(f"archive-performance: failed to remove baseline worktree: {exc}", file=sys.stderr) + + +def _prepare_release_baseline(*, baseline_tag: str, repo_root: Path, target_worktree: Path, tmp_dir: Path) -> None: + try: + baseline_archive = _download_release_baseline( + baseline_tag=baseline_tag, + download_dir=tmp_dir, + repo_root=repo_root, + ) + except (FileNotFoundError, RuntimeError) as exc: + print(f"archive-performance: release baseline asset unavailable; generating {baseline_tag} locally ({exc})", file=sys.stderr) + _generate_release_baseline( + baseline_tag=baseline_tag, + repo_root=repo_root, + target_worktree=target_worktree, + tmp_dir=tmp_dir, + ) + else: + _safe_extract_tar(baseline_archive, target_worktree / "target") + + def _apply_current_diff_to_worktree(*, repo_root: Path, worktree: Path) -> None: diff = run_git_command(["diff", "--binary", "HEAD"], cwd=repo_root).stdout if diff.strip(): @@ -231,8 +429,9 @@ def _has_current_release_signal_tooling(worktree: Path) -> bool: def _run_benchmarks_and_render_report(*, worktree: Path, report: Path, config: GenerationConfig) -> None: + benchmark_env = _benchmark_env(config.repo_root) if _has_current_release_signal_tooling(worktree): - _run_tool("just", ["bench-latest"], cwd=worktree, timeout=_BENCH_TIMEOUT_SECONDS) + _run_tool("just", ["bench-latest"], cwd=worktree, timeout=_BENCH_TIMEOUT_SECONDS, env=benchmark_env) _run_tool( "uv", [ @@ -250,7 +449,7 @@ def _run_benchmarks_and_render_report(*, worktree: Path, report: Path, config: G timeout=_COMMAND_TIMEOUT_SECONDS, ) else: - _run_tool("just", ["bench-exact"], cwd=worktree, timeout=_BENCH_TIMEOUT_SECONDS) + _run_tool("just", ["bench-exact"], cwd=worktree, timeout=_BENCH_TIMEOUT_SECONDS, env=benchmark_env) _run_tool( "uv", [ @@ -278,12 +477,12 @@ def _generate_report_in_temp_worktree( try: if config.apply_current_diff: _apply_current_diff_to_worktree(repo_root=config.repo_root, worktree=worktree) - baseline_archive = _download_release_baseline( + _prepare_release_baseline( baseline_tag=config.baseline_tag, - download_dir=tmp_dir, repo_root=config.repo_root, + target_worktree=worktree, + tmp_dir=tmp_dir, ) - _safe_extract_tar(baseline_archive, worktree / "target") _run_benchmarks_and_render_report(worktree=worktree, report=report, config=config) return _read_text(report) finally: @@ -302,7 +501,7 @@ def promote_report( expected_baseline_tag: str, ) -> ReportId: """Archive the old committed report and promote *source* as the current one.""" - source_text = _read_text(source) + source_text = _normalize_how_to_update(_read_text(source)) source_id = parse_report_id(source_text) expected_source_id = ReportId( current_tag=normalize_tag(expected_current_tag), @@ -317,10 +516,12 @@ def promote_report( raise ValueError(msg) if current.exists(): - current_text = _read_text(current) + current_text = _normalize_how_to_update(_read_text(current)) current_id = parse_report_id(current_text) if current_id != source_id: - _write_text(archive_dir / current_id.archive_name, current_text) + archive_path = archive_dir / current_id.archive_name + if not archive_path.exists(): + _write_text(archive_path, current_text) _write_text(current, source_text) update_archive_index(archive_dir) @@ -364,13 +565,45 @@ def generate_and_promote_worktree_report( source.unlink() +def resolve_archive_request( + *, + current_tag: str | None, + baseline_tag: str | None, + published_latest: bool, + worktree_ref: str, + repo_root: Path, +) -> ResolvedArchiveRequest: + """Resolve explicit or latest-published release arguments.""" + if published_latest: + if current_tag is not None or baseline_tag is not None: + msg = "do not pass current_tag or baseline_tag with --published-latest" + raise ValueError(msg) + published_pair = _published_release_pair(repo_root) + resolved_worktree_ref = published_pair.current_tag if worktree_ref == "HEAD" else worktree_ref + return ResolvedArchiveRequest( + current_tag=published_pair.current_tag, + baseline_tag=published_pair.baseline_tag, + worktree_ref=resolved_worktree_ref, + fetch_tags=True, + ) + + if current_tag is None or baseline_tag is None: + msg = "current_tag and baseline_tag are required unless --published-latest is used" + raise ValueError(msg) + return ResolvedArchiveRequest( + current_tag=current_tag, + baseline_tag=baseline_tag, + worktree_ref=worktree_ref, + ) + + def build_parser() -> argparse.ArgumentParser: """Build the CLI argument parser.""" parser = argparse.ArgumentParser( description="Promote a benchmark comparison into docs/PERFORMANCE.md and archive the previous report.", ) - parser.add_argument("current_tag", help="Release tag for the new report, e.g. v0.4.3") - parser.add_argument("baseline_tag", help="Previous release tag used as the comparison baseline, e.g. v0.4.2") + parser.add_argument("current_tag", nargs="?", help="Release tag for the new report, e.g. v0.4.3") + parser.add_argument("baseline_tag", nargs="?", help="Previous release tag used as the comparison baseline, e.g. v0.4.2") parser.add_argument( "--source", default=_DEFAULT_SOURCE, @@ -391,6 +624,11 @@ def build_parser() -> argparse.ArgumentParser: action="store_true", help="Generate the comparison in a temporary detached worktree before promoting it.", ) + parser.add_argument( + "--published-latest", + action="store_true", + help="Infer the latest stable published GitHub release and its previous stable release.", + ) parser.add_argument( "--worktree-ref", default="HEAD", @@ -429,15 +667,25 @@ def main(argv: list[str] | None = None) -> int: archive_dir = root / archive_dir try: + request = resolve_archive_request( + current_tag=args.current_tag, + baseline_tag=args.baseline_tag, + published_latest=args.published_latest, + worktree_ref=args.worktree_ref, + repo_root=root, + ) + if request.fetch_tags: + _fetch_release_tags(repo_root=root, tags=[request.current_tag, request.baseline_tag]) + if args.generate_in_temp_worktree: report_id = generate_and_promote_worktree_report( current=current, archive_dir=archive_dir, config=GenerationConfig( repo_root=root, - current_tag=args.current_tag, - baseline_tag=args.baseline_tag, - worktree_ref=args.worktree_ref, + current_tag=request.current_tag, + baseline_tag=request.baseline_tag, + worktree_ref=request.worktree_ref, suite=args.suite, scope=args.scope, apply_current_diff=not args.no_apply_current_diff, @@ -448,8 +696,8 @@ def main(argv: list[str] | None = None) -> int: source=source, current=current, archive_dir=archive_dir, - expected_current_tag=args.current_tag, - expected_baseline_tag=args.baseline_tag, + expected_current_tag=request.current_tag, + expected_baseline_tag=request.baseline_tag, ) except Exception as exc: print(f"archive-performance: {exc}", file=sys.stderr) diff --git a/scripts/bench_compare.py b/scripts/bench_compare.py index 24dcea3..c843850 100644 --- a/scripts/bench_compare.py +++ b/scripts/bench_compare.py @@ -707,31 +707,21 @@ def _generate_markdown( [ "## How to Update", "", - "```bash", - "# Save a full last-release baseline", - "just bench-save-last", - "", - "# Run the cheaper latest measurements and compare against last", - "just bench-latest-vs-last", - "", - "# Re-render the report from existing Criterion output", - "just bench-compare", + "Release performance docs are generated in isolated temporary worktrees:", "", - "# Generate a snapshot without comparison", - "uv run bench-compare --snapshot", - "```", + "```bash", + "# Release PR: update docs/PERFORMANCE.md and archive the previous report", + "just performance-release ", "", - "To compare against a *previous* release, check out the old tag first:", + "# Historical published comparison", + "just performance-archive-published", "", - "```bash", - "git checkout v0.2.0", - "just bench-save-baseline v0.2.0", - "git checkout main", - "just bench-latest", - "just bench-compare v0.2.0", + "# Explicit historical repair", + "just performance-archive-published ", "```", "", - "Baselines persist in `target/criterion/` across checkouts (but not `cargo clean`).", + "For local scratch comparisons, use `just bench-latest` and `just bench-compare`.", + "Those write `target/bench-reports/performance.md`.", "", "See `docs/BENCHMARKING.md` for the full comparison workflow.", ] diff --git a/scripts/tests/test_archive_performance.py b/scripts/tests/test_archive_performance.py index 7e45298..ade4445 100644 --- a/scripts/tests/test_archive_performance.py +++ b/scripts/tests/test_archive_performance.py @@ -6,12 +6,23 @@ import subprocess import tarfile from pathlib import Path +from types import SimpleNamespace +from typing import TYPE_CHECKING, Any import pytest import archive_performance from archive_performance import GenerationConfig, generate_and_promote_worktree_report, main, normalize_tag, parse_report_id, promote_report +if TYPE_CHECKING: + from collections.abc import Sequence + +type RunnerCall = tuple[str, tuple[str, ...], Path | None] + + +def _result(stdout: str = "") -> SimpleNamespace: + return SimpleNamespace(stdout=stdout) + def _report(version: str, baseline: str) -> str: return ( @@ -30,6 +41,24 @@ def _report(version: str, baseline: str) -> str: ) +def _normalized_report(version: str, baseline: str) -> str: + return archive_performance._normalize_how_to_update(_report(version, baseline)) + + +def _legacy_report(version: str, baseline: str) -> str: + return ( + _report(version, baseline) + + "\n" + + "## How to Update\n\n" + + "```bash\n" + + "git checkout v0.2.0\n" + + "just bench-save-baseline v0.2.0\n" + + "git checkout main\n" + + "just bench-compare v0.2.0\n" + + "```\n" + ) + + def _write_baseline_archive(path: Path) -> None: fixture_dir = path.parent / "baseline-fixture" criterion_dir = fixture_dir / "criterion" @@ -78,7 +107,75 @@ def test_parse_report_id_reads_current_and_baseline_tags() -> None: assert report_id.archive_name == "v0.4.2-vs-v0.4.1.md" -def test_promote_report_archives_previous_and_updates_sorted_index(tmp_path) -> None: +def test_published_release_pair_discovers_latest_stable_semver_pair(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + def fake_run_safe(command: str, args: Sequence[str], cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: + assert command == "gh" + assert args == [ + "release", + "list", + "--json", + "tagName,isDraft,isPrerelease,publishedAt", + "--limit", + "100", + ] + assert cwd == tmp_path + return _result( + "[" + '{"tagName":"v0.4.2","isDraft":false,"isPrerelease":false,"publishedAt":"2026-01-01T00:00:00Z"},' + '{"tagName":"v0.4.10","isDraft":false,"isPrerelease":false,"publishedAt":"2026-04-01T00:00:00Z"},' + '{"tagName":"v0.4.11-rc.1","isDraft":false,"isPrerelease":true,"publishedAt":"2026-06-01T00:00:00Z"},' + '{"tagName":"v0.4.11","isDraft":true,"isPrerelease":false,"publishedAt":"2026-06-02T00:00:00Z"},' + '{"tagName":"not-semver","isDraft":false,"isPrerelease":false,"publishedAt":"2026-06-03T00:00:00Z"},' + '{"tagName":"v0.4.3","isDraft":false,"isPrerelease":false,"publishedAt":"2026-03-01T00:00:00Z"}' + "]" + ) + + monkeypatch.setattr(archive_performance, "run_safe_command", fake_run_safe) + + report_id = archive_performance._published_release_pair(tmp_path) + + assert report_id.current_tag == "v0.4.10" + assert report_id.baseline_tag == "v0.4.3" + + +def test_published_release_pair_uses_latest_published_release_not_highest_semver(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + def fake_run_safe(command: str, args: Sequence[str], cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: + assert command == "gh" + assert cwd == tmp_path + return _result( + "[" + '{"tagName":"v0.5.0","isDraft":false,"isPrerelease":false,"publishedAt":"2026-01-01T00:00:00Z"},' + '{"tagName":"v0.4.9","isDraft":false,"isPrerelease":false,"publishedAt":"2026-02-01T00:00:00Z"},' + '{"tagName":"v0.4.8","isDraft":false,"isPrerelease":false,"publishedAt":"2025-12-01T00:00:00Z"}' + "]" + ) + + monkeypatch.setattr(archive_performance, "run_safe_command", fake_run_safe) + + report_id = archive_performance._published_release_pair(tmp_path) + + assert report_id.current_tag == "v0.4.9" + assert report_id.baseline_tag == "v0.4.8" + + +def test_benchmark_env_uses_current_repo_toolchain(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("RUSTUP_TOOLCHAIN", raising=False) + (tmp_path / "rust-toolchain.toml").write_text('[toolchain]\nchannel = "1.96.0"\n', encoding="utf-8") + + env = archive_performance._benchmark_env(tmp_path) + + assert env is not None + assert env["RUSTUP_TOOLCHAIN"] == "1.96.0" + + +def test_benchmark_env_respects_existing_toolchain_override(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("RUSTUP_TOOLCHAIN", "nightly") + (tmp_path / "rust-toolchain.toml").write_text('[toolchain]\nchannel = "1.96.0"\n', encoding="utf-8") + + assert archive_performance._benchmark_env(tmp_path) is None + + +def test_promote_report_archives_previous_and_updates_sorted_index(tmp_path: Path) -> None: source = tmp_path / "target" / "bench-reports" / "performance.md" current = tmp_path / "docs" / "PERFORMANCE.md" archive_dir = tmp_path / "docs" / "archive" / "performance" @@ -99,8 +196,8 @@ def test_promote_report_archives_previous_and_updates_sorted_index(tmp_path) -> ) assert promoted.archive_name == "v0.4.2-vs-v0.4.1.md" - assert current.read_text(encoding="utf-8") == source.read_text(encoding="utf-8") - assert (archive_dir / "v0.4.1-vs-v0.4.0.md").read_text(encoding="utf-8") == _report("0.4.1", "v0.4.0") + assert current.read_text(encoding="utf-8") == _normalized_report("0.4.2", "v0.4.1") + assert (archive_dir / "v0.4.1-vs-v0.4.0.md").read_text(encoding="utf-8") == _normalized_report("0.4.1", "v0.4.0") assert (archive_dir / "README.md").read_text(encoding="utf-8") == ( "# Archived Performance Reports\n\n" "Older release-to-release benchmark comparisons are archived here.\n" @@ -110,7 +207,7 @@ def test_promote_report_archives_previous_and_updates_sorted_index(tmp_path) -> ) -def test_promote_report_is_idempotent_for_same_release_pair(tmp_path) -> None: +def test_promote_report_is_idempotent_for_same_release_pair(tmp_path: Path) -> None: source = tmp_path / "performance-new.md" current = tmp_path / "docs" / "PERFORMANCE.md" archive_dir = tmp_path / "docs" / "archive" / "performance" @@ -131,7 +228,30 @@ def test_promote_report_is_idempotent_for_same_release_pair(tmp_path) -> None: assert "- No archived performance reports yet." in (archive_dir / "README.md").read_text(encoding="utf-8") -def test_promote_report_rejects_unexpected_release_pair(tmp_path) -> None: +def test_promote_report_does_not_overwrite_existing_archive(tmp_path: Path) -> None: + source = tmp_path / "performance-new.md" + current = tmp_path / "docs" / "PERFORMANCE.md" + archive_dir = tmp_path / "docs" / "archive" / "performance" + archived = archive_dir / "v0.4.1-vs-v0.4.0.md" + + source.write_text(_report("0.4.2", "v0.4.1"), encoding="utf-8") + current.parent.mkdir(parents=True) + current.write_text(_report("0.4.1", "v0.4.0"), encoding="utf-8") + archive_dir.mkdir(parents=True) + archived.write_text("already archived\n", encoding="utf-8") + + promote_report( + source=source, + current=current, + archive_dir=archive_dir, + expected_current_tag="v0.4.2", + expected_baseline_tag="v0.4.1", + ) + + assert archived.read_text(encoding="utf-8") == "already archived\n" + + +def test_promote_report_rejects_unexpected_release_pair(tmp_path: Path) -> None: source = tmp_path / "performance-new.md" current = tmp_path / "docs" / "PERFORMANCE.md" archive_dir = tmp_path / "docs" / "archive" / "performance" @@ -147,7 +267,33 @@ def test_promote_report_rejects_unexpected_release_pair(tmp_path) -> None: ) -def test_main_promotes_generated_report_to_docs_performance(tmp_path, capsys) -> None: +def test_promote_report_rewrites_legacy_update_instructions(tmp_path: Path) -> None: + source = tmp_path / "performance-new.md" + current = tmp_path / "docs" / "PERFORMANCE.md" + archive_dir = tmp_path / "docs" / "archive" / "performance" + source.write_text(_legacy_report("0.4.3", "v0.4.2"), encoding="utf-8") + current.parent.mkdir(parents=True) + current.write_text(_legacy_report("0.4.2", "v0.4.1"), encoding="utf-8") + + promote_report( + source=source, + current=current, + archive_dir=archive_dir, + expected_current_tag="v0.4.3", + expected_baseline_tag="v0.4.2", + ) + + current_text = current.read_text(encoding="utf-8") + archived_text = (archive_dir / "v0.4.2-vs-v0.4.1.md").read_text(encoding="utf-8") + assert "just performance-release " in current_text + assert "just performance-archive-published" in current_text + assert "just performance-archive-published " in current_text + assert "git checkout" not in current_text + assert "just performance-release " in archived_text + assert "git checkout" not in archived_text + + +def test_main_promotes_generated_report_to_docs_performance(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None: source = tmp_path / "target" / "bench-reports" / "performance.md" current = tmp_path / "docs" / "PERFORMANCE.md" archive_dir = tmp_path / "docs" / "archive" / "performance" @@ -172,12 +318,12 @@ def test_main_promotes_generated_report_to_docs_performance(tmp_path, capsys) -> ) assert rc == 0 - assert current.read_text(encoding="utf-8") == generated + assert current.read_text(encoding="utf-8") == archive_performance._normalize_how_to_update(generated) assert (archive_dir / "v0.4.2-vs-v0.4.1.md").exists() assert "Current performance report: v0.4.3 vs v0.4.2" in capsys.readouterr().out -def test_main_reports_release_pair_mismatch_to_stderr(tmp_path, capsys) -> None: +def test_main_reports_release_pair_mismatch_to_stderr(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None: source = tmp_path / "target" / "bench-reports" / "performance.md" current = tmp_path / "docs" / "PERFORMANCE.md" archive_dir = tmp_path / "docs" / "archive" / "performance" @@ -203,24 +349,24 @@ def test_main_reports_release_pair_mismatch_to_stderr(tmp_path, capsys) -> None: assert not current.exists() -def test_main_generates_report_in_temp_worktree(tmp_path, monkeypatch, capsys) -> None: +def test_main_generates_report_in_temp_worktree(tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: current = tmp_path / "docs" / "PERFORMANCE.md" archive_dir = tmp_path / "docs" / "archive" / "performance" - calls: list[tuple[str, tuple[str, ...], Path | None]] = [] + calls: list[RunnerCall] = [] - def fake_run_git(args, cwd=None, **kwargs): + def fake_run_git(args: Sequence[str], cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: calls.append(("git", tuple(args), cwd)) if args[:3] == ["worktree", "add", "--detach"]: worktree = Path(args[3]) worktree.mkdir(parents=True) _write_current_benchmark_tooling(worktree) - return type("Result", (), {"stdout": ""})() + return _result() - def fake_run_git_with_input(args, input_data, cwd=None, **kwargs): + def fake_run_git_with_input(args: Sequence[str], input_data: str, cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: calls.append(("git-stdin", tuple(args), cwd)) - return type("Result", (), {"stdout": ""})() + return _result() - def fake_run_safe(command, args, cwd=None, **kwargs): + def fake_run_safe(command: str, args: Sequence[str], cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: calls.append((command, tuple(args), cwd)) if command == "gh": download_dir = Path(args[args.index("--dir") + 1]) @@ -228,7 +374,7 @@ def fake_run_safe(command, args, cwd=None, **kwargs): if command == "uv": output = Path(args[args.index("--output") + 1]) output.write_text(_report("0.4.3", "v0.4.2"), encoding="utf-8") - return type("Result", (), {"stdout": ""})() + return _result() monkeypatch.chdir(tmp_path) monkeypatch.setattr(archive_performance, "run_git_command", fake_run_git) @@ -256,7 +402,7 @@ def fake_run_safe(command, args, cwd=None, **kwargs): captured = capsys.readouterr() assert rc == 0 - assert current.read_text(encoding="utf-8") == _report("0.4.3", "v0.4.2") + assert current.read_text(encoding="utf-8") == _normalized_report("0.4.3", "v0.4.2") assert "Generated benchmark report in a temporary worktree" in captured.out assert "target/bench-reports/performance.md" not in captured.out assert any(kind == "git" and args[:3] == ("worktree", "add", "--detach") and args[4] == "v0.4.3" for kind, args, _ in calls) @@ -265,31 +411,33 @@ def fake_run_safe(command, args, cwd=None, **kwargs): assert not any(kind == "git-stdin" for kind, _, _ in calls) -def test_temp_worktree_is_removed_when_benchmark_command_fails(tmp_path, monkeypatch, capsys) -> None: +def test_temp_worktree_is_removed_when_benchmark_command_fails(tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: current = tmp_path / "docs" / "PERFORMANCE.md" archive_dir = tmp_path / "docs" / "archive" / "performance" - calls: list[tuple[str, tuple[str, ...], Path | None]] = [] + calls: list[RunnerCall] = [] - def fake_run_git(args, cwd=None, **kwargs): + def fake_run_git(args: Sequence[str], cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: calls.append(("git", tuple(args), cwd)) + if args[:2] == ["fetch", "origin"]: + return _result() if args[:3] == ["worktree", "add", "--detach"]: worktree = Path(args[3]) worktree.mkdir(parents=True) _write_current_benchmark_tooling(worktree) - return type("Result", (), {"stdout": ""})() + return _result() - def fake_run_git_with_input(args, input_data, cwd=None, **kwargs): + def fake_run_git_with_input(args: Sequence[str], input_data: str, cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: calls.append(("git-stdin", tuple(args), cwd)) - return type("Result", (), {"stdout": ""})() + return _result() - def fake_run_safe(command, args, cwd=None, **kwargs): + def fake_run_safe(command: str, args: Sequence[str], cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: calls.append((command, tuple(args), cwd)) if command == "gh": download_dir = Path(args[args.index("--dir") + 1]) _write_baseline_archive(download_dir / "la-stack-v0.4.2-criterion-baseline.tar.gz") if command == "just" and args == ["bench-latest"]: raise subprocess.CalledProcessError(42, [command, *args], output="bench stdout", stderr="bench stderr") - return type("Result", (), {"stdout": ""})() + return _result() monkeypatch.chdir(tmp_path) monkeypatch.setattr(archive_performance, "run_git_command", fake_run_git) @@ -319,29 +467,29 @@ def fake_run_safe(command, args, cwd=None, **kwargs): assert any(kind == "git" and args[:3] == ("worktree", "remove", "--force") for kind, args, _ in calls) -def test_generate_report_rejects_unsafe_baseline_archive(tmp_path, monkeypatch, capsys) -> None: +def test_generate_report_rejects_unsafe_baseline_archive(tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: current = tmp_path / "docs" / "PERFORMANCE.md" archive_dir = tmp_path / "docs" / "archive" / "performance" - calls: list[tuple[str, tuple[str, ...], Path | None]] = [] + calls: list[RunnerCall] = [] - def fake_run_git(args, cwd=None, **kwargs): + def fake_run_git(args: Sequence[str], cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: calls.append(("git", tuple(args), cwd)) if args[:3] == ["worktree", "add", "--detach"]: worktree = Path(args[3]) worktree.mkdir(parents=True) _write_current_benchmark_tooling(worktree) - return type("Result", (), {"stdout": ""})() + return _result() - def fake_run_git_with_input(args, input_data, cwd=None, **kwargs): + def fake_run_git_with_input(args: Sequence[str], input_data: str, cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: calls.append(("git-stdin", tuple(args), cwd)) - return type("Result", (), {"stdout": ""})() + return _result() - def fake_run_safe(command, args, cwd=None, **kwargs): + def fake_run_safe(command: str, args: Sequence[str], cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: calls.append((command, tuple(args), cwd)) if command == "gh": download_dir = Path(args[args.index("--dir") + 1]) _write_unsafe_baseline_archive(download_dir / "la-stack-v0.4.2-criterion-baseline.tar.gz") - return type("Result", (), {"stdout": ""})() + return _result() monkeypatch.chdir(tmp_path) monkeypatch.setattr(archive_performance, "run_git_command", fake_run_git) @@ -372,26 +520,39 @@ def fake_run_safe(command, args, cwd=None, **kwargs): assert any(kind == "git" and args[:3] == ("worktree", "remove", "--force") for kind, args, _ in calls) -def test_generate_report_fails_when_release_baseline_asset_missing(tmp_path, monkeypatch, capsys) -> None: +def test_generate_report_falls_back_when_release_baseline_asset_is_missing( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] +) -> None: current = tmp_path / "docs" / "PERFORMANCE.md" archive_dir = tmp_path / "docs" / "archive" / "performance" - calls: list[tuple[str, tuple[str, ...], Path | None]] = [] + calls: list[RunnerCall] = [] - def fake_run_git(args, cwd=None, **kwargs): + def fake_run_git(args: Sequence[str], cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: calls.append(("git", tuple(args), cwd)) if args[:3] == ["worktree", "add", "--detach"]: worktree = Path(args[3]) worktree.mkdir(parents=True) - _write_current_benchmark_tooling(worktree) - return type("Result", (), {"stdout": ""})() + if worktree.name == "baseline-worktree": + _write_legacy_benchmark_tooling(worktree) + else: + _write_current_benchmark_tooling(worktree) + return _result() - def fake_run_git_with_input(args, input_data, cwd=None, **kwargs): + def fake_run_git_with_input(args: Sequence[str], input_data: str, cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: calls.append(("git-stdin", tuple(args), cwd)) - return type("Result", (), {"stdout": ""})() + return _result() - def fake_run_safe(command, args, cwd=None, **kwargs): + def fake_run_safe(command: str, args: Sequence[str], cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: calls.append((command, tuple(args), cwd)) - return type("Result", (), {"stdout": ""})() + if command == "just" and args == ["bench-save-baseline", "v0.4.2"]: + assert cwd is not None + criterion_dir = cwd / "target" / "criterion" + criterion_dir.mkdir(parents=True) + (criterion_dir / "baseline.txt").write_text("baseline\n", encoding="utf-8") + if command == "uv": + output = Path(args[args.index("--output") + 1]) + output.write_text(_report("0.4.3", "v0.4.2"), encoding="utf-8") + return _result() monkeypatch.chdir(tmp_path) monkeypatch.setattr(archive_performance, "run_git_command", fake_run_git) @@ -413,15 +574,144 @@ def fake_run_safe(command, args, cwd=None, **kwargs): ] ) + captured = capsys.readouterr() + assert rc == 0 + assert "release baseline asset unavailable; generating v0.4.2 locally" in captured.err + assert current.read_text(encoding="utf-8") == _normalized_report("0.4.3", "v0.4.2") + assert any(kind == "just" and args == ("bench-save-baseline", "v0.4.2") for kind, args, _ in calls) + assert any(kind == "just" and args == ("bench-latest",) for kind, args, _ in calls) + assert any(kind == "uv" and "--suite" in args for kind, args, _ in calls) + assert sum(1 for kind, args, _ in calls if kind == "git" and args[:3] == ("worktree", "remove", "--force")) == 2 + + +def test_main_generates_latest_published_report_from_github_releases(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + current = tmp_path / "docs" / "PERFORMANCE.md" + archive_dir = tmp_path / "docs" / "archive" / "performance" + calls: list[RunnerCall] = [] + + def fake_run_git(args: Sequence[str], cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: + calls.append(("git", tuple(args), cwd)) + if args[:3] == ["worktree", "add", "--detach"]: + worktree = Path(args[3]) + worktree.mkdir(parents=True) + _write_current_benchmark_tooling(worktree) + return _result() + + def fake_run_git_with_input(args: Sequence[str], input_data: str, cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: + calls.append(("git-stdin", tuple(args), cwd)) + return _result() + + def fake_run_safe(command: str, args: Sequence[str], cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: + calls.append((command, tuple(args), cwd)) + if command == "gh" and args[:2] == ["release", "list"]: + return _result( + "[" + '{"tagName":"v0.4.2","isDraft":false,"isPrerelease":false,"publishedAt":"2026-01-01T00:00:00Z"},' + '{"tagName":"v0.4.3","isDraft":false,"isPrerelease":false,"publishedAt":"2026-02-01T00:00:00Z"}' + "]" + ) + if command == "gh": + download_dir = Path(args[args.index("--dir") + 1]) + _write_baseline_archive(download_dir / "la-stack-v0.4.2-criterion-baseline.tar.gz") + if command == "uv": + output = Path(args[args.index("--output") + 1]) + output.write_text(_report("0.4.3", "v0.4.2"), encoding="utf-8") + return _result() + + monkeypatch.chdir(tmp_path) + monkeypatch.setattr(archive_performance, "run_git_command", fake_run_git) + monkeypatch.setattr(archive_performance, "run_git_command_with_input", fake_run_git_with_input) + monkeypatch.setattr(archive_performance, "run_safe_command", fake_run_safe) + + rc = main( + [ + "--current", + str(current), + "--archive-dir", + str(archive_dir), + "--published-latest", + "--generate-in-temp-worktree", + "--no-apply-current-diff", + ] + ) + + assert rc == 0 + assert current.read_text(encoding="utf-8") == _normalized_report("0.4.3", "v0.4.2") + assert any( + kind == "git" + and args + == ( + "fetch", + "origin", + "refs/tags/v0.4.3:refs/tags/v0.4.3", + "refs/tags/v0.4.2:refs/tags/v0.4.2", + ) + for kind, args, _ in calls + ) + fetch_index = next(index for index, (kind, args, _) in enumerate(calls) if kind == "git" and args[:2] == ("fetch", "origin")) + worktree_index = next(index for index, (kind, args, _) in enumerate(calls) if kind == "git" and args[:3] == ("worktree", "add", "--detach")) + assert fetch_index < worktree_index + assert any(kind == "git" and args[:3] == ("worktree", "add", "--detach") and args[4] == "v0.4.3" for kind, args, _ in calls) + + +def test_main_published_latest_fetch_failure_stops_before_worktree(tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: + current = tmp_path / "docs" / "PERFORMANCE.md" + archive_dir = tmp_path / "docs" / "archive" / "performance" + calls: list[RunnerCall] = [] + + def fake_run_git(args: Sequence[str], cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: + calls.append(("git", tuple(args), cwd)) + if args[:2] == ["fetch", "origin"]: + raise subprocess.CalledProcessError(128, ["git", *args], output="fetch stdout", stderr="missing tag") + if args[:3] == ["worktree", "add", "--detach"]: + worktree = Path(args[3]) + worktree.mkdir(parents=True) + _write_current_benchmark_tooling(worktree) + return _result() + + def fake_run_git_with_input(args: Sequence[str], input_data: str, cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: + calls.append(("git-stdin", tuple(args), cwd)) + return _result() + + def fake_run_safe(command: str, args: Sequence[str], cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: + calls.append((command, tuple(args), cwd)) + if command == "gh" and args[:2] == ["release", "list"]: + return _result( + "[" + '{"tagName":"v0.4.2","isDraft":false,"isPrerelease":false,"publishedAt":"2026-01-01T00:00:00Z"},' + '{"tagName":"v0.4.3","isDraft":false,"isPrerelease":false,"publishedAt":"2026-02-01T00:00:00Z"}' + "]" + ) + return _result() + + monkeypatch.chdir(tmp_path) + monkeypatch.setattr(archive_performance, "run_git_command", fake_run_git) + monkeypatch.setattr(archive_performance, "run_git_command_with_input", fake_run_git_with_input) + monkeypatch.setattr(archive_performance, "run_safe_command", fake_run_safe) + + rc = main( + [ + "--current", + str(current), + "--archive-dir", + str(archive_dir), + "--published-latest", + "--generate-in-temp-worktree", + "--no-apply-current-diff", + ] + ) + captured = capsys.readouterr() assert rc == 1 - assert "release baseline asset was not downloaded" in captured.err + assert "command failed (128): git fetch origin" in captured.err + assert "missing tag" in captured.err assert not current.exists() + assert not any(kind == "git" and args[:3] == ("worktree", "add", "--detach") for kind, args, _ in calls) assert not any(kind in {"just", "uv"} for kind, _, _ in calls) - assert any(kind == "git" and args[:3] == ("worktree", "remove", "--force") for kind, args, _ in calls) + assert not any(kind == "git-stdin" for kind, _, _ in calls) -def test_failed_atomic_replace_preserves_existing_report(tmp_path, monkeypatch) -> None: +def test_failed_atomic_replace_preserves_existing_report(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: source = tmp_path / "performance-new.md" current = tmp_path / "docs" / "PERFORMANCE.md" archive_dir = tmp_path / "docs" / "archive" / "performance" @@ -431,7 +721,7 @@ def test_failed_atomic_replace_preserves_existing_report(tmp_path, monkeypatch) current.parent.mkdir(parents=True) current.write_text(original, encoding="utf-8") - def fail_replace(src, dst) -> None: + def fail_replace(src: Path, dst: Path) -> None: msg = f"simulated replace failure for {dst}" raise OSError(msg) @@ -450,29 +740,29 @@ def fail_replace(src, dst) -> None: assert not list(current.parent.glob(".PERFORMANCE.md.*.tmp")) -def test_generate_and_promote_uses_temp_worktree_and_current_diff(tmp_path, monkeypatch) -> None: +def test_generate_and_promote_uses_temp_worktree_and_current_diff(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: current = tmp_path / "docs" / "PERFORMANCE.md" archive_dir = tmp_path / "docs" / "archive" / "performance" current.parent.mkdir(parents=True) current.write_text(_report("0.4.2", "v0.4.1"), encoding="utf-8") - calls: list[tuple[str, tuple[str, ...], Path | None]] = [] + calls: list[RunnerCall] = [] - def fake_run_git(args, cwd=None, **kwargs): + def fake_run_git(args: Sequence[str], cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: calls.append(("git", tuple(args), cwd)) if args[:3] == ["worktree", "add", "--detach"]: worktree = Path(args[3]) worktree.mkdir(parents=True) _write_current_benchmark_tooling(worktree) if args == ["diff", "--binary", "HEAD"]: - return type("Result", (), {"stdout": "diff --git a/README.md b/README.md\n"})() - return type("Result", (), {"stdout": ""})() + return _result("diff --git a/README.md b/README.md\n") + return _result() - def fake_run_git_with_input(args, input_data, cwd=None, **kwargs): + def fake_run_git_with_input(args: Sequence[str], input_data: str, cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: calls.append(("git-stdin", tuple(args), cwd)) assert "diff --git" in input_data - return type("Result", (), {"stdout": ""})() + return _result() - def fake_run_safe(command, args, cwd=None, **kwargs): + def fake_run_safe(command: str, args: Sequence[str], cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: calls.append((command, tuple(args), cwd)) if command == "gh": download_dir = Path(args[args.index("--dir") + 1]) @@ -480,7 +770,7 @@ def fake_run_safe(command, args, cwd=None, **kwargs): if command == "uv": output = Path(args[args.index("--output") + 1]) output.write_text(_report("0.4.3", "v0.4.2"), encoding="utf-8") - return type("Result", (), {"stdout": ""})() + return _result() monkeypatch.setattr(archive_performance, "run_git_command", fake_run_git) monkeypatch.setattr(archive_performance, "run_git_command_with_input", fake_run_git_with_input) @@ -499,7 +789,7 @@ def fake_run_safe(command, args, cwd=None, **kwargs): ) assert report_id.archive_name == "v0.4.3-vs-v0.4.2.md" - assert current.read_text(encoding="utf-8") == _report("0.4.3", "v0.4.2") + assert current.read_text(encoding="utf-8") == _normalized_report("0.4.3", "v0.4.2") assert (archive_dir / "v0.4.2-vs-v0.4.1.md").exists() assert any(kind == "git" and args[:3] == ("worktree", "add", "--detach") and args[4] == "HEAD" for kind, args, _ in calls) assert any(kind == "git-stdin" and args == ("apply", "--binary") for kind, args, _ in calls) @@ -507,24 +797,24 @@ def fake_run_safe(command, args, cwd=None, **kwargs): assert any(kind == "git" and args[:3] == ("worktree", "remove", "--force") for kind, args, _ in calls) -def test_generate_and_promote_legacy_published_tag_uses_legacy_commands(tmp_path, monkeypatch) -> None: +def test_generate_and_promote_legacy_published_tag_uses_legacy_commands(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: current = tmp_path / "docs" / "PERFORMANCE.md" archive_dir = tmp_path / "docs" / "archive" / "performance" - calls: list[tuple[str, tuple[str, ...], Path | None]] = [] + calls: list[RunnerCall] = [] - def fake_run_git(args, cwd=None, **kwargs): + def fake_run_git(args: Sequence[str], cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: calls.append(("git", tuple(args), cwd)) if args[:3] == ["worktree", "add", "--detach"]: worktree = Path(args[3]) worktree.mkdir(parents=True) _write_legacy_benchmark_tooling(worktree) - return type("Result", (), {"stdout": ""})() + return _result() - def fake_run_git_with_input(args, input_data, cwd=None, **kwargs): + def fake_run_git_with_input(args: Sequence[str], input_data: str, cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: calls.append(("git-stdin", tuple(args), cwd)) - return type("Result", (), {"stdout": ""})() + return _result() - def fake_run_safe(command, args, cwd=None, **kwargs): + def fake_run_safe(command: str, args: Sequence[str], cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: calls.append((command, tuple(args), cwd)) if command == "gh": download_dir = Path(args[args.index("--dir") + 1]) @@ -532,7 +822,7 @@ def fake_run_safe(command, args, cwd=None, **kwargs): if command == "uv": output = Path(args[args.index("--output") + 1]) output.write_text(_report("0.4.2", "v0.4.1"), encoding="utf-8") - return type("Result", (), {"stdout": ""})() + return _result() monkeypatch.setattr(archive_performance, "run_git_command", fake_run_git) monkeypatch.setattr(archive_performance, "run_git_command_with_input", fake_run_git_with_input) @@ -551,7 +841,7 @@ def fake_run_safe(command, args, cwd=None, **kwargs): ) assert report_id.archive_name == "v0.4.2-vs-v0.4.1.md" - assert current.read_text(encoding="utf-8") == _report("0.4.2", "v0.4.1") + assert current.read_text(encoding="utf-8") == _normalized_report("0.4.2", "v0.4.1") assert any(kind == "git" and args[:3] == ("worktree", "add", "--detach") and args[4] == "v0.4.2" for kind, args, _ in calls) assert any(kind == "just" and args == ("bench-exact",) for kind, args, _ in calls) assert not any(kind == "just" and args == ("bench-latest",) for kind, args, _ in calls) diff --git a/scripts/tests/test_bench_compare.py b/scripts/tests/test_bench_compare.py index e0744a2..45c7004 100644 --- a/scripts/tests/test_bench_compare.py +++ b/scripts/tests/test_bench_compare.py @@ -375,7 +375,10 @@ def test_main_snapshot_writes_output(tmp_path: Path) -> None: assert "### D=2" in text assert "### Random percentile D=3" in text assert "### Near-singular 3x3" in text - assert "just bench-compare" in text + assert "just performance-release " in text + assert "just performance-archive-published" in text + assert "just performance-archive-published " in text + assert "git checkout" not in text def test_main_no_criterion_dir(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None: From 7258525590f2ed68d41879e71c833010e408e7f7 Mon Sep 17 00:00:00 2001 From: Adam Getchell Date: Mon, 8 Jun 2026 16:06:27 -0700 Subject: [PATCH 3/4] feat(bench): split local and release performance comparisons - Add default performance-local and performance-release workflows that infer the relevant release tags and run in temporary worktrees. - Add a performance-github-assets workflow for comparing stored GitHub Actions release benchmark assets without local cargo runs. - Normalize release tags before fetching, downloading assets, or checking out detached worktrees. - Update performance docs, release guidance, and generated report instructions to use the new benchmark workflows. --- CONTRIBUTING.md | 14 + docs/BENCHMARKING.md | 42 +- docs/PERFORMANCE.md | 19 +- docs/RELEASING.md | 17 +- docs/archive/performance/v0.4.1-vs-v0.4.0.md | 19 +- justfile | 39 +- scripts/README.md | 21 +- scripts/archive_performance.py | 474 ++++++++++++++----- scripts/bench_compare.py | 19 +- scripts/tests/test_archive_performance.py | 213 ++++++++- scripts/tests/test_bench_compare.py | 5 +- 11 files changed, 683 insertions(+), 199 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 51dfa87..611d9fa 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -22,6 +22,20 @@ to an explicit allowlist, and kept with readable version comments for review. CI runs `just ci` on Ubuntu, macOS, and Windows to keep platform coverage aligned with the local comprehensive validation path. +## Performance checks + +Performance-sensitive changes should compare the current tree against the +latest published release: + +```bash +just performance-local +``` + +This writes `target/bench-reports/performance.md` without changing committed +release docs. Regressions are worth treating as design feedback: if a slowdown +is intentional, document the correctness, API clarity, or composability benefit +that justifies it. + For coverage commands and report locations, see [`docs/COVERAGE.md`](docs/COVERAGE.md). For benchmark methodology, see [`docs/BENCHMARKING.md`](docs/BENCHMARKING.md). For the full set of developer commands, run `just --list`. diff --git a/docs/BENCHMARKING.md b/docs/BENCHMARKING.md index af3bee7..db226d0 100644 --- a/docs/BENCHMARKING.md +++ b/docs/BENCHMARKING.md @@ -151,17 +151,18 @@ benchmarks on every iteration. ### Workflow ```bash -# Latest published release vs previous stable release -just performance-archive-published +# Current in-tree code vs latest published release, all measured locally +just performance-local -# Explicit historical repair -just performance-archive-published v0.4.2 v0.4.1 +# Stored GitHub Actions release assets, no local cargo runs +just performance-github-assets ``` -These recipes create isolated temporary worktrees, restore release baselines -from GitHub Release assets when available, and fall back to generating a missing -baseline in a second temporary worktree. They do not require changing the current -checkout. +`performance-local` creates isolated temporary worktrees, generates the latest +published release baseline locally, then benchmarks the current in-tree code on +the same machine. It uses the current checkout's Rust toolchain for both sides +unless `RUSTUP_TOOLCHAIN` is already set. `performance-github-assets` compares +stored GitHub Actions release artifacts and does not run cargo locally. For local scratch comparisons, you can save multiple baselines and compare against any of them. If the release baseline is already present in @@ -183,26 +184,27 @@ matching `vs_linalg` peer exists. Release PRs promote one curated comparison into committed docs: ```bash -just performance-release v0.4.3 v0.4.2 +just performance-release ``` -This runs the release-signal benchmark set, renders the comparison into -an isolated temporary worktree, copies the finished report to -`docs/PERFORMANCE.md`, and archives the previous committed report under -`docs/archive/performance/`. Archive filenames are release-pair names such as -`v0.4.2-vs-v0.4.1.md`, so the directory and generated index stay -lexicographically sorted. +This infers the current release tag from `Cargo.toml`, discovers the previous +stable published release, generates both sides locally in temporary worktrees, +copies the finished report to `docs/PERFORMANCE.md`, and archives the previous +committed report under `docs/archive/performance/`. Archive filenames are +release-pair names such as `v0.4.2-vs-v0.4.1.md`, so the directory and generated +index stay lexicographically sorted. For explicit release repair, pass both +tags: `just performance-release v0.4.3 v0.4.2`. -To regenerate and archive the latest published release comparison without -touching the current checkout: +To compare the latest stored GitHub Actions release assets without touching the +current checkout: ```bash -just performance-archive-published +just performance-github-assets ``` The recipe discovers the latest stable published GitHub release and its previous stable release automatically. For explicit historical repair, pass both tags: -`just performance-archive-published v0.4.2 v0.4.1`. +`just performance-github-assets v0.4.2 v0.4.1`. For exact-arithmetic comparisons against v0.4.2 or older baselines, rows such as `det_exact_rounded_f64 (vs det_exact_f64)` mean the current rounded API is @@ -245,7 +247,7 @@ See `scripts/criterion_dim_plot.py --help` for options. At release time, save a local baseline so future work can compare against it: ```bash -just bench-save-baseline $TAG +just bench-save-baseline just bench-save-last ``` diff --git a/docs/PERFORMANCE.md b/docs/PERFORMANCE.md index 0824ce0..961762b 100644 --- a/docs/PERFORMANCE.md +++ b/docs/PERFORMANCE.md @@ -95,20 +95,23 @@ Negative change = faster. Speedup > 1.00x = improvement. ## How to Update -Release performance docs are generated in isolated temporary worktrees: +Local performance reports are generated in isolated temporary worktrees: ```bash +# Local development: compare the current tree with the latest release +just performance-local + # Release PR: update docs/PERFORMANCE.md and archive the previous report -just performance-release +just performance-release -# Historical published comparison -just performance-archive-published +# GitHub Actions release assets +just performance-github-assets -# Explicit historical repair -just performance-archive-published +# Explicit repair +just performance-release ``` -For local scratch comparisons, use `just bench-latest` and `just bench-compare`. -Those write `target/bench-reports/performance.md`. +`just performance-local` writes `target/bench-reports/performance.md`. +`just performance-github-assets` writes `target/bench-reports/github-assets-performance.md`. See `docs/BENCHMARKING.md` for the full comparison workflow. diff --git a/docs/RELEASING.md b/docs/RELEASING.md index b2ca91d..7a6aff0 100644 --- a/docs/RELEASING.md +++ b/docs/RELEASING.md @@ -104,20 +104,19 @@ accuracy. 5. Update the release performance comparison ```bash -# Runs the release-signal benchmark set in a temporary worktree, compares TAG -# against PREVIOUS_TAG, writes docs/PERFORMANCE.md, and archives the previous -# docs/PERFORMANCE.md under docs/archive/performance/. -just performance-release "$TAG" "$PREVIOUS_TAG" +# Infers TAG from Cargo.toml, compares it against the previous stable published +# release, writes docs/PERFORMANCE.md, and archives the previous docs/PERFORMANCE.md +# under docs/archive/performance/. +just performance-release ``` Review `docs/PERFORMANCE.md` for the latest release-to-release comparison. Older committed comparisons are archived under `docs/archive/performance/` with lexicographically sorted filenames such as `v0.4.2-vs-v0.4.1.md`. Iterative -local reports still live under `target/bench-reports/`. To regenerate the -latest published release comparison, use `just performance-archive-published`; -it discovers the latest stable published GitHub release and its previous stable -release automatically, then runs inside a temporary worktree. For explicit -historical repair, pass both tags. +local reports still live under `target/bench-reports/`. For an explicit release +repair, run `just performance-release `. To compare +the stored GitHub Actions release assets instead of running cargo locally, use +`just performance-github-assets`. 6. Save benchmark baselines for this release diff --git a/docs/archive/performance/v0.4.1-vs-v0.4.0.md b/docs/archive/performance/v0.4.1-vs-v0.4.0.md index 0f4f39b..fce355b 100644 --- a/docs/archive/performance/v0.4.1-vs-v0.4.0.md +++ b/docs/archive/performance/v0.4.1-vs-v0.4.0.md @@ -66,20 +66,23 @@ Negative change = faster. Speedup > 1.00x = improvement. ## How to Update -Release performance docs are generated in isolated temporary worktrees: +Local performance reports are generated in isolated temporary worktrees: ```bash +# Local development: compare the current tree with the latest release +just performance-local + # Release PR: update docs/PERFORMANCE.md and archive the previous report -just performance-release +just performance-release -# Historical published comparison -just performance-archive-published +# GitHub Actions release assets +just performance-github-assets -# Explicit historical repair -just performance-archive-published +# Explicit repair +just performance-release ``` -For local scratch comparisons, use `just bench-latest` and `just bench-compare`. -Those write `target/bench-reports/performance.md`. +`just performance-local` writes `target/bench-reports/performance.md`. +`just performance-github-assets` writes `target/bench-reports/github-assets-performance.md`. See `docs/BENCHMARKING.md` for the full comparison workflow. diff --git a/justfile b/justfile index 79aab56..7bbb61b 100644 --- a/justfile +++ b/justfile @@ -186,12 +186,12 @@ bench-compare baseline="last" suite="all" scope="release-signal": python-sync baseline="{{baseline}}" uv run bench-compare "$baseline" --suite "{{suite}}" --scope "{{scope}}" -# Generate release-signal measurements in a temp worktree, then promote/archive docs. -performance-release current_tag baseline_tag: python-sync - uv run archive-performance "{{current_tag}}" "{{baseline_tag}}" --generate-in-temp-worktree --worktree-ref HEAD +# Backward-compatible alias for the GitHub Actions release-asset comparison. +performance-archive-published current_tag="" baseline_tag="": + just performance-github-assets "{{current_tag}}" "{{baseline_tag}}" -# Generate a published-tag comparison in a temp worktree, then promote/archive docs. -performance-archive-published current_tag="" baseline_tag="": python-sync +# Compare stored GitHub Actions release benchmark assets without local cargo runs. +performance-github-assets current_tag="" baseline_tag="": python-sync #!/usr/bin/env bash set -euo pipefail current_tag="{{current_tag}}" @@ -201,9 +201,29 @@ performance-archive-published current_tag="" baseline_tag="": python-sync echo "current_tag and baseline_tag must be provided together" >&2 exit 2 fi - uv run archive-performance "$current_tag" "$baseline_tag" --generate-in-temp-worktree --worktree-ref "$current_tag" --no-apply-current-diff + uv run archive-performance "$current_tag" "$baseline_tag" --github-assets --generate-in-temp-worktree --worktree-ref "$current_tag" --output-only --output target/bench-reports/github-assets-performance.md else - uv run archive-performance --published-latest --generate-in-temp-worktree --no-apply-current-diff + uv run archive-performance --published-latest --github-assets --generate-in-temp-worktree --output-only --output target/bench-reports/github-assets-performance.md + fi + +# Compare the current tree against the latest published release locally. +performance-local: python-sync + uv run archive-performance --current-vs-latest --generate-in-temp-worktree --output-only --output target/bench-reports/performance.md + +# Generate local release-signal measurements in a temp worktree, then promote/archive docs. +performance-release current_tag="" baseline_tag="": python-sync + #!/usr/bin/env bash + set -euo pipefail + current_tag="{{current_tag}}" + baseline_tag="{{baseline_tag}}" + if [[ -n "$current_tag" || -n "$baseline_tag" ]]; then + if [[ -z "$current_tag" || -z "$baseline_tag" ]]; then + echo "current_tag and baseline_tag must be provided together" >&2 + exit 2 + fi + uv run archive-performance "$current_tag" "$baseline_tag" --generate-in-temp-worktree --worktree-ref HEAD + else + uv run archive-performance --infer-release --generate-in-temp-worktree --worktree-ref HEAD fi # Run the exact-arithmetic benchmark suite. @@ -410,8 +430,9 @@ help-workflows: @echo " just bench-compile # Compile benches with warnings-as-errors" @echo " just bench-latest # Run cheap latest measurements" @echo " just bench-latest-vs-last # Run latest and compare against last" - @echo " just performance-release # Promote release performance docs" - @echo " just performance-archive-published # Archive published release comparison" + @echo " just performance-github-assets # Compare stored GitHub Actions release assets" + @echo " just performance-local # Compare current tree against latest release locally" + @echo " just performance-release # Promote local release performance docs" @echo " just bench-save-last # Save full baseline as 'last'" @echo " just bench-vs-linalg # Run vs_linalg bench (optional filter)" @echo " just bench-vs-linalg-la-stack # Run la-stack rows from vs_linalg" diff --git a/scripts/README.md b/scripts/README.md index 189a723..c19e1ea 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -35,21 +35,28 @@ Use `uv run bench-compare --snapshot` for a no-baseline snapshot, or For release PRs, promote one curated release-to-release comparison into committed docs and archive the previous committed report. Benchmark generation -runs in a temporary worktree: +runs locally in temporary worktrees: ```bash -just performance-release v0.4.3 v0.4.2 +just performance-release ``` -To regenerate the latest published-tag comparison without changing the current -checkout: +For local development regression checks, compare the current in-tree code +against the latest published release: ```bash -just performance-archive-published +just performance-local ``` -For explicit historical repair, pass both tags: -`just performance-archive-published v0.4.2 v0.4.1`. +To compare stored GitHub Actions release benchmark assets without local cargo +runs: + +```bash +just performance-github-assets +``` + +For explicit release repair, pass both tags: +`just performance-release v0.4.3 v0.4.2`. ### Plotting Criterion benchmarks (la-stack vs nalgebra/faer) diff --git a/scripts/archive_performance.py b/scripts/archive_performance.py index 50f67d4..db98879 100644 --- a/scripts/archive_performance.py +++ b/scripts/archive_performance.py @@ -28,7 +28,7 @@ from collections.abc import Mapping from dataclasses import dataclass from pathlib import Path -from typing import Any, cast +from typing import Any, Literal, cast from subprocess_utils import run_git_command, run_git_command_with_input, run_safe_command @@ -50,6 +50,7 @@ _BENCH_TIMEOUT_SECONDS = 7200 _COMMAND_TIMEOUT_SECONDS = 600 _HOW_TO_UPDATE_RE = re.compile(r"(?ms)^## How to Update\n.*\Z") +type BaselineSource = Literal["local", "github-assets"] @dataclass(frozen=True) @@ -76,6 +77,7 @@ class GenerationConfig: suite: str = _DEFAULT_SUITE scope: str = _DEFAULT_SCOPE apply_current_diff: bool = True + baseline_source: BaselineSource = "local" @dataclass(frozen=True) @@ -85,7 +87,38 @@ class ResolvedArchiveRequest: current_tag: str baseline_tag: str worktree_ref: str - fetch_tags: bool = False + tags_to_fetch: tuple[str, ...] = () + + +@dataclass(frozen=True) +class ArchiveRequestOptions: + """CLI options used to resolve release tags.""" + + current_tag: str | None + baseline_tag: str | None + published_latest: bool + infer_release: bool + current_vs_latest: bool + worktree_ref: str + repo_root: Path + + +@dataclass(frozen=True) +class ArchivePaths: + """Filesystem paths used by the archive CLI.""" + + source: Path + current: Path + output: Path + archive_dir: Path + + +@dataclass(frozen=True) +class ArchiveResult: + """Result and destination metadata for a completed archive operation.""" + + report_id: ReportId + action: Literal["output", "promote-generated", "promote-source"] @dataclass(frozen=True) @@ -162,7 +195,7 @@ def _stable_published_releases(releases: object) -> list[PublishedRelease]: return list(stable_releases.values()) -def _published_release_pair(repo_root: Path) -> ReportId: +def _github_release_list(repo_root: Path) -> object: command = [ "release", "list", @@ -181,25 +214,71 @@ def _published_release_pair(repo_root: Path) -> ReportId: except subprocess.CalledProcessError as exc: raise RuntimeError(_format_command_failure(["gh", *command], exc)) from exc try: - releases = json.loads(result.stdout) + return json.loads(result.stdout) except json.JSONDecodeError as exc: msg = "could not parse GitHub release list JSON" raise RuntimeError(msg) from exc - stable_releases = _stable_published_releases(releases) + + +def _published_stable_releases(repo_root: Path) -> list[PublishedRelease]: + return _stable_published_releases(_github_release_list(repo_root)) + + +def _latest_published_release(repo_root: Path) -> PublishedRelease: + stable_releases = _published_stable_releases(repo_root) + if not stable_releases: + msg = "expected at least one published stable semver release" + raise RuntimeError(msg) + return max(stable_releases, key=lambda release: release.published_at) + + +def _previous_release_from_list(stable_releases: list[PublishedRelease], current_tag: str) -> PublishedRelease: + current_key = _semver_sort_key(current_tag) + previous_releases = sorted( + (release for release in stable_releases if _semver_sort_key(release.tag) < current_key), + key=lambda release: _semver_sort_key(release.tag), + ) + if not previous_releases: + msg = f"could not find a previous stable semver release before {current_tag}" + raise RuntimeError(msg) + return previous_releases[-1] + + +def _previous_published_release(repo_root: Path, current_tag: str) -> PublishedRelease: + return _previous_release_from_list(_published_stable_releases(repo_root), current_tag) + + +def _normalize_worktree_ref_for_tag(worktree_ref: str, current_tag: str) -> str: + try: + normalized_ref = normalize_tag(worktree_ref) + except ValueError: + return worktree_ref + return current_tag if normalized_ref == current_tag else worktree_ref + + +def _current_package_tag(repo_root: Path) -> str: + cargo_toml = repo_root / "Cargo.toml" + data = tomllib.loads(_read_text(cargo_toml)) + package = data.get("package") + if not isinstance(package, dict): + msg = f"could not find [package] in {cargo_toml}" + raise TypeError(msg) + version = package.get("version") + if not isinstance(version, str): + msg = f"could not find package.version in {cargo_toml}" + raise TypeError(msg) + return normalize_tag(version) + + +def _published_release_pair(repo_root: Path) -> ReportId: + stable_releases = _published_stable_releases(repo_root) if len(stable_releases) < 2: msg = "expected at least two published stable semver releases" raise RuntimeError(msg) current = max(stable_releases, key=lambda release: release.published_at) - current_key = _semver_sort_key(current.tag) - previous_tags = sorted( - (release.tag for release in stable_releases if _semver_sort_key(release.tag) < current_key), - key=_semver_sort_key, - ) - if not previous_tags: - msg = f"could not find a previous stable semver release before {current.tag}" - raise RuntimeError(msg) - return ReportId(current_tag=current.tag, baseline_tag=previous_tags[-1]) + previous = _previous_release_from_list(stable_releases, current.tag) + return ReportId(current_tag=current.tag, baseline_tag=previous.tag) def _read_text(path: Path) -> str: @@ -210,21 +289,24 @@ def _how_to_update_section() -> str: lines = [ "## How to Update", "", - "Release performance docs are generated in isolated temporary worktrees:", + "Local performance reports are generated in isolated temporary worktrees:", "", "```bash", + "# Local development: compare the current tree with the latest release", + "just performance-local", + "", "# Release PR: update docs/PERFORMANCE.md and archive the previous report", - "just performance-release ", + "just performance-release", "", - "# Historical published comparison", - "just performance-archive-published", + "# GitHub Actions release assets", + "just performance-github-assets", "", - "# Explicit historical repair", - "just performance-archive-published ", + "# Explicit repair", + "just performance-release ", "```", "", - "For local scratch comparisons, use `just bench-latest` and `just bench-compare`.", - "Those write `target/bench-reports/performance.md`.", + "`just performance-local` writes `target/bench-reports/performance.md`.", + "`just performance-github-assets` writes `target/bench-reports/github-assets-performance.md`.", "", "See `docs/BENCHMARKING.md` for the full comparison workflow.", "", @@ -314,8 +396,8 @@ def _run_tool(command: str, args: list[str], *, cwd: Path, timeout: int = _COMMA raise RuntimeError(_format_command_failure([command, *args], exc)) from exc -def _current_rust_toolchain(repo_root: Path) -> str | None: - rust_toolchain = repo_root / "rust-toolchain.toml" +def _current_rust_toolchain(checkout: Path) -> str | None: + rust_toolchain = checkout / "rust-toolchain.toml" if not rust_toolchain.exists(): return None data = tomllib.loads(_read_text(rust_toolchain)) @@ -326,10 +408,10 @@ def _current_rust_toolchain(repo_root: Path) -> str | None: return channel if isinstance(channel, str) else None -def _benchmark_env(repo_root: Path) -> dict[str, str] | None: +def _benchmark_env(checkout: Path) -> dict[str, str] | None: if "RUSTUP_TOOLCHAIN" in os.environ: return None - toolchain = _current_rust_toolchain(repo_root) + toolchain = _current_rust_toolchain(checkout) if toolchain is None: return None env = os.environ.copy() @@ -370,6 +452,21 @@ def _download_release_baseline(*, baseline_tag: str, download_dir: Path, repo_ro return artifact +def _copy_criterion_sample(*, criterion_dir: Path, source_sample: str, target_sample: str) -> None: + copied = 0 + for source in list(criterion_dir.rglob(source_sample)): + if not source.is_dir() or not (source / "estimates.json").exists(): + continue + target = source.parent / target_sample + if target.exists(): + shutil.rmtree(target) + shutil.copytree(source, target) + copied += 1 + if copied == 0: + msg = f"could not find Criterion sample {source_sample!r} under {criterion_dir}" + raise FileNotFoundError(msg) + + def _generate_release_baseline(*, baseline_tag: str, repo_root: Path, target_worktree: Path, tmp_dir: Path) -> None: baseline_worktree = tmp_dir / "baseline-worktree" _run_git(["worktree", "add", "--detach", str(baseline_worktree), baseline_tag], cwd=repo_root) @@ -389,23 +486,30 @@ def _generate_release_baseline(*, baseline_tag: str, repo_root: Path, target_wor print(f"archive-performance: failed to remove baseline worktree: {exc}", file=sys.stderr) -def _prepare_release_baseline(*, baseline_tag: str, repo_root: Path, target_worktree: Path, tmp_dir: Path) -> None: - try: - baseline_archive = _download_release_baseline( - baseline_tag=baseline_tag, - download_dir=tmp_dir, - repo_root=repo_root, - ) - except (FileNotFoundError, RuntimeError) as exc: - print(f"archive-performance: release baseline asset unavailable; generating {baseline_tag} locally ({exc})", file=sys.stderr) - _generate_release_baseline( - baseline_tag=baseline_tag, - repo_root=repo_root, - target_worktree=target_worktree, - tmp_dir=tmp_dir, - ) - else: - _safe_extract_tar(baseline_archive, target_worktree / "target") +def _prepare_local_release_baseline(*, baseline_tag: str, repo_root: Path, target_worktree: Path, tmp_dir: Path) -> None: + _generate_release_baseline( + baseline_tag=baseline_tag, + repo_root=repo_root, + target_worktree=target_worktree, + tmp_dir=tmp_dir, + ) + + +def _prepare_github_release_assets(*, current_tag: str, baseline_tag: str, repo_root: Path, target_worktree: Path, tmp_dir: Path) -> None: + baseline_archive = _download_release_baseline( + baseline_tag=baseline_tag, + download_dir=tmp_dir, + repo_root=repo_root, + ) + current_archive = _download_release_baseline( + baseline_tag=current_tag, + download_dir=tmp_dir, + repo_root=repo_root, + ) + target_dir = target_worktree / "target" + _safe_extract_tar(baseline_archive, target_dir) + _safe_extract_tar(current_archive, target_dir) + _copy_criterion_sample(criterion_dir=target_dir / "criterion", source_sample=current_tag, target_sample="new") def _apply_current_diff_to_worktree(*, repo_root: Path, worktree: Path) -> None: @@ -428,10 +532,8 @@ def _has_current_release_signal_tooling(worktree: Path) -> bool: return re.search(r"(?m)^bench-latest(?:[ :]|$)", justfile_text) is not None and '"--suite"' in bench_compare_text and '"--scope"' in bench_compare_text -def _run_benchmarks_and_render_report(*, worktree: Path, report: Path, config: GenerationConfig) -> None: - benchmark_env = _benchmark_env(config.repo_root) +def _render_report(*, worktree: Path, report: Path, config: GenerationConfig) -> None: if _has_current_release_signal_tooling(worktree): - _run_tool("just", ["bench-latest"], cwd=worktree, timeout=_BENCH_TIMEOUT_SECONDS, env=benchmark_env) _run_tool( "uv", [ @@ -449,7 +551,6 @@ def _run_benchmarks_and_render_report(*, worktree: Path, report: Path, config: G timeout=_COMMAND_TIMEOUT_SECONDS, ) else: - _run_tool("just", ["bench-exact"], cwd=worktree, timeout=_BENCH_TIMEOUT_SECONDS, env=benchmark_env) _run_tool( "uv", [ @@ -464,6 +565,15 @@ def _run_benchmarks_and_render_report(*, worktree: Path, report: Path, config: G ) +def _run_benchmarks_and_render_report(*, worktree: Path, report: Path, config: GenerationConfig) -> None: + benchmark_env = _benchmark_env(config.repo_root) + if _has_current_release_signal_tooling(worktree): + _run_tool("just", ["bench-latest"], cwd=worktree, timeout=_BENCH_TIMEOUT_SECONDS, env=benchmark_env) + else: + _run_tool("just", ["bench-exact"], cwd=worktree, timeout=_BENCH_TIMEOUT_SECONDS, env=benchmark_env) + _render_report(worktree=worktree, report=report, config=config) + + def _generate_report_in_temp_worktree( *, config: GenerationConfig, @@ -477,13 +587,23 @@ def _generate_report_in_temp_worktree( try: if config.apply_current_diff: _apply_current_diff_to_worktree(repo_root=config.repo_root, worktree=worktree) - _prepare_release_baseline( - baseline_tag=config.baseline_tag, - repo_root=config.repo_root, - target_worktree=worktree, - tmp_dir=tmp_dir, - ) - _run_benchmarks_and_render_report(worktree=worktree, report=report, config=config) + if config.baseline_source == "github-assets": + _prepare_github_release_assets( + current_tag=config.current_tag, + baseline_tag=config.baseline_tag, + repo_root=config.repo_root, + target_worktree=worktree, + tmp_dir=tmp_dir, + ) + _render_report(worktree=worktree, report=report, config=config) + else: + _prepare_local_release_baseline( + baseline_tag=config.baseline_tag, + repo_root=config.repo_root, + target_worktree=worktree, + tmp_dir=tmp_dir, + ) + _run_benchmarks_and_render_report(worktree=worktree, report=report, config=config) return _read_text(report) finally: try: @@ -545,6 +665,7 @@ def generate_and_promote_worktree_report( suite=config.suite, scope=config.scope, apply_current_diff=config.apply_current_diff, + baseline_source=config.baseline_source, ) report_text = _generate_report_in_temp_worktree( config=config, @@ -565,15 +686,52 @@ def generate_and_promote_worktree_report( source.unlink() -def resolve_archive_request( +def generate_worktree_report( *, - current_tag: str | None, - baseline_tag: str | None, - published_latest: bool, - worktree_ref: str, - repo_root: Path, -) -> ResolvedArchiveRequest: - """Resolve explicit or latest-published release arguments.""" + output: Path, + config: GenerationConfig, +) -> ReportId: + """Generate a comparison in a temp worktree and write it to *output*.""" + current_tag = normalize_tag(config.current_tag) + baseline_tag = normalize_tag(config.baseline_tag) + config = GenerationConfig( + repo_root=config.repo_root, + current_tag=current_tag, + baseline_tag=baseline_tag, + worktree_ref=config.worktree_ref, + suite=config.suite, + scope=config.scope, + apply_current_diff=config.apply_current_diff, + baseline_source=config.baseline_source, + ) + report_text = _normalize_how_to_update(_generate_report_in_temp_worktree(config=config)) + report_id = parse_report_id(report_text) + expected = ReportId(current_tag=current_tag, baseline_tag=baseline_tag) + if report_id != expected: + msg = ( + "benchmark report does not match requested release pair: " + f"found {report_id.current_tag} vs {report_id.baseline_tag}, " + f"expected {expected.current_tag} vs {expected.baseline_tag}" + ) + raise ValueError(msg) + _write_text(output, report_text) + return report_id + + +def resolve_archive_request(options: ArchiveRequestOptions) -> ResolvedArchiveRequest: + """Resolve explicit, package-inferred, or latest-published release arguments.""" + current_tag = options.current_tag + baseline_tag = options.baseline_tag + worktree_ref = options.worktree_ref + repo_root = options.repo_root + published_latest = options.published_latest + infer_release = options.infer_release + current_vs_latest = options.current_vs_latest + requested_modes = sum((published_latest, infer_release, current_vs_latest)) + if requested_modes > 1: + msg = "choose only one of --published-latest, --infer-release, or --current-vs-latest" + raise ValueError(msg) + if published_latest: if current_tag is not None or baseline_tag is not None: msg = "do not pass current_tag or baseline_tag with --published-latest" @@ -584,16 +742,45 @@ def resolve_archive_request( current_tag=published_pair.current_tag, baseline_tag=published_pair.baseline_tag, worktree_ref=resolved_worktree_ref, - fetch_tags=True, + tags_to_fetch=(published_pair.current_tag, published_pair.baseline_tag), + ) + + if infer_release: + if current_tag is not None or baseline_tag is not None: + msg = "do not pass current_tag or baseline_tag with --infer-release" + raise ValueError(msg) + inferred_current = _current_package_tag(repo_root) + inferred_baseline = _previous_published_release(repo_root, inferred_current).tag + return ResolvedArchiveRequest( + current_tag=inferred_current, + baseline_tag=inferred_baseline, + worktree_ref=worktree_ref, + tags_to_fetch=(inferred_baseline,), + ) + + if current_vs_latest: + if current_tag is not None or baseline_tag is not None: + msg = "do not pass current_tag or baseline_tag with --current-vs-latest" + raise ValueError(msg) + inferred_current = _current_package_tag(repo_root) + latest = _latest_published_release(repo_root).tag + return ResolvedArchiveRequest( + current_tag=inferred_current, + baseline_tag=latest, + worktree_ref=worktree_ref, + tags_to_fetch=(latest,), ) if current_tag is None or baseline_tag is None: - msg = "current_tag and baseline_tag are required unless --published-latest is used" + msg = "current_tag and baseline_tag are required unless an inference mode is used" raise ValueError(msg) + normalized_current = normalize_tag(current_tag) + normalized_baseline = normalize_tag(baseline_tag) return ResolvedArchiveRequest( - current_tag=current_tag, - baseline_tag=baseline_tag, - worktree_ref=worktree_ref, + current_tag=normalized_current, + baseline_tag=normalized_baseline, + worktree_ref=_normalize_worktree_ref_for_tag(worktree_ref, normalized_current), + tags_to_fetch=(normalized_baseline,), ) @@ -614,6 +801,11 @@ def build_parser() -> argparse.ArgumentParser: default=_DEFAULT_CURRENT, help=f"Committed performance report path (default: {_DEFAULT_CURRENT})", ) + parser.add_argument( + "--output", + default=_DEFAULT_SOURCE, + help=f"Generated report path for --output-only (default: {_DEFAULT_SOURCE})", + ) parser.add_argument( "--archive-dir", default=_DEFAULT_ARCHIVE_DIR, @@ -629,6 +821,26 @@ def build_parser() -> argparse.ArgumentParser: action="store_true", help="Infer the latest stable published GitHub release and its previous stable release.", ) + parser.add_argument( + "--infer-release", + action="store_true", + help="Infer current_tag from Cargo.toml and baseline_tag from the previous stable published release.", + ) + parser.add_argument( + "--current-vs-latest", + action="store_true", + help="Infer current_tag from Cargo.toml and baseline_tag from the latest stable published release.", + ) + parser.add_argument( + "--github-assets", + action="store_true", + help="Compare stored GitHub Release benchmark assets instead of generating the baseline locally.", + ) + parser.add_argument( + "--output-only", + action="store_true", + help="Write the generated report to --output without promoting docs/PERFORMANCE.md.", + ) parser.add_argument( "--worktree-ref", default="HEAD", @@ -652,63 +864,113 @@ def build_parser() -> argparse.ArgumentParser: return parser -def main(argv: list[str] | None = None) -> int: - """CLI entry point.""" - args = build_parser().parse_args(argv) - root = Path.cwd() +def _resolve_cli_paths(root: Path, args: argparse.Namespace) -> ArchivePaths: source = Path(args.source) current = Path(args.current) + output = Path(args.output) archive_dir = Path(args.archive_dir) if not source.is_absolute(): source = root / source if not current.is_absolute(): current = root / current + if not output.is_absolute(): + output = root / output if not archive_dir.is_absolute(): archive_dir = root / archive_dir + return ArchivePaths(source=source, current=current, output=output, archive_dir=archive_dir) + + +def _fetch_required_tags(*, request: ResolvedArchiveRequest, repo_root: Path, include_current: bool) -> None: + tags_to_fetch = request.tags_to_fetch + if include_current and request.current_tag not in tags_to_fetch: + tags_to_fetch = (*tags_to_fetch, request.current_tag) + if tags_to_fetch: + _fetch_release_tags(repo_root=repo_root, tags=list(dict.fromkeys(tags_to_fetch))) + + +def _generation_config(*, args: argparse.Namespace, request: ResolvedArchiveRequest, repo_root: Path) -> GenerationConfig: + return GenerationConfig( + repo_root=repo_root, + current_tag=request.current_tag, + baseline_tag=request.baseline_tag, + worktree_ref=request.worktree_ref, + suite=args.suite, + scope=args.scope, + apply_current_diff=not args.no_apply_current_diff and not args.github_assets, + baseline_source="github-assets" if args.github_assets else "local", + ) - try: - request = resolve_archive_request( - current_tag=args.current_tag, - baseline_tag=args.baseline_tag, - published_latest=args.published_latest, - worktree_ref=args.worktree_ref, - repo_root=root, - ) - if request.fetch_tags: - _fetch_release_tags(repo_root=root, tags=[request.current_tag, request.baseline_tag]) - - if args.generate_in_temp_worktree: - report_id = generate_and_promote_worktree_report( - current=current, - archive_dir=archive_dir, - config=GenerationConfig( - repo_root=root, - current_tag=request.current_tag, - baseline_tag=request.baseline_tag, - worktree_ref=request.worktree_ref, - suite=args.suite, - scope=args.scope, - apply_current_diff=not args.no_apply_current_diff, + +def _run_archive_request(*, args: argparse.Namespace, paths: ArchivePaths, request: ResolvedArchiveRequest, repo_root: Path) -> ArchiveResult: + if args.generate_in_temp_worktree: + _fetch_required_tags(request=request, repo_root=repo_root, include_current=args.github_assets) + config = _generation_config(args=args, request=request, repo_root=repo_root) + if args.output_only: + return ArchiveResult( + report_id=generate_worktree_report( + output=paths.output, + config=config, ), + action="output", ) - else: - report_id = promote_report( - source=source, - current=current, - archive_dir=archive_dir, - expected_current_tag=request.current_tag, - expected_baseline_tag=request.baseline_tag, + return ArchiveResult( + report_id=generate_and_promote_worktree_report( + current=paths.current, + archive_dir=paths.archive_dir, + config=config, + ), + action="promote-generated", + ) + + if args.output_only: + msg = "--output-only requires --generate-in-temp-worktree" + raise ValueError(msg) + if args.github_assets: + msg = "--github-assets requires --generate-in-temp-worktree" + raise ValueError(msg) + return ArchiveResult( + report_id=promote_report( + source=paths.source, + current=paths.current, + archive_dir=paths.archive_dir, + expected_current_tag=request.current_tag, + expected_baseline_tag=request.baseline_tag, + ), + action="promote-source", + ) + + +def main(argv: list[str] | None = None) -> int: + """CLI entry point.""" + args = build_parser().parse_args(argv) + root = Path.cwd() + paths = _resolve_cli_paths(root, args) + + try: + request = resolve_archive_request( + ArchiveRequestOptions( + current_tag=args.current_tag, + baseline_tag=args.baseline_tag, + published_latest=args.published_latest, + infer_release=args.infer_release, + current_vs_latest=args.current_vs_latest, + worktree_ref=args.worktree_ref, + repo_root=root, ) + ) + result = _run_archive_request(args=args, paths=paths, request=request, repo_root=root) except Exception as exc: print(f"archive-performance: {exc}", file=sys.stderr) return 1 - if args.generate_in_temp_worktree: - print(f"Generated benchmark report in a temporary worktree and promoted it to {current}") + if result.action == "output": + print(f"Generated benchmark report in a temporary worktree and wrote it to {paths.output}") + elif result.action == "promote-generated": + print(f"Generated benchmark report in a temporary worktree and promoted it to {paths.current}") else: - print(f"Promoted {source} to {current}") - print(f"Current performance report: {report_id.current_tag} vs {report_id.baseline_tag}") - print(f"Archive directory: {archive_dir}") + print(f"Promoted {paths.source} to {paths.current}") + print(f"Current performance report: {result.report_id.current_tag} vs {result.report_id.baseline_tag}") + print(f"Archive directory: {paths.archive_dir}") return 0 diff --git a/scripts/bench_compare.py b/scripts/bench_compare.py index c843850..df9b689 100644 --- a/scripts/bench_compare.py +++ b/scripts/bench_compare.py @@ -707,21 +707,24 @@ def _generate_markdown( [ "## How to Update", "", - "Release performance docs are generated in isolated temporary worktrees:", + "Local performance reports are generated in isolated temporary worktrees:", "", "```bash", + "# Local development: compare the current tree with the latest release", + "just performance-local", + "", "# Release PR: update docs/PERFORMANCE.md and archive the previous report", - "just performance-release ", + "just performance-release", "", - "# Historical published comparison", - "just performance-archive-published", + "# GitHub Actions release assets", + "just performance-github-assets", "", - "# Explicit historical repair", - "just performance-archive-published ", + "# Explicit repair", + "just performance-release ", "```", "", - "For local scratch comparisons, use `just bench-latest` and `just bench-compare`.", - "Those write `target/bench-reports/performance.md`.", + "`just performance-local` writes `target/bench-reports/performance.md`.", + "`just performance-github-assets` writes `target/bench-reports/github-assets-performance.md`.", "", "See `docs/BENCHMARKING.md` for the full comparison workflow.", ] diff --git a/scripts/tests/test_archive_performance.py b/scripts/tests/test_archive_performance.py index ade4445..c9c7672 100644 --- a/scripts/tests/test_archive_performance.py +++ b/scripts/tests/test_archive_performance.py @@ -60,10 +60,14 @@ def _legacy_report(version: str, baseline: str) -> str: def _write_baseline_archive(path: Path) -> None: - fixture_dir = path.parent / "baseline-fixture" + tag = path.name.removeprefix("la-stack-").removesuffix("-criterion-baseline.tar.gz") + fixture_dir = path.parent / f"baseline-fixture-{tag}" criterion_dir = fixture_dir / "criterion" criterion_dir.mkdir(parents=True) (criterion_dir / "placeholder.txt").write_text("baseline\n", encoding="utf-8") + sample_dir = criterion_dir / "exact_d2" / "det_exact" / tag + sample_dir.mkdir(parents=True) + (sample_dir / "estimates.json").write_text('{"median":{"point_estimate":1.0}}\n', encoding="utf-8") with tarfile.open(path, "w:gz") as tar: tar.add(criterion_dir, arcname="criterion") @@ -158,6 +162,74 @@ def fake_run_safe(command: str, args: Sequence[str], cwd: Path | None = None, ** assert report_id.baseline_tag == "v0.4.8" +def test_resolve_archive_request_infer_release_uses_package_version_and_previous_release(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + (tmp_path / "Cargo.toml").write_text('[package]\nversion = "0.4.3"\n', encoding="utf-8") + + def fake_run_safe(command: str, args: Sequence[str], cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: + assert command == "gh" + assert args[:2] == ["release", "list"] + assert cwd == tmp_path + return _result( + "[" + '{"tagName":"v0.4.1","isDraft":false,"isPrerelease":false,"publishedAt":"2026-01-01T00:00:00Z"},' + '{"tagName":"v0.4.2","isDraft":false,"isPrerelease":false,"publishedAt":"2026-02-01T00:00:00Z"}' + "]" + ) + + monkeypatch.setattr(archive_performance, "run_safe_command", fake_run_safe) + + request = archive_performance.resolve_archive_request( + archive_performance.ArchiveRequestOptions( + current_tag=None, + baseline_tag=None, + published_latest=False, + infer_release=True, + current_vs_latest=False, + worktree_ref="HEAD", + repo_root=tmp_path, + ) + ) + + assert request.current_tag == "v0.4.3" + assert request.baseline_tag == "v0.4.2" + assert request.worktree_ref == "HEAD" + assert request.tags_to_fetch == ("v0.4.2",) + + +def test_resolve_archive_request_current_vs_latest_uses_package_version_and_latest_release(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + (tmp_path / "Cargo.toml").write_text('[package]\nversion = "0.4.3"\n', encoding="utf-8") + + def fake_run_safe(command: str, args: Sequence[str], cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: + assert command == "gh" + assert args[:2] == ["release", "list"] + assert cwd == tmp_path + return _result( + "[" + '{"tagName":"v0.4.1","isDraft":false,"isPrerelease":false,"publishedAt":"2026-01-01T00:00:00Z"},' + '{"tagName":"v0.4.2","isDraft":false,"isPrerelease":false,"publishedAt":"2026-02-01T00:00:00Z"}' + "]" + ) + + monkeypatch.setattr(archive_performance, "run_safe_command", fake_run_safe) + + request = archive_performance.resolve_archive_request( + archive_performance.ArchiveRequestOptions( + current_tag=None, + baseline_tag=None, + published_latest=False, + infer_release=False, + current_vs_latest=True, + worktree_ref="HEAD", + repo_root=tmp_path, + ) + ) + + assert request.current_tag == "v0.4.3" + assert request.baseline_tag == "v0.4.2" + assert request.worktree_ref == "HEAD" + assert request.tags_to_fetch == ("v0.4.2",) + + def test_benchmark_env_uses_current_repo_toolchain(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delenv("RUSTUP_TOOLCHAIN", raising=False) (tmp_path / "rust-toolchain.toml").write_text('[toolchain]\nchannel = "1.96.0"\n', encoding="utf-8") @@ -285,11 +357,13 @@ def test_promote_report_rewrites_legacy_update_instructions(tmp_path: Path) -> N current_text = current.read_text(encoding="utf-8") archived_text = (archive_dir / "v0.4.2-vs-v0.4.1.md").read_text(encoding="utf-8") + assert "just performance-local" in current_text + assert "just performance-release" in current_text + assert "just performance-github-assets" in current_text assert "just performance-release " in current_text - assert "just performance-archive-published" in current_text - assert "just performance-archive-published " in current_text assert "git checkout" not in current_text - assert "just performance-release " in archived_text + assert "just performance-local" in archived_text + assert "just performance-github-assets" in archived_text assert "git checkout" not in archived_text @@ -368,9 +442,11 @@ def fake_run_git_with_input(args: Sequence[str], input_data: str, cwd: Path | No def fake_run_safe(command: str, args: Sequence[str], cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: calls.append((command, tuple(args), cwd)) - if command == "gh": - download_dir = Path(args[args.index("--dir") + 1]) - _write_baseline_archive(download_dir / "la-stack-v0.4.2-criterion-baseline.tar.gz") + if command == "just" and args == ["bench-save-baseline", "v0.4.2"]: + assert cwd is not None + criterion_dir = cwd / "target" / "criterion" + criterion_dir.mkdir(parents=True) + (criterion_dir / "baseline.txt").write_text("baseline\n", encoding="utf-8") if command == "uv": output = Path(args[args.index("--output") + 1]) output.write_text(_report("0.4.3", "v0.4.2"), encoding="utf-8") @@ -432,9 +508,11 @@ def fake_run_git_with_input(args: Sequence[str], input_data: str, cwd: Path | No def fake_run_safe(command: str, args: Sequence[str], cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: calls.append((command, tuple(args), cwd)) - if command == "gh": - download_dir = Path(args[args.index("--dir") + 1]) - _write_baseline_archive(download_dir / "la-stack-v0.4.2-criterion-baseline.tar.gz") + if command == "just" and args == ["bench-save-baseline", "v0.4.2"]: + assert cwd is not None + criterion_dir = cwd / "target" / "criterion" + criterion_dir.mkdir(parents=True) + (criterion_dir / "baseline.txt").write_text("baseline\n", encoding="utf-8") if command == "just" and args == ["bench-latest"]: raise subprocess.CalledProcessError(42, [command, *args], output="bench stdout", stderr="bench stderr") return _result() @@ -488,7 +566,8 @@ def fake_run_safe(command: str, args: Sequence[str], cwd: Path | None = None, ** calls.append((command, tuple(args), cwd)) if command == "gh": download_dir = Path(args[args.index("--dir") + 1]) - _write_unsafe_baseline_archive(download_dir / "la-stack-v0.4.2-criterion-baseline.tar.gz") + tag = args[2] + _write_unsafe_baseline_archive(download_dir / f"la-stack-{tag}-criterion-baseline.tar.gz") return _result() monkeypatch.chdir(tmp_path) @@ -508,6 +587,7 @@ def fake_run_safe(command: str, args: Sequence[str], cwd: Path | None = None, ** "--worktree-ref", "HEAD", "--no-apply-current-diff", + "--github-assets", ] ) @@ -520,9 +600,9 @@ def fake_run_safe(command: str, args: Sequence[str], cwd: Path | None = None, ** assert any(kind == "git" and args[:3] == ("worktree", "remove", "--force") for kind, args, _ in calls) -def test_generate_report_falls_back_when_release_baseline_asset_is_missing( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] -) -> None: +def test_generate_report_generates_release_baseline_locally(tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: + monkeypatch.delenv("RUSTUP_TOOLCHAIN", raising=False) + (tmp_path / "rust-toolchain.toml").write_text('[toolchain]\nchannel = "1.96.0"\n', encoding="utf-8") current = tmp_path / "docs" / "PERFORMANCE.md" archive_dir = tmp_path / "docs" / "archive" / "performance" calls: list[RunnerCall] = [] @@ -545,10 +625,13 @@ def fake_run_git_with_input(args: Sequence[str], input_data: str, cwd: Path | No def fake_run_safe(command: str, args: Sequence[str], cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: calls.append((command, tuple(args), cwd)) if command == "just" and args == ["bench-save-baseline", "v0.4.2"]: + assert kwargs["env"]["RUSTUP_TOOLCHAIN"] == "1.96.0" assert cwd is not None criterion_dir = cwd / "target" / "criterion" criterion_dir.mkdir(parents=True) (criterion_dir / "baseline.txt").write_text("baseline\n", encoding="utf-8") + if command == "just" and args == ["bench-latest"]: + assert kwargs["env"]["RUSTUP_TOOLCHAIN"] == "1.96.0" if command == "uv": output = Path(args[args.index("--output") + 1]) output.write_text(_report("0.4.3", "v0.4.2"), encoding="utf-8") @@ -576,8 +659,9 @@ def fake_run_safe(command: str, args: Sequence[str], cwd: Path | None = None, ** captured = capsys.readouterr() assert rc == 0 - assert "release baseline asset unavailable; generating v0.4.2 locally" in captured.err + assert captured.err == "" assert current.read_text(encoding="utf-8") == _normalized_report("0.4.3", "v0.4.2") + assert not any(kind == "gh" for kind, _, _ in calls) assert any(kind == "just" and args == ("bench-save-baseline", "v0.4.2") for kind, args, _ in calls) assert any(kind == "just" and args == ("bench-latest",) for kind, args, _ in calls) assert any(kind == "uv" and "--suite" in args for kind, args, _ in calls) @@ -612,7 +696,8 @@ def fake_run_safe(command: str, args: Sequence[str], cwd: Path | None = None, ** ) if command == "gh": download_dir = Path(args[args.index("--dir") + 1]) - _write_baseline_archive(download_dir / "la-stack-v0.4.2-criterion-baseline.tar.gz") + tag = args[2] + _write_baseline_archive(download_dir / f"la-stack-{tag}-criterion-baseline.tar.gz") if command == "uv": output = Path(args[args.index("--output") + 1]) output.write_text(_report("0.4.3", "v0.4.2"), encoding="utf-8") @@ -630,6 +715,7 @@ def fake_run_safe(command: str, args: Sequence[str], cwd: Path | None = None, ** "--archive-dir", str(archive_dir), "--published-latest", + "--github-assets", "--generate-in-temp-worktree", "--no-apply-current-diff", ] @@ -654,6 +740,80 @@ def fake_run_safe(command: str, args: Sequence[str], cwd: Path | None = None, ** assert any(kind == "git" and args[:3] == ("worktree", "add", "--detach") and args[4] == "v0.4.3" for kind, args, _ in calls) +def test_main_normalizes_explicit_bare_tags_before_fetching_and_checkout(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + output = tmp_path / "target" / "bench-reports" / "github-assets-performance.md" + current = tmp_path / "docs" / "PERFORMANCE.md" + archive_dir = tmp_path / "docs" / "archive" / "performance" + calls: list[RunnerCall] = [] + + def fake_run_git(args: Sequence[str], cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: + calls.append(("git", tuple(args), cwd)) + if args[:3] == ["worktree", "add", "--detach"]: + worktree = Path(args[3]) + worktree.mkdir(parents=True) + _write_current_benchmark_tooling(worktree) + return _result() + + def fake_run_git_with_input(args: Sequence[str], input_data: str, cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: + calls.append(("git-stdin", tuple(args), cwd)) + return _result() + + def fake_run_safe(command: str, args: Sequence[str], cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: + calls.append((command, tuple(args), cwd)) + if command == "gh": + download_dir = Path(args[args.index("--dir") + 1]) + tag = args[2] + _write_baseline_archive(download_dir / f"la-stack-{tag}-criterion-baseline.tar.gz") + if command == "uv": + report = Path(args[args.index("--output") + 1]) + report.write_text(_report("0.4.3", "v0.4.2"), encoding="utf-8") + return _result() + + monkeypatch.chdir(tmp_path) + monkeypatch.setattr(archive_performance, "run_git_command", fake_run_git) + monkeypatch.setattr(archive_performance, "run_git_command_with_input", fake_run_git_with_input) + monkeypatch.setattr(archive_performance, "run_safe_command", fake_run_safe) + + rc = main( + [ + "0.4.3", + "0.4.2", + "--current", + str(current), + "--archive-dir", + str(archive_dir), + "--github-assets", + "--generate-in-temp-worktree", + "--worktree-ref", + "0.4.3", + "--output-only", + "--output", + str(output), + ] + ) + + assert rc == 0 + assert output.read_text(encoding="utf-8") == _normalized_report("0.4.3", "v0.4.2") + assert not current.exists() + assert any( + kind == "git" + and args + == ( + "fetch", + "origin", + "refs/tags/v0.4.2:refs/tags/v0.4.2", + "refs/tags/v0.4.3:refs/tags/v0.4.3", + ) + for kind, args, _ in calls + ) + fetch_index = next(index for index, (kind, args, _) in enumerate(calls) if kind == "git" and args[:2] == ("fetch", "origin")) + worktree_index = next(index for index, (kind, args, _) in enumerate(calls) if kind == "git" and args[:3] == ("worktree", "add", "--detach")) + assert fetch_index < worktree_index + assert any(kind == "git" and args[:3] == ("worktree", "add", "--detach") and args[4] == "v0.4.3" for kind, args, _ in calls) + assert any(kind == "gh" and args[:3] == ("release", "download", "v0.4.2") for kind, args, _ in calls) + assert any(kind == "gh" and args[:3] == ("release", "download", "v0.4.3") for kind, args, _ in calls) + + def test_main_published_latest_fetch_failure_stops_before_worktree(tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: current = tmp_path / "docs" / "PERFORMANCE.md" archive_dir = tmp_path / "docs" / "archive" / "performance" @@ -741,6 +901,8 @@ def fail_replace(src: Path, dst: Path) -> None: def test_generate_and_promote_uses_temp_worktree_and_current_diff(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("RUSTUP_TOOLCHAIN", raising=False) + (tmp_path / "rust-toolchain.toml").write_text('[toolchain]\nchannel = "1.96.0"\n', encoding="utf-8") current = tmp_path / "docs" / "PERFORMANCE.md" archive_dir = tmp_path / "docs" / "archive" / "performance" current.parent.mkdir(parents=True) @@ -764,9 +926,14 @@ def fake_run_git_with_input(args: Sequence[str], input_data: str, cwd: Path | No def fake_run_safe(command: str, args: Sequence[str], cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: calls.append((command, tuple(args), cwd)) - if command == "gh": - download_dir = Path(args[args.index("--dir") + 1]) - _write_baseline_archive(download_dir / "la-stack-v0.4.2-criterion-baseline.tar.gz") + if command == "just" and args == ["bench-save-baseline", "v0.4.2"]: + assert kwargs["env"]["RUSTUP_TOOLCHAIN"] == "1.96.0" + assert cwd is not None + criterion_dir = cwd / "target" / "criterion" + criterion_dir.mkdir(parents=True) + (criterion_dir / "baseline.txt").write_text("baseline\n", encoding="utf-8") + if command == "just" and args == ["bench-latest"]: + assert kwargs["env"]["RUSTUP_TOOLCHAIN"] == "1.96.0" if command == "uv": output = Path(args[args.index("--output") + 1]) output.write_text(_report("0.4.3", "v0.4.2"), encoding="utf-8") @@ -816,9 +983,11 @@ def fake_run_git_with_input(args: Sequence[str], input_data: str, cwd: Path | No def fake_run_safe(command: str, args: Sequence[str], cwd: Path | None = None, **kwargs: Any) -> SimpleNamespace: calls.append((command, tuple(args), cwd)) - if command == "gh": - download_dir = Path(args[args.index("--dir") + 1]) - _write_baseline_archive(download_dir / "la-stack-v0.4.1-criterion-baseline.tar.gz") + if command == "just" and args == ["bench-save-baseline", "v0.4.1"]: + assert cwd is not None + criterion_dir = cwd / "target" / "criterion" + criterion_dir.mkdir(parents=True) + (criterion_dir / "baseline.txt").write_text("baseline\n", encoding="utf-8") if command == "uv": output = Path(args[args.index("--output") + 1]) output.write_text(_report("0.4.2", "v0.4.1"), encoding="utf-8") diff --git a/scripts/tests/test_bench_compare.py b/scripts/tests/test_bench_compare.py index 45c7004..04c59bc 100644 --- a/scripts/tests/test_bench_compare.py +++ b/scripts/tests/test_bench_compare.py @@ -375,9 +375,10 @@ def test_main_snapshot_writes_output(tmp_path: Path) -> None: assert "### D=2" in text assert "### Random percentile D=3" in text assert "### Near-singular 3x3" in text + assert "just performance-local" in text + assert "just performance-release" in text + assert "just performance-github-assets" in text assert "just performance-release " in text - assert "just performance-archive-published" in text - assert "just performance-archive-published " in text assert "git checkout" not in text From 7938386166f1f3f5cf594c5def67458d48e19a98 Mon Sep 17 00:00:00 2001 From: Adam Getchell Date: Mon, 8 Jun 2026 16:30:47 -0700 Subject: [PATCH 4/4] fix(bench): re-raise unexpected archive failures - Limit archive-performance CLI error handling to expected validation, filesystem, subprocess, and runtime failures. - Let unexpected exceptions propagate so benchmark archiving bugs surface during development. --- scripts/archive_performance.py | 4 +++- scripts/tests/test_archive_performance.py | 12 ++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/scripts/archive_performance.py b/scripts/archive_performance.py index db98879..ac4dcf4 100644 --- a/scripts/archive_performance.py +++ b/scripts/archive_performance.py @@ -959,9 +959,11 @@ def main(argv: list[str] | None = None) -> int: ) ) result = _run_archive_request(args=args, paths=paths, request=request, repo_root=root) - except Exception as exc: + except (ValueError, RuntimeError, FileNotFoundError, subprocess.CalledProcessError) as exc: print(f"archive-performance: {exc}", file=sys.stderr) return 1 + except Exception: + raise if result.action == "output": print(f"Generated benchmark report in a temporary worktree and wrote it to {paths.output}") diff --git a/scripts/tests/test_archive_performance.py b/scripts/tests/test_archive_performance.py index c9c7672..41d5329 100644 --- a/scripts/tests/test_archive_performance.py +++ b/scripts/tests/test_archive_performance.py @@ -423,6 +423,18 @@ def test_main_reports_release_pair_mismatch_to_stderr(tmp_path: Path, capsys: py assert not current.exists() +def test_main_reraises_unexpected_errors(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + def fail_unexpected(*, args: object, paths: object, request: object, repo_root: Path) -> archive_performance.ArchiveResult: + msg = "unexpected test failure" + raise AssertionError(msg) + + monkeypatch.chdir(tmp_path) + monkeypatch.setattr(archive_performance, "_run_archive_request", fail_unexpected) + + with pytest.raises(AssertionError, match="unexpected test failure"): + main(["v0.4.3", "v0.4.2"]) + + def test_main_generates_report_in_temp_worktree(tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: current = tmp_path / "docs" / "PERFORMANCE.md" archive_dir = tmp_path / "docs" / "archive" / "performance"