From 74db3a0064d14e45a0bd9bbb7f67726b81a11ae3 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sat, 4 Jul 2026 08:38:37 -0400 Subject: [PATCH 1/3] ci: serialize uv installs against the shared node-local cache Self-hosted Frontier/Frontier-AMD matrix legs (acc/omp/cpu x shards) run their "Fetch Dependencies" step directly on the same login node as the same OS user, all pointed at the same UV_CACHE_DIR (introduced in #1385 to dodge NFS file-lock errors on ~/.cache/uv). uv's own cache lock guards individual entries, but concurrent installs from separate uv processes can still race while one extracts/prunes the shared archive-v0 store, leaving a corrupted entry behind (e.g. a missing dist-info METADATA file) that fails every subsequent install until the cache is cleared by hand -- as happened on PR #1414's Frontier gpu-acc [2/2] job. Serialize the actual `uv pip install` call with flock so only one process touches a given cache dir at a time, while keeping the cache itself shared and warm across runs. --- toolchain/bootstrap/python.sh | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/toolchain/bootstrap/python.sh b/toolchain/bootstrap/python.sh index 2809918b76..32a7e1e84c 100644 --- a/toolchain/bootstrap/python.sh +++ b/toolchain/bootstrap/python.sh @@ -188,11 +188,27 @@ if ! cmp "$(pwd)/toolchain/pyproject.toml" "$(pwd)/build/pyproject.toml" > /dev/ if [ "${GITHUB_ACTIONS:-}" = "true" ] && [ -w "${TMPDIR:-/tmp}" ]; then export UV_CACHE_DIR="${TMPDIR:-/tmp}/uv-cache-${USER:-$(id -un)}" fi + + # Self-hosted HPC runners (Frontier, Phoenix) run several matrix legs + # (interfaces/shards) as the same OS user on the same login node at + # once, all sharing the UV_CACHE_DIR above. uv's own cache lock + # protects individual entries, but concurrent installs can still race + # while uv extracts/prunes the shared archive-v0 store, leaving a + # corrupted entry (e.g. a missing dist-info METADATA file) that fails + # every subsequent install until the cache is manually cleared. + # Serialize the install call itself so only one uv process touches a + # given cache dir at a time. + UV_INSTALL_LOCK="${TMPDIR:-/tmp}/mfc-uv-install-${USER:-$(id -un)}.lock" + if command -v flock > /dev/null 2>&1; then + uv_install() { flock "$UV_INSTALL_LOCK" uv pip install "$@"; } + else + uv_install() { uv pip install "$@"; } + fi log "(venv) Using$MAGENTA uv$COLOR_RESET for fast installation..." if [ "$verbose" = "1" ]; then # Verbose mode: show full uv output - if uv pip install "$(pwd)/toolchain"; then + if uv_install "$(pwd)/toolchain"; then ok "(venv) Installation succeeded." cp "$(pwd)/toolchain/pyproject.toml" "$(pwd)/build/" else @@ -203,7 +219,7 @@ if ! cmp "$(pwd)/toolchain/pyproject.toml" "$(pwd)/build/pyproject.toml" > /dev/ fi else # Default: show progress but filter out individual package lines (+ pkg==ver) - uv pip install "$(pwd)/toolchain" > "$PIP_LOG" 2>&1 + uv_install "$(pwd)/toolchain" > "$PIP_LOG" 2>&1 UV_EXIT=$? # Show filtered output (progress info without package list) # Filter out lines like " + pkg==1.0", " - pkg==1.0", " ~ pkg==1.0" From 3604852cf061a295d64ba906ae9c9399d066087c Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sat, 4 Jul 2026 08:48:47 -0400 Subject: [PATCH 2/3] ci: fall back to /tmp for the uv install lock when TMPDIR is unwritable Addresses Claude Code Review on #1630: - flock hard-fails (and skips running uv pip install entirely) if its lock file's directory doesn't exist or isn't writable. TMPDIR can be stale on HPC login nodes (e.g. left over from a prior job's since-deleted scratch dir), so an unconditional flock target risked breaking installs that worked fine before this lock existed. Guard with the same -w check already used for the UV_CACHE_DIR redirect, falling back to /tmp. - Clarified the comment: uv's cache is shared per-user by default (~/.cache/uv), not just in the CI node-local-redirect case, so the serialization also protects concurrent local builds, not only self-hosted CI matrix legs. --- toolchain/bootstrap/python.sh | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/toolchain/bootstrap/python.sh b/toolchain/bootstrap/python.sh index 32a7e1e84c..86318fb28b 100644 --- a/toolchain/bootstrap/python.sh +++ b/toolchain/bootstrap/python.sh @@ -189,16 +189,23 @@ if ! cmp "$(pwd)/toolchain/pyproject.toml" "$(pwd)/build/pyproject.toml" > /dev/ export UV_CACHE_DIR="${TMPDIR:-/tmp}/uv-cache-${USER:-$(id -un)}" fi - # Self-hosted HPC runners (Frontier, Phoenix) run several matrix legs - # (interfaces/shards) as the same OS user on the same login node at - # once, all sharing the UV_CACHE_DIR above. uv's own cache lock - # protects individual entries, but concurrent installs can still race - # while uv extracts/prunes the shared archive-v0 store, leaving a - # corrupted entry (e.g. a missing dist-info METADATA file) that fails - # every subsequent install until the cache is manually cleared. - # Serialize the install call itself so only one uv process touches a - # given cache dir at a time. - UV_INSTALL_LOCK="${TMPDIR:-/tmp}/mfc-uv-install-${USER:-$(id -un)}.lock" + # uv's cache (~/.cache/uv by default, or the node-local redirect above + # in CI) is shared per-user across every repo/worktree/matrix leg. uv's + # own cache lock protects individual entries, but concurrent installs + # from separate uv processes can still race while one extracts/prunes + # the shared archive-v0 store, leaving a corrupted entry (e.g. a + # missing dist-info METADATA file) that fails every subsequent install + # until the cache is manually cleared -- both across self-hosted CI + # matrix legs (Frontier, Phoenix) sharing a login node, and across + # concurrent local builds by the same user. Serialize the install + # call itself so only one uv process touches a given cache dir at a + # time. Fall back to /tmp for the lock file itself if TMPDIR isn't + # writable (e.g. a stale TMPDIR left over from a prior job's + # since-deleted scratch dir), so a bad TMPDIR can't break installs + # that used to work fine before this lock existed. + UV_LOCK_DIR="${TMPDIR:-/tmp}" + [ -w "$UV_LOCK_DIR" ] || UV_LOCK_DIR=/tmp + UV_INSTALL_LOCK="${UV_LOCK_DIR}/mfc-uv-install-${USER:-$(id -un)}.lock" if command -v flock > /dev/null 2>&1; then uv_install() { flock "$UV_INSTALL_LOCK" uv pip install "$@"; } else From 79c69c334cd25937e4f8c0150b1d4b002e9d4324 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sat, 4 Jul 2026 08:58:05 -0400 Subject: [PATCH 3/3] ci: add -d guard and self-heal retry for uv cache corruption Addresses Copilot review on #1630, plus a live recurrence caught on this PR's own CI run (job 85133634699, "Frontier (AMD) cpu [1/2]"): - UV_LOCK_DIR's guard only checked -w, so a writable non-directory TMPDIR would pass and then get used as a directory prefix, breaking flock. Add a -d check alongside -w, per Copilot's suggestion. - That same CI run hit a *new* corruption symptom ("The wheel is invalid: Missing .dist-info directory" for pandas) on the same physical login node (login05) as the original incident on #1414, even with the new lock in place. Root cause: a cache entry corrupted before the lock existed (or by any other cause) just fails forever until someone manually clears it -- which is exactly what had happened here; login05's ~1.2GiB cache had never actually been cleared (an earlier `uv cache clean` in this investigation was run from a different login node's session and never touched login05). Since self-hosted runners are spread across login nodes we can't all individually SSH into and inspect every time this happens, make the script self-heal instead: on install failure, clear the uv cache and retry once before giving up. --- toolchain/bootstrap/python.sh | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/toolchain/bootstrap/python.sh b/toolchain/bootstrap/python.sh index 86318fb28b..8a5d283706 100644 --- a/toolchain/bootstrap/python.sh +++ b/toolchain/bootstrap/python.sh @@ -204,18 +204,33 @@ if ! cmp "$(pwd)/toolchain/pyproject.toml" "$(pwd)/build/pyproject.toml" > /dev/ # since-deleted scratch dir), so a bad TMPDIR can't break installs # that used to work fine before this lock existed. UV_LOCK_DIR="${TMPDIR:-/tmp}" - [ -w "$UV_LOCK_DIR" ] || UV_LOCK_DIR=/tmp + [ -d "$UV_LOCK_DIR" ] && [ -w "$UV_LOCK_DIR" ] || UV_LOCK_DIR=/tmp UV_INSTALL_LOCK="${UV_LOCK_DIR}/mfc-uv-install-${USER:-$(id -un)}.lock" if command -v flock > /dev/null 2>&1; then uv_install() { flock "$UV_INSTALL_LOCK" uv pip install "$@"; } else uv_install() { uv pip install "$@"; } fi + + # A cache entry corrupted before this lock existed (or by any other + # cause) will otherwise fail every subsequent install until someone + # notices and clears the cache by hand -- which is exactly what + # happened here (a stale corrupted entry on a self-hosted runner kept + # failing across multiple PRs before anyone caught it). Self-heal: on + # the first failure, clear the cache and retry once before giving up. + uv_install_with_retry() { + if uv_install "$@"; then + return 0 + fi + warn "(venv) uv install failed; clearing the uv cache and retrying once, in case a corrupted cache entry is the cause..." + uv cache clean > /dev/null 2>&1 || true + uv_install "$@" + } log "(venv) Using$MAGENTA uv$COLOR_RESET for fast installation..." if [ "$verbose" = "1" ]; then # Verbose mode: show full uv output - if uv_install "$(pwd)/toolchain"; then + if uv_install_with_retry "$(pwd)/toolchain"; then ok "(venv) Installation succeeded." cp "$(pwd)/toolchain/pyproject.toml" "$(pwd)/build/" else @@ -226,7 +241,7 @@ if ! cmp "$(pwd)/toolchain/pyproject.toml" "$(pwd)/build/pyproject.toml" > /dev/ fi else # Default: show progress but filter out individual package lines (+ pkg==ver) - uv_install "$(pwd)/toolchain" > "$PIP_LOG" 2>&1 + uv_install_with_retry "$(pwd)/toolchain" > "$PIP_LOG" 2>&1 UV_EXIT=$? # Show filtered output (progress info without package list) # Filter out lines like " + pkg==1.0", " - pkg==1.0", " ~ pkg==1.0"