From d886aa5f209c7573aa74b32fd05ee6789139f6e2 Mon Sep 17 00:00:00 2001 From: Sandy Chen Date: Thu, 28 May 2026 15:50:45 +0900 Subject: [PATCH] fix(compactor): await HeartBeat goroutine in blocks cleaner BlocksCleaner.cleanUpActiveUsers and cleanDeletedUsers each launched a VisitMarkerManager.HeartBeat goroutine and only signalled it via errChan <- nil before returning, never awaiting the goroutine. When HeartBeat received the nil it still ran a terminal MarkWithStatus( Completed) using the parent ctx followed by DeleteVisitMarker( context.Background()) against the bucket. In tests using a filesystem bucket, those writes raced with t.Cleanup's RemoveAll on the temp dir, producing the arm64-flaky failure: unlinkat .../visit-mark.json: directory not empty Wrap the heartbeat goroutine in a sync.WaitGroup so the per-user callback waits for HeartBeat to fully exit (including its terminal writes) before returning. This drains all bucket I/O before test teardown removes the directory, eliminating the race. Fixes #7564 Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Sandy Chen --- CHANGELOG.md | 1 + pkg/compactor/blocks_cleaner.go | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3f86f69e38..a5d3d9f813 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,6 +37,7 @@ * [BUGFIX] Compactor: Fix stale `cortex_bucket_index_last_successful_update_timestamp_seconds` metric not being cleaned up when tenant ownership changes due to ring rebalancing. This caused false alarms on bucket index update rate when a tenant moved between compactors. #7485 * [BUGFIX] Security: Fix stored XSS vulnerability in Alertmanager and Store Gateway status pages by replacing `text/template` with `html/template`. #7512 * [BUGFIX] Security: Limit decompressed gzip output in `ParseProtoReader` and OTLP ingestion path. The decompressed body is now capped by `-distributor.otlp-max-recv-msg-size`. #7515 +* [BUGFIX] Compactor: Wait for the blocks cleaner `VisitMarkerManager.HeartBeat` goroutine to exit before returning from per-user cleanup callbacks, preventing a late `MarkWithStatus(Completed)` and `DeleteVisitMarker` write from racing with test teardown and causing `unlinkat: directory not empty` flakes in `TestBlocksCleaner`. #7564 ## 1.21.0 2026-04-24 diff --git a/pkg/compactor/blocks_cleaner.go b/pkg/compactor/blocks_cleaner.go index e7633c9d67..508a34432e 100644 --- a/pkg/compactor/blocks_cleaner.go +++ b/pkg/compactor/blocks_cleaner.go @@ -357,9 +357,15 @@ func (c *BlocksCleaner) cleanUpActiveUsers(ctx context.Context, users []string, return nil } errChan := make(chan error, 1) - go visitMarkerManager.HeartBeat(ctx, errChan, c.cleanerVisitMarkerFileUpdateInterval, true) + var hbWG sync.WaitGroup + hbWG.Add(1) + go func() { + defer hbWG.Done() + visitMarkerManager.HeartBeat(ctx, errChan, c.cleanerVisitMarkerFileUpdateInterval, true) + }() defer func() { errChan <- nil + hbWG.Wait() }() return errors.Wrapf(c.cleanUser(ctx, userLogger, userBucket, userID, firstRun), "failed to delete blocks for user: %s", userID) }) @@ -392,9 +398,15 @@ func (c *BlocksCleaner) cleanDeletedUsers(ctx context.Context, users []string) e return nil } errChan := make(chan error, 1) - go visitMarkerManager.HeartBeat(ctx, errChan, c.cleanerVisitMarkerFileUpdateInterval, true) + var hbWG sync.WaitGroup + hbWG.Add(1) + go func() { + defer hbWG.Done() + visitMarkerManager.HeartBeat(ctx, errChan, c.cleanerVisitMarkerFileUpdateInterval, true) + }() defer func() { errChan <- nil + hbWG.Wait() }() return errors.Wrapf(c.deleteUserMarkedForDeletion(ctx, userLogger, userBucket, userID), "failed to delete user marked for deletion: %s", userID) })