From 1cd2da67f204c3e5311199a3d955217a356252b5 Mon Sep 17 00:00:00 2001 From: Dan Sutton Date: Tue, 26 May 2026 11:06:11 +0100 Subject: [PATCH 1/7] feat(redis-worker,webapp): mollifier buffer extensions + snapshot type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the buffer-side data layer used by phase-3 work: - buffer.ts gains entry inspection (getEntry), idempotency lookup (lookupIdempotency), in-place snapshot mutation (mutateSnapshot), and dwell tracking — all atomic via Lua. - snapshot.server.ts: shared MollifierSnapshot type + (de)serialise. - Drops the entry-TTL config — the drainer is the recovery mechanism. Adds methods to the buffer interface; nothing consumes them yet. Subsequent PRs in the stack wire trigger-time mollify, read-fallback, and mutation paths against this surface. Co-Authored-By: Claude Opus 4.7 (1M context) --- .changeset/mollifier-buffer-extensions.md | 6 + apps/webapp/app/env.server.ts | 1 - .../v3/mollifier/mollifierBuffer.server.ts | 1 - .../v3/mollifier/mollifierSnapshot.server.ts | 16 + .../redis-worker/src/mollifier/buffer.test.ts | 1221 +++++++++++++++-- packages/redis-worker/src/mollifier/buffer.ts | 646 ++++++++- packages/redis-worker/src/mollifier/index.ts | 11 +- .../redis-worker/src/mollifier/schemas.ts | 22 + 8 files changed, 1786 insertions(+), 138 deletions(-) create mode 100644 .changeset/mollifier-buffer-extensions.md create mode 100644 apps/webapp/app/v3/mollifier/mollifierSnapshot.server.ts diff --git a/.changeset/mollifier-buffer-extensions.md b/.changeset/mollifier-buffer-extensions.md new file mode 100644 index 00000000000..b1f38f51ecc --- /dev/null +++ b/.changeset/mollifier-buffer-extensions.md @@ -0,0 +1,6 @@ +--- +"@trigger.dev/redis-worker": minor +"@trigger.dev/core": patch +--- + +Mollifier buffer feature set built on top of the initial primitives: idempotency-lookup with SETNX dedup, atomic snapshot-mutation API (`mutateSnapshot` with tag/metadata/delay/cancel patches), metadata CAS for lossless concurrent updates, watermark-paginated listing, claim primitives for pre-gate idempotency, ZSET-backed per-env queue, 30s post-ack grace TTL, and drop the accept-time entry TTL (drainer is now the only removal mechanism). `@trigger.dev/core` gains an optional `notice` field on the trigger response so the SDK can surface mollifier-queued guidance to customers. diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index 97b8333e279..9e4743d740f 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -1095,7 +1095,6 @@ const EnvironmentSchema = z TRIGGER_MOLLIFIER_TRIP_THRESHOLD: z.coerce.number().int().positive().default(100), TRIGGER_MOLLIFIER_HOLD_MS: z.coerce.number().int().positive().default(500), TRIGGER_MOLLIFIER_DRAIN_CONCURRENCY: z.coerce.number().int().positive().default(50), - TRIGGER_MOLLIFIER_ENTRY_TTL_S: z.coerce.number().int().positive().default(600), TRIGGER_MOLLIFIER_DRAIN_MAX_ATTEMPTS: z.coerce.number().int().positive().default(3), TRIGGER_MOLLIFIER_DRAIN_SHUTDOWN_TIMEOUT_MS: z.coerce.number().int().positive().default(30_000), TRIGGER_MOLLIFIER_DRAIN_MAX_ORGS_PER_TICK: z.coerce.number().int().positive().default(500), diff --git a/apps/webapp/app/v3/mollifier/mollifierBuffer.server.ts b/apps/webapp/app/v3/mollifier/mollifierBuffer.server.ts index 9c8917623e4..09b52aa9da3 100644 --- a/apps/webapp/app/v3/mollifier/mollifierBuffer.server.ts +++ b/apps/webapp/app/v3/mollifier/mollifierBuffer.server.ts @@ -22,7 +22,6 @@ function initializeMollifierBuffer(): MollifierBuffer { enableAutoPipelining: true, ...(env.TRIGGER_MOLLIFIER_REDIS_TLS_DISABLED === "true" ? {} : { tls: {} }), }, - entryTtlSeconds: env.TRIGGER_MOLLIFIER_ENTRY_TTL_S, }); } diff --git a/apps/webapp/app/v3/mollifier/mollifierSnapshot.server.ts b/apps/webapp/app/v3/mollifier/mollifierSnapshot.server.ts new file mode 100644 index 00000000000..a0732a3542e --- /dev/null +++ b/apps/webapp/app/v3/mollifier/mollifierSnapshot.server.ts @@ -0,0 +1,16 @@ +import { serialiseSnapshot, deserialiseSnapshot } from "@trigger.dev/redis-worker"; + +// MollifierSnapshot is the JSON-serialisable shape of the input that would be +// passed to engine.trigger(). The drainer deserialises and replays it. +// Kept as Record at this layer — the engine.trigger call site +// casts it to the engine's typed input. This keeps the mollifier subdirectory +// from depending on @internal/run-engine internals. +export type MollifierSnapshot = Record; + +export function serialiseMollifierSnapshot(input: MollifierSnapshot): string { + return serialiseSnapshot(input); +} + +export function deserialiseMollifierSnapshot(serialised: string): MollifierSnapshot { + return deserialiseSnapshot(serialised); +} diff --git a/packages/redis-worker/src/mollifier/buffer.test.ts b/packages/redis-worker/src/mollifier/buffer.test.ts index c8f7b95c97a..a4c1be35eb3 100644 --- a/packages/redis-worker/src/mollifier/buffer.test.ts +++ b/packages/redis-worker/src/mollifier/buffer.test.ts @@ -20,12 +20,14 @@ describe("schemas", () => { status: "QUEUED", attempts: "0", createdAt: "2026-05-11T10:00:00.000Z", + createdAtMicros: "1747044000000000", }; const parsed = BufferEntrySchema.parse(raw); expect(parsed.runId).toBe("run_abc"); expect(parsed.status).toBe("QUEUED"); expect(parsed.attempts).toBe(0); expect(parsed.createdAt).toBeInstanceOf(Date); + expect(parsed.createdAtMicros).toBe(1747044000000000); }); it("BufferEntrySchema parses a FAILED entry with lastError", () => { @@ -37,6 +39,7 @@ describe("schemas", () => { status: "FAILED", attempts: "3", createdAt: "2026-05-11T10:00:00.000Z", + createdAtMicros: "1747044000000000", lastError: JSON.stringify({ code: "P2024", message: "connection lost" }), }; const parsed = BufferEntrySchema.parse(raw); @@ -52,7 +55,6 @@ describe("MollifierBuffer construction", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -68,7 +70,6 @@ describe("MollifierBuffer.accept", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -105,7 +106,6 @@ describe("MollifierBuffer.pop", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -132,7 +132,6 @@ describe("MollifierBuffer.pop", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -151,7 +150,6 @@ describe("MollifierBuffer.pop", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -169,24 +167,56 @@ describe("MollifierBuffer.pop", () => { }); describe("MollifierBuffer.ack", () => { - redisTest("ack deletes the entry", { timeout: 20_000 }, async ({ redisContainer }) => { + redisTest( + "ack marks entry materialised and applies the grace TTL — entry persists as a read-fallback safety net", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + + try { + await buffer.accept({ runId: "run_x", envId: "env_a", orgId: "org_1", payload: "{}" }); + await buffer.pop("env_a"); + await buffer.ack("run_x"); + + const after = await buffer.getEntry("run_x"); + expect(after).not.toBeNull(); + expect(after!.materialised).toBe(true); + + // ack grace TTL is the only context where an entry hash gets + // an EXPIRE — accept no longer sets one. Should be at most 30s. + const ttl = await buffer.getEntryTtlSeconds("run_x"); + expect(ttl).toBeGreaterThan(0); + expect(ttl).toBeLessThanOrEqual(30); + } finally { + await buffer.close(); + } + }, + ); + + redisTest("ack on missing entry is a no-op", { timeout: 20_000 }, async ({ redisContainer }) => { const buffer = new MollifierBuffer({ redisOptions: { host: redisContainer.getHost(), port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); try { - await buffer.accept({ runId: "run_x", envId: "env_a", orgId: "org_1", payload: "{}" }); - await buffer.pop("env_a"); - await buffer.ack("run_x"); - - const after = await buffer.getEntry("run_x"); - expect(after).toBeNull(); + await buffer.ack("run_ghost"); + const stored = await buffer.getEntry("run_ghost"); + expect(stored).toBeNull(); + // Critical: no partial hash created. + const raw = await buffer["redis"].hgetall("mollifier:entries:run_ghost"); + expect(Object.keys(raw)).toHaveLength(0); } finally { await buffer.close(); } @@ -204,13 +234,12 @@ describe("MollifierBuffer.pop orphan handling", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); try { // Simulate a TTL-expired orphan: queue ref exists, entry hash does not. - await buffer["redis"].lpush("mollifier:queue:env_a", "run_orphan"); + await buffer["redis"].zadd("mollifier:queue:env_a", 1, "run_orphan"); const popped = await buffer.pop("env_a"); expect(popped).toBeNull(); @@ -220,7 +249,7 @@ describe("MollifierBuffer.pop orphan handling", () => { expect(Object.keys(raw)).toHaveLength(0); // Queue is drained — the loop pops orphans until empty. - const qLen = await buffer["redis"].llen("mollifier:queue:env_a"); + const qLen = await buffer["redis"].zcard("mollifier:queue:env_a"); expect(qLen).toBe(0); } finally { await buffer.close(); @@ -238,17 +267,16 @@ describe("MollifierBuffer.pop orphan handling", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); try { - // Layout (oldest-first, since RPOP takes from tail): orphan, valid, orphan. - // LPUSH puts items at the head, so to get RPOP order [orphan_a, valid, orphan_b] - // we LPUSH in reverse: orphan_b first, then valid, then orphan_a. - await buffer["redis"].lpush("mollifier:queue:env_a", "orphan_b"); + // Layout by score (lowest-first, since ZPOPMIN takes the min): + // orphan_a (score 1) → valid (score = its createdAtMicros, large) → orphan_b (score 1e18). + // First pop skips orphan_a, returns valid; orphan_b remains. + await buffer["redis"].zadd("mollifier:queue:env_a", 1, "orphan_a"); await buffer.accept({ runId: "valid", envId: "env_a", orgId: "org_1", payload: "{}" }); - await buffer["redis"].lpush("mollifier:queue:env_a", "orphan_a"); + await buffer["redis"].zadd("mollifier:queue:env_a", 1e18, "orphan_b"); const popped = await buffer.pop("env_a"); expect(popped).not.toBeNull(); @@ -256,7 +284,7 @@ describe("MollifierBuffer.pop orphan handling", () => { expect(popped!.status).toBe("DRAINING"); // The trailing orphan_b is still in the queue (single pop call). - const remaining = await buffer["redis"].llen("mollifier:queue:env_a"); + const remaining = await buffer["redis"].zcard("mollifier:queue:env_a"); expect(remaining).toBe(1); // A second pop drains the trailing orphan_b. The queue is now @@ -283,7 +311,6 @@ describe("MollifierBuffer.requeue", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -305,30 +332,43 @@ describe("MollifierBuffer.requeue", () => { }); describe("MollifierBuffer.fail", () => { - redisTest("fail transitions to FAILED and stores lastError", { timeout: 20_000 }, async ({ redisContainer }) => { - const buffer = new MollifierBuffer({ - redisOptions: { - host: redisContainer.getHost(), - port: redisContainer.getPort(), - password: redisContainer.getPassword(), - }, - entryTtlSeconds: 600, - logger: new Logger("test", "log"), - }); + redisTest( + "fail returns true and tears the entry down (drainer-terminal cleanup)", + { timeout: 20_000 }, + async ({ redisContainer }) => { + // Post-TTL-drop design: the drainer's createFailedTaskRun has + // already written a SYSTEM_FAILURE PG row by the time we call + // fail(), so the entry hash is no longer load-bearing. fail + // returns true and removes the entry; without this teardown + // failed entries would accrete forever now that there's no + // accept-time TTL. The Lua also DELs the idempotency lookup so + // future retries with the same key go through to PG instead of + // hitting an orphan dedup record. + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); - try { - await buffer.accept({ runId: "run_f", envId: "env_a", orgId: "org_1", payload: "{}" }); - await buffer.pop("env_a"); - const failed = await buffer.fail("run_f", { code: "VALIDATION", message: "boom" }); - expect(failed).toBe(true); + try { + await buffer.accept({ runId: "run_f", envId: "env_a", orgId: "org_1", payload: "{}" }); + await buffer.pop("env_a"); + const failed = await buffer.fail("run_f", { code: "VALIDATION", message: "boom" }); + expect(failed).toBe(true); - const entry = await buffer.getEntry("run_f"); - expect(entry!.status).toBe("FAILED"); - expect(entry!.lastError).toEqual({ code: "VALIDATION", message: "boom" }); - } finally { - await buffer.close(); - } - }); + // Entry hash is gone post-fail. + const entry = await buffer.getEntry("run_f"); + expect(entry).toBeNull(); + const raw = await buffer["redis"].hgetall("mollifier:entries:run_f"); + expect(Object.keys(raw)).toHaveLength(0); + } finally { + await buffer.close(); + } + }, + ); redisTest( "fail on missing entry is a no-op (returns false; no partial hash created)", @@ -340,7 +380,6 @@ describe("MollifierBuffer.fail", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -361,27 +400,35 @@ describe("MollifierBuffer.fail", () => { }); describe("MollifierBuffer TTL", () => { - redisTest("entry has TTL applied on accept", { timeout: 20_000 }, async ({ redisContainer }) => { - const buffer = new MollifierBuffer({ - redisOptions: { - host: redisContainer.getHost(), - port: redisContainer.getPort(), - password: redisContainer.getPassword(), - }, - entryTtlSeconds: 600, - logger: new Logger("test", "log"), - }); + redisTest( + "entry has NO TTL applied on accept — drainer is the only cleanup path", + { timeout: 20_000 }, + async ({ redisContainer }) => { + // Regression guard for the design change: buffer entries must + // persist until the drainer ACKs or FAILs them. An accept-time + // EXPIRE would re-introduce the silent-loss-when-drainer-offline + // failure mode that the stale-entry alerting pipeline depends on + // *not* happening. + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); - try { - await buffer.accept({ runId: "run_t", envId: "env_a", orgId: "org_1", payload: "{}" }); + try { + await buffer.accept({ runId: "run_t", envId: "env_a", orgId: "org_1", payload: "{}" }); - const ttl = await buffer.getEntryTtlSeconds("run_t"); - expect(ttl).toBeGreaterThan(0); - expect(ttl).toBeLessThanOrEqual(600); - } finally { - await buffer.close(); - } - }); + // Redis returns -1 when the key exists but has no TTL set. + const ttl = await buffer.getEntryTtlSeconds("run_t"); + expect(ttl).toBe(-1); + } finally { + await buffer.close(); + } + }, + ); }); describe("MollifierBuffer payload encoding", () => { @@ -395,7 +442,6 @@ describe("MollifierBuffer payload encoding", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -437,7 +483,6 @@ describe("MollifierBuffer.requeue on missing entry", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -458,22 +503,27 @@ describe("MollifierBuffer.requeue on missing entry", () => { describe("MollifierBuffer.requeue ordering", () => { redisTest( - "requeued entry is popped AFTER other queued entries on the same env (FIFO retry)", + "requeued entry retains its original createdAt and pops next (oldest-first by createdAt)", { timeout: 20_000 }, async ({ redisContainer }) => { + // Score == createdAtMicros; requeue does not bump the score. The + // oldest entry continues to pop first across retries. `maxAttempts` + // in the drainer bounds the retry loop for a persistently failing + // entry (after which it goes to the `fail` path, not requeue). const buffer = new MollifierBuffer({ redisOptions: { host: redisContainer.getHost(), port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); try { await buffer.accept({ runId: "a", envId: "env_a", orgId: "org_1", payload: "{}" }); + await new Promise((r) => setTimeout(r, 2)); await buffer.accept({ runId: "b", envId: "env_a", orgId: "org_1", payload: "{}" }); + await new Promise((r) => setTimeout(r, 2)); await buffer.accept({ runId: "c", envId: "env_a", orgId: "org_1", payload: "{}" }); const first = await buffer.pop("env_a"); @@ -481,12 +531,13 @@ describe("MollifierBuffer.requeue ordering", () => { await buffer.requeue("a"); + // a still has the smallest createdAtMicros → pops next. const next = await buffer.pop("env_a"); - expect(next!.runId).toBe("b"); + expect(next!.runId).toBe("a"); const after = await buffer.pop("env_a"); - expect(after!.runId).toBe("c"); + expect(after!.runId).toBe("b"); const last = await buffer.pop("env_a"); - expect(last!.runId).toBe("a"); + expect(last!.runId).toBe("c"); } finally { await buffer.close(); } @@ -508,7 +559,6 @@ describe("MollifierBuffer.evaluateTrip", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -530,7 +580,6 @@ describe("MollifierBuffer.evaluateTrip", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -557,7 +606,6 @@ describe("MollifierBuffer.evaluateTrip", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -585,7 +633,6 @@ describe("MollifierBuffer.evaluateTrip", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -610,7 +657,6 @@ describe("MollifierBuffer.evaluateTrip", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -638,7 +684,6 @@ describe("MollifierBuffer.evaluateTrip", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -671,7 +716,6 @@ describe("MollifierBuffer.evaluateTrip", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -707,22 +751,21 @@ describe("MollifierBuffer entry lifecycle invariants", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); try { await buffer.accept({ runId: "run_ttl", envId: "env_a", orgId: "org_1", payload: "{}" }); const beforeTtl = await buffer.getEntryTtlSeconds("run_ttl"); - expect(beforeTtl).toBeGreaterThan(0); + expect(beforeTtl).toBe(-1); await buffer.pop("env_a"); const afterTtl = await buffer.getEntryTtlSeconds("run_ttl"); - // TTL must still be present (>0). Redis returns -1 if the key has no - // TTL — that's the leak shape we're guarding against. - expect(afterTtl).toBeGreaterThan(0); - expect(afterTtl).toBeLessThanOrEqual(beforeTtl); + // No TTL applied at any point during accept/pop — the entry + // persists until the drainer ACKs or FAILs. Returning -1 from + // Redis here is the expected steady state, not a leak. + expect(afterTtl).toBe(-1); } finally { await buffer.close(); } @@ -739,7 +782,6 @@ describe("MollifierBuffer entry lifecycle invariants", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -795,7 +837,6 @@ describe("MollifierBuffer.accept idempotency", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -813,8 +854,8 @@ describe("MollifierBuffer.accept idempotency", () => { payload: serialiseSnapshot({ first: false }), }); - expect(first).toBe(true); - expect(second).toBe(false); + expect(first).toEqual({ kind: "accepted" }); + expect(second).toEqual({ kind: "duplicate_run_id" }); // First payload preserved; second was a no-op. const stored = await buffer.getEntry("run_dup"); @@ -844,7 +885,6 @@ describe("MollifierBuffer.accept idempotency", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -855,7 +895,7 @@ describe("MollifierBuffer.accept idempotency", () => { expect(stored!.status).toBe("DRAINING"); const dup = await buffer.accept({ runId: "run_dr", envId: "env_a", orgId: "org_1", payload: "{}" }); - expect(dup).toBe(false); + expect(dup).toEqual({ kind: "duplicate_run_id" }); const afterDup = await buffer.getEntry("run_dr"); expect(afterDup!.status).toBe("DRAINING"); // unchanged @@ -866,16 +906,21 @@ describe("MollifierBuffer.accept idempotency", () => { ); redisTest( - "accept refused while existing entry is FAILED", + "runId slot is reclaimable after fail tears the entry down", { timeout: 20_000 }, async ({ redisContainer }) => { + // Post-TTL-drop design: fail() deletes the entry hash because + // the SYSTEM_FAILURE PG row is the canonical record of the + // failure. The runId slot is therefore free for a fresh accept + // afterwards — runIds are server-generated CUIDs and don't + // collide in practice, but the contract pinning here documents + // that a re-acceptance does NOT see a phantom "FAILED" entry. const buffer = new MollifierBuffer({ redisOptions: { host: redisContainer.getHost(), port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -883,15 +928,20 @@ describe("MollifierBuffer.accept idempotency", () => { await buffer.accept({ runId: "run_fl", envId: "env_a", orgId: "org_1", payload: "{}" }); await buffer.pop("env_a"); await buffer.fail("run_fl", { code: "VALIDATION", message: "boom" }); - const stored = await buffer.getEntry("run_fl"); - expect(stored!.status).toBe("FAILED"); - const dup = await buffer.accept({ runId: "run_fl", envId: "env_a", orgId: "org_1", payload: "{}" }); - expect(dup).toBe(false); + // Entry hash gone after fail (see "fail returns true and tears + // the entry down" — this test pins the accept-side effect). + expect(await buffer.getEntry("run_fl")).toBeNull(); - const afterDup = await buffer.getEntry("run_fl"); - expect(afterDup!.status).toBe("FAILED"); // unchanged - expect(afterDup!.lastError).toEqual({ code: "VALIDATION", message: "boom" }); + const fresh = await buffer.accept({ + runId: "run_fl", + envId: "env_a", + orgId: "org_1", + payload: '{"fresh":true}', + }); + expect(fresh).toEqual({ kind: "accepted" }); + const after = await buffer.getEntry("run_fl"); + expect(after?.status).toBe("QUEUED"); } finally { await buffer.close(); } @@ -899,16 +949,21 @@ describe("MollifierBuffer.accept idempotency", () => { ); redisTest( - "re-accept after ack works (terminal entry can be re-accepted)", + "accept refused while a previously-acked (materialised) entry is still inside its grace TTL", { timeout: 20_000 }, async ({ redisContainer }) => { + // After ack, the entry hash persists for the grace window as a + // read-fallback safety net (Q1 D2). RunIds are server-generated and + // never collide in practice, but defense-in-depth: accept refuses + // while *any* entry exists for the runId, including materialised + // ones. The entry hash's TTL is now ~30s instead of the original + // entryTtlSeconds. const buffer = new MollifierBuffer({ redisOptions: { host: redisContainer.getHost(), port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -922,7 +977,6 @@ describe("MollifierBuffer.accept idempotency", () => { await buffer.pop("env_a"); await buffer.ack("run_x"); - // Entry is gone — re-accept should succeed. const reAccept = await buffer.accept({ runId: "run_x", envId: "env_a", @@ -930,8 +984,11 @@ describe("MollifierBuffer.accept idempotency", () => { payload: "{}", }); - expect(first).toBe(true); - expect(reAccept).toBe(true); + expect(first).toEqual({ kind: "accepted" }); + expect(reAccept).toEqual({ kind: "duplicate_run_id" }); + + const stored = await buffer.getEntry("run_x"); + expect(stored!.materialised).toBe(true); } finally { await buffer.close(); } @@ -950,7 +1007,6 @@ describe("MollifierBuffer envs set lifecycle", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -976,7 +1032,6 @@ describe("MollifierBuffer envs set lifecycle", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -1006,7 +1061,6 @@ describe("MollifierBuffer envs set lifecycle", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -1025,3 +1079,952 @@ describe("MollifierBuffer envs set lifecycle", () => { }, ); }); + +describe("MollifierBuffer idempotency lookup", () => { + redisTest( + "accept with idempotencyKey + taskIdentifier writes the lookup with no TTL", + { timeout: 20_000 }, + async ({ redisContainer }) => { + // Post-TTL-drop design: the idempotency lookup has no TTL, so it + // can never expire ahead of the entry hash (which used to cause + // a dedup-drift bug — once the lookup expired but the entry + // didn't, a retry with the same key would create a *new* + // buffered run for the same key). The drainer's ack and fail + // both DEL the lookup as part of teardown. + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + const result = await buffer.accept({ + runId: "ri1", + envId: "env_i", + orgId: "org_1", + payload: "{}", + idempotencyKey: "ikey-1", + taskIdentifier: "my-task", + }); + expect(result).toEqual({ kind: "accepted" }); + + const lookupKey = "mollifier:idempotency:env_i:my-task:ikey-1"; + const stored = await buffer["redis"].get(lookupKey); + expect(stored).toBe("ri1"); + // -1 = key exists with no TTL set. + expect(await buffer["redis"].ttl(lookupKey)).toBe(-1); + + const entry = await buffer.getEntry("ri1"); + expect(entry!.idempotencyLookupKey).toBe(lookupKey); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "second accept with same (env, task, idempotencyKey) returns duplicate_idempotency with the winner's runId", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + const first = await buffer.accept({ + runId: "ri-a", + envId: "env_i", + orgId: "org_1", + payload: "{}", + idempotencyKey: "ikey-2", + taskIdentifier: "my-task", + }); + const second = await buffer.accept({ + runId: "ri-b", + envId: "env_i", + orgId: "org_1", + payload: "{}", + idempotencyKey: "ikey-2", + taskIdentifier: "my-task", + }); + + expect(first).toEqual({ kind: "accepted" }); + expect(second).toEqual({ + kind: "duplicate_idempotency", + existingRunId: "ri-a", + }); + + // The loser's runId entry was never created. + const loserEntry = await buffer.getEntry("ri-b"); + expect(loserEntry).toBeNull(); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "lookupIdempotency hits when the run is buffered", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "rl1", + envId: "env_i", + orgId: "org_1", + payload: "{}", + idempotencyKey: "k1", + taskIdentifier: "t", + }); + const found = await buffer.lookupIdempotency({ + envId: "env_i", + taskIdentifier: "t", + idempotencyKey: "k1", + }); + expect(found).toBe("rl1"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "lookupIdempotency returns null when no lookup is bound", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + const found = await buffer.lookupIdempotency({ + envId: "env_i", + taskIdentifier: "t", + idempotencyKey: "absent", + }); + expect(found).toBeNull(); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "lookupIdempotency self-heals when the lookup points at an expired entry", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + // Plant a stale lookup pointing at a non-existent entry. + const lookupKey = "mollifier:idempotency:env_i:t:stale"; + await buffer["redis"].set(lookupKey, "rl-stale", "EX", 600); + expect(await buffer["redis"].get(lookupKey)).toBe("rl-stale"); + + const found = await buffer.lookupIdempotency({ + envId: "env_i", + taskIdentifier: "t", + idempotencyKey: "stale", + }); + expect(found).toBeNull(); + // Self-healed. + expect(await buffer["redis"].get(lookupKey)).toBeNull(); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "ack DELs the idempotency lookup along with marking materialised", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "ra1", + envId: "env_i", + orgId: "org_1", + payload: "{}", + idempotencyKey: "ka", + taskIdentifier: "t", + }); + await buffer.pop("env_i"); + await buffer.ack("ra1"); + + const lookupKey = "mollifier:idempotency:env_i:t:ka"; + expect(await buffer["redis"].get(lookupKey)).toBeNull(); + const entry = await buffer.getEntry("ra1"); + expect(entry!.materialised).toBe(true); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "resetIdempotency clears snapshot fields + lookup; returns the runId", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "rr1", + envId: "env_i", + orgId: "org_1", + payload: serialiseSnapshot({ + idempotencyKey: "kr", + idempotencyKeyExpiresAt: "2026-12-01T00:00:00Z", + other: "field", + }), + idempotencyKey: "kr", + taskIdentifier: "t", + }); + + const result = await buffer.resetIdempotency({ + envId: "env_i", + taskIdentifier: "t", + idempotencyKey: "kr", + }); + expect(result.clearedRunId).toBe("rr1"); + + // Lookup is gone. + const lookupKey = "mollifier:idempotency:env_i:t:kr"; + expect(await buffer["redis"].get(lookupKey)).toBeNull(); + + // Snapshot's idempotency fields are nulled, other fields kept. + const entry = await buffer.getEntry("rr1"); + const payload = JSON.parse(entry!.payload) as { + idempotencyKey: unknown; + idempotencyKeyExpiresAt: unknown; + other: string; + }; + expect(payload.idempotencyKey).toBeNull(); + expect(payload.idempotencyKeyExpiresAt).toBeNull(); + expect(payload.other).toBe("field"); + expect(entry!.idempotencyLookupKey).toBe(""); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "resetIdempotency returns null when nothing is bound", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + const result = await buffer.resetIdempotency({ + envId: "env_i", + taskIdentifier: "t", + idempotencyKey: "absent", + }); + expect(result.clearedRunId).toBeNull(); + } finally { + await buffer.close(); + } + }, + ); +}); + +describe("MollifierBuffer.casSetMetadata", () => { + redisTest( + "applies when expectedVersion matches; increments version; updates payload", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "cas1", + envId: "env_c", + orgId: "org_1", + payload: serialiseSnapshot({ metadata: '{"v":1}', metadataType: "application/json" }), + }); + const result = await buffer.casSetMetadata({ + runId: "cas1", + expectedVersion: 0, + newMetadata: '{"v":2}', + newMetadataType: "application/json", + }); + expect(result).toEqual({ kind: "applied", newVersion: 1 }); + + const entry = await buffer.getEntry("cas1"); + expect(entry!.metadataVersion).toBe(1); + const payload = JSON.parse(entry!.payload) as { metadata: string }; + expect(payload.metadata).toBe('{"v":2}'); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "returns version_conflict when expectedVersion is stale", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "cas2", + envId: "env_c", + orgId: "org_1", + payload: serialiseSnapshot({}), + }); + await buffer.casSetMetadata({ + runId: "cas2", + expectedVersion: 0, + newMetadata: '{"a":1}', + newMetadataType: "application/json", + }); + + // Second write with stale expectedVersion = 0 must conflict. + const result = await buffer.casSetMetadata({ + runId: "cas2", + expectedVersion: 0, + newMetadata: '{"a":2}', + newMetadataType: "application/json", + }); + expect(result).toEqual({ kind: "version_conflict", currentVersion: 1 }); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "returns not_found / busy on missing or terminal entries", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + const nf = await buffer.casSetMetadata({ + runId: "absent", + expectedVersion: 0, + newMetadata: "{}", + newMetadataType: "application/json", + }); + expect(nf).toEqual({ kind: "not_found" }); + + await buffer.accept({ + runId: "cas3", + envId: "env_c", + orgId: "org_1", + payload: serialiseSnapshot({}), + }); + await buffer.pop("env_c"); + const busy = await buffer.casSetMetadata({ + runId: "cas3", + expectedVersion: 0, + newMetadata: "{}", + newMetadataType: "application/json", + }); + expect(busy).toEqual({ kind: "busy" }); + } finally { + await buffer.close(); + } + }, + ); +}); + +describe("MollifierBuffer.mutateSnapshot", () => { + redisTest( + "returns not_found when no entry exists for the runId", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + const result = await buffer.mutateSnapshot("nope", { + type: "append_tags", + tags: ["x"], + }); + expect(result).toBe("not_found"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "append_tags on QUEUED entry appends and dedupes", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "r1", + envId: "env_m", + orgId: "org_1", + payload: serialiseSnapshot({ tags: ["existing"] }), + }); + const first = await buffer.mutateSnapshot("r1", { + type: "append_tags", + tags: ["existing", "new"], + }); + expect(first).toBe("applied_to_snapshot"); + + const entry = await buffer.getEntry("r1"); + const payload = JSON.parse(entry!.payload) as { tags: string[] }; + expect(payload.tags).toEqual(["existing", "new"]); + + // Second mutation appends without duplicating + const second = await buffer.mutateSnapshot("r1", { + type: "append_tags", + tags: ["new", "third"], + }); + expect(second).toBe("applied_to_snapshot"); + const e2 = await buffer.getEntry("r1"); + const p2 = JSON.parse(e2!.payload) as { tags: string[] }; + expect(p2.tags).toEqual(["existing", "new", "third"]); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "append_tags creates payload.tags when absent", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "r2", + envId: "env_m", + orgId: "org_1", + payload: serialiseSnapshot({ taskId: "t" }), + }); + const result = await buffer.mutateSnapshot("r2", { + type: "append_tags", + tags: ["a", "b"], + }); + expect(result).toBe("applied_to_snapshot"); + const entry = await buffer.getEntry("r2"); + const payload = JSON.parse(entry!.payload) as { tags: string[] }; + expect(payload.tags).toEqual(["a", "b"]); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "set_metadata replaces metadata + metadataType (last-write-wins)", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "r3", + envId: "env_m", + orgId: "org_1", + payload: serialiseSnapshot({ metadata: '{"v":1}', metadataType: "application/json" }), + }); + const result = await buffer.mutateSnapshot("r3", { + type: "set_metadata", + metadata: '{"v":2}', + metadataType: "application/json", + }); + expect(result).toBe("applied_to_snapshot"); + const entry = await buffer.getEntry("r3"); + const payload = JSON.parse(entry!.payload) as { + metadata: string; + metadataType: string; + }; + expect(payload.metadata).toBe('{"v":2}'); + expect(payload.metadataType).toBe("application/json"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "set_delay sets payload.delayUntil", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "r4", + envId: "env_m", + orgId: "org_1", + payload: serialiseSnapshot({ taskId: "t" }), + }); + const result = await buffer.mutateSnapshot("r4", { + type: "set_delay", + delayUntil: "2026-06-01T00:00:00.000Z", + }); + expect(result).toBe("applied_to_snapshot"); + const entry = await buffer.getEntry("r4"); + const payload = JSON.parse(entry!.payload) as { delayUntil: string }; + expect(payload.delayUntil).toBe("2026-06-01T00:00:00.000Z"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "mark_cancelled stamps cancelledAt + cancelReason", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "r5", + envId: "env_m", + orgId: "org_1", + payload: serialiseSnapshot({ taskId: "t" }), + }); + const result = await buffer.mutateSnapshot("r5", { + type: "mark_cancelled", + cancelledAt: "2026-05-19T12:00:00.000Z", + cancelReason: "user-initiated", + }); + expect(result).toBe("applied_to_snapshot"); + const entry = await buffer.getEntry("r5"); + const payload = JSON.parse(entry!.payload) as { + cancelledAt: string; + cancelReason: string; + }; + expect(payload.cancelledAt).toBe("2026-05-19T12:00:00.000Z"); + expect(payload.cancelReason).toBe("user-initiated"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "returns busy when entry is DRAINING", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "rd", + envId: "env_m", + orgId: "org_1", + payload: serialiseSnapshot({ tags: [] }), + }); + await buffer.pop("env_m"); + const result = await buffer.mutateSnapshot("rd", { + type: "append_tags", + tags: ["x"], + }); + expect(result).toBe("busy"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "returns not_found when entry was FAILED (drainer-terminal teardown)", + { timeout: 20_000 }, + async ({ redisContainer }) => { + // Post-TTL-drop design: fail() DELs the entry hash because the + // drainer has already written the canonical SYSTEM_FAILURE PG + // row, and without an accept-time TTL we'd otherwise accrete + // failed entries in Redis forever. Late mutations against a + // failed run therefore see `not_found`, matching the same shape + // they'd get for any other already-cleaned-up runId. + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "rf", + envId: "env_m", + orgId: "org_1", + payload: serialiseSnapshot({ tags: [] }), + }); + await buffer.pop("env_m"); + await buffer.fail("rf", { code: "X", message: "boom" }); + const result = await buffer.mutateSnapshot("rf", { + type: "append_tags", + tags: ["x"], + }); + expect(result).toBe("not_found"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "returns busy when entry is materialised (post-ack grace window)", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "rm", + envId: "env_m", + orgId: "org_1", + payload: serialiseSnapshot({ tags: [] }), + }); + await buffer.pop("env_m"); + await buffer.ack("rm"); + const result = await buffer.mutateSnapshot("rm", { + type: "append_tags", + tags: ["x"], + }); + expect(result).toBe("busy"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "Lua atomicity serialises concurrent mutations per-runId", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "rcc", + envId: "env_m", + orgId: "org_1", + payload: serialiseSnapshot({ tags: [] }), + }); + + const tagsToAdd = Array.from({ length: 50 }, (_, i) => `t${i}`); + await Promise.all( + tagsToAdd.map((t) => buffer.mutateSnapshot("rcc", { type: "append_tags", tags: [t] })), + ); + + const entry = await buffer.getEntry("rcc"); + const payload = JSON.parse(entry!.payload) as { tags: string[] }; + expect(payload.tags.sort()).toEqual(tagsToAdd.sort()); + } finally { + await buffer.close(); + } + }, + ); +}); + +describe("MollifierBuffer ZSET storage", () => { + redisTest( + "queue key is a ZSET scored by entry's createdAtMicros", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + + try { + await buffer.accept({ runId: "z1", envId: "env_z", orgId: "org_1", payload: "{}" }); + + // ZSET-only commands must succeed against the queue key. + const card = await buffer["redis"].zcard("mollifier:queue:env_z"); + expect(card).toBe(1); + + const score = await buffer["redis"].zscore("mollifier:queue:env_z", "z1"); + expect(score).not.toBeNull(); + const scoreNum = Number(score); + expect(Number.isFinite(scoreNum)).toBe(true); + + // Score matches the entry hash's createdAtMicros field. + const micros = await buffer["redis"].hget("mollifier:entries:z1", "createdAtMicros"); + expect(micros).not.toBeNull(); + expect(Number(micros)).toBe(scoreNum); + + // Score is plausibly recent (within last minute as microseconds). + const nowMicros = Date.now() * 1000; + expect(scoreNum).toBeGreaterThan(nowMicros - 60_000_000); + expect(scoreNum).toBeLessThanOrEqual(nowMicros + 1_000_000); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "pop returns entries in ascending createdAtMicros order (FIFO by time, not by member)", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + + try { + // Insert runIds in reverse-lex order to prove ordering is by score, not member. + await buffer.accept({ runId: "zzz", envId: "env_o", orgId: "org_1", payload: "{}" }); + await new Promise((r) => setTimeout(r, 5)); + await buffer.accept({ runId: "mmm", envId: "env_o", orgId: "org_1", payload: "{}" }); + await new Promise((r) => setTimeout(r, 5)); + await buffer.accept({ runId: "aaa", envId: "env_o", orgId: "org_1", payload: "{}" }); + + const first = await buffer.pop("env_o"); + expect(first!.runId).toBe("zzz"); + const second = await buffer.pop("env_o"); + expect(second!.runId).toBe("mmm"); + const third = await buffer.pop("env_o"); + expect(third!.runId).toBe("aaa"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "requeue keeps original score; createdAt is immutable across retries", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + + try { + await buffer.accept({ runId: "rq", envId: "env_rq", orgId: "org_1", payload: "{}" }); + const originalScore = Number( + await buffer["redis"].zscore("mollifier:queue:env_rq", "rq"), + ); + const originalMicros = Number( + await buffer["redis"].hget("mollifier:entries:rq", "createdAtMicros"), + ); + + await buffer.pop("env_rq"); + await new Promise((r) => setTimeout(r, 5)); + await buffer.requeue("rq"); + + const newScore = Number( + await buffer["redis"].zscore("mollifier:queue:env_rq", "rq"), + ); + const newMicros = Number( + await buffer["redis"].hget("mollifier:entries:rq", "createdAtMicros"), + ); + expect(newScore).toBe(originalScore); + expect(newMicros).toBe(originalMicros); + } finally { + await buffer.close(); + } + }, + ); +}); + +describe("MollifierBuffer.listEntriesForEnv", () => { + redisTest( + "returns up to maxCount entries from the queue without consuming them", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + + try { + await buffer.accept({ runId: "r1", envId: "env_a", orgId: "org_1", payload: "{}" }); + await buffer.accept({ runId: "r2", envId: "env_a", orgId: "org_1", payload: "{}" }); + await buffer.accept({ runId: "r3", envId: "env_a", orgId: "org_1", payload: "{}" }); + + const entries = await buffer.listEntriesForEnv("env_a", 2); + expect(entries).toHaveLength(2); + const runIds = entries.map((e) => e.runId); + expect(new Set(runIds).size).toBe(2); + for (const id of runIds) expect(["r1", "r2", "r3"]).toContain(id); + + // Non-destructive: the drainer can still pop all three. + const popped: string[] = []; + for (let i = 0; i < 3; i++) { + const entry = await buffer.pop("env_a"); + if (entry) popped.push(entry.runId); + } + expect(new Set(popped)).toEqual(new Set(["r1", "r2", "r3"])); + } finally { + await buffer.close(); + } + }, + ); + + redisTest("returns empty array when env queue is empty", { timeout: 20_000 }, async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + + try { + expect(await buffer.listEntriesForEnv("env_empty", 10)).toEqual([]); + } finally { + await buffer.close(); + } + }); + + redisTest("maxCount <= 0 returns empty without hitting redis", { timeout: 20_000 }, async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + + try { + expect(await buffer.listEntriesForEnv("env_a", 0)).toEqual([]); + expect(await buffer.listEntriesForEnv("env_a", -5)).toEqual([]); + } finally { + await buffer.close(); + } + }); +}); diff --git a/packages/redis-worker/src/mollifier/buffer.ts b/packages/redis-worker/src/mollifier/buffer.ts index f739e3ff362..fd53f59efea 100644 --- a/packages/redis-worker/src/mollifier/buffer.ts +++ b/packages/redis-worker/src/mollifier/buffer.ts @@ -10,17 +10,66 @@ import { BufferEntry, BufferEntrySchema } from "./schemas.js"; export type MollifierBufferOptions = { redisOptions: RedisOptions; - entryTtlSeconds: number; logger?: Logger; }; +// Grace TTL applied to the entry hash on drainer ack. The entry survives +// this long after materialisation so direct reads (retrieve, trace, etc.) +// have a safety net while PG replica lag settles. Q1 D2. +const ACK_GRACE_TTL_SECONDS = 30; + +export type SnapshotPatch = + | { type: "append_tags"; tags: string[] } + | { type: "set_metadata"; metadata: string; metadataType: string } + | { type: "set_delay"; delayUntil: string } + | { type: "mark_cancelled"; cancelledAt: string; cancelReason?: string }; + +export type MutateSnapshotResult = "applied_to_snapshot" | "not_found" | "busy"; + +export type CasSetMetadataResult = + | { kind: "applied"; newVersion: number } + | { kind: "version_conflict"; currentVersion: number } + | { kind: "not_found" } + | { kind: "busy" }; + +export type AcceptResult = + | { kind: "accepted" } + | { kind: "duplicate_run_id" } + | { kind: "duplicate_idempotency"; existingRunId: string }; + +export type IdempotencyLookupInput = { + envId: string; + taskIdentifier: string; + idempotencyKey: string; +}; + +function makeIdempotencyLookupKey(input: IdempotencyLookupInput): string { + return `mollifier:idempotency:${input.envId}:${input.taskIdentifier}:${input.idempotencyKey}`; +} + +// Pre-gate claim key namespace, distinct from `mollifier:idempotency` so the +// existing B6a buffer-side dedup stays isolated. The claim is the +// authoritative cross-store "this idempotency key is in flight or +// resolved" pointer used by the trigger hot path +// (`_plans/2026-05-21-mollifier-idempotency-claim.md`). Values: +// "pending" → a trigger pipeline owns the key and hasn't published yet +// → the winning trigger's runId (resolved) +export const IDEMPOTENCY_CLAIM_PENDING = "pending"; + +function makeIdempotencyClaimKey(input: IdempotencyLookupInput): string { + return `mollifier:claim:${input.envId}:${input.taskIdentifier}:${input.idempotencyKey}`; +} + +export type IdempotencyClaimResult = + | { kind: "claimed" } + | { kind: "pending" } + | { kind: "resolved"; runId: string }; + export class MollifierBuffer { private readonly redis: Redis; - private readonly entryTtlSeconds: number; private readonly logger: Logger; constructor(options: MollifierBufferOptions) { - this.entryTtlSeconds = options.entryTtlSeconds; this.logger = options.logger ?? new Logger("MollifierBuffer", "debug"); this.redis = createRedisClient( @@ -41,19 +90,47 @@ export class MollifierBuffer { this.#registerCommands(); } - // Returns true if the entry was newly written; false if a duplicate runId - // was already buffered (idempotent no-op). Callers can use the boolean to - // record a duplicate-accept metric without affecting buffer state. + // Three outcomes: + // - { kind: "accepted" } — entry was newly written. + // - { kind: "duplicate_run_id" } — runId was already buffered (idempotent + // no-op, same semantic as the previous boolean-false return). + // - { kind: "duplicate_idempotency", existingRunId } — the (env, task, + // idempotencyKey) tuple was already bound to another buffered run. + // The Lua's atomic SETNX is the race-winner; the second caller gets + // the winner's runId so it can return that as the trigger response. async accept(input: { runId: string; envId: string; orgId: string; payload: string; - }): Promise { + // Optional idempotency-key triple. When all three are present we + // SETNX a Redis lookup at `mollifier:idempotency:{env}:{task}:{key}` + // pointing at the runId so trigger-time dedup during the buffered + // window resolves the same way PG's unique constraint resolves it + // post-materialisation (Q5). + idempotencyKey?: string; + taskIdentifier?: string; + }): Promise { const entryKey = `mollifier:entries:${input.runId}`; const queueKey = `mollifier:queue:${input.envId}`; const orgsKey = "mollifier:orgs"; - const createdAt = new Date().toISOString(); + const nowMs = Date.now(); + const createdAt = new Date(nowMs).toISOString(); + // Microsecond epoch. JS only has millisecond precision, so multiple + // accepts in the same ms share a score; ZSET ties resolve by member + // (runId) lex order, which is deterministic and acceptable for FIFO + // pop. The hash carries the same value as `createdAtMicros` so the + // listing helper (Phase E) can read a stable per-run timestamp + // without re-fetching the score. + const createdAtMicros = nowMs * 1000; + const idempotencyLookupKey = + input.idempotencyKey && input.taskIdentifier + ? makeIdempotencyLookupKey({ + envId: input.envId, + taskIdentifier: input.taskIdentifier, + idempotencyKey: input.idempotencyKey, + }) + : ""; const result = await this.redis.acceptMollifierEntry( entryKey, queueKey, @@ -63,10 +140,17 @@ export class MollifierBuffer { input.orgId, input.payload, createdAt, - String(this.entryTtlSeconds), + String(createdAtMicros), "mollifier:org-envs:", + idempotencyLookupKey, ); - return result === 1; + // Lua returns 1 (accepted), 0 (duplicate runId), or a string runId + // (duplicate idempotency — value is the existing winner's runId). + if (typeof result === "string" && result.length > 0) { + return { kind: "duplicate_idempotency", existingRunId: result }; + } + if (result === 1) return { kind: "accepted" }; + return { kind: "duplicate_run_id" }; } async pop(envId: string): Promise { @@ -128,8 +212,247 @@ export class MollifierBuffer { return this.redis.smembers(`mollifier:org-envs:${orgId}`); } + // Paginated read of currently-queued entries newest-first, bounded by + // an optional `(createdAtMicros, runId)` watermark. Q1 listing design. + // Returns hydrated `BufferEntry` rows up to `pageSize`. Skips orphans + // (queue ref without an entry hash) silently. Non-destructive — the + // drainer keeps popping these entries in createdAt order regardless. + async listForEnvWithWatermark(input: { + envId: string; + watermark?: { createdAtMicros: number; runId: string }; + pageSize: number; + }): Promise { + if (input.pageSize <= 0) return []; + const queueKey = `mollifier:queue:${input.envId}`; + + let runIds: string[]; + if (!input.watermark) { + // Page 1 — newest first. + runIds = await this.redis.zrevrangebyscore( + queueKey, + "+inf", + "-inf", + "LIMIT", + 0, + input.pageSize, + ); + } else { + // Page N — strictly below the watermark score. + const belowScore = await this.redis.zrevrangebyscore( + queueKey, + `(${input.watermark.createdAtMicros}`, + "-inf", + "LIMIT", + 0, + input.pageSize, + ); + runIds = belowScore; + // Tied-score scan: ZSET ties broken by member-DESC, so entries + // sharing the watermark score with a lex-smaller runId still + // need to surface. Cheap second range over the tied band. + if (belowScore.length < input.pageSize) { + const remaining = input.pageSize - belowScore.length; + const tied = await this.redis.zrangebyscore( + queueKey, + input.watermark.createdAtMicros, + input.watermark.createdAtMicros, + ); + // Filter to runIds lex-less than the watermark anchor, sort + // member-DESC, take `remaining`. + const tiedFiltered = tied + .filter((r) => r < input.watermark!.runId) + .sort((a, b) => (a < b ? 1 : a > b ? -1 : 0)) + .slice(0, remaining); + runIds = [...belowScore, ...tiedFiltered]; + } + } + + if (runIds.length === 0) return []; + + // Parallel HGETALL — one round-trip per entry, all in flight. + const fetched = await Promise.all( + runIds.map((runId) => this.redis.hgetall(`mollifier:entries:${runId}`)), + ); + const entries: BufferEntry[] = []; + for (const value of fetched) { + if (!value || Object.keys(value).length === 0) continue; + const parsed = BufferEntrySchema.safeParse(value); + if (parsed.success) entries.push(parsed.data); + } + return entries; + } + + // Read-only listing of currently-queued entries for a single env. Used by + // the dashboard's "Recently queued" surface — non-destructive, so the + // drainer still pops these entries in order. Returns up to `maxCount` + // entries newest-first (highest score, which is `createdAtMicros`). + // Each entry hash is fetched separately; a `null` from getEntry (TTL + // expired between ZREVRANGE and HGETALL) is skipped. + async listEntriesForEnv(envId: string, maxCount: number): Promise { + if (maxCount <= 0) return []; + const runIds = await this.redis.zrevrange( + `mollifier:queue:${envId}`, + 0, + maxCount - 1, + ); + const entries: BufferEntry[] = []; + for (const runId of runIds) { + const entry = await this.getEntry(runId); + if (entry) entries.push(entry); + } + return entries; + } + + // Atomic snapshot mutation. Used by customer-mutation API endpoints + // (tags, metadata-put, reschedule, cancel) when the run is still in + // the buffer. Three outcomes: + // - "applied_to_snapshot": entry was QUEUED + not materialised; the + // drainer will read the patched payload on its next pop. + // - "not_found": no entry hash exists for this runId. + // - "busy": entry is DRAINING / FAILED / materialised. The API + // wait-and-bounces through PG (Q3 design). + async mutateSnapshot(runId: string, patch: SnapshotPatch): Promise { + const result = (await this.redis.mutateMollifierSnapshot( + `mollifier:entries:${runId}`, + JSON.stringify(patch), + )) as string; + if ( + result === "applied_to_snapshot" || + result === "not_found" || + result === "busy" + ) { + return result; + } + throw new Error(`MollifierBuffer.mutateSnapshot: unexpected Lua return value: ${result}`); + } + + // Optimistic compare-and-swap on the snapshot's metadata. Caller reads + // the current metadataVersion via getEntry, applies operations in JS via + // `applyMetadataOperations`, then calls this with the new metadata + the + // expected version. Lua refuses if the version has moved (caller retries + // up to N times). Mirrors the PG-side `UpdateMetadataService` retry + // loop so concurrent increment/append operations don't lose deltas. + async casSetMetadata(input: { + runId: string; + expectedVersion: number; + newMetadata: string; + newMetadataType: string; + }): Promise { + const entryKey = `mollifier:entries:${input.runId}`; + const raw = (await this.redis.casSetMollifierMetadata( + entryKey, + String(input.expectedVersion), + input.newMetadata, + input.newMetadataType, + )) as string; + if (raw === "not_found") return { kind: "not_found" }; + if (raw === "busy") return { kind: "busy" }; + if (raw.startsWith("conflict:")) { + return { kind: "version_conflict", currentVersion: Number(raw.slice("conflict:".length)) }; + } + if (raw.startsWith("applied:")) { + return { kind: "applied", newVersion: Number(raw.slice("applied:".length)) }; + } + throw new Error(`MollifierBuffer.casSetMetadata: unexpected Lua return: ${raw}`); + } + + // Atomic pre-gate claim on a (env, task, idempotencyKey) tuple. One + // call across both PG and buffer paths serialises through this claim; + // closes the race the buffer-side B6a SETNX leaves open during the + // gate-transition burst window (see + // `_plans/2026-05-21-mollifier-idempotency-claim.md`). + // + // - "claimed": we now own the claim, the caller proceeds with the + // trigger pipeline and must `publishClaim` on success or + // `releaseClaim` on failure. + // - "pending": another trigger owns the claim and hasn't published + // yet; the caller should poll. + // - "resolved": the claim already holds a runId; the caller can + // return that runId as a cached hit. + async claimIdempotency( + input: IdempotencyLookupInput & { ttlSeconds: number }, + ): Promise { + const claimKey = makeIdempotencyClaimKey(input); + const raw = (await this.redis.claimMollifierIdempotency( + claimKey, + IDEMPOTENCY_CLAIM_PENDING, + String(input.ttlSeconds), + )) as string; + if (raw === "claimed") return { kind: "claimed" }; + if (raw === "pending") return { kind: "pending" }; + if (raw.startsWith("resolved:")) { + return { kind: "resolved", runId: raw.slice("resolved:".length) }; + } + throw new Error(`MollifierBuffer.claimIdempotency: unexpected return: ${raw}`); + } + + // Publish the winning runId to the claim so subsequent claimants / + // waiters see "resolved". TTL bounded by the customer's + // `idempotencyKeyExpiresAt` minus now; caller computes. + async publishClaim( + input: IdempotencyLookupInput & { runId: string; ttlSeconds: number }, + ): Promise { + const claimKey = makeIdempotencyClaimKey(input); + await this.redis.set(claimKey, input.runId, "EX", input.ttlSeconds); + } + + // Release the claim on pipeline error so waiters can re-claim and + // retry. Idempotent. + async releaseClaim(input: IdempotencyLookupInput): Promise { + const claimKey = makeIdempotencyClaimKey(input); + await this.redis.del(claimKey); + } + + // Read the current claim value, used by the wait/poll loop on losers + // to detect "pending" → "resolved" transitions and timeouts. + async readClaim(input: IdempotencyLookupInput): Promise { + const claimKey = makeIdempotencyClaimKey(input); + const value = await this.redis.get(claimKey); + if (value === null) return null; + if (value === IDEMPOTENCY_CLAIM_PENDING) return { kind: "pending" }; + return { kind: "resolved", runId: value }; + } + + // Resolve a buffered run by (env, task, idempotencyKey) tuple. Used by + // `IdempotencyKeyConcern.handleTriggerRequest` after the PG check + // misses — same key may belong to a buffered run waiting to drain. The + // lookup self-heals: if the lookup points at an entry hash that's + // expired, we DEL the lookup and report a miss. + async lookupIdempotency(input: IdempotencyLookupInput): Promise { + const lookupKey = makeIdempotencyLookupKey(input); + const runId = await this.redis.get(lookupKey); + if (!runId) return null; + const entry = await this.getEntry(runId); + if (!entry) { + await this.redis.del(lookupKey); + return null; + } + return runId; + } + + // Clear the idempotency binding from a buffered run. Used by + // `ResetIdempotencyKeyService` alongside the existing PG-side + // `updateMany`. Returns the runId that was cleared, or null if no + // buffered run held this key. + async resetIdempotency(input: IdempotencyLookupInput): Promise<{ clearedRunId: string | null }> { + const lookupKey = makeIdempotencyLookupKey(input); + const clearedRunId = (await this.redis.resetMollifierIdempotency( + lookupKey, + "mollifier:entries:", + )) as string; + return { clearedRunId: clearedRunId.length > 0 ? clearedRunId : null }; + } + + // Marks the entry as materialised (PG row written) and resets its TTL to + // the grace window. Entry hash persists past ack as a read-fallback + // safety net for the brief PG replica-lag window between drainer-side + // write and reader-side visibility (Q1 D2). Also clears the associated + // idempotency lookup if one was set on accept (Q5). async ack(runId: string): Promise { - await this.redis.del(`mollifier:entries:${runId}`); + await this.redis.ackMollifierEntry( + `mollifier:entries:${runId}`, + String(ACK_GRACE_TTL_SECONDS), + ); } async requeue(runId: string): Promise { @@ -153,10 +476,16 @@ export class MollifierBuffer { return result === 1; } + // Returns Redis-side TTL on the entry hash. Returns -1 for entries + // with no TTL — the steady state under the current design, where + // entries persist until drainer ack/fail. The ack grace TTL (30s + // post-materialise) is the only context where this returns a + // positive value; tests around the grace TTL still rely on it. async getEntryTtlSeconds(runId: string): Promise { return this.redis.ttl(`mollifier:entries:${runId}`); } + async evaluateTrip( envId: string, options: { windowMs: number; threshold: number; holdMs: number }, @@ -190,8 +519,9 @@ export class MollifierBuffer { local orgId = ARGV[3] local payload = ARGV[4] local createdAt = ARGV[5] - local ttlSeconds = tonumber(ARGV[6]) + local createdAtMicros = ARGV[6] local orgEnvsPrefix = ARGV[7] + local idempotencyLookupKey = ARGV[8] or '' -- Idempotent: refuse if an entry for this runId already exists in any -- state. Caller-side dedup is also enforced via API idempotency keys, @@ -200,6 +530,20 @@ export class MollifierBuffer { return 0 end + -- Idempotency-key dedup (Q5). If the caller passed a lookup key + -- and it's already bound to another buffered run, return the + -- winner's runId so the loser's API response can echo it as a + -- cached hit. Otherwise SET the lookup (no TTL — lifecycle is + -- paired with the entry hash; drainer ack/fail clear it + -- explicitly). + if idempotencyLookupKey ~= '' then + local existing = redis.call('GET', idempotencyLookupKey) + if existing then + return existing + end + redis.call('SET', idempotencyLookupKey, runId) + end + redis.call('HSET', entryKey, 'runId', runId, 'envId', envId, @@ -207,9 +551,22 @@ export class MollifierBuffer { 'payload', payload, 'status', 'QUEUED', 'attempts', '0', - 'createdAt', createdAt) - redis.call('EXPIRE', entryKey, ttlSeconds) - redis.call('LPUSH', queueKey, runId) + 'createdAt', createdAt, + 'createdAtMicros', createdAtMicros, + 'idempotencyLookupKey', idempotencyLookupKey, + 'metadataVersion', '0') + -- No EXPIRE on the entry hash. Buffer entries persist until the + -- drainer ACKs (post-materialise grace) or FAILs them — the + -- drainer is the only recovery mechanism, so silent TTL-based + -- eviction would lose runs with no customer-visible signal. + -- Memory pressure from an offline drainer is the alertable + -- failure mode instead; see _ops/mollifier-ops.md. + -- ZSET keyed by createdAtMicros: ZPOPMIN drains oldest-first + -- (FIFO); listing pagination uses ZREVRANGEBYSCORE with a + -- (createdAt, runId) cursor anchor. Score is stable across the + -- entry's lifecycle — requeue does not bump it (see Phase 3b / + -- Q1 design). + redis.call('ZADD', queueKey, createdAtMicros, runId) -- Org-level membership: maintained atomically with the per-env -- queue so the drainer can walk orgs → envs-for-org and -- schedule one env per org per tick. SADDs are idempotent if the @@ -231,7 +588,8 @@ export class MollifierBuffer { local envId = redis.call('HGET', entryKey, 'envId') local orgId = redis.call('HGET', entryKey, 'orgId') - if not envId then + local createdAtMicros = redis.call('HGET', entryKey, 'createdAtMicros') + if not envId or not createdAtMicros then return 0 end @@ -239,7 +597,11 @@ export class MollifierBuffer { local nextAttempts = tonumber(currentAttempts or '0') + 1 redis.call('HSET', entryKey, 'status', 'QUEUED', 'attempts', tostring(nextAttempts)) - redis.call('LPUSH', queuePrefix .. envId, runId) + -- Requeue re-adds with the ORIGINAL createdAtMicros score. + -- createdAt is immutable across retries (Phase 3b decision). + -- The drainer's maxAttempts caps the retry loop so a poisoned + -- entry doesn't head-of-line forever. + redis.call('ZADD', queuePrefix .. envId, tonumber(createdAtMicros), runId) -- Re-track the org/env: pop may have SREM'd them when the queue -- last emptied. SADDs are idempotent if the values are still -- present. @@ -279,7 +641,9 @@ export class MollifierBuffer { -- hash without a TTL, leaking memory. The loop is bounded by queue -- length; entire Lua script remains atomic. while true do - local runId = redis.call('RPOP', queueKey) + -- ZPOPMIN returns {member, score} as a flat array, or {} when empty. + local popped = redis.call('ZPOPMIN', queueKey) + local runId = popped[1] if not runId then -- Queue is empty AND we have no entry to read orgId from, so -- skip org-level cleanup. Stale org-envs entries are bounded @@ -296,9 +660,9 @@ export class MollifierBuffer { result[raw[i]] = raw[i + 1] end -- Prune org-level membership if this pop drained the queue. - -- Atomic with the RPOP above — a concurrent accept AFTER this - -- script will SADD both back along with its LPUSH. - if redis.call('LLEN', queueKey) == 0 then + -- Atomic with the ZPOPMIN above — a concurrent accept AFTER + -- this script will SADD both back along with its ZADD. + if redis.call('ZCARD', queueKey) == 0 then pruneOrgMembership(result['orgId']) end return cjson.encode(result) @@ -309,19 +673,220 @@ export class MollifierBuffer { `, }); + this.redis.defineCommand("casSetMollifierMetadata", { + numberOfKeys: 1, + lua: ` + local entryKey = KEYS[1] + local expectedVersion = tonumber(ARGV[1]) + local newMetadata = ARGV[2] + local newMetadataType = ARGV[3] + + if redis.call('EXISTS', entryKey) == 0 then + return 'not_found' + end + + local status = redis.call('HGET', entryKey, 'status') + local materialised = redis.call('HGET', entryKey, 'materialised') + if status ~= 'QUEUED' or materialised == 'true' then + return 'busy' + end + + local currentVersionStr = redis.call('HGET', entryKey, 'metadataVersion') or '0' + local currentVersion = tonumber(currentVersionStr) or 0 + if currentVersion ~= expectedVersion then + return 'conflict:' .. tostring(currentVersion) + end + + -- Write the new metadata onto the snapshot's payload JSON. We + -- keep the rest of the payload intact — only metadata/metadataType + -- change. metadataVersion is denormalised on the hash for cheap + -- CAS reads; it's intentionally NOT stored inside the payload + -- itself (PG-side metadataVersion is a column, not a JSON field). + local payloadJson = redis.call('HGET', entryKey, 'payload') + local ok, payload = pcall(cjson.decode, payloadJson) + if not ok then return 'busy' end + payload.metadata = newMetadata + payload.metadataType = newMetadataType + + local newVersion = currentVersion + 1 + redis.call('HSET', entryKey, + 'payload', cjson.encode(payload), + 'metadataVersion', tostring(newVersion)) + return 'applied:' .. tostring(newVersion) + `, + }); + + this.redis.defineCommand("claimMollifierIdempotency", { + numberOfKeys: 1, + lua: ` + local claimKey = KEYS[1] + local pending = ARGV[1] + local ttl = tonumber(ARGV[2]) + + -- SETNX-with-TTL: atomic; only one caller can win. + local won = redis.call('SET', claimKey, pending, 'NX', 'EX', ttl) + if won then + return 'claimed' + end + + local existing = redis.call('GET', claimKey) + if existing == pending then + return 'pending' + end + return 'resolved:' .. existing + `, + }); + + this.redis.defineCommand("resetMollifierIdempotency", { + numberOfKeys: 1, + lua: ` + local lookupKey = KEYS[1] + local entryPrefix = ARGV[1] + + local runId = redis.call('GET', lookupKey) + if not runId then + return '' + end + + local entryKey = entryPrefix .. runId + if redis.call('EXISTS', entryKey) == 0 then + -- Stale lookup. Lazy cleanup. + redis.call('DEL', lookupKey) + return '' + end + + -- Clear the idempotency fields on the snapshot payload so the + -- drainer's eventual engine.trigger call inserts a PG row + -- without the key set. + local payloadJson = redis.call('HGET', entryKey, 'payload') + if payloadJson then + local ok, payload = pcall(cjson.decode, payloadJson) + if ok then + payload.idempotencyKey = cjson.null + payload.idempotencyKeyExpiresAt = cjson.null + redis.call('HSET', entryKey, 'payload', cjson.encode(payload)) + end + end + -- Clear the denormalised lookup pointer on the hash so a later + -- ack doesn't try to DEL a key that's already gone. + redis.call('HSET', entryKey, 'idempotencyLookupKey', '') + redis.call('DEL', lookupKey) + return runId + `, + }); + + this.redis.defineCommand("mutateMollifierSnapshot", { + numberOfKeys: 1, + lua: ` + local entryKey = KEYS[1] + local patchJson = ARGV[1] + + if redis.call('EXISTS', entryKey) == 0 then + return 'not_found' + end + + local status = redis.call('HGET', entryKey, 'status') + local materialised = redis.call('HGET', entryKey, 'materialised') + if status ~= 'QUEUED' or materialised == 'true' then + return 'busy' + end + + local payloadJson = redis.call('HGET', entryKey, 'payload') + local ok, payload = pcall(cjson.decode, payloadJson) + if not ok then return 'busy' end + + local patch = cjson.decode(patchJson) + + if patch.type == 'append_tags' then + -- cjson decode of an absent or empty-array field gives nil or + -- an empty table; we rebuild as a dense array. Existing tags + -- are preserved; new tags are appended only if not present. + local existing = payload.tags or {} + local seen = {} + local merged = {} + for _, t in ipairs(existing) do + if not seen[t] then + seen[t] = true + table.insert(merged, t) + end + end + for _, t in ipairs(patch.tags or {}) do + if not seen[t] then + seen[t] = true + table.insert(merged, t) + end + end + payload.tags = merged + elseif patch.type == 'set_metadata' then + payload.metadata = patch.metadata + payload.metadataType = patch.metadataType + elseif patch.type == 'set_delay' then + payload.delayUntil = patch.delayUntil + elseif patch.type == 'mark_cancelled' then + payload.cancelledAt = patch.cancelledAt + payload.cancelReason = patch.cancelReason + else + return 'busy' + end + + redis.call('HSET', entryKey, 'payload', cjson.encode(payload)) + return 'applied_to_snapshot' + `, + }); + + this.redis.defineCommand("ackMollifierEntry", { + numberOfKeys: 1, + lua: ` + local entryKey = KEYS[1] + local graceTtlSeconds = tonumber(ARGV[1]) + + -- Guard: never create a partial entry. If the hash expired between + -- pop and ack, the run is gone — nothing to mark materialised. + if redis.call('EXISTS', entryKey) == 0 then + return 0 + end + + -- If the entry was accepted with an idempotency key, the lookup + -- string was stored on the hash at accept time. Clear it now — + -- PG becomes canonical for the key post-materialisation (Q5). + local lookupKey = redis.call('HGET', entryKey, 'idempotencyLookupKey') + if lookupKey and lookupKey ~= '' then + redis.call('DEL', lookupKey) + end + + redis.call('HSET', entryKey, 'materialised', 'true') + redis.call('EXPIRE', entryKey, graceTtlSeconds) + return 1 + `, + }); + this.redis.defineCommand("failMollifierEntry", { numberOfKeys: 1, lua: ` local entryKey = KEYS[1] local errorPayload = ARGV[1] - -- Guard: never create a partial entry. If the hash expired between - -- pop and fail, the run is gone — nothing to mark FAILED. + -- Guard: nothing to mark FAILED if the hash is gone (concurrent + -- ack/manual cleanup). Returning 0 lets the caller distinguish + -- "marked failed" from "no-op". if redis.call('EXISTS', entryKey) == 0 then return 0 end redis.call('HSET', entryKey, 'status', 'FAILED', 'lastError', errorPayload) + + -- The drainer has already written a SYSTEM_FAILURE PG row for + -- terminal failures (see mollifierDrainerHandler.server.ts), so + -- the buffer entry is no longer load-bearing. Clear the + -- idempotency lookup — PG's unique constraint is the canonical + -- dedup mechanism post-materialise — and drop the entry hash so + -- failed runs don't accrete forever now that there's no + -- accept-time TTL. + local lookupKey = redis.call('HGET', entryKey, 'idempotencyLookupKey') + if lookupKey and lookupKey ~= '' then + redis.call('DEL', lookupKey) + end + redis.call('DEL', entryKey) return 1 `, }); @@ -362,10 +927,11 @@ declare module "@internal/redis" { orgId: string, payload: string, createdAt: string, - ttlSeconds: string, + createdAtMicros: string, orgEnvsPrefix: string, - callback?: Callback, - ): Result; + idempotencyLookupKey: string, + callback?: Callback, + ): Result; popAndMarkDraining( queueKey: string, orgsKey: string, @@ -382,6 +948,34 @@ declare module "@internal/redis" { orgEnvsPrefix: string, callback?: Callback, ): Result; + mutateMollifierSnapshot( + entryKey: string, + patchJson: string, + callback?: Callback, + ): Result; + casSetMollifierMetadata( + entryKey: string, + expectedVersion: string, + newMetadata: string, + newMetadataType: string, + callback?: Callback, + ): Result; + resetMollifierIdempotency( + lookupKey: string, + entryPrefix: string, + callback?: Callback, + ): Result; + claimMollifierIdempotency( + claimKey: string, + pendingMarker: string, + ttlSeconds: string, + callback?: Callback, + ): Result; + ackMollifierEntry( + entryKey: string, + graceTtlSeconds: string, + callback?: Callback, + ): Result; failMollifierEntry( entryKey: string, errorPayload: string, diff --git a/packages/redis-worker/src/mollifier/index.ts b/packages/redis-worker/src/mollifier/index.ts index 5e6fe202e3d..2751a6615eb 100644 --- a/packages/redis-worker/src/mollifier/index.ts +++ b/packages/redis-worker/src/mollifier/index.ts @@ -1,4 +1,13 @@ -export { MollifierBuffer, type MollifierBufferOptions } from "./buffer.js"; +export { + MollifierBuffer, + type MollifierBufferOptions, + type SnapshotPatch, + type MutateSnapshotResult, + type CasSetMetadataResult, + type IdempotencyClaimResult, + type IdempotencyLookupInput, + IDEMPOTENCY_CLAIM_PENDING, +} from "./buffer.js"; export { MollifierDrainer, type MollifierDrainerOptions, diff --git a/packages/redis-worker/src/mollifier/schemas.ts b/packages/redis-worker/src/mollifier/schemas.ts index f93b0f0a3c3..c5d9915575a 100644 --- a/packages/redis-worker/src/mollifier/schemas.ts +++ b/packages/redis-worker/src/mollifier/schemas.ts @@ -27,6 +27,10 @@ const stringToDate = z.string().transform((v, ctx) => { return d; }); +const stringToBool = z + .union([z.literal("true"), z.literal("false")]) + .transform((v) => v === "true"); + const stringToError = z.string().transform((v, ctx) => { try { return BufferEntryError.parse(JSON.parse(v)); @@ -44,6 +48,24 @@ export const BufferEntrySchema = z.object({ status: BufferEntryStatus, attempts: stringToInt, createdAt: stringToDate, + // Microsecond epoch matching the ZSET queue score. Stable across + // requeues — the score never moves once set at accept time. + createdAtMicros: stringToInt, + // Drainer-ack flag: `true` once the drainer has materialised this run + // into PG. The hash persists for a short grace TTL after ack so direct + // reads (retrieve, trace, etc.) still resolve while PG replica lag + // settles. Absent on pre-ack entries. + materialised: stringToBool.default("false"), + // Denormalised pointer to the Redis idempotency lookup key (set when + // the run was accepted with an idempotency key, empty otherwise). The + // ack Lua reads this to DEL the lookup atomically with marking the + // entry materialised (Q5). + idempotencyLookupKey: z.string().optional().default(""), + // Optimistic-lock counter for the snapshot's `metadata` field. + // Incremented atomically by the CAS metadata Lua. Matches the + // semantic of `TaskRun.metadataVersion` on the PG side (which the + // UpdateMetadataService uses for the same retry-on-conflict pattern). + metadataVersion: stringToInt.default("0"), lastError: stringToError.optional(), }); From 51911a150da51caa58781d1837ef65714226f830 Mon Sep 17 00:00:00 2001 From: Dan Sutton Date: Tue, 26 May 2026 12:11:51 +0100 Subject: [PATCH 2/7] test(redis-worker): align drainer tests with buffer ack/fail semantics After the buffer extensions in this PR: - ack() keeps the entry alive with a grace TTL as a read-fallback safety net. Test asserts the entry persists with materialised=true. - fail() deletes the entry once the drainer-handler has written the canonical SYSTEM_FAILURE PG row. Tests assert the entry is null and use runOnce()'s `failed` counter as the surviving signal. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/mollifier/drainer.test.ts | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/packages/redis-worker/src/mollifier/drainer.test.ts b/packages/redis-worker/src/mollifier/drainer.test.ts index c8f68977f69..c98c733b0fc 100644 --- a/packages/redis-worker/src/mollifier/drainer.test.ts +++ b/packages/redis-worker/src/mollifier/drainer.test.ts @@ -6,7 +6,6 @@ import { MollifierDrainer } from "./drainer.js"; import { serialiseSnapshot } from "./schemas.js"; const noopOptions = { - entryTtlSeconds: 600, logger: new Logger("test", "log"), }; @@ -87,8 +86,11 @@ describe("MollifierDrainer.runOnce", () => { payload: { foo: 1 }, }); + // After ack the entry persists as a read-fallback safety net with + // materialised=true and a fresh grace TTL (Q1 D2 / Phase B2). const entry = await buffer.getEntry("run_1"); - expect(entry).toBeNull(); + expect(entry).not.toBeNull(); + expect(entry!.materialised).toBe(true); } finally { await buffer.close(); } @@ -167,9 +169,14 @@ describe("MollifierDrainer error handling", () => { expect(after2!.status).toBe("QUEUED"); expect(after2!.attempts).toBe(2); - await drainer.runOnce(); + const result3 = await drainer.runOnce(); + // On attempt 3 the drainer hits maxAttempts and calls fail(), + // which deletes the entry — once the drainer-handler has written + // the SYSTEM_FAILURE PG row the buffer entry is no longer + // load-bearing. The runOnce result is the surviving signal. const after3 = await buffer.getEntry("run_r"); - expect(after3!.status).toBe("FAILED"); + expect(after3).toBeNull(); + expect(result3.failed).toBe(1); expect(calls).toBe(3); } finally { await buffer.close(); @@ -202,11 +209,13 @@ describe("MollifierDrainer error handling", () => { try { await buffer.accept({ runId: "run_nr", envId: "env_a", orgId: "org_1", payload: "{}" }); - await drainer.runOnce(); + const result = await drainer.runOnce(); + // fail() deletes the entry once the drainer-handler has written + // the canonical SYSTEM_FAILURE PG row. const entry = await buffer.getEntry("run_nr"); - expect(entry!.status).toBe("FAILED"); - expect(entry!.lastError).toEqual({ code: "Error", message: "validation failure" }); + expect(entry).toBeNull(); + expect(result.failed).toBe(1); } finally { await buffer.close(); } From 2be2ddd74b3158d5faf8ff1fafeea624926016d3 Mon Sep 17 00:00:00 2001 From: Dan Sutton Date: Tue, 26 May 2026 14:10:13 +0100 Subject: [PATCH 3/7] fix(redis-worker): encode mollifier composite-key segments + per-claim ownership token Addresses code-review feedback on the buffer's idempotency keying: - Encode `envId` / `taskIdentifier` / `idempotencyKey` with base64url before concatenation so customer-supplied segments containing `:` cannot alias each other onto the same Redis key. Exports `idempotencyLookupKeyFor` so tests assert against the same encoding the buffer writes. - Replace the shared `"pending"` claim marker with a caller-supplied ownership token (`"pending:"`). `publishClaim` and `releaseClaim` become compare-and-set / compare-and-delete via Lua, so a late release from a previous claimant whose TTL expired cannot erase a new owner's claim. New buffer tests cover the alias-collision case, the encoded-key-shape contract, and the token-ownership safety properties (stale release is a no-op, wrong-token publish is a no-op, fresh claim survives the post-TTL-expiry stale-release race). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../redis-worker/src/mollifier/buffer.test.ts | 301 +++++++++++++++++- packages/redis-worker/src/mollifier/buffer.ts | 148 +++++++-- packages/redis-worker/src/mollifier/index.ts | 2 +- 3 files changed, 420 insertions(+), 31 deletions(-) diff --git a/packages/redis-worker/src/mollifier/buffer.test.ts b/packages/redis-worker/src/mollifier/buffer.test.ts index a4c1be35eb3..5ce6be7f09b 100644 --- a/packages/redis-worker/src/mollifier/buffer.test.ts +++ b/packages/redis-worker/src/mollifier/buffer.test.ts @@ -2,7 +2,7 @@ import { describe, expect, it } from "vitest"; import { BufferEntrySchema, serialiseSnapshot, deserialiseSnapshot } from "./schemas.js"; import { redisTest } from "@internal/testcontainers"; import { Logger } from "@trigger.dev/core/logger"; -import { MollifierBuffer } from "./buffer.js"; +import { MollifierBuffer, idempotencyLookupKeyFor } from "./buffer.js"; describe("schemas", () => { it("serialiseSnapshot then deserialiseSnapshot is identity for plain objects", () => { @@ -1110,7 +1110,11 @@ describe("MollifierBuffer idempotency lookup", () => { }); expect(result).toEqual({ kind: "accepted" }); - const lookupKey = "mollifier:idempotency:env_i:my-task:ikey-1"; + const lookupKey = idempotencyLookupKeyFor({ + envId: "env_i", + taskIdentifier: "my-task", + idempotencyKey: "ikey-1", + }); const stored = await buffer["redis"].get(lookupKey); expect(stored).toBe("ri1"); // -1 = key exists with no TTL set. @@ -1241,7 +1245,11 @@ describe("MollifierBuffer idempotency lookup", () => { }); try { // Plant a stale lookup pointing at a non-existent entry. - const lookupKey = "mollifier:idempotency:env_i:t:stale"; + const lookupKey = idempotencyLookupKeyFor({ + envId: "env_i", + taskIdentifier: "t", + idempotencyKey: "stale", + }); await buffer["redis"].set(lookupKey, "rl-stale", "EX", 600); expect(await buffer["redis"].get(lookupKey)).toBe("rl-stale"); @@ -1283,7 +1291,11 @@ describe("MollifierBuffer idempotency lookup", () => { await buffer.pop("env_i"); await buffer.ack("ra1"); - const lookupKey = "mollifier:idempotency:env_i:t:ka"; + const lookupKey = idempotencyLookupKeyFor({ + envId: "env_i", + taskIdentifier: "t", + idempotencyKey: "ka", + }); expect(await buffer["redis"].get(lookupKey)).toBeNull(); const entry = await buffer.getEntry("ra1"); expect(entry!.materialised).toBe(true); @@ -1327,7 +1339,11 @@ describe("MollifierBuffer idempotency lookup", () => { expect(result.clearedRunId).toBe("rr1"); // Lookup is gone. - const lookupKey = "mollifier:idempotency:env_i:t:kr"; + const lookupKey = idempotencyLookupKeyFor({ + envId: "env_i", + taskIdentifier: "t", + idempotencyKey: "kr", + }); expect(await buffer["redis"].get(lookupKey)).toBeNull(); // Snapshot's idempotency fields are nulled, other fields kept. @@ -2028,3 +2044,278 @@ describe("MollifierBuffer.listEntriesForEnv", () => { } }); }); + +// Composite-key safety. The Redis-key builders concatenate +// `(envId, taskIdentifier, idempotencyKey)` with `:` separators; without +// per-segment encoding, `taskIdentifier="a:b"` and `idempotencyKey="x"` +// would map to the same key as `taskIdentifier="a"` and +// `idempotencyKey="b:x"`. base64url encoding has no `:` in its alphabet, +// so the encoded keys are unique per tuple. +describe("MollifierBuffer composite-key encoding (collision resistance)", () => { + redisTest( + "two accepts whose unencoded keys would alias don't collide on the idempotency lookup", + { timeout: 30_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + // Aliased tuples under raw `:` concatenation: + // env_x : "a:b" : "x" → "mollifier:idempotency:env_x:a:b:x" + // env_x : "a" : "b:x" → "mollifier:idempotency:env_x:a:b:x" + const r1 = await buffer.accept({ + runId: "ck_run_1", + envId: "env_x", + orgId: "org_1", + payload: "{}", + taskIdentifier: "a:b", + idempotencyKey: "x", + }); + const r2 = await buffer.accept({ + runId: "ck_run_2", + envId: "env_x", + orgId: "org_1", + payload: "{}", + taskIdentifier: "a", + idempotencyKey: "b:x", + }); + // Both accepted — no false-positive collision. + expect(r1).toEqual({ kind: "accepted" }); + expect(r2).toEqual({ kind: "accepted" }); + + // Each tuple resolves to its own runId. + const hit1 = await buffer.lookupIdempotency({ + envId: "env_x", + taskIdentifier: "a:b", + idempotencyKey: "x", + }); + const hit2 = await buffer.lookupIdempotency({ + envId: "env_x", + taskIdentifier: "a", + idempotencyKey: "b:x", + }); + expect(hit1).toBe("ck_run_1"); + expect(hit2).toBe("ck_run_2"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "encoded lookup key contains no ':' separator beyond the namespace", + { timeout: 20_000 }, + async () => { + // Pure-function test — verifies the encoding bijection without + // needing a live buffer. Re-uses the redisTest fixture for + // parallelism with other describe blocks but doesn't touch redis. + const key = idempotencyLookupKeyFor({ + envId: "env_x", + taskIdentifier: "a:b", + idempotencyKey: "x:y:z", + }); + // namespace prefix is exactly `mollifier:idempotency:` (two `:`), + // then three base64url segments separated by two more `:` — + // never the customer-supplied colons. + const colonCount = key.split(":").length - 1; + expect(colonCount).toBe(4); + // base64url alphabet has no `:`, `+`, `/`, or `=`. + const afterNamespace = key.slice("mollifier:idempotency:".length); + expect(afterNamespace).toMatch(/^[A-Za-z0-9_\-]+:[A-Za-z0-9_\-]+:[A-Za-z0-9_\-]+$/); + }, + ); +}); + +// Pre-gate claim ownership protection. The claim slot stores +// `"pending:"` so publish and release compare-and-act on the +// caller's token — a late release from a previous claimant whose TTL +// expired cannot erase a new owner's claim. +describe("MollifierBuffer pre-gate claim — ownership token safety", () => { + const claimInput = { + envId: "env_c", + taskIdentifier: "task_c", + idempotencyKey: "key_c", + }; + + redisTest( + "claimIdempotency: first caller gets 'claimed', second concurrent caller gets 'pending'", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + const first = await buffer.claimIdempotency({ + ...claimInput, + token: "token-A", + ttlSeconds: 30, + }); + expect(first.kind).toBe("claimed"); + + // Second concurrent caller with a different token sees pending. + const second = await buffer.claimIdempotency({ + ...claimInput, + token: "token-B", + ttlSeconds: 30, + }); + expect(second.kind).toBe("pending"); + + // readClaim distinguishes pending from resolved without leaking + // the token to the loser. + const read = await buffer.readClaim(claimInput); + expect(read?.kind).toBe("pending"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "releaseClaim with the wrong token is a no-op (compare-and-delete)", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.claimIdempotency({ ...claimInput, token: "owner", ttlSeconds: 30 }); + + // Pretend a stale claimant fires a release with their old token. + await buffer.releaseClaim({ ...claimInput, token: "stale-impostor" }); + + // The owner's claim survives. + const stillThere = await buffer.readClaim(claimInput); + expect(stillThere?.kind).toBe("pending"); + + // The owner can still release. + await buffer.releaseClaim({ ...claimInput, token: "owner" }); + expect(await buffer.readClaim(claimInput)).toBeNull(); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "publishClaim with the wrong token is a no-op and returns false", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.claimIdempotency({ ...claimInput, token: "owner", ttlSeconds: 30 }); + + const wrongTokenPublish = await buffer.publishClaim({ + ...claimInput, + token: "stale-impostor", + runId: "imposter-run", + ttlSeconds: 60, + }); + expect(wrongTokenPublish).toBe(false); + + // Claim slot unchanged. + const stillPending = await buffer.readClaim(claimInput); + expect(stillPending?.kind).toBe("pending"); + + const goodPublish = await buffer.publishClaim({ + ...claimInput, + token: "owner", + runId: "real-run", + ttlSeconds: 60, + }); + expect(goodPublish).toBe(true); + + const resolved = await buffer.readClaim(claimInput); + expect(resolved).toEqual({ kind: "resolved", runId: "real-run" }); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "regression: stale release after TTL expiry does NOT erase a fresh claim", + { timeout: 20_000 }, + async ({ redisContainer }) => { + // Hazard from CodeRabbit r3290070707: + // 1. Claimant A SETNXs the slot with their token, then stalls. + // 2. TTL expires, slot vanishes. + // 3. Claimant B SETNXs the slot with a DIFFERENT token. + // 4. Claimant A finally finishes (or errors) and calls + // releaseClaim with their original token. + // Without compare-and-delete, A's release would wipe B's slot and + // any concurrent customer of B's idempotency key would see "no + // claim" and re-issue, breaking same-key dedup. + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + // Step 1: A claims with token "A". + const a = await buffer.claimIdempotency({ + ...claimInput, + token: "A", + ttlSeconds: 1, // short TTL to simulate expiry quickly + }); + expect(a.kind).toBe("claimed"); + + // Step 2: simulate TTL expiry — DEL the slot directly so the + // test doesn't rely on wall-clock sleeping. + await buffer["redis"].del(`mollifier:claim:${[claimInput.envId, claimInput.taskIdentifier, claimInput.idempotencyKey] + .map((s) => Buffer.from(s, "utf8").toString("base64url")) + .join(":")}`); + + // Step 3: B claims with token "B". + const b = await buffer.claimIdempotency({ + ...claimInput, + token: "B", + ttlSeconds: 30, + }); + expect(b.kind).toBe("claimed"); + + // Step 4: A's late release. MUST be a no-op. + await buffer.releaseClaim({ ...claimInput, token: "A" }); + + // B's claim survives intact. + const after = await buffer.readClaim(claimInput); + expect(after?.kind).toBe("pending"); + + // B can still publish. + const published = await buffer.publishClaim({ + ...claimInput, + token: "B", + runId: "B-run", + ttlSeconds: 60, + }); + expect(published).toBe(true); + } finally { + await buffer.close(); + } + }, + ); +}); diff --git a/packages/redis-worker/src/mollifier/buffer.ts b/packages/redis-worker/src/mollifier/buffer.ts index fd53f59efea..37752d21623 100644 --- a/packages/redis-worker/src/mollifier/buffer.ts +++ b/packages/redis-worker/src/mollifier/buffer.ts @@ -43,21 +43,38 @@ export type IdempotencyLookupInput = { idempotencyKey: string; }; -function makeIdempotencyLookupKey(input: IdempotencyLookupInput): string { - return `mollifier:idempotency:${input.envId}:${input.taskIdentifier}:${input.idempotencyKey}`; +// Reversible encoding for Redis-key segments. The composite-key builders +// concatenate `envId`, `taskIdentifier`, and `idempotencyKey` with `:` +// separators; if any segment contains a literal `:` (envId is internal +// and `:`-free, but taskIdentifier and idempotencyKey are +// customer-supplied) different tuples would map to the same Redis key +// and dedupe the wrong run. base64url has no `:` in its alphabet and is +// bijective on the input string, so the encoded keys are +// collision-free. +function encodeKeyPart(value: string): string { + return Buffer.from(value, "utf8").toString("base64url"); +} + +// Exported so tests can compute the same Redis key the buffer writes +// without hard-coding the encoding (which is a buffer-internal detail). +export function idempotencyLookupKeyFor(input: IdempotencyLookupInput): string { + return `mollifier:idempotency:${encodeKeyPart(input.envId)}:${encodeKeyPart(input.taskIdentifier)}:${encodeKeyPart(input.idempotencyKey)}`; } // Pre-gate claim key namespace, distinct from `mollifier:idempotency` so the // existing B6a buffer-side dedup stays isolated. The claim is the // authoritative cross-store "this idempotency key is in flight or -// resolved" pointer used by the trigger hot path -// (`_plans/2026-05-21-mollifier-idempotency-claim.md`). Values: -// "pending" → a trigger pipeline owns the key and hasn't published yet -// → the winning trigger's runId (resolved) -export const IDEMPOTENCY_CLAIM_PENDING = "pending"; +// resolved" pointer used by the trigger hot path. Values: +// "pending:" → claimed by a trigger pipeline; `` is the +// caller-supplied ownership token. Release and +// publish compare-and-act on this token so a +// late release from a previous claimant whose TTL +// expired cannot erase a new owner's claim. +// → the winning trigger's resolved runId. +const PENDING_PREFIX = "pending:"; function makeIdempotencyClaimKey(input: IdempotencyLookupInput): string { - return `mollifier:claim:${input.envId}:${input.taskIdentifier}:${input.idempotencyKey}`; + return `mollifier:claim:${encodeKeyPart(input.envId)}:${encodeKeyPart(input.taskIdentifier)}:${encodeKeyPart(input.idempotencyKey)}`; } export type IdempotencyClaimResult = @@ -125,7 +142,7 @@ export class MollifierBuffer { const createdAtMicros = nowMs * 1000; const idempotencyLookupKey = input.idempotencyKey && input.taskIdentifier - ? makeIdempotencyLookupKey({ + ? idempotencyLookupKeyFor({ envId: input.envId, taskIdentifier: input.taskIdentifier, idempotencyKey: input.idempotencyKey, @@ -359,8 +376,12 @@ export class MollifierBuffer { // Atomic pre-gate claim on a (env, task, idempotencyKey) tuple. One // call across both PG and buffer paths serialises through this claim; // closes the race the buffer-side B6a SETNX leaves open during the - // gate-transition burst window (see - // `_plans/2026-05-21-mollifier-idempotency-claim.md`). + // gate-transition burst window. + // + // The caller supplies an opaque `token` (UUID) on claim. The same token + // MUST be passed to `publishClaim` / `releaseClaim`, which compare-and- + // act so a late release from a previous claimant whose TTL expired + // cannot erase a new owner's claim. // // - "claimed": we now own the claim, the caller proceeds with the // trigger pipeline and must `publishClaim` on success or @@ -370,12 +391,13 @@ export class MollifierBuffer { // - "resolved": the claim already holds a runId; the caller can // return that runId as a cached hit. async claimIdempotency( - input: IdempotencyLookupInput & { ttlSeconds: number }, + input: IdempotencyLookupInput & { token: string; ttlSeconds: number }, ): Promise { const claimKey = makeIdempotencyClaimKey(input); const raw = (await this.redis.claimMollifierIdempotency( claimKey, - IDEMPOTENCY_CLAIM_PENDING, + `${PENDING_PREFIX}${input.token}`, + PENDING_PREFIX, String(input.ttlSeconds), )) as string; if (raw === "claimed") return { kind: "claimed" }; @@ -389,18 +411,39 @@ export class MollifierBuffer { // Publish the winning runId to the claim so subsequent claimants / // waiters see "resolved". TTL bounded by the customer's // `idempotencyKeyExpiresAt` minus now; caller computes. + // + // Compare-and-set on the caller's token: if the current value isn't + // our pending marker (TTL expired and another claimant moved in, or + // someone else already published), the publish is a no-op. The caller + // can treat any such case as "we lost the claim" and re-read. + // Returns true if we published; false if the claim slot was no longer + // ours. async publishClaim( - input: IdempotencyLookupInput & { runId: string; ttlSeconds: number }, - ): Promise { + input: IdempotencyLookupInput & { token: string; runId: string; ttlSeconds: number }, + ): Promise { const claimKey = makeIdempotencyClaimKey(input); - await this.redis.set(claimKey, input.runId, "EX", input.ttlSeconds); + const result = (await this.redis.publishMollifierClaim( + claimKey, + `${PENDING_PREFIX}${input.token}`, + input.runId, + String(input.ttlSeconds), + )) as number; + return result === 1; } // Release the claim on pipeline error so waiters can re-claim and // retry. Idempotent. - async releaseClaim(input: IdempotencyLookupInput): Promise { + // + // Compare-and-delete on the caller's token: only deletes if the + // current value is exactly our pending marker. A late release from a + // claimant whose TTL expired is a no-op, so a new owner's claim is + // never wiped by a slow predecessor. + async releaseClaim(input: IdempotencyLookupInput & { token: string }): Promise { const claimKey = makeIdempotencyClaimKey(input); - await this.redis.del(claimKey); + await this.redis.releaseMollifierClaim( + claimKey, + `${PENDING_PREFIX}${input.token}`, + ); } // Read the current claim value, used by the wait/poll loop on losers @@ -409,7 +452,7 @@ export class MollifierBuffer { const claimKey = makeIdempotencyClaimKey(input); const value = await this.redis.get(claimKey); if (value === null) return null; - if (value === IDEMPOTENCY_CLAIM_PENDING) return { kind: "pending" }; + if (value.startsWith(PENDING_PREFIX)) return { kind: "pending" }; return { kind: "resolved", runId: value }; } @@ -419,7 +462,7 @@ export class MollifierBuffer { // lookup self-heals: if the lookup points at an entry hash that's // expired, we DEL the lookup and report a miss. async lookupIdempotency(input: IdempotencyLookupInput): Promise { - const lookupKey = makeIdempotencyLookupKey(input); + const lookupKey = idempotencyLookupKeyFor(input); const runId = await this.redis.get(lookupKey); if (!runId) return null; const entry = await this.getEntry(runId); @@ -435,7 +478,7 @@ export class MollifierBuffer { // `updateMany`. Returns the runId that was cleared, or null if no // buffered run held this key. async resetIdempotency(input: IdempotencyLookupInput): Promise<{ clearedRunId: string | null }> { - const lookupKey = makeIdempotencyLookupKey(input); + const lookupKey = idempotencyLookupKeyFor(input); const clearedRunId = (await this.redis.resetMollifierIdempotency( lookupKey, "mollifier:entries:", @@ -720,23 +763,65 @@ export class MollifierBuffer { numberOfKeys: 1, lua: ` local claimKey = KEYS[1] - local pending = ARGV[1] - local ttl = tonumber(ARGV[2]) + local pendingMarker = ARGV[1] -- "pending:" + local pendingPrefix = ARGV[2] -- "pending:" + local ttl = tonumber(ARGV[3]) -- SETNX-with-TTL: atomic; only one caller can win. - local won = redis.call('SET', claimKey, pending, 'NX', 'EX', ttl) + local won = redis.call('SET', claimKey, pendingMarker, 'NX', 'EX', ttl) if won then return 'claimed' end local existing = redis.call('GET', claimKey) - if existing == pending then + -- Any "pending:*" value is a live claim — the caller-supplied + -- token differentiates ownership but is opaque to losers. + if string.sub(existing, 1, string.len(pendingPrefix)) == pendingPrefix then return 'pending' end return 'resolved:' .. existing `, }); + // Publish a winning runId to a claim slot we own. Compare-and-set on + // the caller's pending marker: if the slot is no longer ours (TTL + // expired and another claimant moved in, or already resolved by + // someone else), we no-op. Returns 1 on publish, 0 on no-op. + this.redis.defineCommand("publishMollifierClaim", { + numberOfKeys: 1, + lua: ` + local claimKey = KEYS[1] + local ownerMarker = ARGV[1] -- "pending:" + local runId = ARGV[2] + local ttl = tonumber(ARGV[3]) + + local existing = redis.call('GET', claimKey) + if existing == ownerMarker then + redis.call('SET', claimKey, runId, 'EX', ttl) + return 1 + end + return 0 + `, + }); + + // Release a claim slot we own. Compare-and-delete on the caller's + // pending marker: a late release from a previous claimant whose TTL + // expired is a no-op, so a new owner's claim is never wiped. + this.redis.defineCommand("releaseMollifierClaim", { + numberOfKeys: 1, + lua: ` + local claimKey = KEYS[1] + local ownerMarker = ARGV[1] -- "pending:" + + local existing = redis.call('GET', claimKey) + if existing == ownerMarker then + redis.call('DEL', claimKey) + return 1 + end + return 0 + `, + }); + this.redis.defineCommand("resetMollifierIdempotency", { numberOfKeys: 1, lua: ` @@ -968,9 +1053,22 @@ declare module "@internal/redis" { claimMollifierIdempotency( claimKey: string, pendingMarker: string, + pendingPrefix: string, ttlSeconds: string, callback?: Callback, ): Result; + publishMollifierClaim( + claimKey: string, + ownerMarker: string, + runId: string, + ttlSeconds: string, + callback?: Callback, + ): Result; + releaseMollifierClaim( + claimKey: string, + ownerMarker: string, + callback?: Callback, + ): Result; ackMollifierEntry( entryKey: string, graceTtlSeconds: string, diff --git a/packages/redis-worker/src/mollifier/index.ts b/packages/redis-worker/src/mollifier/index.ts index 2751a6615eb..6e93f105fee 100644 --- a/packages/redis-worker/src/mollifier/index.ts +++ b/packages/redis-worker/src/mollifier/index.ts @@ -6,7 +6,7 @@ export { type CasSetMetadataResult, type IdempotencyClaimResult, type IdempotencyLookupInput, - IDEMPOTENCY_CLAIM_PENDING, + idempotencyLookupKeyFor, } from "./buffer.js"; export { MollifierDrainer, From 0a22252aaeab8c00fff68392d4dfecd0f4ac6f26 Mon Sep 17 00:00:00 2001 From: Daniel Sutton Date: Wed, 27 May 2026 12:58:34 +0100 Subject: [PATCH 4/7] docs(changeset): scope mollifier buffer-extensions changeset to redis-worker Drop the @trigger.dev/core entry and notice-field claim (no core change in this PR) and trim drainer-replay wording that belongs to a later PR. Co-Authored-By: Claude Opus 4.7 (1M context) --- .changeset/mollifier-buffer-extensions.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.changeset/mollifier-buffer-extensions.md b/.changeset/mollifier-buffer-extensions.md index b1f38f51ecc..2334c6d70b6 100644 --- a/.changeset/mollifier-buffer-extensions.md +++ b/.changeset/mollifier-buffer-extensions.md @@ -1,6 +1,5 @@ --- "@trigger.dev/redis-worker": minor -"@trigger.dev/core": patch --- -Mollifier buffer feature set built on top of the initial primitives: idempotency-lookup with SETNX dedup, atomic snapshot-mutation API (`mutateSnapshot` with tag/metadata/delay/cancel patches), metadata CAS for lossless concurrent updates, watermark-paginated listing, claim primitives for pre-gate idempotency, ZSET-backed per-env queue, 30s post-ack grace TTL, and drop the accept-time entry TTL (drainer is now the only removal mechanism). `@trigger.dev/core` gains an optional `notice` field on the trigger response so the SDK can surface mollifier-queued guidance to customers. +Mollifier buffer extensions: idempotency dedup, an atomic `mutateSnapshot` API, metadata CAS, paginated listing, claim primitives, and a `MollifierSnapshot` type. From 8febda21470c392a7d2483446ced7637c77d8ce8 Mon Sep 17 00:00:00 2001 From: Daniel Sutton Date: Wed, 27 May 2026 13:51:38 +0100 Subject: [PATCH 5/7] fix(redis-worker): mollifier CAS isolation, reset claim cleanup + test gaps Close review findings on the mollifier buffer extensions: - mutateSnapshot(set_metadata) now bumps metadataVersion so a concurrent casSetMetadata conflicts instead of clobbering under a stale version - resetIdempotency clears the pre-gate claim slot, not just the lookup, so a resolved/pending claim can't keep deduping past reset - claimMollifierIdempotency guards a falsy GET before string.sub - re-export AcceptResult (accept's public return type) and makeIdempotencyClaimKey from the package index - correct stale "TTL expired" comments (no accept-time TTL anymore) and the mutateSnapshot doc (FAILED returns not_found, not busy) Tests: listForEnvWithWatermark coverage (page-1, page-N, tied-score continuation, orphan skip, empty/guard), fail clears idempotency lookup, casSetMetadata busy on materialised entry, set_metadata/CAS version interleave, resetIdempotency clears claim slot. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../redis-worker/src/mollifier/buffer.test.ts | 374 +++++++++++++++++- packages/redis-worker/src/mollifier/buffer.ts | 66 +++- packages/redis-worker/src/mollifier/index.ts | 2 + 3 files changed, 421 insertions(+), 21 deletions(-) diff --git a/packages/redis-worker/src/mollifier/buffer.test.ts b/packages/redis-worker/src/mollifier/buffer.test.ts index 5ce6be7f09b..e9e004fb851 100644 --- a/packages/redis-worker/src/mollifier/buffer.test.ts +++ b/packages/redis-worker/src/mollifier/buffer.test.ts @@ -2,7 +2,7 @@ import { describe, expect, it } from "vitest"; import { BufferEntrySchema, serialiseSnapshot, deserialiseSnapshot } from "./schemas.js"; import { redisTest } from "@internal/testcontainers"; import { Logger } from "@trigger.dev/core/logger"; -import { MollifierBuffer, idempotencyLookupKeyFor } from "./buffer.js"; +import { MollifierBuffer, idempotencyLookupKeyFor, makeIdempotencyClaimKey } from "./buffer.js"; describe("schemas", () => { it("serialiseSnapshot then deserialiseSnapshot is identity for plain objects", () => { @@ -397,6 +397,62 @@ describe("MollifierBuffer.fail", () => { } }, ); + + redisTest( + "fail DELs the idempotency lookup so a same-key retry goes through to PG", + { timeout: 20_000 }, + async ({ redisContainer }) => { + // Symmetric with the ack path: the failMollifierEntry Lua reads the + // idempotencyLookupKey off the hash and DELs it. Without this, a + // post-fail retry with the same idempotency key would hit the + // orphaned dedup record and resolve to a run that no longer exists. + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + + try { + await buffer.accept({ + runId: "run_fk", + envId: "env_a", + orgId: "org_1", + payload: "{}", + idempotencyKey: "kf", + taskIdentifier: "t", + }); + const lookupKey = idempotencyLookupKeyFor({ + envId: "env_a", + taskIdentifier: "t", + idempotencyKey: "kf", + }); + // Lookup exists before fail. + expect(await buffer["redis"].get(lookupKey)).toBe("run_fk"); + + await buffer.pop("env_a"); + const failed = await buffer.fail("run_fk", { code: "VALIDATION", message: "boom" }); + expect(failed).toBe(true); + + // Lookup is cleared, so the slot is reclaimable: a fresh accept + // with the same tuple succeeds rather than deduping. + expect(await buffer["redis"].get(lookupKey)).toBeNull(); + const reaccept = await buffer.accept({ + runId: "run_fk2", + envId: "env_a", + orgId: "org_1", + payload: "{}", + idempotencyKey: "kf", + taskIdentifier: "t", + }); + expect(reaccept).toEqual({ kind: "accepted" }); + } finally { + await buffer.close(); + } + }, + ); }); describe("MollifierBuffer TTL", () => { @@ -1387,6 +1443,49 @@ describe("MollifierBuffer idempotency lookup", () => { } }, ); + + redisTest( + "resetIdempotency also clears the pre-gate claim slot", + { timeout: 20_000 }, + async ({ redisContainer }) => { + // The lookup and the cross-store claim are two pointers for the same + // key. Reset must reopen both — otherwise a resolved/pending claim + // keeps deduping new triggers for the rest of its TTL even though + // the binding was reset. + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + const tuple = { envId: "env_rc", taskIdentifier: "t", idempotencyKey: "krc" }; + try { + // A resolved claim is in place... + await buffer.claimIdempotency({ ...tuple, token: "owner", ttlSeconds: 600 }); + await buffer.publishClaim({ ...tuple, token: "owner", runId: "rc1", ttlSeconds: 600 }); + expect(await buffer.readClaim(tuple)).toEqual({ kind: "resolved", runId: "rc1" }); + // ...alongside a buffered run holding the lookup. + await buffer.accept({ + runId: "rc1", + envId: "env_rc", + orgId: "org_1", + payload: serialiseSnapshot({}), + idempotencyKey: "krc", + taskIdentifier: "t", + }); + + await buffer.resetIdempotency(tuple); + + // Both the lookup and the claim are gone. + expect(await buffer.lookupIdempotency(tuple)).toBeNull(); + expect(await buffer.readClaim(tuple)).toBeNull(); + } finally { + await buffer.close(); + } + }, + ); }); describe("MollifierBuffer.casSetMetadata", () => { @@ -1507,6 +1606,96 @@ describe("MollifierBuffer.casSetMetadata", () => { } }, ); + + redisTest( + "returns busy on a materialised entry (post-ack grace window)", + { timeout: 20_000 }, + async ({ redisContainer }) => { + // The guard rejects `materialised == 'true'` as well as non-QUEUED + // status. After ack the entry lingers QUEUED-but-materialised for + // the grace TTL; a CAS in that window must not mutate it. + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "cas_mat", + envId: "env_c", + orgId: "org_1", + payload: serialiseSnapshot({}), + }); + await buffer.pop("env_c"); + await buffer.ack("cas_mat"); + + const result = await buffer.casSetMetadata({ + runId: "cas_mat", + expectedVersion: 0, + newMetadata: "{}", + newMetadataType: "application/json", + }); + expect(result).toEqual({ kind: "busy" }); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "a mutateSnapshot set_metadata bumps metadataVersion so an in-flight CAS conflicts", + { timeout: 20_000 }, + async ({ redisContainer }) => { + // CAS isolation: a reader fetches version N, then a concurrent + // mutateSnapshot(set_metadata) overwrites the metadata. The reader's + // CAS at expectedVersion=N must NOT silently win — both paths write + // payload.metadata, so set_metadata bumps the same counter the CAS + // is gated on. + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "cas_int", + envId: "env_c", + orgId: "org_1", + payload: serialiseSnapshot({ metadata: '{"v":0}', metadataType: "application/json" }), + }); + // Reader observes version 0. + const before = await buffer.getEntry("cas_int"); + expect(before!.metadataVersion).toBe(0); + + // Concurrent snapshot mutation writes metadata + bumps version. + const mutated = await buffer.mutateSnapshot("cas_int", { + type: "set_metadata", + metadata: '{"v":1}', + metadataType: "application/json", + }); + expect(mutated).toBe("applied_to_snapshot"); + const mid = await buffer.getEntry("cas_int"); + expect(mid!.metadataVersion).toBe(1); + + // The reader's stale CAS conflicts instead of clobbering. + const result = await buffer.casSetMetadata({ + runId: "cas_int", + expectedVersion: 0, + newMetadata: '{"v":2}', + newMetadataType: "application/json", + }); + expect(result).toEqual({ kind: "version_conflict", currentVersion: 1 }); + } finally { + await buffer.close(); + } + }, + ); }); describe("MollifierBuffer.mutateSnapshot", () => { @@ -2045,6 +2234,181 @@ describe("MollifierBuffer.listEntriesForEnv", () => { }); }); +describe("MollifierBuffer.listForEnvWithWatermark", () => { + // Seed a QUEUED entry, then pin its ZSET score and hash `createdAtMicros` + // to a deterministic value so ordering and the watermark cursor don't + // depend on wall-clock timing (Date.now() ties within a millisecond). + async function seed(buffer: MollifierBuffer, envId: string, runId: string, micros: number) { + await buffer.accept({ runId, envId, orgId: "org_1", payload: "{}" }); + await buffer["redis"].zadd(`mollifier:queue:${envId}`, String(micros), runId); + await buffer["redis"].hset(`mollifier:entries:${runId}`, "createdAtMicros", String(micros)); + } + + redisTest("pageSize <= 0 returns empty without hitting redis", { timeout: 20_000 }, async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + expect(await buffer.listForEnvWithWatermark({ envId: "env_w", pageSize: 0 })).toEqual([]); + expect(await buffer.listForEnvWithWatermark({ envId: "env_w", pageSize: -3 })).toEqual([]); + } finally { + await buffer.close(); + } + }); + + redisTest( + "page 1 returns newest-first up to pageSize without consuming entries", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await seed(buffer, "env_w", "wa", 1000); + await seed(buffer, "env_w", "wb", 2000); + await seed(buffer, "env_w", "wc", 3000); + + const page = await buffer.listForEnvWithWatermark({ envId: "env_w", pageSize: 2 }); + // Newest-first by createdAtMicros. + expect(page.map((e) => e.runId)).toEqual(["wc", "wb"]); + + // Non-destructive: drainer still pops oldest-first. + const popped: string[] = []; + for (let i = 0; i < 3; i++) { + const e = await buffer.pop("env_w"); + if (e) popped.push(e.runId); + } + expect(popped).toEqual(["wa", "wb", "wc"]); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "page N continues strictly below the watermark score", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await seed(buffer, "env_w", "wa", 1000); + await seed(buffer, "env_w", "wb", 2000); + await seed(buffer, "env_w", "wc", 3000); + + const page1 = await buffer.listForEnvWithWatermark({ envId: "env_w", pageSize: 2 }); + expect(page1.map((e) => e.runId)).toEqual(["wc", "wb"]); + + const last = page1[page1.length - 1]!; + const page2 = await buffer.listForEnvWithWatermark({ + envId: "env_w", + pageSize: 2, + watermark: { createdAtMicros: last.createdAtMicros, runId: last.runId }, + }); + // Only the entry strictly below score 2000 remains; no overlap. + expect(page2.map((e) => e.runId)).toEqual(["wa"]); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "tied-score watermark surfaces lex-smaller members on the next page without dupes", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + // Three entries share one score; ZSET breaks the tie by member, + // and zrevrangebyscore returns them member-DESC: tc, tb, ta. + await seed(buffer, "env_w", "ta", 2000); + await seed(buffer, "env_w", "tb", 2000); + await seed(buffer, "env_w", "tc", 2000); + + const page1 = await buffer.listForEnvWithWatermark({ envId: "env_w", pageSize: 2 }); + expect(page1.map((e) => e.runId)).toEqual(["tc", "tb"]); + + const last = page1[page1.length - 1]!; + const page2 = await buffer.listForEnvWithWatermark({ + envId: "env_w", + pageSize: 2, + watermark: { createdAtMicros: last.createdAtMicros, runId: last.runId }, + }); + // Same score, lex-smaller than the "tb" anchor — and not "tb" + // itself (no duplicate across the page boundary). + expect(page2.map((e) => e.runId)).toEqual(["ta"]); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "skips orphan queue references (entry hash gone) during listing", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await seed(buffer, "env_w", "live", 1000); + await seed(buffer, "env_w", "orphan", 2000); + // Drop the orphan's hash but leave its queue ref behind. + await buffer["redis"].del("mollifier:entries:orphan"); + + const page = await buffer.listForEnvWithWatermark({ envId: "env_w", pageSize: 10 }); + expect(page.map((e) => e.runId)).toEqual(["live"]); + } finally { + await buffer.close(); + } + }, + ); + + redisTest("returns empty for an env with no queued entries", { timeout: 20_000 }, async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + expect(await buffer.listForEnvWithWatermark({ envId: "env_empty_w", pageSize: 10 })).toEqual([]); + } finally { + await buffer.close(); + } + }); +}); + // Composite-key safety. The Redis-key builders concatenate // `(envId, taskIdentifier, idempotencyKey)` with `:` separators; without // per-segment encoding, `taskIdentifier="a:b"` and `idempotencyKey="x"` @@ -2285,10 +2649,10 @@ describe("MollifierBuffer pre-gate claim — ownership token safety", () => { expect(a.kind).toBe("claimed"); // Step 2: simulate TTL expiry — DEL the slot directly so the - // test doesn't rely on wall-clock sleeping. - await buffer["redis"].del(`mollifier:claim:${[claimInput.envId, claimInput.taskIdentifier, claimInput.idempotencyKey] - .map((s) => Buffer.from(s, "utf8").toString("base64url")) - .join(":")}`); + // test doesn't rely on wall-clock sleeping. Targets the same key + // the buffer writes via the exported builder, so a key-format + // change can't silently make this DEL miss. + await buffer["redis"].del(makeIdempotencyClaimKey(claimInput)); // Step 3: B claims with token "B". const b = await buffer.claimIdempotency({ diff --git a/packages/redis-worker/src/mollifier/buffer.ts b/packages/redis-worker/src/mollifier/buffer.ts index 37752d21623..e8e413fd5dd 100644 --- a/packages/redis-worker/src/mollifier/buffer.ts +++ b/packages/redis-worker/src/mollifier/buffer.ts @@ -73,7 +73,9 @@ export function idempotencyLookupKeyFor(input: IdempotencyLookupInput): string { // → the winning trigger's resolved runId. const PENDING_PREFIX = "pending:"; -function makeIdempotencyClaimKey(input: IdempotencyLookupInput): string { +// Exported (like `idempotencyLookupKeyFor`) so tests can target the same +// claim key the buffer writes without hard-coding the encoding. +export function makeIdempotencyClaimKey(input: IdempotencyLookupInput): string { return `mollifier:claim:${encodeKeyPart(input.envId)}:${encodeKeyPart(input.taskIdentifier)}:${encodeKeyPart(input.idempotencyKey)}`; } @@ -303,8 +305,9 @@ export class MollifierBuffer { // the dashboard's "Recently queued" surface — non-destructive, so the // drainer still pops these entries in order. Returns up to `maxCount` // entries newest-first (highest score, which is `createdAtMicros`). - // Each entry hash is fetched separately; a `null` from getEntry (TTL - // expired between ZREVRANGE and HGETALL) is skipped. + // Each entry hash is fetched separately; a `null` from getEntry (entry + // torn down by a concurrent drainer ack/fail between ZREVRANGE and + // HGETALL) is skipped. async listEntriesForEnv(envId: string, maxCount: number): Promise { if (maxCount <= 0) return []; const runIds = await this.redis.zrevrange( @@ -325,8 +328,9 @@ export class MollifierBuffer { // the buffer. Three outcomes: // - "applied_to_snapshot": entry was QUEUED + not materialised; the // drainer will read the patched payload on its next pop. - // - "not_found": no entry hash exists for this runId. - // - "busy": entry is DRAINING / FAILED / materialised. The API + // - "not_found": no entry hash exists for this runId — including a + // FAILED entry, whose hash the drainer-terminal `fail` path DELs. + // - "busy": entry is DRAINING or materialised. The API // wait-and-bounces through PG (Q3 design). async mutateSnapshot(runId: string, patch: SnapshotPatch): Promise { const result = (await this.redis.mutateMollifierSnapshot( @@ -479,9 +483,11 @@ export class MollifierBuffer { // buffered run held this key. async resetIdempotency(input: IdempotencyLookupInput): Promise<{ clearedRunId: string | null }> { const lookupKey = idempotencyLookupKeyFor(input); + const claimKey = makeIdempotencyClaimKey(input); const clearedRunId = (await this.redis.resetMollifierIdempotency( lookupKey, "mollifier:entries:", + claimKey, )) as string; return { clearedRunId: clearedRunId.length > 0 ? clearedRunId : null }; } @@ -508,9 +514,12 @@ export class MollifierBuffer { ); } - // Returns true if the entry transitioned to FAILED; false if the entry no - // longer exists (TTL expired between pop and fail). Caller can use the - // boolean to skip downstream FAILED handling for ghost entries. + // Returns true if a live entry was torn down; false if the entry no + // longer existed (a concurrent ack or manual cleanup removed it between + // pop and fail — there is no accept-time TTL). Note FAILED is not an + // observable state: the Lua marks the hash FAILED then DELs it in the + // same atomic script, so a subsequent getEntry returns null. Caller can + // use the boolean to skip downstream FAILED handling for ghost entries. async fail(runId: string, error: { code: string; message: string }): Promise { const result = await this.redis.failMollifierEntry( `mollifier:entries:${runId}`, @@ -679,10 +688,11 @@ export class MollifierBuffer { end end - -- Loop to skip orphan queue references — runIds whose entry hash has - -- expired (TTL hit). HSET on a missing key would CREATE a partial - -- hash without a TTL, leaking memory. The loop is bounded by queue - -- length; entire Lua script remains atomic. + -- Loop to skip orphan queue references — runIds whose entry hash is + -- gone (e.g. Redis maxmemory eviction, since QUEUED entries carry + -- no TTL of their own). HSET on a missing key would CREATE a + -- partial hash without a TTL, leaking memory. The loop is bounded + -- by queue length; entire Lua script remains atomic. while true do -- ZPOPMIN returns {member, score} as a flat array, or {} when empty. local popped = redis.call('ZPOPMIN', queueKey) @@ -710,8 +720,8 @@ export class MollifierBuffer { end return cjson.encode(result) end - -- Orphan queue reference: entry TTL expired while runId was queued. - -- Discard the reference and loop to the next. + -- Orphan queue reference: entry hash gone (evicted) while runId + -- was queued. Discard the reference and loop to the next. end `, }); @@ -774,6 +784,13 @@ export class MollifierBuffer { end local existing = redis.call('GET', claimKey) + if not existing then + -- The slot expired in the race window between the SET NX + -- failing and this GET. It's free now — claim it so we don't + -- string.sub a nil and error out. + redis.call('SET', claimKey, pendingMarker, 'EX', ttl) + return 'claimed' + end -- Any "pending:*" value is a live claim — the caller-supplied -- token differentiates ownership but is opaque to losers. if string.sub(existing, 1, string.len(pendingPrefix)) == pendingPrefix then @@ -827,6 +844,15 @@ export class MollifierBuffer { lua: ` local lookupKey = KEYS[1] local entryPrefix = ARGV[1] + local claimKey = ARGV[2] + + -- Reset reopens the key across BOTH the buffer lookup and the + -- cross-store pre-gate claim pointer. Without clearing the claim, + -- a resolved/pending claim would keep deduping new triggers for + -- the rest of its TTL even though the binding was reset. DEL is + -- unconditional — the claim is gone regardless of whether a + -- buffered run currently holds the lookup. + redis.call('DEL', claimKey) local runId = redis.call('GET', lookupKey) if not runId then @@ -905,6 +931,12 @@ export class MollifierBuffer { elseif patch.type == 'set_metadata' then payload.metadata = patch.metadata payload.metadataType = patch.metadataType + -- Bump the denormalised metadataVersion so an in-flight + -- casSetMetadata (optimistic CAS keyed on this counter) sees + -- the concurrent write as a version conflict and retries, + -- instead of clobbering it under a now-stale expectedVersion. + local currentVersion = tonumber(redis.call('HGET', entryKey, 'metadataVersion') or '0') or 0 + redis.call('HSET', entryKey, 'metadataVersion', tostring(currentVersion + 1)) elseif patch.type == 'set_delay' then payload.delayUntil = patch.delayUntil elseif patch.type == 'mark_cancelled' then @@ -925,8 +957,9 @@ export class MollifierBuffer { local entryKey = KEYS[1] local graceTtlSeconds = tonumber(ARGV[1]) - -- Guard: never create a partial entry. If the hash expired between - -- pop and ack, the run is gone — nothing to mark materialised. + -- Guard: never create a partial entry. If the hash is gone between + -- pop and ack (concurrent fail or eviction — QUEUED entries carry + -- no TTL), the run is gone, nothing to mark materialised. if redis.call('EXISTS', entryKey) == 0 then return 0 end @@ -1048,6 +1081,7 @@ declare module "@internal/redis" { resetMollifierIdempotency( lookupKey: string, entryPrefix: string, + claimKey: string, callback?: Callback, ): Result; claimMollifierIdempotency( diff --git a/packages/redis-worker/src/mollifier/index.ts b/packages/redis-worker/src/mollifier/index.ts index 6e93f105fee..6147b522603 100644 --- a/packages/redis-worker/src/mollifier/index.ts +++ b/packages/redis-worker/src/mollifier/index.ts @@ -2,11 +2,13 @@ export { MollifierBuffer, type MollifierBufferOptions, type SnapshotPatch, + type AcceptResult, type MutateSnapshotResult, type CasSetMetadataResult, type IdempotencyClaimResult, type IdempotencyLookupInput, idempotencyLookupKeyFor, + makeIdempotencyClaimKey, } from "./buffer.js"; export { MollifierDrainer, From 612de3b0f0b659098bddff664be2c4ec4d61e65a Mon Sep 17 00:00:00 2001 From: Daniel Sutton Date: Wed, 27 May 2026 15:05:10 +0100 Subject: [PATCH 6/7] refactor(redis-worker): mollifier queue ZSET -> LIST, drop dead watermark listing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The watermark listing (listForEnvWithWatermark) had zero consumers across the whole stack — the dashboard renders buffered runs per-run via getEntry, never a paginated queue list. With it gone, nothing left needs a sorted set: FIFO drain is just insertion order, the stale sweep enumerates without caring about order, and delay is materialised into PG at drain time (never a queue sort key). Revert the per-env queue to a Redis LIST: LPUSH on accept, RPOP on pop (FIFO), RPUSH on requeue (transiently-failed entry pops next), LLEN for the empty check, LRANGE for the stale-sweep enumeration. O(1) instead of O(log N), simpler Lua. createdAtMicros stays a hash field for dwell metrics; it is no longer a sort key. Removes listForEnvWithWatermark + its tests; rewrites the ZSET-storage tests as LIST-storage tests; updates orphan/requeue tests for RPOP/RPUSH. Co-Authored-By: Claude Opus 4.7 (1M context) --- .changeset/mollifier-buffer-extensions.md | 2 +- .../redis-worker/src/mollifier/buffer.test.ts | 270 +++--------------- packages/redis-worker/src/mollifier/buffer.ts | 132 ++------- 3 files changed, 72 insertions(+), 332 deletions(-) diff --git a/.changeset/mollifier-buffer-extensions.md b/.changeset/mollifier-buffer-extensions.md index 2334c6d70b6..109ea5c3d20 100644 --- a/.changeset/mollifier-buffer-extensions.md +++ b/.changeset/mollifier-buffer-extensions.md @@ -2,4 +2,4 @@ "@trigger.dev/redis-worker": minor --- -Mollifier buffer extensions: idempotency dedup, an atomic `mutateSnapshot` API, metadata CAS, paginated listing, claim primitives, and a `MollifierSnapshot` type. +Mollifier buffer extensions: idempotency dedup, an atomic `mutateSnapshot` API, metadata CAS, claim primitives, and a `MollifierSnapshot` type. diff --git a/packages/redis-worker/src/mollifier/buffer.test.ts b/packages/redis-worker/src/mollifier/buffer.test.ts index e9e004fb851..4af7575e252 100644 --- a/packages/redis-worker/src/mollifier/buffer.test.ts +++ b/packages/redis-worker/src/mollifier/buffer.test.ts @@ -238,8 +238,8 @@ describe("MollifierBuffer.pop orphan handling", () => { }); try { - // Simulate a TTL-expired orphan: queue ref exists, entry hash does not. - await buffer["redis"].zadd("mollifier:queue:env_a", 1, "run_orphan"); + // Simulate an evicted orphan: queue ref exists, entry hash does not. + await buffer["redis"].rpush("mollifier:queue:env_a", "run_orphan"); const popped = await buffer.pop("env_a"); expect(popped).toBeNull(); @@ -249,7 +249,7 @@ describe("MollifierBuffer.pop orphan handling", () => { expect(Object.keys(raw)).toHaveLength(0); // Queue is drained — the loop pops orphans until empty. - const qLen = await buffer["redis"].zcard("mollifier:queue:env_a"); + const qLen = await buffer["redis"].llen("mollifier:queue:env_a"); expect(qLen).toBe(0); } finally { await buffer.close(); @@ -271,12 +271,13 @@ describe("MollifierBuffer.pop orphan handling", () => { }); try { - // Layout by score (lowest-first, since ZPOPMIN takes the min): - // orphan_a (score 1) → valid (score = its createdAtMicros, large) → orphan_b (score 1e18). - // First pop skips orphan_a, returns valid; orphan_b remains. - await buffer["redis"].zadd("mollifier:queue:env_a", 1, "orphan_a"); + // Build the queue so RPOP (tail-first) yields: orphan_a, valid, + // orphan_b. accept LPUSHes "valid"; RPUSH puts orphan_a at the + // tail (popped first), LPUSH puts orphan_b at the head (popped + // last). First pop skips orphan_a, returns valid; orphan_b remains. await buffer.accept({ runId: "valid", envId: "env_a", orgId: "org_1", payload: "{}" }); - await buffer["redis"].zadd("mollifier:queue:env_a", 1e18, "orphan_b"); + await buffer["redis"].rpush("mollifier:queue:env_a", "orphan_a"); + await buffer["redis"].lpush("mollifier:queue:env_a", "orphan_b"); const popped = await buffer.pop("env_a"); expect(popped).not.toBeNull(); @@ -284,7 +285,7 @@ describe("MollifierBuffer.pop orphan handling", () => { expect(popped!.status).toBe("DRAINING"); // The trailing orphan_b is still in the queue (single pop call). - const remaining = await buffer["redis"].zcard("mollifier:queue:env_a"); + const remaining = await buffer["redis"].llen("mollifier:queue:env_a"); expect(remaining).toBe(1); // A second pop drains the trailing orphan_b. The queue is now @@ -559,13 +560,14 @@ describe("MollifierBuffer.requeue on missing entry", () => { describe("MollifierBuffer.requeue ordering", () => { redisTest( - "requeued entry retains its original createdAt and pops next (oldest-first by createdAt)", + "requeued entry pops next (RPUSH to the RPOP/tail end), preserving FIFO", { timeout: 20_000 }, async ({ redisContainer }) => { - // Score == createdAtMicros; requeue does not bump the score. The - // oldest entry continues to pop first across retries. `maxAttempts` - // in the drainer bounds the retry loop for a persistently failing - // entry (after which it goes to the `fail` path, not requeue). + // LIST FIFO: accept LPUSHes at the head, pop RPOPs from the tail, so + // the first-accepted entry pops first. requeue RPUSHes back to the + // tail, so a transiently failed entry pops next rather than going to + // the back. `maxAttempts` in the drainer bounds the retry loop for a + // persistently failing entry (after which it goes to `fail`, not requeue). const buffer = new MollifierBuffer({ redisOptions: { host: redisContainer.getHost(), @@ -577,9 +579,7 @@ describe("MollifierBuffer.requeue ordering", () => { try { await buffer.accept({ runId: "a", envId: "env_a", orgId: "org_1", payload: "{}" }); - await new Promise((r) => setTimeout(r, 2)); await buffer.accept({ runId: "b", envId: "env_a", orgId: "org_1", payload: "{}" }); - await new Promise((r) => setTimeout(r, 2)); await buffer.accept({ runId: "c", envId: "env_a", orgId: "org_1", payload: "{}" }); const first = await buffer.pop("env_a"); @@ -587,7 +587,7 @@ describe("MollifierBuffer.requeue ordering", () => { await buffer.requeue("a"); - // a still has the smallest createdAtMicros → pops next. + // a was RPUSHed back to the tail → pops next, ahead of b and c. const next = await buffer.pop("env_a"); expect(next!.runId).toBe("a"); const after = await buffer.pop("env_a"); @@ -2045,9 +2045,9 @@ describe("MollifierBuffer.mutateSnapshot", () => { ); }); -describe("MollifierBuffer ZSET storage", () => { +describe("MollifierBuffer LIST storage", () => { redisTest( - "queue key is a ZSET scored by entry's createdAtMicros", + "queue key is a LIST; createdAtMicros is a hash field, not a sort key", { timeout: 20_000 }, async ({ redisContainer }) => { const buffer = new MollifierBuffer({ @@ -2062,24 +2062,21 @@ describe("MollifierBuffer ZSET storage", () => { try { await buffer.accept({ runId: "z1", envId: "env_z", orgId: "org_1", payload: "{}" }); - // ZSET-only commands must succeed against the queue key. - const card = await buffer["redis"].zcard("mollifier:queue:env_z"); - expect(card).toBe(1); + // LIST-only commands must succeed against the queue key. + const len = await buffer["redis"].llen("mollifier:queue:env_z"); + expect(len).toBe(1); + const members = await buffer["redis"].lrange("mollifier:queue:env_z", 0, -1); + expect(members).toEqual(["z1"]); - const score = await buffer["redis"].zscore("mollifier:queue:env_z", "z1"); - expect(score).not.toBeNull(); - const scoreNum = Number(score); - expect(Number.isFinite(scoreNum)).toBe(true); + // The queue holds no score — it's not a ZSET. + await expect(buffer["redis"].zscore("mollifier:queue:env_z", "z1")).rejects.toThrow(); - // Score matches the entry hash's createdAtMicros field. - const micros = await buffer["redis"].hget("mollifier:entries:z1", "createdAtMicros"); - expect(micros).not.toBeNull(); - expect(Number(micros)).toBe(scoreNum); - - // Score is plausibly recent (within last minute as microseconds). + // createdAtMicros lives on the entry hash (for dwell metrics) and + // is plausibly recent (within the last minute, as microseconds). + const micros = Number(await buffer["redis"].hget("mollifier:entries:z1", "createdAtMicros")); const nowMicros = Date.now() * 1000; - expect(scoreNum).toBeGreaterThan(nowMicros - 60_000_000); - expect(scoreNum).toBeLessThanOrEqual(nowMicros + 1_000_000); + expect(micros).toBeGreaterThan(nowMicros - 60_000_000); + expect(micros).toBeLessThanOrEqual(nowMicros + 1_000_000); } finally { await buffer.close(); } @@ -2087,7 +2084,7 @@ describe("MollifierBuffer ZSET storage", () => { ); redisTest( - "pop returns entries in ascending createdAtMicros order (FIFO by time, not by member)", + "pop returns entries in FIFO insertion order (independent of member lex order)", { timeout: 20_000 }, async ({ redisContainer }) => { const buffer = new MollifierBuffer({ @@ -2100,11 +2097,10 @@ describe("MollifierBuffer ZSET storage", () => { }); try { - // Insert runIds in reverse-lex order to prove ordering is by score, not member. + // Accept in reverse-lex order to prove ordering is by insertion + // (LPUSH head / RPOP tail), not by member value. await buffer.accept({ runId: "zzz", envId: "env_o", orgId: "org_1", payload: "{}" }); - await new Promise((r) => setTimeout(r, 5)); await buffer.accept({ runId: "mmm", envId: "env_o", orgId: "org_1", payload: "{}" }); - await new Promise((r) => setTimeout(r, 5)); await buffer.accept({ runId: "aaa", envId: "env_o", orgId: "org_1", payload: "{}" }); const first = await buffer.pop("env_o"); @@ -2120,7 +2116,7 @@ describe("MollifierBuffer ZSET storage", () => { ); redisTest( - "requeue keeps original score; createdAt is immutable across retries", + "requeue re-enqueues to the LIST; createdAt is immutable across retries", { timeout: 20_000 }, async ({ redisContainer }) => { const buffer = new MollifierBuffer({ @@ -2134,24 +2130,17 @@ describe("MollifierBuffer ZSET storage", () => { try { await buffer.accept({ runId: "rq", envId: "env_rq", orgId: "org_1", payload: "{}" }); - const originalScore = Number( - await buffer["redis"].zscore("mollifier:queue:env_rq", "rq"), - ); - const originalMicros = Number( - await buffer["redis"].hget("mollifier:entries:rq", "createdAtMicros"), - ); + const originalMicros = await buffer["redis"].hget("mollifier:entries:rq", "createdAtMicros"); await buffer.pop("env_rq"); - await new Promise((r) => setTimeout(r, 5)); + // Queue is empty after the pop. + expect(await buffer["redis"].llen("mollifier:queue:env_rq")).toBe(0); + await buffer.requeue("rq"); - const newScore = Number( - await buffer["redis"].zscore("mollifier:queue:env_rq", "rq"), - ); - const newMicros = Number( - await buffer["redis"].hget("mollifier:entries:rq", "createdAtMicros"), - ); - expect(newScore).toBe(originalScore); + // Back on the LIST, and createdAtMicros is unchanged. + expect(await buffer["redis"].lrange("mollifier:queue:env_rq", 0, -1)).toEqual(["rq"]); + const newMicros = await buffer["redis"].hget("mollifier:entries:rq", "createdAtMicros"); expect(newMicros).toBe(originalMicros); } finally { await buffer.close(); @@ -2234,181 +2223,6 @@ describe("MollifierBuffer.listEntriesForEnv", () => { }); }); -describe("MollifierBuffer.listForEnvWithWatermark", () => { - // Seed a QUEUED entry, then pin its ZSET score and hash `createdAtMicros` - // to a deterministic value so ordering and the watermark cursor don't - // depend on wall-clock timing (Date.now() ties within a millisecond). - async function seed(buffer: MollifierBuffer, envId: string, runId: string, micros: number) { - await buffer.accept({ runId, envId, orgId: "org_1", payload: "{}" }); - await buffer["redis"].zadd(`mollifier:queue:${envId}`, String(micros), runId); - await buffer["redis"].hset(`mollifier:entries:${runId}`, "createdAtMicros", String(micros)); - } - - redisTest("pageSize <= 0 returns empty without hitting redis", { timeout: 20_000 }, async ({ redisContainer }) => { - const buffer = new MollifierBuffer({ - redisOptions: { - host: redisContainer.getHost(), - port: redisContainer.getPort(), - password: redisContainer.getPassword(), - }, - logger: new Logger("test", "log"), - }); - try { - expect(await buffer.listForEnvWithWatermark({ envId: "env_w", pageSize: 0 })).toEqual([]); - expect(await buffer.listForEnvWithWatermark({ envId: "env_w", pageSize: -3 })).toEqual([]); - } finally { - await buffer.close(); - } - }); - - redisTest( - "page 1 returns newest-first up to pageSize without consuming entries", - { timeout: 20_000 }, - async ({ redisContainer }) => { - const buffer = new MollifierBuffer({ - redisOptions: { - host: redisContainer.getHost(), - port: redisContainer.getPort(), - password: redisContainer.getPassword(), - }, - logger: new Logger("test", "log"), - }); - try { - await seed(buffer, "env_w", "wa", 1000); - await seed(buffer, "env_w", "wb", 2000); - await seed(buffer, "env_w", "wc", 3000); - - const page = await buffer.listForEnvWithWatermark({ envId: "env_w", pageSize: 2 }); - // Newest-first by createdAtMicros. - expect(page.map((e) => e.runId)).toEqual(["wc", "wb"]); - - // Non-destructive: drainer still pops oldest-first. - const popped: string[] = []; - for (let i = 0; i < 3; i++) { - const e = await buffer.pop("env_w"); - if (e) popped.push(e.runId); - } - expect(popped).toEqual(["wa", "wb", "wc"]); - } finally { - await buffer.close(); - } - }, - ); - - redisTest( - "page N continues strictly below the watermark score", - { timeout: 20_000 }, - async ({ redisContainer }) => { - const buffer = new MollifierBuffer({ - redisOptions: { - host: redisContainer.getHost(), - port: redisContainer.getPort(), - password: redisContainer.getPassword(), - }, - logger: new Logger("test", "log"), - }); - try { - await seed(buffer, "env_w", "wa", 1000); - await seed(buffer, "env_w", "wb", 2000); - await seed(buffer, "env_w", "wc", 3000); - - const page1 = await buffer.listForEnvWithWatermark({ envId: "env_w", pageSize: 2 }); - expect(page1.map((e) => e.runId)).toEqual(["wc", "wb"]); - - const last = page1[page1.length - 1]!; - const page2 = await buffer.listForEnvWithWatermark({ - envId: "env_w", - pageSize: 2, - watermark: { createdAtMicros: last.createdAtMicros, runId: last.runId }, - }); - // Only the entry strictly below score 2000 remains; no overlap. - expect(page2.map((e) => e.runId)).toEqual(["wa"]); - } finally { - await buffer.close(); - } - }, - ); - - redisTest( - "tied-score watermark surfaces lex-smaller members on the next page without dupes", - { timeout: 20_000 }, - async ({ redisContainer }) => { - const buffer = new MollifierBuffer({ - redisOptions: { - host: redisContainer.getHost(), - port: redisContainer.getPort(), - password: redisContainer.getPassword(), - }, - logger: new Logger("test", "log"), - }); - try { - // Three entries share one score; ZSET breaks the tie by member, - // and zrevrangebyscore returns them member-DESC: tc, tb, ta. - await seed(buffer, "env_w", "ta", 2000); - await seed(buffer, "env_w", "tb", 2000); - await seed(buffer, "env_w", "tc", 2000); - - const page1 = await buffer.listForEnvWithWatermark({ envId: "env_w", pageSize: 2 }); - expect(page1.map((e) => e.runId)).toEqual(["tc", "tb"]); - - const last = page1[page1.length - 1]!; - const page2 = await buffer.listForEnvWithWatermark({ - envId: "env_w", - pageSize: 2, - watermark: { createdAtMicros: last.createdAtMicros, runId: last.runId }, - }); - // Same score, lex-smaller than the "tb" anchor — and not "tb" - // itself (no duplicate across the page boundary). - expect(page2.map((e) => e.runId)).toEqual(["ta"]); - } finally { - await buffer.close(); - } - }, - ); - - redisTest( - "skips orphan queue references (entry hash gone) during listing", - { timeout: 20_000 }, - async ({ redisContainer }) => { - const buffer = new MollifierBuffer({ - redisOptions: { - host: redisContainer.getHost(), - port: redisContainer.getPort(), - password: redisContainer.getPassword(), - }, - logger: new Logger("test", "log"), - }); - try { - await seed(buffer, "env_w", "live", 1000); - await seed(buffer, "env_w", "orphan", 2000); - // Drop the orphan's hash but leave its queue ref behind. - await buffer["redis"].del("mollifier:entries:orphan"); - - const page = await buffer.listForEnvWithWatermark({ envId: "env_w", pageSize: 10 }); - expect(page.map((e) => e.runId)).toEqual(["live"]); - } finally { - await buffer.close(); - } - }, - ); - - redisTest("returns empty for an env with no queued entries", { timeout: 20_000 }, async ({ redisContainer }) => { - const buffer = new MollifierBuffer({ - redisOptions: { - host: redisContainer.getHost(), - port: redisContainer.getPort(), - password: redisContainer.getPassword(), - }, - logger: new Logger("test", "log"), - }); - try { - expect(await buffer.listForEnvWithWatermark({ envId: "env_empty_w", pageSize: 10 })).toEqual([]); - } finally { - await buffer.close(); - } - }); -}); - // Composite-key safety. The Redis-key builders concatenate // `(envId, taskIdentifier, idempotencyKey)` with `:` separators; without // per-segment encoding, `taskIdentifier="a:b"` and `idempotencyKey="x"` diff --git a/packages/redis-worker/src/mollifier/buffer.ts b/packages/redis-worker/src/mollifier/buffer.ts index e8e413fd5dd..2c2f0cc5fa7 100644 --- a/packages/redis-worker/src/mollifier/buffer.ts +++ b/packages/redis-worker/src/mollifier/buffer.ts @@ -135,12 +135,10 @@ export class MollifierBuffer { const orgsKey = "mollifier:orgs"; const nowMs = Date.now(); const createdAt = new Date(nowMs).toISOString(); - // Microsecond epoch. JS only has millisecond precision, so multiple - // accepts in the same ms share a score; ZSET ties resolve by member - // (runId) lex order, which is deterministic and acceptable for FIFO - // pop. The hash carries the same value as `createdAtMicros` so the - // listing helper (Phase E) can read a stable per-run timestamp - // without re-fetching the score. + // Microsecond epoch, stored as a hash field for dwell-time metrics + // (stale sweep, drainer dwell span). FIFO ordering comes from the + // LIST itself (LPUSH head / RPOP tail), not from this value — it is + // no longer a queue sort key. const createdAtMicros = nowMs * 1000; const idempotencyLookupKey = input.idempotencyKey && input.taskIdentifier @@ -231,86 +229,16 @@ export class MollifierBuffer { return this.redis.smembers(`mollifier:org-envs:${orgId}`); } - // Paginated read of currently-queued entries newest-first, bounded by - // an optional `(createdAtMicros, runId)` watermark. Q1 listing design. - // Returns hydrated `BufferEntry` rows up to `pageSize`. Skips orphans - // (queue ref without an entry hash) silently. Non-destructive — the - // drainer keeps popping these entries in createdAt order regardless. - async listForEnvWithWatermark(input: { - envId: string; - watermark?: { createdAtMicros: number; runId: string }; - pageSize: number; - }): Promise { - if (input.pageSize <= 0) return []; - const queueKey = `mollifier:queue:${input.envId}`; - - let runIds: string[]; - if (!input.watermark) { - // Page 1 — newest first. - runIds = await this.redis.zrevrangebyscore( - queueKey, - "+inf", - "-inf", - "LIMIT", - 0, - input.pageSize, - ); - } else { - // Page N — strictly below the watermark score. - const belowScore = await this.redis.zrevrangebyscore( - queueKey, - `(${input.watermark.createdAtMicros}`, - "-inf", - "LIMIT", - 0, - input.pageSize, - ); - runIds = belowScore; - // Tied-score scan: ZSET ties broken by member-DESC, so entries - // sharing the watermark score with a lex-smaller runId still - // need to surface. Cheap second range over the tied band. - if (belowScore.length < input.pageSize) { - const remaining = input.pageSize - belowScore.length; - const tied = await this.redis.zrangebyscore( - queueKey, - input.watermark.createdAtMicros, - input.watermark.createdAtMicros, - ); - // Filter to runIds lex-less than the watermark anchor, sort - // member-DESC, take `remaining`. - const tiedFiltered = tied - .filter((r) => r < input.watermark!.runId) - .sort((a, b) => (a < b ? 1 : a > b ? -1 : 0)) - .slice(0, remaining); - runIds = [...belowScore, ...tiedFiltered]; - } - } - - if (runIds.length === 0) return []; - - // Parallel HGETALL — one round-trip per entry, all in flight. - const fetched = await Promise.all( - runIds.map((runId) => this.redis.hgetall(`mollifier:entries:${runId}`)), - ); - const entries: BufferEntry[] = []; - for (const value of fetched) { - if (!value || Object.keys(value).length === 0) continue; - const parsed = BufferEntrySchema.safeParse(value); - if (parsed.success) entries.push(parsed.data); - } - return entries; - } - - // Read-only listing of currently-queued entries for a single env. Used by - // the dashboard's "Recently queued" surface — non-destructive, so the - // drainer still pops these entries in order. Returns up to `maxCount` - // entries newest-first (highest score, which is `createdAtMicros`). - // Each entry hash is fetched separately; a `null` from getEntry (entry - // torn down by a concurrent drainer ack/fail between ZREVRANGE and - // HGETALL) is skipped. + // Read-only enumeration of currently-queued entries for a single env. + // Used by the stale-sweep to compute per-entry dwell time, so order is + // immaterial — LRANGE returns them newest-first (LPUSH head) but the + // caller scans the whole window. Non-destructive: the drainer still + // RPOPs these entries in FIFO order. Each entry hash is fetched + // separately; a `null` from getEntry (entry torn down by a concurrent + // drainer ack/fail between LRANGE and HGETALL) is skipped. async listEntriesForEnv(envId: string, maxCount: number): Promise { if (maxCount <= 0) return []; - const runIds = await this.redis.zrevrange( + const runIds = await this.redis.lrange( `mollifier:queue:${envId}`, 0, maxCount - 1, @@ -613,12 +541,11 @@ export class MollifierBuffer { -- eviction would lose runs with no customer-visible signal. -- Memory pressure from an offline drainer is the alertable -- failure mode instead; see _ops/mollifier-ops.md. - -- ZSET keyed by createdAtMicros: ZPOPMIN drains oldest-first - -- (FIFO); listing pagination uses ZREVRANGEBYSCORE with a - -- (createdAt, runId) cursor anchor. Score is stable across the - -- entry's lifecycle — requeue does not bump it (see Phase 3b / - -- Q1 design). - redis.call('ZADD', queueKey, createdAtMicros, runId) + -- LIST queue: LPUSH at the head, drainer RPOPs from the tail, so + -- insertion order == drain order (FIFO). createdAtMicros is kept + -- as a hash field for dwell metrics only — it is no longer a sort + -- key now that the buffer has no list/pagination surface. + redis.call('LPUSH', queueKey, runId) -- Org-level membership: maintained atomically with the per-env -- queue so the drainer can walk orgs → envs-for-org and -- schedule one env per org per tick. SADDs are idempotent if the @@ -640,8 +567,7 @@ export class MollifierBuffer { local envId = redis.call('HGET', entryKey, 'envId') local orgId = redis.call('HGET', entryKey, 'orgId') - local createdAtMicros = redis.call('HGET', entryKey, 'createdAtMicros') - if not envId or not createdAtMicros then + if not envId then return 0 end @@ -649,11 +575,12 @@ export class MollifierBuffer { local nextAttempts = tonumber(currentAttempts or '0') + 1 redis.call('HSET', entryKey, 'status', 'QUEUED', 'attempts', tostring(nextAttempts)) - -- Requeue re-adds with the ORIGINAL createdAtMicros score. - -- createdAt is immutable across retries (Phase 3b decision). - -- The drainer's maxAttempts caps the retry loop so a poisoned - -- entry doesn't head-of-line forever. - redis.call('ZADD', queuePrefix .. envId, tonumber(createdAtMicros), runId) + -- Requeue RPUSHes to the tail (the RPOP end) so a transiently + -- failed entry pops next rather than going to the back of the + -- line behind a fresh backlog. createdAt is immutable across + -- retries (Phase 3b decision); the drainer's maxAttempts caps the + -- retry loop so a poisoned entry doesn't head-of-line forever. + redis.call('RPUSH', queuePrefix .. envId, runId) -- Re-track the org/env: pop may have SREM'd them when the queue -- last emptied. SADDs are idempotent if the values are still -- present. @@ -694,9 +621,8 @@ export class MollifierBuffer { -- partial hash without a TTL, leaking memory. The loop is bounded -- by queue length; entire Lua script remains atomic. while true do - -- ZPOPMIN returns {member, score} as a flat array, or {} when empty. - local popped = redis.call('ZPOPMIN', queueKey) - local runId = popped[1] + -- RPOP returns the tail member (oldest, FIFO), or false when empty. + local runId = redis.call('RPOP', queueKey) if not runId then -- Queue is empty AND we have no entry to read orgId from, so -- skip org-level cleanup. Stale org-envs entries are bounded @@ -713,9 +639,9 @@ export class MollifierBuffer { result[raw[i]] = raw[i + 1] end -- Prune org-level membership if this pop drained the queue. - -- Atomic with the ZPOPMIN above — a concurrent accept AFTER - -- this script will SADD both back along with its ZADD. - if redis.call('ZCARD', queueKey) == 0 then + -- Atomic with the RPOP above — a concurrent accept AFTER + -- this script will SADD both back along with its LPUSH. + if redis.call('LLEN', queueKey) == 0 then pruneOrgMembership(result['orgId']) end return cjson.encode(result) From 8bfa7419cda4bfd32866118b90eefae0dc7392af Mon Sep 17 00:00:00 2001 From: Daniel Sutton Date: Wed, 27 May 2026 15:42:19 +0100 Subject: [PATCH 7/7] fix(redis-worker): self-heal stale idempotency lookups + default createdAtMicros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address AI review findings on PR #3752: - accept(): if an idempotency lookup survives its entry hash being evicted (maxmemory), the lookup is stale — rebind to the new run instead of returning a dead existingRunId that blocks the key forever. Mirrors the self-heal lookupIdempotency already does. (CodeRabbit) - lookupIdempotency(): clear a stale lookup with a compare-and-delete (delMollifierKeyIfEquals Lua) so a concurrent accept that rebinds the key between our GET and DEL isn't clobbered. (CodeRabbit) - schemas: default createdAtMicros to "0" so an entry written before the field existed (or surviving across the deploy that added it) still parses on pop instead of being silently dropped. (Devin) - rename the requeue-ordering test to "retry priority" — RPUSH-to-tail pops the requeued entry ahead of newer items; that's deliberate retry priority, not FIFO relative to the rest of the queue. (CodeRabbit nit) Co-Authored-By: Claude Opus 4.7 (1M context) --- .../redis-worker/src/mollifier/buffer.test.ts | 102 +++++++++++++++++- packages/redis-worker/src/mollifier/buffer.ts | 39 ++++++- .../redis-worker/src/mollifier/schemas.ts | 9 +- 3 files changed, 138 insertions(+), 12 deletions(-) diff --git a/packages/redis-worker/src/mollifier/buffer.test.ts b/packages/redis-worker/src/mollifier/buffer.test.ts index 4af7575e252..00d618c0498 100644 --- a/packages/redis-worker/src/mollifier/buffer.test.ts +++ b/packages/redis-worker/src/mollifier/buffer.test.ts @@ -30,6 +30,24 @@ describe("schemas", () => { expect(parsed.createdAtMicros).toBe(1747044000000000); }); + it("BufferEntrySchema defaults createdAtMicros for entries written before the field existed", () => { + // Backward compat: an entry written by an accept Lua predating + // createdAtMicros (only the original 7 fields) must still parse on + // pop rather than being silently dropped. + const raw = { + runId: "run_old", + envId: "env_1", + orgId: "org_1", + payload: serialiseSnapshot({}), + status: "QUEUED", + attempts: "0", + createdAt: "2026-05-11T10:00:00.000Z", + // no createdAtMicros + }; + const parsed = BufferEntrySchema.parse(raw); + expect(parsed.createdAtMicros).toBe(0); + }); + it("BufferEntrySchema parses a FAILED entry with lastError", () => { const raw = { runId: "run_abc", @@ -560,13 +578,15 @@ describe("MollifierBuffer.requeue on missing entry", () => { describe("MollifierBuffer.requeue ordering", () => { redisTest( - "requeued entry pops next (RPUSH to the RPOP/tail end), preserving FIFO", + "requeued entry gets retry priority (RPUSH to the RPOP/tail end), popping ahead of newer items", { timeout: 20_000 }, async ({ redisContainer }) => { - // LIST FIFO: accept LPUSHes at the head, pop RPOPs from the tail, so - // the first-accepted entry pops first. requeue RPUSHes back to the - // tail, so a transiently failed entry pops next rather than going to - // the back. `maxAttempts` in the drainer bounds the retry loop for a + // LIST: accept LPUSHes at the head, pop RPOPs from the tail, so the + // first-accepted entry pops first. requeue RPUSHes back to the tail, + // giving a transiently failed entry *retry priority* — it pops next, + // ahead of newer queued items, rather than going to the back. (This + // is deliberately not FIFO relative to the rest of the queue.) + // `maxAttempts` in the drainer bounds the retry loop for a // persistently failing entry (after which it goes to `fail`, not requeue). const buffer = new MollifierBuffer({ redisOptions: { @@ -1486,6 +1506,78 @@ describe("MollifierBuffer idempotency lookup", () => { } }, ); + + redisTest( + "accept self-heals a stale lookup: a new run rebinds when the bound entry was evicted", + { timeout: 20_000 }, + async ({ redisContainer }) => { + // If an entry hash is evicted (maxmemory) but its idempotency lookup + // survives, a fresh accept with the same key must NOT return the dead + // runId (which would block the key forever) — it should rebind to the + // new run and accept it. + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + const idem = { idempotencyKey: "kheal", taskIdentifier: "t" }; + try { + await buffer.accept({ runId: "heal_old", envId: "env_h", orgId: "org_1", payload: "{}", ...idem }); + // Simulate eviction of the entry hash while the lookup survives. + await buffer["redis"].del("mollifier:entries:heal_old"); + const lookupKey = idempotencyLookupKeyFor({ envId: "env_h", ...idem }); + expect(await buffer["redis"].get(lookupKey)).toBe("heal_old"); + + // A fresh accept with the same key rebinds rather than deduping + // onto the dead run. + const result = await buffer.accept({ + runId: "heal_new", + envId: "env_h", + orgId: "org_1", + payload: "{}", + ...idem, + }); + expect(result).toEqual({ kind: "accepted" }); + expect(await buffer["redis"].get(lookupKey)).toBe("heal_new"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "accept still dedups when the bound entry is live", + { timeout: 20_000 }, + async ({ redisContainer }) => { + // The self-heal must not weaken normal dedup: a live bound entry + // still wins, and the loser gets its runId back. + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + const idem = { idempotencyKey: "klive", taskIdentifier: "t" }; + try { + await buffer.accept({ runId: "live_win", envId: "env_h", orgId: "org_1", payload: "{}", ...idem }); + const result = await buffer.accept({ + runId: "live_lose", + envId: "env_h", + orgId: "org_1", + payload: "{}", + ...idem, + }); + expect(result).toEqual({ kind: "duplicate_idempotency", existingRunId: "live_win" }); + } finally { + await buffer.close(); + } + }, + ); }); describe("MollifierBuffer.casSetMetadata", () => { diff --git a/packages/redis-worker/src/mollifier/buffer.ts b/packages/redis-worker/src/mollifier/buffer.ts index 2c2f0cc5fa7..d5492e745d5 100644 --- a/packages/redis-worker/src/mollifier/buffer.ts +++ b/packages/redis-worker/src/mollifier/buffer.ts @@ -160,6 +160,7 @@ export class MollifierBuffer { String(createdAtMicros), "mollifier:org-envs:", idempotencyLookupKey, + "mollifier:entries:", ); // Lua returns 1 (accepted), 0 (duplicate runId), or a string runId // (duplicate idempotency — value is the existing winner's runId). @@ -391,15 +392,17 @@ export class MollifierBuffer { // Resolve a buffered run by (env, task, idempotencyKey) tuple. Used by // `IdempotencyKeyConcern.handleTriggerRequest` after the PG check // misses — same key may belong to a buffered run waiting to drain. The - // lookup self-heals: if the lookup points at an entry hash that's - // expired, we DEL the lookup and report a miss. + // lookup self-heals: if the lookup points at an entry hash that's gone, + // we clear the lookup and report a miss. The clear is a compare-and- + // delete (only if the key still holds the stale runId we observed) so a + // fresh accept that rebinds the key between our GET and DEL isn't wiped. async lookupIdempotency(input: IdempotencyLookupInput): Promise { const lookupKey = idempotencyLookupKeyFor(input); const runId = await this.redis.get(lookupKey); if (!runId) return null; const entry = await this.getEntry(runId); if (!entry) { - await this.redis.del(lookupKey); + await this.redis.delMollifierKeyIfEquals(lookupKey, runId); return null; } return runId; @@ -502,6 +505,7 @@ export class MollifierBuffer { local createdAtMicros = ARGV[6] local orgEnvsPrefix = ARGV[7] local idempotencyLookupKey = ARGV[8] or '' + local entryPrefix = ARGV[9] -- Idempotent: refuse if an entry for this runId already exists in any -- state. Caller-side dedup is also enforced via API idempotency keys, @@ -519,7 +523,14 @@ export class MollifierBuffer { if idempotencyLookupKey ~= '' then local existing = redis.call('GET', idempotencyLookupKey) if existing then - return existing + -- Self-heal: only honour the binding if its entry hash still + -- exists. If the entry was evicted (maxmemory) but the lookup + -- survived, the binding is stale — fall through and rebind to + -- this run rather than returning a dead runId that would block + -- the key indefinitely. Mirrors lookupIdempotency's self-heal. + if redis.call('EXISTS', entryPrefix .. existing) == 1 then + return existing + end end redis.call('SET', idempotencyLookupKey, runId) end @@ -935,6 +946,20 @@ export class MollifierBuffer { `, }); + // Compare-and-delete: DEL the key only if it still holds the expected + // value. Used by lookupIdempotency's stale-lookup self-heal so a + // concurrent accept that rebinds the key between the reader's GET and + // this DEL isn't clobbered. + this.redis.defineCommand("delMollifierKeyIfEquals", { + numberOfKeys: 1, + lua: ` + if redis.call('GET', KEYS[1]) == ARGV[1] then + return redis.call('DEL', KEYS[1]) + end + return 0 + `, + }); + this.redis.defineCommand("mollifierEvaluateTrip", { numberOfKeys: 2, lua: ` @@ -974,6 +999,7 @@ declare module "@internal/redis" { createdAtMicros: string, orgEnvsPrefix: string, idempotencyLookupKey: string, + entryPrefix: string, callback?: Callback, ): Result; popAndMarkDraining( @@ -1039,6 +1065,11 @@ declare module "@internal/redis" { errorPayload: string, callback?: Callback, ): Result; + delMollifierKeyIfEquals( + key: string, + expected: string, + callback?: Callback, + ): Result; mollifierEvaluateTrip( rateKey: string, trippedKey: string, diff --git a/packages/redis-worker/src/mollifier/schemas.ts b/packages/redis-worker/src/mollifier/schemas.ts index c5d9915575a..7a2861a63a5 100644 --- a/packages/redis-worker/src/mollifier/schemas.ts +++ b/packages/redis-worker/src/mollifier/schemas.ts @@ -48,9 +48,12 @@ export const BufferEntrySchema = z.object({ status: BufferEntryStatus, attempts: stringToInt, createdAt: stringToDate, - // Microsecond epoch matching the ZSET queue score. Stable across - // requeues — the score never moves once set at accept time. - createdAtMicros: stringToInt, + // Microsecond epoch of accept time, kept as a hash field for dwell + // metrics. Not a queue sort key (the queue is a FIFO LIST). Defaulted + // so an entry written by an accept Lua predating this field — or one + // surviving across the deploy that introduced it — still parses instead + // of being silently dropped on pop. + createdAtMicros: stringToInt.default("0"), // Drainer-ack flag: `true` once the drainer has materialised this run // into PG. The hash persists for a short grace TTL after ack so direct // reads (retrieve, trace, etc.) still resolve while PG replica lag