From 1cd2da67f204c3e5311199a3d955217a356252b5 Mon Sep 17 00:00:00 2001
From: Dan Sutton <dansutton@trigger.dev>
Date: Tue, 26 May 2026 11:06:11 +0100
Subject: [PATCH 1/7] feat(redis-worker,webapp): mollifier buffer extensions +
 snapshot type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds the buffer-side data layer used by phase-3 work:
- buffer.ts gains entry inspection (getEntry), idempotency lookup
  (lookupIdempotency), in-place snapshot mutation (mutateSnapshot),
  and dwell tracking — all atomic via Lua.
- snapshot.server.ts: shared MollifierSnapshot type + (de)serialise.
- Drops the entry-TTL config — the drainer is the recovery mechanism.

Adds methods to the buffer interface; nothing consumes them yet.
Subsequent PRs in the stack wire trigger-time mollify, read-fallback,
and mutation paths against this surface.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .changeset/mollifier-buffer-extensions.md     |    6 +
 apps/webapp/app/env.server.ts                 |    1 -
 .../v3/mollifier/mollifierBuffer.server.ts    |    1 -
 .../v3/mollifier/mollifierSnapshot.server.ts  |   16 +
 .../redis-worker/src/mollifier/buffer.test.ts | 1221 +++++++++++++++--
 packages/redis-worker/src/mollifier/buffer.ts |  646 ++++++++-
 packages/redis-worker/src/mollifier/index.ts  |   11 +-
 .../redis-worker/src/mollifier/schemas.ts     |   22 +
 8 files changed, 1786 insertions(+), 138 deletions(-)
 create mode 100644 .changeset/mollifier-buffer-extensions.md
 create mode 100644 apps/webapp/app/v3/mollifier/mollifierSnapshot.server.ts

diff --git a/.changeset/mollifier-buffer-extensions.md b/.changeset/mollifier-buffer-extensions.md
new file mode 100644
index 00000000000..b1f38f51ecc
--- /dev/null
+++ b/.changeset/mollifier-buffer-extensions.md
@@ -0,0 +1,6 @@
+---
+"@trigger.dev/redis-worker": minor
+"@trigger.dev/core": patch
+---
+
+Mollifier buffer feature set built on top of the initial primitives: idempotency-lookup with SETNX dedup, atomic snapshot-mutation API (`mutateSnapshot` with tag/metadata/delay/cancel patches), metadata CAS for lossless concurrent updates, watermark-paginated listing, claim primitives for pre-gate idempotency, ZSET-backed per-env queue, 30s post-ack grace TTL, and drop the accept-time entry TTL (drainer is now the only removal mechanism). `@trigger.dev/core` gains an optional `notice` field on the trigger response so the SDK can surface mollifier-queued guidance to customers.
diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts
index 97b8333e279..9e4743d740f 100644
--- a/apps/webapp/app/env.server.ts
+++ b/apps/webapp/app/env.server.ts
@@ -1095,7 +1095,6 @@ const EnvironmentSchema = z
     TRIGGER_MOLLIFIER_TRIP_THRESHOLD: z.coerce.number().int().positive().default(100),
     TRIGGER_MOLLIFIER_HOLD_MS: z.coerce.number().int().positive().default(500),
     TRIGGER_MOLLIFIER_DRAIN_CONCURRENCY: z.coerce.number().int().positive().default(50),
-    TRIGGER_MOLLIFIER_ENTRY_TTL_S: z.coerce.number().int().positive().default(600),
     TRIGGER_MOLLIFIER_DRAIN_MAX_ATTEMPTS: z.coerce.number().int().positive().default(3),
     TRIGGER_MOLLIFIER_DRAIN_SHUTDOWN_TIMEOUT_MS: z.coerce.number().int().positive().default(30_000),
     TRIGGER_MOLLIFIER_DRAIN_MAX_ORGS_PER_TICK: z.coerce.number().int().positive().default(500),
diff --git a/apps/webapp/app/v3/mollifier/mollifierBuffer.server.ts b/apps/webapp/app/v3/mollifier/mollifierBuffer.server.ts
index 9c8917623e4..09b52aa9da3 100644
--- a/apps/webapp/app/v3/mollifier/mollifierBuffer.server.ts
+++ b/apps/webapp/app/v3/mollifier/mollifierBuffer.server.ts
@@ -22,7 +22,6 @@ function initializeMollifierBuffer(): MollifierBuffer {
       enableAutoPipelining: true,
       ...(env.TRIGGER_MOLLIFIER_REDIS_TLS_DISABLED === "true" ? {} : { tls: {} }),
     },
-    entryTtlSeconds: env.TRIGGER_MOLLIFIER_ENTRY_TTL_S,
   });
 }
 
diff --git a/apps/webapp/app/v3/mollifier/mollifierSnapshot.server.ts b/apps/webapp/app/v3/mollifier/mollifierSnapshot.server.ts
new file mode 100644
index 00000000000..a0732a3542e
--- /dev/null
+++ b/apps/webapp/app/v3/mollifier/mollifierSnapshot.server.ts
@@ -0,0 +1,16 @@
+import { serialiseSnapshot, deserialiseSnapshot } from "@trigger.dev/redis-worker";
+
+// MollifierSnapshot is the JSON-serialisable shape of the input that would be
+// passed to engine.trigger(). The drainer deserialises and replays it.
+// Kept as Record<string, unknown> at this layer — the engine.trigger call site
+// casts it to the engine's typed input. This keeps the mollifier subdirectory
+// from depending on @internal/run-engine internals.
+export type MollifierSnapshot = Record<string, unknown>;
+
+export function serialiseMollifierSnapshot(input: MollifierSnapshot): string {
+  return serialiseSnapshot(input);
+}
+
+export function deserialiseMollifierSnapshot(serialised: string): MollifierSnapshot {
+  return deserialiseSnapshot<MollifierSnapshot>(serialised);
+}
diff --git a/packages/redis-worker/src/mollifier/buffer.test.ts b/packages/redis-worker/src/mollifier/buffer.test.ts
index c8f7b95c97a..a4c1be35eb3 100644
--- a/packages/redis-worker/src/mollifier/buffer.test.ts
+++ b/packages/redis-worker/src/mollifier/buffer.test.ts
@@ -20,12 +20,14 @@ describe("schemas", () => {
       status: "QUEUED",
       attempts: "0",
       createdAt: "2026-05-11T10:00:00.000Z",
+      createdAtMicros: "1747044000000000",
     };
     const parsed = BufferEntrySchema.parse(raw);
     expect(parsed.runId).toBe("run_abc");
     expect(parsed.status).toBe("QUEUED");
     expect(parsed.attempts).toBe(0);
     expect(parsed.createdAt).toBeInstanceOf(Date);
+    expect(parsed.createdAtMicros).toBe(1747044000000000);
   });
 
   it("BufferEntrySchema parses a FAILED entry with lastError", () => {
@@ -37,6 +39,7 @@ describe("schemas", () => {
       status: "FAILED",
       attempts: "3",
       createdAt: "2026-05-11T10:00:00.000Z",
+      createdAtMicros: "1747044000000000",
       lastError: JSON.stringify({ code: "P2024", message: "connection lost" }),
     };
     const parsed = BufferEntrySchema.parse(raw);
@@ -52,7 +55,6 @@ describe("MollifierBuffer construction", () => {
         port: redisContainer.getPort(),
         password: redisContainer.getPassword(),
       },
-      entryTtlSeconds: 600,
       logger: new Logger("test", "log"),
     });
 
@@ -68,7 +70,6 @@ describe("MollifierBuffer.accept", () => {
         port: redisContainer.getPort(),
         password: redisContainer.getPassword(),
       },
-      entryTtlSeconds: 600,
       logger: new Logger("test", "log"),
     });
 
@@ -105,7 +106,6 @@ describe("MollifierBuffer.pop", () => {
         port: redisContainer.getPort(),
         password: redisContainer.getPassword(),
       },
-      entryTtlSeconds: 600,
       logger: new Logger("test", "log"),
     });
 
@@ -132,7 +132,6 @@ describe("MollifierBuffer.pop", () => {
         port: redisContainer.getPort(),
         password: redisContainer.getPassword(),
       },
-      entryTtlSeconds: 600,
       logger: new Logger("test", "log"),
     });
 
@@ -151,7 +150,6 @@ describe("MollifierBuffer.pop", () => {
         port: redisContainer.getPort(),
         password: redisContainer.getPassword(),
       },
-      entryTtlSeconds: 600,
       logger: new Logger("test", "log"),
     });
 
@@ -169,24 +167,56 @@ describe("MollifierBuffer.pop", () => {
 });
 
 describe("MollifierBuffer.ack", () => {
-  redisTest("ack deletes the entry", { timeout: 20_000 }, async ({ redisContainer }) => {
+  redisTest(
+    "ack marks entry materialised and applies the grace TTL — entry persists as a read-fallback safety net",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+
+      try {
+        await buffer.accept({ runId: "run_x", envId: "env_a", orgId: "org_1", payload: "{}" });
+        await buffer.pop("env_a");
+        await buffer.ack("run_x");
+
+        const after = await buffer.getEntry("run_x");
+        expect(after).not.toBeNull();
+        expect(after!.materialised).toBe(true);
+
+        // ack grace TTL is the only context where an entry hash gets
+        // an EXPIRE — accept no longer sets one. Should be at most 30s.
+        const ttl = await buffer.getEntryTtlSeconds("run_x");
+        expect(ttl).toBeGreaterThan(0);
+        expect(ttl).toBeLessThanOrEqual(30);
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest("ack on missing entry is a no-op", { timeout: 20_000 }, async ({ redisContainer }) => {
     const buffer = new MollifierBuffer({
       redisOptions: {
         host: redisContainer.getHost(),
         port: redisContainer.getPort(),
         password: redisContainer.getPassword(),
       },
-      entryTtlSeconds: 600,
       logger: new Logger("test", "log"),
     });
 
     try {
-      await buffer.accept({ runId: "run_x", envId: "env_a", orgId: "org_1", payload: "{}" });
-      await buffer.pop("env_a");
-      await buffer.ack("run_x");
-
-      const after = await buffer.getEntry("run_x");
-      expect(after).toBeNull();
+      await buffer.ack("run_ghost");
+      const stored = await buffer.getEntry("run_ghost");
+      expect(stored).toBeNull();
+      // Critical: no partial hash created.
+      const raw = await buffer["redis"].hgetall("mollifier:entries:run_ghost");
+      expect(Object.keys(raw)).toHaveLength(0);
     } finally {
       await buffer.close();
     }
@@ -204,13 +234,12 @@ describe("MollifierBuffer.pop orphan handling", () => {
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
       try {
         // Simulate a TTL-expired orphan: queue ref exists, entry hash does not.
-        await buffer["redis"].lpush("mollifier:queue:env_a", "run_orphan");
+        await buffer["redis"].zadd("mollifier:queue:env_a", 1, "run_orphan");
 
         const popped = await buffer.pop("env_a");
         expect(popped).toBeNull();
@@ -220,7 +249,7 @@ describe("MollifierBuffer.pop orphan handling", () => {
         expect(Object.keys(raw)).toHaveLength(0);
 
         // Queue is drained — the loop pops orphans until empty.
-        const qLen = await buffer["redis"].llen("mollifier:queue:env_a");
+        const qLen = await buffer["redis"].zcard("mollifier:queue:env_a");
         expect(qLen).toBe(0);
       } finally {
         await buffer.close();
@@ -238,17 +267,16 @@ describe("MollifierBuffer.pop orphan handling", () => {
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
       try {
-        // Layout (oldest-first, since RPOP takes from tail): orphan, valid, orphan.
-        // LPUSH puts items at the head, so to get RPOP order [orphan_a, valid, orphan_b]
-        // we LPUSH in reverse: orphan_b first, then valid, then orphan_a.
-        await buffer["redis"].lpush("mollifier:queue:env_a", "orphan_b");
+        // Layout by score (lowest-first, since ZPOPMIN takes the min):
+        // orphan_a (score 1) → valid (score = its createdAtMicros, large) → orphan_b (score 1e18).
+        // First pop skips orphan_a, returns valid; orphan_b remains.
+        await buffer["redis"].zadd("mollifier:queue:env_a", 1, "orphan_a");
         await buffer.accept({ runId: "valid", envId: "env_a", orgId: "org_1", payload: "{}" });
-        await buffer["redis"].lpush("mollifier:queue:env_a", "orphan_a");
+        await buffer["redis"].zadd("mollifier:queue:env_a", 1e18, "orphan_b");
 
         const popped = await buffer.pop("env_a");
         expect(popped).not.toBeNull();
@@ -256,7 +284,7 @@ describe("MollifierBuffer.pop orphan handling", () => {
         expect(popped!.status).toBe("DRAINING");
 
         // The trailing orphan_b is still in the queue (single pop call).
-        const remaining = await buffer["redis"].llen("mollifier:queue:env_a");
+        const remaining = await buffer["redis"].zcard("mollifier:queue:env_a");
         expect(remaining).toBe(1);
 
         // A second pop drains the trailing orphan_b. The queue is now
@@ -283,7 +311,6 @@ describe("MollifierBuffer.requeue", () => {
         port: redisContainer.getPort(),
         password: redisContainer.getPassword(),
       },
-      entryTtlSeconds: 600,
       logger: new Logger("test", "log"),
     });
 
@@ -305,30 +332,43 @@ describe("MollifierBuffer.requeue", () => {
 });
 
 describe("MollifierBuffer.fail", () => {
-  redisTest("fail transitions to FAILED and stores lastError", { timeout: 20_000 }, async ({ redisContainer }) => {
-    const buffer = new MollifierBuffer({
-      redisOptions: {
-        host: redisContainer.getHost(),
-        port: redisContainer.getPort(),
-        password: redisContainer.getPassword(),
-      },
-      entryTtlSeconds: 600,
-      logger: new Logger("test", "log"),
-    });
+  redisTest(
+    "fail returns true and tears the entry down (drainer-terminal cleanup)",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      // Post-TTL-drop design: the drainer's createFailedTaskRun has
+      // already written a SYSTEM_FAILURE PG row by the time we call
+      // fail(), so the entry hash is no longer load-bearing. fail
+      // returns true and removes the entry; without this teardown
+      // failed entries would accrete forever now that there's no
+      // accept-time TTL. The Lua also DELs the idempotency lookup so
+      // future retries with the same key go through to PG instead of
+      // hitting an orphan dedup record.
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
 
-    try {
-      await buffer.accept({ runId: "run_f", envId: "env_a", orgId: "org_1", payload: "{}" });
-      await buffer.pop("env_a");
-      const failed = await buffer.fail("run_f", { code: "VALIDATION", message: "boom" });
-      expect(failed).toBe(true);
+      try {
+        await buffer.accept({ runId: "run_f", envId: "env_a", orgId: "org_1", payload: "{}" });
+        await buffer.pop("env_a");
+        const failed = await buffer.fail("run_f", { code: "VALIDATION", message: "boom" });
+        expect(failed).toBe(true);
 
-      const entry = await buffer.getEntry("run_f");
-      expect(entry!.status).toBe("FAILED");
-      expect(entry!.lastError).toEqual({ code: "VALIDATION", message: "boom" });
-    } finally {
-      await buffer.close();
-    }
-  });
+        // Entry hash is gone post-fail.
+        const entry = await buffer.getEntry("run_f");
+        expect(entry).toBeNull();
+        const raw = await buffer["redis"].hgetall("mollifier:entries:run_f");
+        expect(Object.keys(raw)).toHaveLength(0);
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
 
   redisTest(
     "fail on missing entry is a no-op (returns false; no partial hash created)",
@@ -340,7 +380,6 @@ describe("MollifierBuffer.fail", () => {
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
@@ -361,27 +400,35 @@ describe("MollifierBuffer.fail", () => {
 });
 
 describe("MollifierBuffer TTL", () => {
-  redisTest("entry has TTL applied on accept", { timeout: 20_000 }, async ({ redisContainer }) => {
-    const buffer = new MollifierBuffer({
-      redisOptions: {
-        host: redisContainer.getHost(),
-        port: redisContainer.getPort(),
-        password: redisContainer.getPassword(),
-      },
-      entryTtlSeconds: 600,
-      logger: new Logger("test", "log"),
-    });
+  redisTest(
+    "entry has NO TTL applied on accept — drainer is the only cleanup path",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      // Regression guard for the design change: buffer entries must
+      // persist until the drainer ACKs or FAILs them. An accept-time
+      // EXPIRE would re-introduce the silent-loss-when-drainer-offline
+      // failure mode that the stale-entry alerting pipeline depends on
+      // *not* happening.
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
 
-    try {
-      await buffer.accept({ runId: "run_t", envId: "env_a", orgId: "org_1", payload: "{}" });
+      try {
+        await buffer.accept({ runId: "run_t", envId: "env_a", orgId: "org_1", payload: "{}" });
 
-      const ttl = await buffer.getEntryTtlSeconds("run_t");
-      expect(ttl).toBeGreaterThan(0);
-      expect(ttl).toBeLessThanOrEqual(600);
-    } finally {
-      await buffer.close();
-    }
-  });
+        // Redis returns -1 when the key exists but has no TTL set.
+        const ttl = await buffer.getEntryTtlSeconds("run_t");
+        expect(ttl).toBe(-1);
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
 });
 
 describe("MollifierBuffer payload encoding", () => {
@@ -395,7 +442,6 @@ describe("MollifierBuffer payload encoding", () => {
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
@@ -437,7 +483,6 @@ describe("MollifierBuffer.requeue on missing entry", () => {
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
@@ -458,22 +503,27 @@ describe("MollifierBuffer.requeue on missing entry", () => {
 
 describe("MollifierBuffer.requeue ordering", () => {
   redisTest(
-    "requeued entry is popped AFTER other queued entries on the same env (FIFO retry)",
+    "requeued entry retains its original createdAt and pops next (oldest-first by createdAt)",
     { timeout: 20_000 },
     async ({ redisContainer }) => {
+      // Score == createdAtMicros; requeue does not bump the score. The
+      // oldest entry continues to pop first across retries. `maxAttempts`
+      // in the drainer bounds the retry loop for a persistently failing
+      // entry (after which it goes to the `fail` path, not requeue).
       const buffer = new MollifierBuffer({
         redisOptions: {
           host: redisContainer.getHost(),
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
       try {
         await buffer.accept({ runId: "a", envId: "env_a", orgId: "org_1", payload: "{}" });
+        await new Promise((r) => setTimeout(r, 2));
         await buffer.accept({ runId: "b", envId: "env_a", orgId: "org_1", payload: "{}" });
+        await new Promise((r) => setTimeout(r, 2));
         await buffer.accept({ runId: "c", envId: "env_a", orgId: "org_1", payload: "{}" });
 
         const first = await buffer.pop("env_a");
@@ -481,12 +531,13 @@ describe("MollifierBuffer.requeue ordering", () => {
 
         await buffer.requeue("a");
 
+        // a still has the smallest createdAtMicros → pops next.
         const next = await buffer.pop("env_a");
-        expect(next!.runId).toBe("b");
+        expect(next!.runId).toBe("a");
         const after = await buffer.pop("env_a");
-        expect(after!.runId).toBe("c");
+        expect(after!.runId).toBe("b");
         const last = await buffer.pop("env_a");
-        expect(last!.runId).toBe("a");
+        expect(last!.runId).toBe("c");
       } finally {
         await buffer.close();
       }
@@ -508,7 +559,6 @@ describe("MollifierBuffer.evaluateTrip", () => {
         port: redisContainer.getPort(),
         password: redisContainer.getPassword(),
       },
-      entryTtlSeconds: 600,
       logger: new Logger("test", "log"),
     });
 
@@ -530,7 +580,6 @@ describe("MollifierBuffer.evaluateTrip", () => {
         port: redisContainer.getPort(),
         password: redisContainer.getPassword(),
       },
-      entryTtlSeconds: 600,
       logger: new Logger("test", "log"),
     });
 
@@ -557,7 +606,6 @@ describe("MollifierBuffer.evaluateTrip", () => {
         port: redisContainer.getPort(),
         password: redisContainer.getPassword(),
       },
-      entryTtlSeconds: 600,
       logger: new Logger("test", "log"),
     });
 
@@ -585,7 +633,6 @@ describe("MollifierBuffer.evaluateTrip", () => {
         port: redisContainer.getPort(),
         password: redisContainer.getPassword(),
       },
-      entryTtlSeconds: 600,
       logger: new Logger("test", "log"),
     });
 
@@ -610,7 +657,6 @@ describe("MollifierBuffer.evaluateTrip", () => {
         port: redisContainer.getPort(),
         password: redisContainer.getPassword(),
       },
-      entryTtlSeconds: 600,
       logger: new Logger("test", "log"),
     });
 
@@ -638,7 +684,6 @@ describe("MollifierBuffer.evaluateTrip", () => {
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
@@ -671,7 +716,6 @@ describe("MollifierBuffer.evaluateTrip", () => {
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
@@ -707,22 +751,21 @@ describe("MollifierBuffer entry lifecycle invariants", () => {
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
       try {
         await buffer.accept({ runId: "run_ttl", envId: "env_a", orgId: "org_1", payload: "{}" });
         const beforeTtl = await buffer.getEntryTtlSeconds("run_ttl");
-        expect(beforeTtl).toBeGreaterThan(0);
+        expect(beforeTtl).toBe(-1);
 
         await buffer.pop("env_a");
         const afterTtl = await buffer.getEntryTtlSeconds("run_ttl");
 
-        // TTL must still be present (>0). Redis returns -1 if the key has no
-        // TTL — that's the leak shape we're guarding against.
-        expect(afterTtl).toBeGreaterThan(0);
-        expect(afterTtl).toBeLessThanOrEqual(beforeTtl);
+        // No TTL applied at any point during accept/pop — the entry
+        // persists until the drainer ACKs or FAILs. Returning -1 from
+        // Redis here is the expected steady state, not a leak.
+        expect(afterTtl).toBe(-1);
       } finally {
         await buffer.close();
       }
@@ -739,7 +782,6 @@ describe("MollifierBuffer entry lifecycle invariants", () => {
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
@@ -795,7 +837,6 @@ describe("MollifierBuffer.accept idempotency", () => {
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
@@ -813,8 +854,8 @@ describe("MollifierBuffer.accept idempotency", () => {
           payload: serialiseSnapshot({ first: false }),
         });
 
-        expect(first).toBe(true);
-        expect(second).toBe(false);
+        expect(first).toEqual({ kind: "accepted" });
+        expect(second).toEqual({ kind: "duplicate_run_id" });
 
         // First payload preserved; second was a no-op.
         const stored = await buffer.getEntry("run_dup");
@@ -844,7 +885,6 @@ describe("MollifierBuffer.accept idempotency", () => {
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
@@ -855,7 +895,7 @@ describe("MollifierBuffer.accept idempotency", () => {
         expect(stored!.status).toBe("DRAINING");
 
         const dup = await buffer.accept({ runId: "run_dr", envId: "env_a", orgId: "org_1", payload: "{}" });
-        expect(dup).toBe(false);
+        expect(dup).toEqual({ kind: "duplicate_run_id" });
 
         const afterDup = await buffer.getEntry("run_dr");
         expect(afterDup!.status).toBe("DRAINING"); // unchanged
@@ -866,16 +906,21 @@ describe("MollifierBuffer.accept idempotency", () => {
   );
 
   redisTest(
-    "accept refused while existing entry is FAILED",
+    "runId slot is reclaimable after fail tears the entry down",
     { timeout: 20_000 },
     async ({ redisContainer }) => {
+      // Post-TTL-drop design: fail() deletes the entry hash because
+      // the SYSTEM_FAILURE PG row is the canonical record of the
+      // failure. The runId slot is therefore free for a fresh accept
+      // afterwards — runIds are server-generated CUIDs and don't
+      // collide in practice, but the contract pinning here documents
+      // that a re-acceptance does NOT see a phantom "FAILED" entry.
       const buffer = new MollifierBuffer({
         redisOptions: {
           host: redisContainer.getHost(),
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
@@ -883,15 +928,20 @@ describe("MollifierBuffer.accept idempotency", () => {
         await buffer.accept({ runId: "run_fl", envId: "env_a", orgId: "org_1", payload: "{}" });
         await buffer.pop("env_a");
         await buffer.fail("run_fl", { code: "VALIDATION", message: "boom" });
-        const stored = await buffer.getEntry("run_fl");
-        expect(stored!.status).toBe("FAILED");
 
-        const dup = await buffer.accept({ runId: "run_fl", envId: "env_a", orgId: "org_1", payload: "{}" });
-        expect(dup).toBe(false);
+        // Entry hash gone after fail (see "fail returns true and tears
+        // the entry down" — this test pins the accept-side effect).
+        expect(await buffer.getEntry("run_fl")).toBeNull();
 
-        const afterDup = await buffer.getEntry("run_fl");
-        expect(afterDup!.status).toBe("FAILED"); // unchanged
-        expect(afterDup!.lastError).toEqual({ code: "VALIDATION", message: "boom" });
+        const fresh = await buffer.accept({
+          runId: "run_fl",
+          envId: "env_a",
+          orgId: "org_1",
+          payload: '{"fresh":true}',
+        });
+        expect(fresh).toEqual({ kind: "accepted" });
+        const after = await buffer.getEntry("run_fl");
+        expect(after?.status).toBe("QUEUED");
       } finally {
         await buffer.close();
       }
@@ -899,16 +949,21 @@ describe("MollifierBuffer.accept idempotency", () => {
   );
 
   redisTest(
-    "re-accept after ack works (terminal entry can be re-accepted)",
+    "accept refused while a previously-acked (materialised) entry is still inside its grace TTL",
     { timeout: 20_000 },
     async ({ redisContainer }) => {
+      // After ack, the entry hash persists for the grace window as a
+      // read-fallback safety net (Q1 D2). RunIds are server-generated and
+      // never collide in practice, but defense-in-depth: accept refuses
+      // while *any* entry exists for the runId, including materialised
+      // ones. The entry hash's TTL is now ~30s instead of the original
+      // entryTtlSeconds.
       const buffer = new MollifierBuffer({
         redisOptions: {
           host: redisContainer.getHost(),
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
@@ -922,7 +977,6 @@ describe("MollifierBuffer.accept idempotency", () => {
         await buffer.pop("env_a");
         await buffer.ack("run_x");
 
-        // Entry is gone — re-accept should succeed.
         const reAccept = await buffer.accept({
           runId: "run_x",
           envId: "env_a",
@@ -930,8 +984,11 @@ describe("MollifierBuffer.accept idempotency", () => {
           payload: "{}",
         });
 
-        expect(first).toBe(true);
-        expect(reAccept).toBe(true);
+        expect(first).toEqual({ kind: "accepted" });
+        expect(reAccept).toEqual({ kind: "duplicate_run_id" });
+
+        const stored = await buffer.getEntry("run_x");
+        expect(stored!.materialised).toBe(true);
       } finally {
         await buffer.close();
       }
@@ -950,7 +1007,6 @@ describe("MollifierBuffer envs set lifecycle", () => {
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
@@ -976,7 +1032,6 @@ describe("MollifierBuffer envs set lifecycle", () => {
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
@@ -1006,7 +1061,6 @@ describe("MollifierBuffer envs set lifecycle", () => {
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
@@ -1025,3 +1079,952 @@ describe("MollifierBuffer envs set lifecycle", () => {
     },
   );
 });
+
+describe("MollifierBuffer idempotency lookup", () => {
+  redisTest(
+    "accept with idempotencyKey + taskIdentifier writes the lookup with no TTL",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      // Post-TTL-drop design: the idempotency lookup has no TTL, so it
+      // can never expire ahead of the entry hash (which used to cause
+      // a dedup-drift bug — once the lookup expired but the entry
+      // didn't, a retry with the same key would create a *new*
+      // buffered run for the same key). The drainer's ack and fail
+      // both DEL the lookup as part of teardown.
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        const result = await buffer.accept({
+          runId: "ri1",
+          envId: "env_i",
+          orgId: "org_1",
+          payload: "{}",
+          idempotencyKey: "ikey-1",
+          taskIdentifier: "my-task",
+        });
+        expect(result).toEqual({ kind: "accepted" });
+
+        const lookupKey = "mollifier:idempotency:env_i:my-task:ikey-1";
+        const stored = await buffer["redis"].get(lookupKey);
+        expect(stored).toBe("ri1");
+        // -1 = key exists with no TTL set.
+        expect(await buffer["redis"].ttl(lookupKey)).toBe(-1);
+
+        const entry = await buffer.getEntry("ri1");
+        expect(entry!.idempotencyLookupKey).toBe(lookupKey);
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "second accept with same (env, task, idempotencyKey) returns duplicate_idempotency with the winner's runId",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        const first = await buffer.accept({
+          runId: "ri-a",
+          envId: "env_i",
+          orgId: "org_1",
+          payload: "{}",
+          idempotencyKey: "ikey-2",
+          taskIdentifier: "my-task",
+        });
+        const second = await buffer.accept({
+          runId: "ri-b",
+          envId: "env_i",
+          orgId: "org_1",
+          payload: "{}",
+          idempotencyKey: "ikey-2",
+          taskIdentifier: "my-task",
+        });
+
+        expect(first).toEqual({ kind: "accepted" });
+        expect(second).toEqual({
+          kind: "duplicate_idempotency",
+          existingRunId: "ri-a",
+        });
+
+        // The loser's runId entry was never created.
+        const loserEntry = await buffer.getEntry("ri-b");
+        expect(loserEntry).toBeNull();
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "lookupIdempotency hits when the run is buffered",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "rl1",
+          envId: "env_i",
+          orgId: "org_1",
+          payload: "{}",
+          idempotencyKey: "k1",
+          taskIdentifier: "t",
+        });
+        const found = await buffer.lookupIdempotency({
+          envId: "env_i",
+          taskIdentifier: "t",
+          idempotencyKey: "k1",
+        });
+        expect(found).toBe("rl1");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "lookupIdempotency returns null when no lookup is bound",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        const found = await buffer.lookupIdempotency({
+          envId: "env_i",
+          taskIdentifier: "t",
+          idempotencyKey: "absent",
+        });
+        expect(found).toBeNull();
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "lookupIdempotency self-heals when the lookup points at an expired entry",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        // Plant a stale lookup pointing at a non-existent entry.
+        const lookupKey = "mollifier:idempotency:env_i:t:stale";
+        await buffer["redis"].set(lookupKey, "rl-stale", "EX", 600);
+        expect(await buffer["redis"].get(lookupKey)).toBe("rl-stale");
+
+        const found = await buffer.lookupIdempotency({
+          envId: "env_i",
+          taskIdentifier: "t",
+          idempotencyKey: "stale",
+        });
+        expect(found).toBeNull();
+        // Self-healed.
+        expect(await buffer["redis"].get(lookupKey)).toBeNull();
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "ack DELs the idempotency lookup along with marking materialised",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "ra1",
+          envId: "env_i",
+          orgId: "org_1",
+          payload: "{}",
+          idempotencyKey: "ka",
+          taskIdentifier: "t",
+        });
+        await buffer.pop("env_i");
+        await buffer.ack("ra1");
+
+        const lookupKey = "mollifier:idempotency:env_i:t:ka";
+        expect(await buffer["redis"].get(lookupKey)).toBeNull();
+        const entry = await buffer.getEntry("ra1");
+        expect(entry!.materialised).toBe(true);
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "resetIdempotency clears snapshot fields + lookup; returns the runId",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "rr1",
+          envId: "env_i",
+          orgId: "org_1",
+          payload: serialiseSnapshot({
+            idempotencyKey: "kr",
+            idempotencyKeyExpiresAt: "2026-12-01T00:00:00Z",
+            other: "field",
+          }),
+          idempotencyKey: "kr",
+          taskIdentifier: "t",
+        });
+
+        const result = await buffer.resetIdempotency({
+          envId: "env_i",
+          taskIdentifier: "t",
+          idempotencyKey: "kr",
+        });
+        expect(result.clearedRunId).toBe("rr1");
+
+        // Lookup is gone.
+        const lookupKey = "mollifier:idempotency:env_i:t:kr";
+        expect(await buffer["redis"].get(lookupKey)).toBeNull();
+
+        // Snapshot's idempotency fields are nulled, other fields kept.
+        const entry = await buffer.getEntry("rr1");
+        const payload = JSON.parse(entry!.payload) as {
+          idempotencyKey: unknown;
+          idempotencyKeyExpiresAt: unknown;
+          other: string;
+        };
+        expect(payload.idempotencyKey).toBeNull();
+        expect(payload.idempotencyKeyExpiresAt).toBeNull();
+        expect(payload.other).toBe("field");
+        expect(entry!.idempotencyLookupKey).toBe("");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "resetIdempotency returns null when nothing is bound",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        const result = await buffer.resetIdempotency({
+          envId: "env_i",
+          taskIdentifier: "t",
+          idempotencyKey: "absent",
+        });
+        expect(result.clearedRunId).toBeNull();
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+});
+
+describe("MollifierBuffer.casSetMetadata", () => {
+  redisTest(
+    "applies when expectedVersion matches; increments version; updates payload",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "cas1",
+          envId: "env_c",
+          orgId: "org_1",
+          payload: serialiseSnapshot({ metadata: '{"v":1}', metadataType: "application/json" }),
+        });
+        const result = await buffer.casSetMetadata({
+          runId: "cas1",
+          expectedVersion: 0,
+          newMetadata: '{"v":2}',
+          newMetadataType: "application/json",
+        });
+        expect(result).toEqual({ kind: "applied", newVersion: 1 });
+
+        const entry = await buffer.getEntry("cas1");
+        expect(entry!.metadataVersion).toBe(1);
+        const payload = JSON.parse(entry!.payload) as { metadata: string };
+        expect(payload.metadata).toBe('{"v":2}');
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "returns version_conflict when expectedVersion is stale",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "cas2",
+          envId: "env_c",
+          orgId: "org_1",
+          payload: serialiseSnapshot({}),
+        });
+        await buffer.casSetMetadata({
+          runId: "cas2",
+          expectedVersion: 0,
+          newMetadata: '{"a":1}',
+          newMetadataType: "application/json",
+        });
+
+        // Second write with stale expectedVersion = 0 must conflict.
+        const result = await buffer.casSetMetadata({
+          runId: "cas2",
+          expectedVersion: 0,
+          newMetadata: '{"a":2}',
+          newMetadataType: "application/json",
+        });
+        expect(result).toEqual({ kind: "version_conflict", currentVersion: 1 });
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "returns not_found / busy on missing or terminal entries",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        const nf = await buffer.casSetMetadata({
+          runId: "absent",
+          expectedVersion: 0,
+          newMetadata: "{}",
+          newMetadataType: "application/json",
+        });
+        expect(nf).toEqual({ kind: "not_found" });
+
+        await buffer.accept({
+          runId: "cas3",
+          envId: "env_c",
+          orgId: "org_1",
+          payload: serialiseSnapshot({}),
+        });
+        await buffer.pop("env_c");
+        const busy = await buffer.casSetMetadata({
+          runId: "cas3",
+          expectedVersion: 0,
+          newMetadata: "{}",
+          newMetadataType: "application/json",
+        });
+        expect(busy).toEqual({ kind: "busy" });
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+});
+
+describe("MollifierBuffer.mutateSnapshot", () => {
+  redisTest(
+    "returns not_found when no entry exists for the runId",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        const result = await buffer.mutateSnapshot("nope", {
+          type: "append_tags",
+          tags: ["x"],
+        });
+        expect(result).toBe("not_found");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "append_tags on QUEUED entry appends and dedupes",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "r1",
+          envId: "env_m",
+          orgId: "org_1",
+          payload: serialiseSnapshot({ tags: ["existing"] }),
+        });
+        const first = await buffer.mutateSnapshot("r1", {
+          type: "append_tags",
+          tags: ["existing", "new"],
+        });
+        expect(first).toBe("applied_to_snapshot");
+
+        const entry = await buffer.getEntry("r1");
+        const payload = JSON.parse(entry!.payload) as { tags: string[] };
+        expect(payload.tags).toEqual(["existing", "new"]);
+
+        // Second mutation appends without duplicating
+        const second = await buffer.mutateSnapshot("r1", {
+          type: "append_tags",
+          tags: ["new", "third"],
+        });
+        expect(second).toBe("applied_to_snapshot");
+        const e2 = await buffer.getEntry("r1");
+        const p2 = JSON.parse(e2!.payload) as { tags: string[] };
+        expect(p2.tags).toEqual(["existing", "new", "third"]);
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "append_tags creates payload.tags when absent",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "r2",
+          envId: "env_m",
+          orgId: "org_1",
+          payload: serialiseSnapshot({ taskId: "t" }),
+        });
+        const result = await buffer.mutateSnapshot("r2", {
+          type: "append_tags",
+          tags: ["a", "b"],
+        });
+        expect(result).toBe("applied_to_snapshot");
+        const entry = await buffer.getEntry("r2");
+        const payload = JSON.parse(entry!.payload) as { tags: string[] };
+        expect(payload.tags).toEqual(["a", "b"]);
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "set_metadata replaces metadata + metadataType (last-write-wins)",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "r3",
+          envId: "env_m",
+          orgId: "org_1",
+          payload: serialiseSnapshot({ metadata: '{"v":1}', metadataType: "application/json" }),
+        });
+        const result = await buffer.mutateSnapshot("r3", {
+          type: "set_metadata",
+          metadata: '{"v":2}',
+          metadataType: "application/json",
+        });
+        expect(result).toBe("applied_to_snapshot");
+        const entry = await buffer.getEntry("r3");
+        const payload = JSON.parse(entry!.payload) as {
+          metadata: string;
+          metadataType: string;
+        };
+        expect(payload.metadata).toBe('{"v":2}');
+        expect(payload.metadataType).toBe("application/json");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "set_delay sets payload.delayUntil",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "r4",
+          envId: "env_m",
+          orgId: "org_1",
+          payload: serialiseSnapshot({ taskId: "t" }),
+        });
+        const result = await buffer.mutateSnapshot("r4", {
+          type: "set_delay",
+          delayUntil: "2026-06-01T00:00:00.000Z",
+        });
+        expect(result).toBe("applied_to_snapshot");
+        const entry = await buffer.getEntry("r4");
+        const payload = JSON.parse(entry!.payload) as { delayUntil: string };
+        expect(payload.delayUntil).toBe("2026-06-01T00:00:00.000Z");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "mark_cancelled stamps cancelledAt + cancelReason",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "r5",
+          envId: "env_m",
+          orgId: "org_1",
+          payload: serialiseSnapshot({ taskId: "t" }),
+        });
+        const result = await buffer.mutateSnapshot("r5", {
+          type: "mark_cancelled",
+          cancelledAt: "2026-05-19T12:00:00.000Z",
+          cancelReason: "user-initiated",
+        });
+        expect(result).toBe("applied_to_snapshot");
+        const entry = await buffer.getEntry("r5");
+        const payload = JSON.parse(entry!.payload) as {
+          cancelledAt: string;
+          cancelReason: string;
+        };
+        expect(payload.cancelledAt).toBe("2026-05-19T12:00:00.000Z");
+        expect(payload.cancelReason).toBe("user-initiated");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "returns busy when entry is DRAINING",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "rd",
+          envId: "env_m",
+          orgId: "org_1",
+          payload: serialiseSnapshot({ tags: [] }),
+        });
+        await buffer.pop("env_m");
+        const result = await buffer.mutateSnapshot("rd", {
+          type: "append_tags",
+          tags: ["x"],
+        });
+        expect(result).toBe("busy");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "returns not_found when entry was FAILED (drainer-terminal teardown)",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      // Post-TTL-drop design: fail() DELs the entry hash because the
+      // drainer has already written the canonical SYSTEM_FAILURE PG
+      // row, and without an accept-time TTL we'd otherwise accrete
+      // failed entries in Redis forever. Late mutations against a
+      // failed run therefore see `not_found`, matching the same shape
+      // they'd get for any other already-cleaned-up runId.
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "rf",
+          envId: "env_m",
+          orgId: "org_1",
+          payload: serialiseSnapshot({ tags: [] }),
+        });
+        await buffer.pop("env_m");
+        await buffer.fail("rf", { code: "X", message: "boom" });
+        const result = await buffer.mutateSnapshot("rf", {
+          type: "append_tags",
+          tags: ["x"],
+        });
+        expect(result).toBe("not_found");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "returns busy when entry is materialised (post-ack grace window)",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "rm",
+          envId: "env_m",
+          orgId: "org_1",
+          payload: serialiseSnapshot({ tags: [] }),
+        });
+        await buffer.pop("env_m");
+        await buffer.ack("rm");
+        const result = await buffer.mutateSnapshot("rm", {
+          type: "append_tags",
+          tags: ["x"],
+        });
+        expect(result).toBe("busy");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "Lua atomicity serialises concurrent mutations per-runId",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "rcc",
+          envId: "env_m",
+          orgId: "org_1",
+          payload: serialiseSnapshot({ tags: [] }),
+        });
+
+        const tagsToAdd = Array.from({ length: 50 }, (_, i) => `t${i}`);
+        await Promise.all(
+          tagsToAdd.map((t) => buffer.mutateSnapshot("rcc", { type: "append_tags", tags: [t] })),
+        );
+
+        const entry = await buffer.getEntry("rcc");
+        const payload = JSON.parse(entry!.payload) as { tags: string[] };
+        expect(payload.tags.sort()).toEqual(tagsToAdd.sort());
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+});
+
+describe("MollifierBuffer ZSET storage", () => {
+  redisTest(
+    "queue key is a ZSET scored by entry's createdAtMicros",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+
+      try {
+        await buffer.accept({ runId: "z1", envId: "env_z", orgId: "org_1", payload: "{}" });
+
+        // ZSET-only commands must succeed against the queue key.
+        const card = await buffer["redis"].zcard("mollifier:queue:env_z");
+        expect(card).toBe(1);
+
+        const score = await buffer["redis"].zscore("mollifier:queue:env_z", "z1");
+        expect(score).not.toBeNull();
+        const scoreNum = Number(score);
+        expect(Number.isFinite(scoreNum)).toBe(true);
+
+        // Score matches the entry hash's createdAtMicros field.
+        const micros = await buffer["redis"].hget("mollifier:entries:z1", "createdAtMicros");
+        expect(micros).not.toBeNull();
+        expect(Number(micros)).toBe(scoreNum);
+
+        // Score is plausibly recent (within last minute as microseconds).
+        const nowMicros = Date.now() * 1000;
+        expect(scoreNum).toBeGreaterThan(nowMicros - 60_000_000);
+        expect(scoreNum).toBeLessThanOrEqual(nowMicros + 1_000_000);
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "pop returns entries in ascending createdAtMicros order (FIFO by time, not by member)",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+
+      try {
+        // Insert runIds in reverse-lex order to prove ordering is by score, not member.
+        await buffer.accept({ runId: "zzz", envId: "env_o", orgId: "org_1", payload: "{}" });
+        await new Promise((r) => setTimeout(r, 5));
+        await buffer.accept({ runId: "mmm", envId: "env_o", orgId: "org_1", payload: "{}" });
+        await new Promise((r) => setTimeout(r, 5));
+        await buffer.accept({ runId: "aaa", envId: "env_o", orgId: "org_1", payload: "{}" });
+
+        const first = await buffer.pop("env_o");
+        expect(first!.runId).toBe("zzz");
+        const second = await buffer.pop("env_o");
+        expect(second!.runId).toBe("mmm");
+        const third = await buffer.pop("env_o");
+        expect(third!.runId).toBe("aaa");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "requeue keeps original score; createdAt is immutable across retries",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+
+      try {
+        await buffer.accept({ runId: "rq", envId: "env_rq", orgId: "org_1", payload: "{}" });
+        const originalScore = Number(
+          await buffer["redis"].zscore("mollifier:queue:env_rq", "rq"),
+        );
+        const originalMicros = Number(
+          await buffer["redis"].hget("mollifier:entries:rq", "createdAtMicros"),
+        );
+
+        await buffer.pop("env_rq");
+        await new Promise((r) => setTimeout(r, 5));
+        await buffer.requeue("rq");
+
+        const newScore = Number(
+          await buffer["redis"].zscore("mollifier:queue:env_rq", "rq"),
+        );
+        const newMicros = Number(
+          await buffer["redis"].hget("mollifier:entries:rq", "createdAtMicros"),
+        );
+        expect(newScore).toBe(originalScore);
+        expect(newMicros).toBe(originalMicros);
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+});
+
+describe("MollifierBuffer.listEntriesForEnv", () => {
+  redisTest(
+    "returns up to maxCount entries from the queue without consuming them",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+
+      try {
+        await buffer.accept({ runId: "r1", envId: "env_a", orgId: "org_1", payload: "{}" });
+        await buffer.accept({ runId: "r2", envId: "env_a", orgId: "org_1", payload: "{}" });
+        await buffer.accept({ runId: "r3", envId: "env_a", orgId: "org_1", payload: "{}" });
+
+        const entries = await buffer.listEntriesForEnv("env_a", 2);
+        expect(entries).toHaveLength(2);
+        const runIds = entries.map((e) => e.runId);
+        expect(new Set(runIds).size).toBe(2);
+        for (const id of runIds) expect(["r1", "r2", "r3"]).toContain(id);
+
+        // Non-destructive: the drainer can still pop all three.
+        const popped: string[] = [];
+        for (let i = 0; i < 3; i++) {
+          const entry = await buffer.pop("env_a");
+          if (entry) popped.push(entry.runId);
+        }
+        expect(new Set(popped)).toEqual(new Set(["r1", "r2", "r3"]));
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest("returns empty array when env queue is empty", { timeout: 20_000 }, async ({ redisContainer }) => {
+    const buffer = new MollifierBuffer({
+      redisOptions: {
+        host: redisContainer.getHost(),
+        port: redisContainer.getPort(),
+        password: redisContainer.getPassword(),
+      },
+      logger: new Logger("test", "log"),
+    });
+
+    try {
+      expect(await buffer.listEntriesForEnv("env_empty", 10)).toEqual([]);
+    } finally {
+      await buffer.close();
+    }
+  });
+
+  redisTest("maxCount <= 0 returns empty without hitting redis", { timeout: 20_000 }, async ({ redisContainer }) => {
+    const buffer = new MollifierBuffer({
+      redisOptions: {
+        host: redisContainer.getHost(),
+        port: redisContainer.getPort(),
+        password: redisContainer.getPassword(),
+      },
+      logger: new Logger("test", "log"),
+    });
+
+    try {
+      expect(await buffer.listEntriesForEnv("env_a", 0)).toEqual([]);
+      expect(await buffer.listEntriesForEnv("env_a", -5)).toEqual([]);
+    } finally {
+      await buffer.close();
+    }
+  });
+});
diff --git a/packages/redis-worker/src/mollifier/buffer.ts b/packages/redis-worker/src/mollifier/buffer.ts
index f739e3ff362..fd53f59efea 100644
--- a/packages/redis-worker/src/mollifier/buffer.ts
+++ b/packages/redis-worker/src/mollifier/buffer.ts
@@ -10,17 +10,66 @@ import { BufferEntry, BufferEntrySchema } from "./schemas.js";
 
 export type MollifierBufferOptions = {
   redisOptions: RedisOptions;
-  entryTtlSeconds: number;
   logger?: Logger;
 };
 
+// Grace TTL applied to the entry hash on drainer ack. The entry survives
+// this long after materialisation so direct reads (retrieve, trace, etc.)
+// have a safety net while PG replica lag settles. Q1 D2.
+const ACK_GRACE_TTL_SECONDS = 30;
+
+export type SnapshotPatch =
+  | { type: "append_tags"; tags: string[] }
+  | { type: "set_metadata"; metadata: string; metadataType: string }
+  | { type: "set_delay"; delayUntil: string }
+  | { type: "mark_cancelled"; cancelledAt: string; cancelReason?: string };
+
+export type MutateSnapshotResult = "applied_to_snapshot" | "not_found" | "busy";
+
+export type CasSetMetadataResult =
+  | { kind: "applied"; newVersion: number }
+  | { kind: "version_conflict"; currentVersion: number }
+  | { kind: "not_found" }
+  | { kind: "busy" };
+
+export type AcceptResult =
+  | { kind: "accepted" }
+  | { kind: "duplicate_run_id" }
+  | { kind: "duplicate_idempotency"; existingRunId: string };
+
+export type IdempotencyLookupInput = {
+  envId: string;
+  taskIdentifier: string;
+  idempotencyKey: string;
+};
+
+function makeIdempotencyLookupKey(input: IdempotencyLookupInput): string {
+  return `mollifier:idempotency:${input.envId}:${input.taskIdentifier}:${input.idempotencyKey}`;
+}
+
+// Pre-gate claim key namespace, distinct from `mollifier:idempotency` so the
+// existing B6a buffer-side dedup stays isolated. The claim is the
+// authoritative cross-store "this idempotency key is in flight or
+// resolved" pointer used by the trigger hot path
+// (`_plans/2026-05-21-mollifier-idempotency-claim.md`). Values:
+//   "pending"   → a trigger pipeline owns the key and hasn't published yet
+//   <runId>     → the winning trigger's runId (resolved)
+export const IDEMPOTENCY_CLAIM_PENDING = "pending";
+
+function makeIdempotencyClaimKey(input: IdempotencyLookupInput): string {
+  return `mollifier:claim:${input.envId}:${input.taskIdentifier}:${input.idempotencyKey}`;
+}
+
+export type IdempotencyClaimResult =
+  | { kind: "claimed" }
+  | { kind: "pending" }
+  | { kind: "resolved"; runId: string };
+
 export class MollifierBuffer {
   private readonly redis: Redis;
-  private readonly entryTtlSeconds: number;
   private readonly logger: Logger;
 
   constructor(options: MollifierBufferOptions) {
-    this.entryTtlSeconds = options.entryTtlSeconds;
     this.logger = options.logger ?? new Logger("MollifierBuffer", "debug");
 
     this.redis = createRedisClient(
@@ -41,19 +90,47 @@ export class MollifierBuffer {
     this.#registerCommands();
   }
 
-  // Returns true if the entry was newly written; false if a duplicate runId
-  // was already buffered (idempotent no-op). Callers can use the boolean to
-  // record a duplicate-accept metric without affecting buffer state.
+  // Three outcomes:
+  //   - { kind: "accepted" } — entry was newly written.
+  //   - { kind: "duplicate_run_id" } — runId was already buffered (idempotent
+  //     no-op, same semantic as the previous boolean-false return).
+  //   - { kind: "duplicate_idempotency", existingRunId } — the (env, task,
+  //     idempotencyKey) tuple was already bound to another buffered run.
+  //     The Lua's atomic SETNX is the race-winner; the second caller gets
+  //     the winner's runId so it can return that as the trigger response.
   async accept(input: {
     runId: string;
     envId: string;
     orgId: string;
     payload: string;
-  }): Promise<boolean> {
+    // Optional idempotency-key triple. When all three are present we
+    // SETNX a Redis lookup at `mollifier:idempotency:{env}:{task}:{key}`
+    // pointing at the runId so trigger-time dedup during the buffered
+    // window resolves the same way PG's unique constraint resolves it
+    // post-materialisation (Q5).
+    idempotencyKey?: string;
+    taskIdentifier?: string;
+  }): Promise<AcceptResult> {
     const entryKey = `mollifier:entries:${input.runId}`;
     const queueKey = `mollifier:queue:${input.envId}`;
     const orgsKey = "mollifier:orgs";
-    const createdAt = new Date().toISOString();
+    const nowMs = Date.now();
+    const createdAt = new Date(nowMs).toISOString();
+    // Microsecond epoch. JS only has millisecond precision, so multiple
+    // accepts in the same ms share a score; ZSET ties resolve by member
+    // (runId) lex order, which is deterministic and acceptable for FIFO
+    // pop. The hash carries the same value as `createdAtMicros` so the
+    // listing helper (Phase E) can read a stable per-run timestamp
+    // without re-fetching the score.
+    const createdAtMicros = nowMs * 1000;
+    const idempotencyLookupKey =
+      input.idempotencyKey && input.taskIdentifier
+        ? makeIdempotencyLookupKey({
+            envId: input.envId,
+            taskIdentifier: input.taskIdentifier,
+            idempotencyKey: input.idempotencyKey,
+          })
+        : "";
     const result = await this.redis.acceptMollifierEntry(
       entryKey,
       queueKey,
@@ -63,10 +140,17 @@ export class MollifierBuffer {
       input.orgId,
       input.payload,
       createdAt,
-      String(this.entryTtlSeconds),
+      String(createdAtMicros),
       "mollifier:org-envs:",
+      idempotencyLookupKey,
     );
-    return result === 1;
+    // Lua returns 1 (accepted), 0 (duplicate runId), or a string runId
+    // (duplicate idempotency — value is the existing winner's runId).
+    if (typeof result === "string" && result.length > 0) {
+      return { kind: "duplicate_idempotency", existingRunId: result };
+    }
+    if (result === 1) return { kind: "accepted" };
+    return { kind: "duplicate_run_id" };
   }
 
   async pop(envId: string): Promise<BufferEntry | null> {
@@ -128,8 +212,247 @@ export class MollifierBuffer {
     return this.redis.smembers(`mollifier:org-envs:${orgId}`);
   }
 
+  // Paginated read of currently-queued entries newest-first, bounded by
+  // an optional `(createdAtMicros, runId)` watermark. Q1 listing design.
+  // Returns hydrated `BufferEntry` rows up to `pageSize`. Skips orphans
+  // (queue ref without an entry hash) silently. Non-destructive — the
+  // drainer keeps popping these entries in createdAt order regardless.
+  async listForEnvWithWatermark(input: {
+    envId: string;
+    watermark?: { createdAtMicros: number; runId: string };
+    pageSize: number;
+  }): Promise<BufferEntry[]> {
+    if (input.pageSize <= 0) return [];
+    const queueKey = `mollifier:queue:${input.envId}`;
+
+    let runIds: string[];
+    if (!input.watermark) {
+      // Page 1 — newest first.
+      runIds = await this.redis.zrevrangebyscore(
+        queueKey,
+        "+inf",
+        "-inf",
+        "LIMIT",
+        0,
+        input.pageSize,
+      );
+    } else {
+      // Page N — strictly below the watermark score.
+      const belowScore = await this.redis.zrevrangebyscore(
+        queueKey,
+        `(${input.watermark.createdAtMicros}`,
+        "-inf",
+        "LIMIT",
+        0,
+        input.pageSize,
+      );
+      runIds = belowScore;
+      // Tied-score scan: ZSET ties broken by member-DESC, so entries
+      // sharing the watermark score with a lex-smaller runId still
+      // need to surface. Cheap second range over the tied band.
+      if (belowScore.length < input.pageSize) {
+        const remaining = input.pageSize - belowScore.length;
+        const tied = await this.redis.zrangebyscore(
+          queueKey,
+          input.watermark.createdAtMicros,
+          input.watermark.createdAtMicros,
+        );
+        // Filter to runIds lex-less than the watermark anchor, sort
+        // member-DESC, take `remaining`.
+        const tiedFiltered = tied
+          .filter((r) => r < input.watermark!.runId)
+          .sort((a, b) => (a < b ? 1 : a > b ? -1 : 0))
+          .slice(0, remaining);
+        runIds = [...belowScore, ...tiedFiltered];
+      }
+    }
+
+    if (runIds.length === 0) return [];
+
+    // Parallel HGETALL — one round-trip per entry, all in flight.
+    const fetched = await Promise.all(
+      runIds.map((runId) => this.redis.hgetall(`mollifier:entries:${runId}`)),
+    );
+    const entries: BufferEntry[] = [];
+    for (const value of fetched) {
+      if (!value || Object.keys(value).length === 0) continue;
+      const parsed = BufferEntrySchema.safeParse(value);
+      if (parsed.success) entries.push(parsed.data);
+    }
+    return entries;
+  }
+
+  // Read-only listing of currently-queued entries for a single env. Used by
+  // the dashboard's "Recently queued" surface — non-destructive, so the
+  // drainer still pops these entries in order. Returns up to `maxCount`
+  // entries newest-first (highest score, which is `createdAtMicros`).
+  // Each entry hash is fetched separately; a `null` from getEntry (TTL
+  // expired between ZREVRANGE and HGETALL) is skipped.
+  async listEntriesForEnv(envId: string, maxCount: number): Promise<BufferEntry[]> {
+    if (maxCount <= 0) return [];
+    const runIds = await this.redis.zrevrange(
+      `mollifier:queue:${envId}`,
+      0,
+      maxCount - 1,
+    );
+    const entries: BufferEntry[] = [];
+    for (const runId of runIds) {
+      const entry = await this.getEntry(runId);
+      if (entry) entries.push(entry);
+    }
+    return entries;
+  }
+
+  // Atomic snapshot mutation. Used by customer-mutation API endpoints
+  // (tags, metadata-put, reschedule, cancel) when the run is still in
+  // the buffer. Three outcomes:
+  //   - "applied_to_snapshot": entry was QUEUED + not materialised; the
+  //     drainer will read the patched payload on its next pop.
+  //   - "not_found": no entry hash exists for this runId.
+  //   - "busy": entry is DRAINING / FAILED / materialised. The API
+  //     wait-and-bounces through PG (Q3 design).
+  async mutateSnapshot(runId: string, patch: SnapshotPatch): Promise<MutateSnapshotResult> {
+    const result = (await this.redis.mutateMollifierSnapshot(
+      `mollifier:entries:${runId}`,
+      JSON.stringify(patch),
+    )) as string;
+    if (
+      result === "applied_to_snapshot" ||
+      result === "not_found" ||
+      result === "busy"
+    ) {
+      return result;
+    }
+    throw new Error(`MollifierBuffer.mutateSnapshot: unexpected Lua return value: ${result}`);
+  }
+
+  // Optimistic compare-and-swap on the snapshot's metadata. Caller reads
+  // the current metadataVersion via getEntry, applies operations in JS via
+  // `applyMetadataOperations`, then calls this with the new metadata + the
+  // expected version. Lua refuses if the version has moved (caller retries
+  // up to N times). Mirrors the PG-side `UpdateMetadataService` retry
+  // loop so concurrent increment/append operations don't lose deltas.
+  async casSetMetadata(input: {
+    runId: string;
+    expectedVersion: number;
+    newMetadata: string;
+    newMetadataType: string;
+  }): Promise<CasSetMetadataResult> {
+    const entryKey = `mollifier:entries:${input.runId}`;
+    const raw = (await this.redis.casSetMollifierMetadata(
+      entryKey,
+      String(input.expectedVersion),
+      input.newMetadata,
+      input.newMetadataType,
+    )) as string;
+    if (raw === "not_found") return { kind: "not_found" };
+    if (raw === "busy") return { kind: "busy" };
+    if (raw.startsWith("conflict:")) {
+      return { kind: "version_conflict", currentVersion: Number(raw.slice("conflict:".length)) };
+    }
+    if (raw.startsWith("applied:")) {
+      return { kind: "applied", newVersion: Number(raw.slice("applied:".length)) };
+    }
+    throw new Error(`MollifierBuffer.casSetMetadata: unexpected Lua return: ${raw}`);
+  }
+
+  // Atomic pre-gate claim on a (env, task, idempotencyKey) tuple. One
+  // call across both PG and buffer paths serialises through this claim;
+  // closes the race the buffer-side B6a SETNX leaves open during the
+  // gate-transition burst window (see
+  // `_plans/2026-05-21-mollifier-idempotency-claim.md`).
+  //
+  // - "claimed": we now own the claim, the caller proceeds with the
+  //   trigger pipeline and must `publishClaim` on success or
+  //   `releaseClaim` on failure.
+  // - "pending": another trigger owns the claim and hasn't published
+  //   yet; the caller should poll.
+  // - "resolved": the claim already holds a runId; the caller can
+  //   return that runId as a cached hit.
+  async claimIdempotency(
+    input: IdempotencyLookupInput & { ttlSeconds: number },
+  ): Promise<IdempotencyClaimResult> {
+    const claimKey = makeIdempotencyClaimKey(input);
+    const raw = (await this.redis.claimMollifierIdempotency(
+      claimKey,
+      IDEMPOTENCY_CLAIM_PENDING,
+      String(input.ttlSeconds),
+    )) as string;
+    if (raw === "claimed") return { kind: "claimed" };
+    if (raw === "pending") return { kind: "pending" };
+    if (raw.startsWith("resolved:")) {
+      return { kind: "resolved", runId: raw.slice("resolved:".length) };
+    }
+    throw new Error(`MollifierBuffer.claimIdempotency: unexpected return: ${raw}`);
+  }
+
+  // Publish the winning runId to the claim so subsequent claimants /
+  // waiters see "resolved". TTL bounded by the customer's
+  // `idempotencyKeyExpiresAt` minus now; caller computes.
+  async publishClaim(
+    input: IdempotencyLookupInput & { runId: string; ttlSeconds: number },
+  ): Promise<void> {
+    const claimKey = makeIdempotencyClaimKey(input);
+    await this.redis.set(claimKey, input.runId, "EX", input.ttlSeconds);
+  }
+
+  // Release the claim on pipeline error so waiters can re-claim and
+  // retry. Idempotent.
+  async releaseClaim(input: IdempotencyLookupInput): Promise<void> {
+    const claimKey = makeIdempotencyClaimKey(input);
+    await this.redis.del(claimKey);
+  }
+
+  // Read the current claim value, used by the wait/poll loop on losers
+  // to detect "pending" → "resolved" transitions and timeouts.
+  async readClaim(input: IdempotencyLookupInput): Promise<IdempotencyClaimResult | null> {
+    const claimKey = makeIdempotencyClaimKey(input);
+    const value = await this.redis.get(claimKey);
+    if (value === null) return null;
+    if (value === IDEMPOTENCY_CLAIM_PENDING) return { kind: "pending" };
+    return { kind: "resolved", runId: value };
+  }
+
+  // Resolve a buffered run by (env, task, idempotencyKey) tuple. Used by
+  // `IdempotencyKeyConcern.handleTriggerRequest` after the PG check
+  // misses — same key may belong to a buffered run waiting to drain. The
+  // lookup self-heals: if the lookup points at an entry hash that's
+  // expired, we DEL the lookup and report a miss.
+  async lookupIdempotency(input: IdempotencyLookupInput): Promise<string | null> {
+    const lookupKey = makeIdempotencyLookupKey(input);
+    const runId = await this.redis.get(lookupKey);
+    if (!runId) return null;
+    const entry = await this.getEntry(runId);
+    if (!entry) {
+      await this.redis.del(lookupKey);
+      return null;
+    }
+    return runId;
+  }
+
+  // Clear the idempotency binding from a buffered run. Used by
+  // `ResetIdempotencyKeyService` alongside the existing PG-side
+  // `updateMany`. Returns the runId that was cleared, or null if no
+  // buffered run held this key.
+  async resetIdempotency(input: IdempotencyLookupInput): Promise<{ clearedRunId: string | null }> {
+    const lookupKey = makeIdempotencyLookupKey(input);
+    const clearedRunId = (await this.redis.resetMollifierIdempotency(
+      lookupKey,
+      "mollifier:entries:",
+    )) as string;
+    return { clearedRunId: clearedRunId.length > 0 ? clearedRunId : null };
+  }
+
+  // Marks the entry as materialised (PG row written) and resets its TTL to
+  // the grace window. Entry hash persists past ack as a read-fallback
+  // safety net for the brief PG replica-lag window between drainer-side
+  // write and reader-side visibility (Q1 D2). Also clears the associated
+  // idempotency lookup if one was set on accept (Q5).
   async ack(runId: string): Promise<void> {
-    await this.redis.del(`mollifier:entries:${runId}`);
+    await this.redis.ackMollifierEntry(
+      `mollifier:entries:${runId}`,
+      String(ACK_GRACE_TTL_SECONDS),
+    );
   }
 
   async requeue(runId: string): Promise<void> {
@@ -153,10 +476,16 @@ export class MollifierBuffer {
     return result === 1;
   }
 
+  // Returns Redis-side TTL on the entry hash. Returns -1 for entries
+  // with no TTL — the steady state under the current design, where
+  // entries persist until drainer ack/fail. The ack grace TTL (30s
+  // post-materialise) is the only context where this returns a
+  // positive value; tests around the grace TTL still rely on it.
   async getEntryTtlSeconds(runId: string): Promise<number> {
     return this.redis.ttl(`mollifier:entries:${runId}`);
   }
 
+
   async evaluateTrip(
     envId: string,
     options: { windowMs: number; threshold: number; holdMs: number },
@@ -190,8 +519,9 @@ export class MollifierBuffer {
         local orgId = ARGV[3]
         local payload = ARGV[4]
         local createdAt = ARGV[5]
-        local ttlSeconds = tonumber(ARGV[6])
+        local createdAtMicros = ARGV[6]
         local orgEnvsPrefix = ARGV[7]
+        local idempotencyLookupKey = ARGV[8] or ''
 
         -- Idempotent: refuse if an entry for this runId already exists in any
         -- state. Caller-side dedup is also enforced via API idempotency keys,
@@ -200,6 +530,20 @@ export class MollifierBuffer {
           return 0
         end
 
+        -- Idempotency-key dedup (Q5). If the caller passed a lookup key
+        -- and it's already bound to another buffered run, return the
+        -- winner's runId so the loser's API response can echo it as a
+        -- cached hit. Otherwise SET the lookup (no TTL — lifecycle is
+        -- paired with the entry hash; drainer ack/fail clear it
+        -- explicitly).
+        if idempotencyLookupKey ~= '' then
+          local existing = redis.call('GET', idempotencyLookupKey)
+          if existing then
+            return existing
+          end
+          redis.call('SET', idempotencyLookupKey, runId)
+        end
+
         redis.call('HSET', entryKey,
           'runId', runId,
           'envId', envId,
@@ -207,9 +551,22 @@ export class MollifierBuffer {
           'payload', payload,
           'status', 'QUEUED',
           'attempts', '0',
-          'createdAt', createdAt)
-        redis.call('EXPIRE', entryKey, ttlSeconds)
-        redis.call('LPUSH', queueKey, runId)
+          'createdAt', createdAt,
+          'createdAtMicros', createdAtMicros,
+          'idempotencyLookupKey', idempotencyLookupKey,
+          'metadataVersion', '0')
+        -- No EXPIRE on the entry hash. Buffer entries persist until the
+        -- drainer ACKs (post-materialise grace) or FAILs them — the
+        -- drainer is the only recovery mechanism, so silent TTL-based
+        -- eviction would lose runs with no customer-visible signal.
+        -- Memory pressure from an offline drainer is the alertable
+        -- failure mode instead; see _ops/mollifier-ops.md.
+        -- ZSET keyed by createdAtMicros: ZPOPMIN drains oldest-first
+        -- (FIFO); listing pagination uses ZREVRANGEBYSCORE with a
+        -- (createdAt, runId) cursor anchor. Score is stable across the
+        -- entry's lifecycle — requeue does not bump it (see Phase 3b /
+        -- Q1 design).
+        redis.call('ZADD', queueKey, createdAtMicros, runId)
         -- Org-level membership: maintained atomically with the per-env
         -- queue so the drainer can walk orgs → envs-for-org and
         -- schedule one env per org per tick. SADDs are idempotent if the
@@ -231,7 +588,8 @@ export class MollifierBuffer {
 
         local envId = redis.call('HGET', entryKey, 'envId')
         local orgId = redis.call('HGET', entryKey, 'orgId')
-        if not envId then
+        local createdAtMicros = redis.call('HGET', entryKey, 'createdAtMicros')
+        if not envId or not createdAtMicros then
           return 0
         end
 
@@ -239,7 +597,11 @@ export class MollifierBuffer {
         local nextAttempts = tonumber(currentAttempts or '0') + 1
 
         redis.call('HSET', entryKey, 'status', 'QUEUED', 'attempts', tostring(nextAttempts))
-        redis.call('LPUSH', queuePrefix .. envId, runId)
+        -- Requeue re-adds with the ORIGINAL createdAtMicros score.
+        -- createdAt is immutable across retries (Phase 3b decision).
+        -- The drainer's maxAttempts caps the retry loop so a poisoned
+        -- entry doesn't head-of-line forever.
+        redis.call('ZADD', queuePrefix .. envId, tonumber(createdAtMicros), runId)
         -- Re-track the org/env: pop may have SREM'd them when the queue
         -- last emptied. SADDs are idempotent if the values are still
         -- present.
@@ -279,7 +641,9 @@ export class MollifierBuffer {
         -- hash without a TTL, leaking memory. The loop is bounded by queue
         -- length; entire Lua script remains atomic.
         while true do
-          local runId = redis.call('RPOP', queueKey)
+          -- ZPOPMIN returns {member, score} as a flat array, or {} when empty.
+          local popped = redis.call('ZPOPMIN', queueKey)
+          local runId = popped[1]
           if not runId then
             -- Queue is empty AND we have no entry to read orgId from, so
             -- skip org-level cleanup. Stale org-envs entries are bounded
@@ -296,9 +660,9 @@ export class MollifierBuffer {
               result[raw[i]] = raw[i + 1]
             end
             -- Prune org-level membership if this pop drained the queue.
-            -- Atomic with the RPOP above — a concurrent accept AFTER this
-            -- script will SADD both back along with its LPUSH.
-            if redis.call('LLEN', queueKey) == 0 then
+            -- Atomic with the ZPOPMIN above — a concurrent accept AFTER
+            -- this script will SADD both back along with its ZADD.
+            if redis.call('ZCARD', queueKey) == 0 then
               pruneOrgMembership(result['orgId'])
             end
             return cjson.encode(result)
@@ -309,19 +673,220 @@ export class MollifierBuffer {
       `,
     });
 
+    this.redis.defineCommand("casSetMollifierMetadata", {
+      numberOfKeys: 1,
+      lua: `
+        local entryKey = KEYS[1]
+        local expectedVersion = tonumber(ARGV[1])
+        local newMetadata = ARGV[2]
+        local newMetadataType = ARGV[3]
+
+        if redis.call('EXISTS', entryKey) == 0 then
+          return 'not_found'
+        end
+
+        local status = redis.call('HGET', entryKey, 'status')
+        local materialised = redis.call('HGET', entryKey, 'materialised')
+        if status ~= 'QUEUED' or materialised == 'true' then
+          return 'busy'
+        end
+
+        local currentVersionStr = redis.call('HGET', entryKey, 'metadataVersion') or '0'
+        local currentVersion = tonumber(currentVersionStr) or 0
+        if currentVersion ~= expectedVersion then
+          return 'conflict:' .. tostring(currentVersion)
+        end
+
+        -- Write the new metadata onto the snapshot's payload JSON. We
+        -- keep the rest of the payload intact — only metadata/metadataType
+        -- change. metadataVersion is denormalised on the hash for cheap
+        -- CAS reads; it's intentionally NOT stored inside the payload
+        -- itself (PG-side metadataVersion is a column, not a JSON field).
+        local payloadJson = redis.call('HGET', entryKey, 'payload')
+        local ok, payload = pcall(cjson.decode, payloadJson)
+        if not ok then return 'busy' end
+        payload.metadata = newMetadata
+        payload.metadataType = newMetadataType
+
+        local newVersion = currentVersion + 1
+        redis.call('HSET', entryKey,
+          'payload', cjson.encode(payload),
+          'metadataVersion', tostring(newVersion))
+        return 'applied:' .. tostring(newVersion)
+      `,
+    });
+
+    this.redis.defineCommand("claimMollifierIdempotency", {
+      numberOfKeys: 1,
+      lua: `
+        local claimKey = KEYS[1]
+        local pending = ARGV[1]
+        local ttl = tonumber(ARGV[2])
+
+        -- SETNX-with-TTL: atomic; only one caller can win.
+        local won = redis.call('SET', claimKey, pending, 'NX', 'EX', ttl)
+        if won then
+          return 'claimed'
+        end
+
+        local existing = redis.call('GET', claimKey)
+        if existing == pending then
+          return 'pending'
+        end
+        return 'resolved:' .. existing
+      `,
+    });
+
+    this.redis.defineCommand("resetMollifierIdempotency", {
+      numberOfKeys: 1,
+      lua: `
+        local lookupKey = KEYS[1]
+        local entryPrefix = ARGV[1]
+
+        local runId = redis.call('GET', lookupKey)
+        if not runId then
+          return ''
+        end
+
+        local entryKey = entryPrefix .. runId
+        if redis.call('EXISTS', entryKey) == 0 then
+          -- Stale lookup. Lazy cleanup.
+          redis.call('DEL', lookupKey)
+          return ''
+        end
+
+        -- Clear the idempotency fields on the snapshot payload so the
+        -- drainer's eventual engine.trigger call inserts a PG row
+        -- without the key set.
+        local payloadJson = redis.call('HGET', entryKey, 'payload')
+        if payloadJson then
+          local ok, payload = pcall(cjson.decode, payloadJson)
+          if ok then
+            payload.idempotencyKey = cjson.null
+            payload.idempotencyKeyExpiresAt = cjson.null
+            redis.call('HSET', entryKey, 'payload', cjson.encode(payload))
+          end
+        end
+        -- Clear the denormalised lookup pointer on the hash so a later
+        -- ack doesn't try to DEL a key that's already gone.
+        redis.call('HSET', entryKey, 'idempotencyLookupKey', '')
+        redis.call('DEL', lookupKey)
+        return runId
+      `,
+    });
+
+    this.redis.defineCommand("mutateMollifierSnapshot", {
+      numberOfKeys: 1,
+      lua: `
+        local entryKey = KEYS[1]
+        local patchJson = ARGV[1]
+
+        if redis.call('EXISTS', entryKey) == 0 then
+          return 'not_found'
+        end
+
+        local status = redis.call('HGET', entryKey, 'status')
+        local materialised = redis.call('HGET', entryKey, 'materialised')
+        if status ~= 'QUEUED' or materialised == 'true' then
+          return 'busy'
+        end
+
+        local payloadJson = redis.call('HGET', entryKey, 'payload')
+        local ok, payload = pcall(cjson.decode, payloadJson)
+        if not ok then return 'busy' end
+
+        local patch = cjson.decode(patchJson)
+
+        if patch.type == 'append_tags' then
+          -- cjson decode of an absent or empty-array field gives nil or
+          -- an empty table; we rebuild as a dense array. Existing tags
+          -- are preserved; new tags are appended only if not present.
+          local existing = payload.tags or {}
+          local seen = {}
+          local merged = {}
+          for _, t in ipairs(existing) do
+            if not seen[t] then
+              seen[t] = true
+              table.insert(merged, t)
+            end
+          end
+          for _, t in ipairs(patch.tags or {}) do
+            if not seen[t] then
+              seen[t] = true
+              table.insert(merged, t)
+            end
+          end
+          payload.tags = merged
+        elseif patch.type == 'set_metadata' then
+          payload.metadata = patch.metadata
+          payload.metadataType = patch.metadataType
+        elseif patch.type == 'set_delay' then
+          payload.delayUntil = patch.delayUntil
+        elseif patch.type == 'mark_cancelled' then
+          payload.cancelledAt = patch.cancelledAt
+          payload.cancelReason = patch.cancelReason
+        else
+          return 'busy'
+        end
+
+        redis.call('HSET', entryKey, 'payload', cjson.encode(payload))
+        return 'applied_to_snapshot'
+      `,
+    });
+
+    this.redis.defineCommand("ackMollifierEntry", {
+      numberOfKeys: 1,
+      lua: `
+        local entryKey = KEYS[1]
+        local graceTtlSeconds = tonumber(ARGV[1])
+
+        -- Guard: never create a partial entry. If the hash expired between
+        -- pop and ack, the run is gone — nothing to mark materialised.
+        if redis.call('EXISTS', entryKey) == 0 then
+          return 0
+        end
+
+        -- If the entry was accepted with an idempotency key, the lookup
+        -- string was stored on the hash at accept time. Clear it now —
+        -- PG becomes canonical for the key post-materialisation (Q5).
+        local lookupKey = redis.call('HGET', entryKey, 'idempotencyLookupKey')
+        if lookupKey and lookupKey ~= '' then
+          redis.call('DEL', lookupKey)
+        end
+
+        redis.call('HSET', entryKey, 'materialised', 'true')
+        redis.call('EXPIRE', entryKey, graceTtlSeconds)
+        return 1
+      `,
+    });
+
     this.redis.defineCommand("failMollifierEntry", {
       numberOfKeys: 1,
       lua: `
         local entryKey = KEYS[1]
         local errorPayload = ARGV[1]
 
-        -- Guard: never create a partial entry. If the hash expired between
-        -- pop and fail, the run is gone — nothing to mark FAILED.
+        -- Guard: nothing to mark FAILED if the hash is gone (concurrent
+        -- ack/manual cleanup). Returning 0 lets the caller distinguish
+        -- "marked failed" from "no-op".
         if redis.call('EXISTS', entryKey) == 0 then
           return 0
         end
 
         redis.call('HSET', entryKey, 'status', 'FAILED', 'lastError', errorPayload)
+
+        -- The drainer has already written a SYSTEM_FAILURE PG row for
+        -- terminal failures (see mollifierDrainerHandler.server.ts), so
+        -- the buffer entry is no longer load-bearing. Clear the
+        -- idempotency lookup — PG's unique constraint is the canonical
+        -- dedup mechanism post-materialise — and drop the entry hash so
+        -- failed runs don't accrete forever now that there's no
+        -- accept-time TTL.
+        local lookupKey = redis.call('HGET', entryKey, 'idempotencyLookupKey')
+        if lookupKey and lookupKey ~= '' then
+          redis.call('DEL', lookupKey)
+        end
+        redis.call('DEL', entryKey)
         return 1
       `,
     });
@@ -362,10 +927,11 @@ declare module "@internal/redis" {
       orgId: string,
       payload: string,
       createdAt: string,
-      ttlSeconds: string,
+      createdAtMicros: string,
       orgEnvsPrefix: string,
-      callback?: Callback<number>,
-    ): Result<number, Context>;
+      idempotencyLookupKey: string,
+      callback?: Callback<number | string>,
+    ): Result<number | string, Context>;
     popAndMarkDraining(
       queueKey: string,
       orgsKey: string,
@@ -382,6 +948,34 @@ declare module "@internal/redis" {
       orgEnvsPrefix: string,
       callback?: Callback<number>,
     ): Result<number, Context>;
+    mutateMollifierSnapshot(
+      entryKey: string,
+      patchJson: string,
+      callback?: Callback<string>,
+    ): Result<string, Context>;
+    casSetMollifierMetadata(
+      entryKey: string,
+      expectedVersion: string,
+      newMetadata: string,
+      newMetadataType: string,
+      callback?: Callback<string>,
+    ): Result<string, Context>;
+    resetMollifierIdempotency(
+      lookupKey: string,
+      entryPrefix: string,
+      callback?: Callback<string>,
+    ): Result<string, Context>;
+    claimMollifierIdempotency(
+      claimKey: string,
+      pendingMarker: string,
+      ttlSeconds: string,
+      callback?: Callback<string>,
+    ): Result<string, Context>;
+    ackMollifierEntry(
+      entryKey: string,
+      graceTtlSeconds: string,
+      callback?: Callback<number>,
+    ): Result<number, Context>;
     failMollifierEntry(
       entryKey: string,
       errorPayload: string,
diff --git a/packages/redis-worker/src/mollifier/index.ts b/packages/redis-worker/src/mollifier/index.ts
index 5e6fe202e3d..2751a6615eb 100644
--- a/packages/redis-worker/src/mollifier/index.ts
+++ b/packages/redis-worker/src/mollifier/index.ts
@@ -1,4 +1,13 @@
-export { MollifierBuffer, type MollifierBufferOptions } from "./buffer.js";
+export {
+  MollifierBuffer,
+  type MollifierBufferOptions,
+  type SnapshotPatch,
+  type MutateSnapshotResult,
+  type CasSetMetadataResult,
+  type IdempotencyClaimResult,
+  type IdempotencyLookupInput,
+  IDEMPOTENCY_CLAIM_PENDING,
+} from "./buffer.js";
 export {
   MollifierDrainer,
   type MollifierDrainerOptions,
diff --git a/packages/redis-worker/src/mollifier/schemas.ts b/packages/redis-worker/src/mollifier/schemas.ts
index f93b0f0a3c3..c5d9915575a 100644
--- a/packages/redis-worker/src/mollifier/schemas.ts
+++ b/packages/redis-worker/src/mollifier/schemas.ts
@@ -27,6 +27,10 @@ const stringToDate = z.string().transform((v, ctx) => {
   return d;
 });
 
+const stringToBool = z
+  .union([z.literal("true"), z.literal("false")])
+  .transform((v) => v === "true");
+
 const stringToError = z.string().transform((v, ctx) => {
   try {
     return BufferEntryError.parse(JSON.parse(v));
@@ -44,6 +48,24 @@ export const BufferEntrySchema = z.object({
   status: BufferEntryStatus,
   attempts: stringToInt,
   createdAt: stringToDate,
+  // Microsecond epoch matching the ZSET queue score. Stable across
+  // requeues — the score never moves once set at accept time.
+  createdAtMicros: stringToInt,
+  // Drainer-ack flag: `true` once the drainer has materialised this run
+  // into PG. The hash persists for a short grace TTL after ack so direct
+  // reads (retrieve, trace, etc.) still resolve while PG replica lag
+  // settles. Absent on pre-ack entries.
+  materialised: stringToBool.default("false"),
+  // Denormalised pointer to the Redis idempotency lookup key (set when
+  // the run was accepted with an idempotency key, empty otherwise). The
+  // ack Lua reads this to DEL the lookup atomically with marking the
+  // entry materialised (Q5).
+  idempotencyLookupKey: z.string().optional().default(""),
+  // Optimistic-lock counter for the snapshot's `metadata` field.
+  // Incremented atomically by the CAS metadata Lua. Matches the
+  // semantic of `TaskRun.metadataVersion` on the PG side (which the
+  // UpdateMetadataService uses for the same retry-on-conflict pattern).
+  metadataVersion: stringToInt.default("0"),
   lastError: stringToError.optional(),
 });
 

From 51911a150da51caa58781d1837ef65714226f830 Mon Sep 17 00:00:00 2001
From: Dan Sutton <dansutton@trigger.dev>
Date: Tue, 26 May 2026 12:11:51 +0100
Subject: [PATCH 2/7] test(redis-worker): align drainer tests with buffer
 ack/fail semantics

After the buffer extensions in this PR:
- ack() keeps the entry alive with a grace TTL as a read-fallback
  safety net. Test asserts the entry persists with materialised=true.
- fail() deletes the entry once the drainer-handler has written the
  canonical SYSTEM_FAILURE PG row. Tests assert the entry is null and
  use runOnce()'s `failed` counter as the surviving signal.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../src/mollifier/drainer.test.ts             | 23 +++++++++++++------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/packages/redis-worker/src/mollifier/drainer.test.ts b/packages/redis-worker/src/mollifier/drainer.test.ts
index c8f68977f69..c98c733b0fc 100644
--- a/packages/redis-worker/src/mollifier/drainer.test.ts
+++ b/packages/redis-worker/src/mollifier/drainer.test.ts
@@ -6,7 +6,6 @@ import { MollifierDrainer } from "./drainer.js";
 import { serialiseSnapshot } from "./schemas.js";
 
 const noopOptions = {
-  entryTtlSeconds: 600,
   logger: new Logger("test", "log"),
 };
 
@@ -87,8 +86,11 @@ describe("MollifierDrainer.runOnce", () => {
         payload: { foo: 1 },
       });
 
+      // After ack the entry persists as a read-fallback safety net with
+      // materialised=true and a fresh grace TTL (Q1 D2 / Phase B2).
       const entry = await buffer.getEntry("run_1");
-      expect(entry).toBeNull();
+      expect(entry).not.toBeNull();
+      expect(entry!.materialised).toBe(true);
     } finally {
       await buffer.close();
     }
@@ -167,9 +169,14 @@ describe("MollifierDrainer error handling", () => {
       expect(after2!.status).toBe("QUEUED");
       expect(after2!.attempts).toBe(2);
 
-      await drainer.runOnce();
+      const result3 = await drainer.runOnce();
+      // On attempt 3 the drainer hits maxAttempts and calls fail(),
+      // which deletes the entry — once the drainer-handler has written
+      // the SYSTEM_FAILURE PG row the buffer entry is no longer
+      // load-bearing. The runOnce result is the surviving signal.
       const after3 = await buffer.getEntry("run_r");
-      expect(after3!.status).toBe("FAILED");
+      expect(after3).toBeNull();
+      expect(result3.failed).toBe(1);
       expect(calls).toBe(3);
     } finally {
       await buffer.close();
@@ -202,11 +209,13 @@ describe("MollifierDrainer error handling", () => {
     try {
       await buffer.accept({ runId: "run_nr", envId: "env_a", orgId: "org_1", payload: "{}" });
 
-      await drainer.runOnce();
+      const result = await drainer.runOnce();
 
+      // fail() deletes the entry once the drainer-handler has written
+      // the canonical SYSTEM_FAILURE PG row.
       const entry = await buffer.getEntry("run_nr");
-      expect(entry!.status).toBe("FAILED");
-      expect(entry!.lastError).toEqual({ code: "Error", message: "validation failure" });
+      expect(entry).toBeNull();
+      expect(result.failed).toBe(1);
     } finally {
       await buffer.close();
     }

From 2be2ddd74b3158d5faf8ff1fafeea624926016d3 Mon Sep 17 00:00:00 2001
From: Dan Sutton <dansutton@trigger.dev>
Date: Tue, 26 May 2026 14:10:13 +0100
Subject: [PATCH 3/7] fix(redis-worker): encode mollifier composite-key
 segments + per-claim ownership token

Addresses code-review feedback on the buffer's idempotency keying:

- Encode `envId` / `taskIdentifier` / `idempotencyKey` with base64url
  before concatenation so customer-supplied segments containing `:`
  cannot alias each other onto the same Redis key. Exports
  `idempotencyLookupKeyFor` so tests assert against the same encoding
  the buffer writes.
- Replace the shared `"pending"` claim marker with a caller-supplied
  ownership token (`"pending:<token>"`). `publishClaim` and
  `releaseClaim` become compare-and-set / compare-and-delete via Lua,
  so a late release from a previous claimant whose TTL expired cannot
  erase a new owner's claim.

New buffer tests cover the alias-collision case, the
encoded-key-shape contract, and the token-ownership safety properties
(stale release is a no-op, wrong-token publish is a no-op, fresh
claim survives the post-TTL-expiry stale-release race).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../redis-worker/src/mollifier/buffer.test.ts | 301 +++++++++++++++++-
 packages/redis-worker/src/mollifier/buffer.ts | 148 +++++++--
 packages/redis-worker/src/mollifier/index.ts  |   2 +-
 3 files changed, 420 insertions(+), 31 deletions(-)

diff --git a/packages/redis-worker/src/mollifier/buffer.test.ts b/packages/redis-worker/src/mollifier/buffer.test.ts
index a4c1be35eb3..5ce6be7f09b 100644
--- a/packages/redis-worker/src/mollifier/buffer.test.ts
+++ b/packages/redis-worker/src/mollifier/buffer.test.ts
@@ -2,7 +2,7 @@ import { describe, expect, it } from "vitest";
 import { BufferEntrySchema, serialiseSnapshot, deserialiseSnapshot } from "./schemas.js";
 import { redisTest } from "@internal/testcontainers";
 import { Logger } from "@trigger.dev/core/logger";
-import { MollifierBuffer } from "./buffer.js";
+import { MollifierBuffer, idempotencyLookupKeyFor } from "./buffer.js";
 
 describe("schemas", () => {
   it("serialiseSnapshot then deserialiseSnapshot is identity for plain objects", () => {
@@ -1110,7 +1110,11 @@ describe("MollifierBuffer idempotency lookup", () => {
         });
         expect(result).toEqual({ kind: "accepted" });
 
-        const lookupKey = "mollifier:idempotency:env_i:my-task:ikey-1";
+        const lookupKey = idempotencyLookupKeyFor({
+          envId: "env_i",
+          taskIdentifier: "my-task",
+          idempotencyKey: "ikey-1",
+        });
         const stored = await buffer["redis"].get(lookupKey);
         expect(stored).toBe("ri1");
         // -1 = key exists with no TTL set.
@@ -1241,7 +1245,11 @@ describe("MollifierBuffer idempotency lookup", () => {
       });
       try {
         // Plant a stale lookup pointing at a non-existent entry.
-        const lookupKey = "mollifier:idempotency:env_i:t:stale";
+        const lookupKey = idempotencyLookupKeyFor({
+          envId: "env_i",
+          taskIdentifier: "t",
+          idempotencyKey: "stale",
+        });
         await buffer["redis"].set(lookupKey, "rl-stale", "EX", 600);
         expect(await buffer["redis"].get(lookupKey)).toBe("rl-stale");
 
@@ -1283,7 +1291,11 @@ describe("MollifierBuffer idempotency lookup", () => {
         await buffer.pop("env_i");
         await buffer.ack("ra1");
 
-        const lookupKey = "mollifier:idempotency:env_i:t:ka";
+        const lookupKey = idempotencyLookupKeyFor({
+          envId: "env_i",
+          taskIdentifier: "t",
+          idempotencyKey: "ka",
+        });
         expect(await buffer["redis"].get(lookupKey)).toBeNull();
         const entry = await buffer.getEntry("ra1");
         expect(entry!.materialised).toBe(true);
@@ -1327,7 +1339,11 @@ describe("MollifierBuffer idempotency lookup", () => {
         expect(result.clearedRunId).toBe("rr1");
 
         // Lookup is gone.
-        const lookupKey = "mollifier:idempotency:env_i:t:kr";
+        const lookupKey = idempotencyLookupKeyFor({
+          envId: "env_i",
+          taskIdentifier: "t",
+          idempotencyKey: "kr",
+        });
         expect(await buffer["redis"].get(lookupKey)).toBeNull();
 
         // Snapshot's idempotency fields are nulled, other fields kept.
@@ -2028,3 +2044,278 @@ describe("MollifierBuffer.listEntriesForEnv", () => {
     }
   });
 });
+
+// Composite-key safety. The Redis-key builders concatenate
+// `(envId, taskIdentifier, idempotencyKey)` with `:` separators; without
+// per-segment encoding, `taskIdentifier="a:b"` and `idempotencyKey="x"`
+// would map to the same key as `taskIdentifier="a"` and
+// `idempotencyKey="b:x"`. base64url encoding has no `:` in its alphabet,
+// so the encoded keys are unique per tuple.
+describe("MollifierBuffer composite-key encoding (collision resistance)", () => {
+  redisTest(
+    "two accepts whose unencoded keys would alias don't collide on the idempotency lookup",
+    { timeout: 30_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        // Aliased tuples under raw `:` concatenation:
+        //   env_x : "a:b" : "x"   →   "mollifier:idempotency:env_x:a:b:x"
+        //   env_x : "a"   : "b:x" →   "mollifier:idempotency:env_x:a:b:x"
+        const r1 = await buffer.accept({
+          runId: "ck_run_1",
+          envId: "env_x",
+          orgId: "org_1",
+          payload: "{}",
+          taskIdentifier: "a:b",
+          idempotencyKey: "x",
+        });
+        const r2 = await buffer.accept({
+          runId: "ck_run_2",
+          envId: "env_x",
+          orgId: "org_1",
+          payload: "{}",
+          taskIdentifier: "a",
+          idempotencyKey: "b:x",
+        });
+        // Both accepted — no false-positive collision.
+        expect(r1).toEqual({ kind: "accepted" });
+        expect(r2).toEqual({ kind: "accepted" });
+
+        // Each tuple resolves to its own runId.
+        const hit1 = await buffer.lookupIdempotency({
+          envId: "env_x",
+          taskIdentifier: "a:b",
+          idempotencyKey: "x",
+        });
+        const hit2 = await buffer.lookupIdempotency({
+          envId: "env_x",
+          taskIdentifier: "a",
+          idempotencyKey: "b:x",
+        });
+        expect(hit1).toBe("ck_run_1");
+        expect(hit2).toBe("ck_run_2");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "encoded lookup key contains no ':' separator beyond the namespace",
+    { timeout: 20_000 },
+    async () => {
+      // Pure-function test — verifies the encoding bijection without
+      // needing a live buffer. Re-uses the redisTest fixture for
+      // parallelism with other describe blocks but doesn't touch redis.
+      const key = idempotencyLookupKeyFor({
+        envId: "env_x",
+        taskIdentifier: "a:b",
+        idempotencyKey: "x:y:z",
+      });
+      // namespace prefix is exactly `mollifier:idempotency:` (two `:`),
+      // then three base64url segments separated by two more `:` —
+      // never the customer-supplied colons.
+      const colonCount = key.split(":").length - 1;
+      expect(colonCount).toBe(4);
+      // base64url alphabet has no `:`, `+`, `/`, or `=`.
+      const afterNamespace = key.slice("mollifier:idempotency:".length);
+      expect(afterNamespace).toMatch(/^[A-Za-z0-9_\-]+:[A-Za-z0-9_\-]+:[A-Za-z0-9_\-]+$/);
+    },
+  );
+});
+
+// Pre-gate claim ownership protection. The claim slot stores
+// `"pending:<token>"` so publish and release compare-and-act on the
+// caller's token — a late release from a previous claimant whose TTL
+// expired cannot erase a new owner's claim.
+describe("MollifierBuffer pre-gate claim — ownership token safety", () => {
+  const claimInput = {
+    envId: "env_c",
+    taskIdentifier: "task_c",
+    idempotencyKey: "key_c",
+  };
+
+  redisTest(
+    "claimIdempotency: first caller gets 'claimed', second concurrent caller gets 'pending'",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        const first = await buffer.claimIdempotency({
+          ...claimInput,
+          token: "token-A",
+          ttlSeconds: 30,
+        });
+        expect(first.kind).toBe("claimed");
+
+        // Second concurrent caller with a different token sees pending.
+        const second = await buffer.claimIdempotency({
+          ...claimInput,
+          token: "token-B",
+          ttlSeconds: 30,
+        });
+        expect(second.kind).toBe("pending");
+
+        // readClaim distinguishes pending from resolved without leaking
+        // the token to the loser.
+        const read = await buffer.readClaim(claimInput);
+        expect(read?.kind).toBe("pending");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "releaseClaim with the wrong token is a no-op (compare-and-delete)",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.claimIdempotency({ ...claimInput, token: "owner", ttlSeconds: 30 });
+
+        // Pretend a stale claimant fires a release with their old token.
+        await buffer.releaseClaim({ ...claimInput, token: "stale-impostor" });
+
+        // The owner's claim survives.
+        const stillThere = await buffer.readClaim(claimInput);
+        expect(stillThere?.kind).toBe("pending");
+
+        // The owner can still release.
+        await buffer.releaseClaim({ ...claimInput, token: "owner" });
+        expect(await buffer.readClaim(claimInput)).toBeNull();
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "publishClaim with the wrong token is a no-op and returns false",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.claimIdempotency({ ...claimInput, token: "owner", ttlSeconds: 30 });
+
+        const wrongTokenPublish = await buffer.publishClaim({
+          ...claimInput,
+          token: "stale-impostor",
+          runId: "imposter-run",
+          ttlSeconds: 60,
+        });
+        expect(wrongTokenPublish).toBe(false);
+
+        // Claim slot unchanged.
+        const stillPending = await buffer.readClaim(claimInput);
+        expect(stillPending?.kind).toBe("pending");
+
+        const goodPublish = await buffer.publishClaim({
+          ...claimInput,
+          token: "owner",
+          runId: "real-run",
+          ttlSeconds: 60,
+        });
+        expect(goodPublish).toBe(true);
+
+        const resolved = await buffer.readClaim(claimInput);
+        expect(resolved).toEqual({ kind: "resolved", runId: "real-run" });
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "regression: stale release after TTL expiry does NOT erase a fresh claim",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      // Hazard from CodeRabbit r3290070707:
+      //   1. Claimant A SETNXs the slot with their token, then stalls.
+      //   2. TTL expires, slot vanishes.
+      //   3. Claimant B SETNXs the slot with a DIFFERENT token.
+      //   4. Claimant A finally finishes (or errors) and calls
+      //      releaseClaim with their original token.
+      // Without compare-and-delete, A's release would wipe B's slot and
+      // any concurrent customer of B's idempotency key would see "no
+      // claim" and re-issue, breaking same-key dedup.
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        // Step 1: A claims with token "A".
+        const a = await buffer.claimIdempotency({
+          ...claimInput,
+          token: "A",
+          ttlSeconds: 1, // short TTL to simulate expiry quickly
+        });
+        expect(a.kind).toBe("claimed");
+
+        // Step 2: simulate TTL expiry — DEL the slot directly so the
+        // test doesn't rely on wall-clock sleeping.
+        await buffer["redis"].del(`mollifier:claim:${[claimInput.envId, claimInput.taskIdentifier, claimInput.idempotencyKey]
+          .map((s) => Buffer.from(s, "utf8").toString("base64url"))
+          .join(":")}`);
+
+        // Step 3: B claims with token "B".
+        const b = await buffer.claimIdempotency({
+          ...claimInput,
+          token: "B",
+          ttlSeconds: 30,
+        });
+        expect(b.kind).toBe("claimed");
+
+        // Step 4: A's late release. MUST be a no-op.
+        await buffer.releaseClaim({ ...claimInput, token: "A" });
+
+        // B's claim survives intact.
+        const after = await buffer.readClaim(claimInput);
+        expect(after?.kind).toBe("pending");
+
+        // B can still publish.
+        const published = await buffer.publishClaim({
+          ...claimInput,
+          token: "B",
+          runId: "B-run",
+          ttlSeconds: 60,
+        });
+        expect(published).toBe(true);
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+});
diff --git a/packages/redis-worker/src/mollifier/buffer.ts b/packages/redis-worker/src/mollifier/buffer.ts
index fd53f59efea..37752d21623 100644
--- a/packages/redis-worker/src/mollifier/buffer.ts
+++ b/packages/redis-worker/src/mollifier/buffer.ts
@@ -43,21 +43,38 @@ export type IdempotencyLookupInput = {
   idempotencyKey: string;
 };
 
-function makeIdempotencyLookupKey(input: IdempotencyLookupInput): string {
-  return `mollifier:idempotency:${input.envId}:${input.taskIdentifier}:${input.idempotencyKey}`;
+// Reversible encoding for Redis-key segments. The composite-key builders
+// concatenate `envId`, `taskIdentifier`, and `idempotencyKey` with `:`
+// separators; if any segment contains a literal `:` (envId is internal
+// and `:`-free, but taskIdentifier and idempotencyKey are
+// customer-supplied) different tuples would map to the same Redis key
+// and dedupe the wrong run. base64url has no `:` in its alphabet and is
+// bijective on the input string, so the encoded keys are
+// collision-free.
+function encodeKeyPart(value: string): string {
+  return Buffer.from(value, "utf8").toString("base64url");
+}
+
+// Exported so tests can compute the same Redis key the buffer writes
+// without hard-coding the encoding (which is a buffer-internal detail).
+export function idempotencyLookupKeyFor(input: IdempotencyLookupInput): string {
+  return `mollifier:idempotency:${encodeKeyPart(input.envId)}:${encodeKeyPart(input.taskIdentifier)}:${encodeKeyPart(input.idempotencyKey)}`;
 }
 
 // Pre-gate claim key namespace, distinct from `mollifier:idempotency` so the
 // existing B6a buffer-side dedup stays isolated. The claim is the
 // authoritative cross-store "this idempotency key is in flight or
-// resolved" pointer used by the trigger hot path
-// (`_plans/2026-05-21-mollifier-idempotency-claim.md`). Values:
-//   "pending"   → a trigger pipeline owns the key and hasn't published yet
-//   <runId>     → the winning trigger's runId (resolved)
-export const IDEMPOTENCY_CLAIM_PENDING = "pending";
+// resolved" pointer used by the trigger hot path. Values:
+//   "pending:<token>"  → claimed by a trigger pipeline; `<token>` is the
+//                        caller-supplied ownership token. Release and
+//                        publish compare-and-act on this token so a
+//                        late release from a previous claimant whose TTL
+//                        expired cannot erase a new owner's claim.
+//   <runId>            → the winning trigger's resolved runId.
+const PENDING_PREFIX = "pending:";
 
 function makeIdempotencyClaimKey(input: IdempotencyLookupInput): string {
-  return `mollifier:claim:${input.envId}:${input.taskIdentifier}:${input.idempotencyKey}`;
+  return `mollifier:claim:${encodeKeyPart(input.envId)}:${encodeKeyPart(input.taskIdentifier)}:${encodeKeyPart(input.idempotencyKey)}`;
 }
 
 export type IdempotencyClaimResult =
@@ -125,7 +142,7 @@ export class MollifierBuffer {
     const createdAtMicros = nowMs * 1000;
     const idempotencyLookupKey =
       input.idempotencyKey && input.taskIdentifier
-        ? makeIdempotencyLookupKey({
+        ? idempotencyLookupKeyFor({
             envId: input.envId,
             taskIdentifier: input.taskIdentifier,
             idempotencyKey: input.idempotencyKey,
@@ -359,8 +376,12 @@ export class MollifierBuffer {
   // Atomic pre-gate claim on a (env, task, idempotencyKey) tuple. One
   // call across both PG and buffer paths serialises through this claim;
   // closes the race the buffer-side B6a SETNX leaves open during the
-  // gate-transition burst window (see
-  // `_plans/2026-05-21-mollifier-idempotency-claim.md`).
+  // gate-transition burst window.
+  //
+  // The caller supplies an opaque `token` (UUID) on claim. The same token
+  // MUST be passed to `publishClaim` / `releaseClaim`, which compare-and-
+  // act so a late release from a previous claimant whose TTL expired
+  // cannot erase a new owner's claim.
   //
   // - "claimed": we now own the claim, the caller proceeds with the
   //   trigger pipeline and must `publishClaim` on success or
@@ -370,12 +391,13 @@ export class MollifierBuffer {
   // - "resolved": the claim already holds a runId; the caller can
   //   return that runId as a cached hit.
   async claimIdempotency(
-    input: IdempotencyLookupInput & { ttlSeconds: number },
+    input: IdempotencyLookupInput & { token: string; ttlSeconds: number },
   ): Promise<IdempotencyClaimResult> {
     const claimKey = makeIdempotencyClaimKey(input);
     const raw = (await this.redis.claimMollifierIdempotency(
       claimKey,
-      IDEMPOTENCY_CLAIM_PENDING,
+      `${PENDING_PREFIX}${input.token}`,
+      PENDING_PREFIX,
       String(input.ttlSeconds),
     )) as string;
     if (raw === "claimed") return { kind: "claimed" };
@@ -389,18 +411,39 @@ export class MollifierBuffer {
   // Publish the winning runId to the claim so subsequent claimants /
   // waiters see "resolved". TTL bounded by the customer's
   // `idempotencyKeyExpiresAt` minus now; caller computes.
+  //
+  // Compare-and-set on the caller's token: if the current value isn't
+  // our pending marker (TTL expired and another claimant moved in, or
+  // someone else already published), the publish is a no-op. The caller
+  // can treat any such case as "we lost the claim" and re-read.
+  // Returns true if we published; false if the claim slot was no longer
+  // ours.
   async publishClaim(
-    input: IdempotencyLookupInput & { runId: string; ttlSeconds: number },
-  ): Promise<void> {
+    input: IdempotencyLookupInput & { token: string; runId: string; ttlSeconds: number },
+  ): Promise<boolean> {
     const claimKey = makeIdempotencyClaimKey(input);
-    await this.redis.set(claimKey, input.runId, "EX", input.ttlSeconds);
+    const result = (await this.redis.publishMollifierClaim(
+      claimKey,
+      `${PENDING_PREFIX}${input.token}`,
+      input.runId,
+      String(input.ttlSeconds),
+    )) as number;
+    return result === 1;
   }
 
   // Release the claim on pipeline error so waiters can re-claim and
   // retry. Idempotent.
-  async releaseClaim(input: IdempotencyLookupInput): Promise<void> {
+  //
+  // Compare-and-delete on the caller's token: only deletes if the
+  // current value is exactly our pending marker. A late release from a
+  // claimant whose TTL expired is a no-op, so a new owner's claim is
+  // never wiped by a slow predecessor.
+  async releaseClaim(input: IdempotencyLookupInput & { token: string }): Promise<void> {
     const claimKey = makeIdempotencyClaimKey(input);
-    await this.redis.del(claimKey);
+    await this.redis.releaseMollifierClaim(
+      claimKey,
+      `${PENDING_PREFIX}${input.token}`,
+    );
   }
 
   // Read the current claim value, used by the wait/poll loop on losers
@@ -409,7 +452,7 @@ export class MollifierBuffer {
     const claimKey = makeIdempotencyClaimKey(input);
     const value = await this.redis.get(claimKey);
     if (value === null) return null;
-    if (value === IDEMPOTENCY_CLAIM_PENDING) return { kind: "pending" };
+    if (value.startsWith(PENDING_PREFIX)) return { kind: "pending" };
     return { kind: "resolved", runId: value };
   }
 
@@ -419,7 +462,7 @@ export class MollifierBuffer {
   // lookup self-heals: if the lookup points at an entry hash that's
   // expired, we DEL the lookup and report a miss.
   async lookupIdempotency(input: IdempotencyLookupInput): Promise<string | null> {
-    const lookupKey = makeIdempotencyLookupKey(input);
+    const lookupKey = idempotencyLookupKeyFor(input);
     const runId = await this.redis.get(lookupKey);
     if (!runId) return null;
     const entry = await this.getEntry(runId);
@@ -435,7 +478,7 @@ export class MollifierBuffer {
   // `updateMany`. Returns the runId that was cleared, or null if no
   // buffered run held this key.
   async resetIdempotency(input: IdempotencyLookupInput): Promise<{ clearedRunId: string | null }> {
-    const lookupKey = makeIdempotencyLookupKey(input);
+    const lookupKey = idempotencyLookupKeyFor(input);
     const clearedRunId = (await this.redis.resetMollifierIdempotency(
       lookupKey,
       "mollifier:entries:",
@@ -720,23 +763,65 @@ export class MollifierBuffer {
       numberOfKeys: 1,
       lua: `
         local claimKey = KEYS[1]
-        local pending = ARGV[1]
-        local ttl = tonumber(ARGV[2])
+        local pendingMarker = ARGV[1]   -- "pending:<caller-token>"
+        local pendingPrefix = ARGV[2]   -- "pending:"
+        local ttl = tonumber(ARGV[3])
 
         -- SETNX-with-TTL: atomic; only one caller can win.
-        local won = redis.call('SET', claimKey, pending, 'NX', 'EX', ttl)
+        local won = redis.call('SET', claimKey, pendingMarker, 'NX', 'EX', ttl)
         if won then
           return 'claimed'
         end
 
         local existing = redis.call('GET', claimKey)
-        if existing == pending then
+        -- Any "pending:*" value is a live claim — the caller-supplied
+        -- token differentiates ownership but is opaque to losers.
+        if string.sub(existing, 1, string.len(pendingPrefix)) == pendingPrefix then
           return 'pending'
         end
         return 'resolved:' .. existing
       `,
     });
 
+    // Publish a winning runId to a claim slot we own. Compare-and-set on
+    // the caller's pending marker: if the slot is no longer ours (TTL
+    // expired and another claimant moved in, or already resolved by
+    // someone else), we no-op. Returns 1 on publish, 0 on no-op.
+    this.redis.defineCommand("publishMollifierClaim", {
+      numberOfKeys: 1,
+      lua: `
+        local claimKey = KEYS[1]
+        local ownerMarker = ARGV[1]   -- "pending:<our-token>"
+        local runId = ARGV[2]
+        local ttl = tonumber(ARGV[3])
+
+        local existing = redis.call('GET', claimKey)
+        if existing == ownerMarker then
+          redis.call('SET', claimKey, runId, 'EX', ttl)
+          return 1
+        end
+        return 0
+      `,
+    });
+
+    // Release a claim slot we own. Compare-and-delete on the caller's
+    // pending marker: a late release from a previous claimant whose TTL
+    // expired is a no-op, so a new owner's claim is never wiped.
+    this.redis.defineCommand("releaseMollifierClaim", {
+      numberOfKeys: 1,
+      lua: `
+        local claimKey = KEYS[1]
+        local ownerMarker = ARGV[1]   -- "pending:<our-token>"
+
+        local existing = redis.call('GET', claimKey)
+        if existing == ownerMarker then
+          redis.call('DEL', claimKey)
+          return 1
+        end
+        return 0
+      `,
+    });
+
     this.redis.defineCommand("resetMollifierIdempotency", {
       numberOfKeys: 1,
       lua: `
@@ -968,9 +1053,22 @@ declare module "@internal/redis" {
     claimMollifierIdempotency(
       claimKey: string,
       pendingMarker: string,
+      pendingPrefix: string,
       ttlSeconds: string,
       callback?: Callback<string>,
     ): Result<string, Context>;
+    publishMollifierClaim(
+      claimKey: string,
+      ownerMarker: string,
+      runId: string,
+      ttlSeconds: string,
+      callback?: Callback<number>,
+    ): Result<number, Context>;
+    releaseMollifierClaim(
+      claimKey: string,
+      ownerMarker: string,
+      callback?: Callback<number>,
+    ): Result<number, Context>;
     ackMollifierEntry(
       entryKey: string,
       graceTtlSeconds: string,
diff --git a/packages/redis-worker/src/mollifier/index.ts b/packages/redis-worker/src/mollifier/index.ts
index 2751a6615eb..6e93f105fee 100644
--- a/packages/redis-worker/src/mollifier/index.ts
+++ b/packages/redis-worker/src/mollifier/index.ts
@@ -6,7 +6,7 @@ export {
   type CasSetMetadataResult,
   type IdempotencyClaimResult,
   type IdempotencyLookupInput,
-  IDEMPOTENCY_CLAIM_PENDING,
+  idempotencyLookupKeyFor,
 } from "./buffer.js";
 export {
   MollifierDrainer,

From 0a22252aaeab8c00fff68392d4dfecd0f4ac6f26 Mon Sep 17 00:00:00 2001
From: Daniel Sutton <dansutton@trigger.dev>
Date: Wed, 27 May 2026 12:58:34 +0100
Subject: [PATCH 4/7] docs(changeset): scope mollifier buffer-extensions
 changeset to redis-worker

Drop the @trigger.dev/core entry and notice-field claim (no core change
in this PR) and trim drainer-replay wording that belongs to a later PR.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .changeset/mollifier-buffer-extensions.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.changeset/mollifier-buffer-extensions.md b/.changeset/mollifier-buffer-extensions.md
index b1f38f51ecc..2334c6d70b6 100644
--- a/.changeset/mollifier-buffer-extensions.md
+++ b/.changeset/mollifier-buffer-extensions.md
@@ -1,6 +1,5 @@
 ---
 "@trigger.dev/redis-worker": minor
-"@trigger.dev/core": patch
 ---
 
-Mollifier buffer feature set built on top of the initial primitives: idempotency-lookup with SETNX dedup, atomic snapshot-mutation API (`mutateSnapshot` with tag/metadata/delay/cancel patches), metadata CAS for lossless concurrent updates, watermark-paginated listing, claim primitives for pre-gate idempotency, ZSET-backed per-env queue, 30s post-ack grace TTL, and drop the accept-time entry TTL (drainer is now the only removal mechanism). `@trigger.dev/core` gains an optional `notice` field on the trigger response so the SDK can surface mollifier-queued guidance to customers.
+Mollifier buffer extensions: idempotency dedup, an atomic `mutateSnapshot` API, metadata CAS, paginated listing, claim primitives, and a `MollifierSnapshot` type.

From 8febda21470c392a7d2483446ced7637c77d8ce8 Mon Sep 17 00:00:00 2001
From: Daniel Sutton <dansutton@trigger.dev>
Date: Wed, 27 May 2026 13:51:38 +0100
Subject: [PATCH 5/7] fix(redis-worker): mollifier CAS isolation, reset claim
 cleanup + test gaps

Close review findings on the mollifier buffer extensions:

- mutateSnapshot(set_metadata) now bumps metadataVersion so a concurrent
  casSetMetadata conflicts instead of clobbering under a stale version
- resetIdempotency clears the pre-gate claim slot, not just the lookup,
  so a resolved/pending claim can't keep deduping past reset
- claimMollifierIdempotency guards a falsy GET before string.sub
- re-export AcceptResult (accept's public return type) and
  makeIdempotencyClaimKey from the package index
- correct stale "TTL expired" comments (no accept-time TTL anymore) and
  the mutateSnapshot doc (FAILED returns not_found, not busy)

Tests: listForEnvWithWatermark coverage (page-1, page-N, tied-score
continuation, orphan skip, empty/guard), fail clears idempotency lookup,
casSetMetadata busy on materialised entry, set_metadata/CAS version
interleave, resetIdempotency clears claim slot.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../redis-worker/src/mollifier/buffer.test.ts | 374 +++++++++++++++++-
 packages/redis-worker/src/mollifier/buffer.ts |  66 +++-
 packages/redis-worker/src/mollifier/index.ts  |   2 +
 3 files changed, 421 insertions(+), 21 deletions(-)

diff --git a/packages/redis-worker/src/mollifier/buffer.test.ts b/packages/redis-worker/src/mollifier/buffer.test.ts
index 5ce6be7f09b..e9e004fb851 100644
--- a/packages/redis-worker/src/mollifier/buffer.test.ts
+++ b/packages/redis-worker/src/mollifier/buffer.test.ts
@@ -2,7 +2,7 @@ import { describe, expect, it } from "vitest";
 import { BufferEntrySchema, serialiseSnapshot, deserialiseSnapshot } from "./schemas.js";
 import { redisTest } from "@internal/testcontainers";
 import { Logger } from "@trigger.dev/core/logger";
-import { MollifierBuffer, idempotencyLookupKeyFor } from "./buffer.js";
+import { MollifierBuffer, idempotencyLookupKeyFor, makeIdempotencyClaimKey } from "./buffer.js";
 
 describe("schemas", () => {
   it("serialiseSnapshot then deserialiseSnapshot is identity for plain objects", () => {
@@ -397,6 +397,62 @@ describe("MollifierBuffer.fail", () => {
       }
     },
   );
+
+  redisTest(
+    "fail DELs the idempotency lookup so a same-key retry goes through to PG",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      // Symmetric with the ack path: the failMollifierEntry Lua reads the
+      // idempotencyLookupKey off the hash and DELs it. Without this, a
+      // post-fail retry with the same idempotency key would hit the
+      // orphaned dedup record and resolve to a run that no longer exists.
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+
+      try {
+        await buffer.accept({
+          runId: "run_fk",
+          envId: "env_a",
+          orgId: "org_1",
+          payload: "{}",
+          idempotencyKey: "kf",
+          taskIdentifier: "t",
+        });
+        const lookupKey = idempotencyLookupKeyFor({
+          envId: "env_a",
+          taskIdentifier: "t",
+          idempotencyKey: "kf",
+        });
+        // Lookup exists before fail.
+        expect(await buffer["redis"].get(lookupKey)).toBe("run_fk");
+
+        await buffer.pop("env_a");
+        const failed = await buffer.fail("run_fk", { code: "VALIDATION", message: "boom" });
+        expect(failed).toBe(true);
+
+        // Lookup is cleared, so the slot is reclaimable: a fresh accept
+        // with the same tuple succeeds rather than deduping.
+        expect(await buffer["redis"].get(lookupKey)).toBeNull();
+        const reaccept = await buffer.accept({
+          runId: "run_fk2",
+          envId: "env_a",
+          orgId: "org_1",
+          payload: "{}",
+          idempotencyKey: "kf",
+          taskIdentifier: "t",
+        });
+        expect(reaccept).toEqual({ kind: "accepted" });
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
 });
 
 describe("MollifierBuffer TTL", () => {
@@ -1387,6 +1443,49 @@ describe("MollifierBuffer idempotency lookup", () => {
       }
     },
   );
+
+  redisTest(
+    "resetIdempotency also clears the pre-gate claim slot",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      // The lookup and the cross-store claim are two pointers for the same
+      // key. Reset must reopen both — otherwise a resolved/pending claim
+      // keeps deduping new triggers for the rest of its TTL even though
+      // the binding was reset.
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      const tuple = { envId: "env_rc", taskIdentifier: "t", idempotencyKey: "krc" };
+      try {
+        // A resolved claim is in place...
+        await buffer.claimIdempotency({ ...tuple, token: "owner", ttlSeconds: 600 });
+        await buffer.publishClaim({ ...tuple, token: "owner", runId: "rc1", ttlSeconds: 600 });
+        expect(await buffer.readClaim(tuple)).toEqual({ kind: "resolved", runId: "rc1" });
+        // ...alongside a buffered run holding the lookup.
+        await buffer.accept({
+          runId: "rc1",
+          envId: "env_rc",
+          orgId: "org_1",
+          payload: serialiseSnapshot({}),
+          idempotencyKey: "krc",
+          taskIdentifier: "t",
+        });
+
+        await buffer.resetIdempotency(tuple);
+
+        // Both the lookup and the claim are gone.
+        expect(await buffer.lookupIdempotency(tuple)).toBeNull();
+        expect(await buffer.readClaim(tuple)).toBeNull();
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
 });
 
 describe("MollifierBuffer.casSetMetadata", () => {
@@ -1507,6 +1606,96 @@ describe("MollifierBuffer.casSetMetadata", () => {
       }
     },
   );
+
+  redisTest(
+    "returns busy on a materialised entry (post-ack grace window)",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      // The guard rejects `materialised == 'true'` as well as non-QUEUED
+      // status. After ack the entry lingers QUEUED-but-materialised for
+      // the grace TTL; a CAS in that window must not mutate it.
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "cas_mat",
+          envId: "env_c",
+          orgId: "org_1",
+          payload: serialiseSnapshot({}),
+        });
+        await buffer.pop("env_c");
+        await buffer.ack("cas_mat");
+
+        const result = await buffer.casSetMetadata({
+          runId: "cas_mat",
+          expectedVersion: 0,
+          newMetadata: "{}",
+          newMetadataType: "application/json",
+        });
+        expect(result).toEqual({ kind: "busy" });
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "a mutateSnapshot set_metadata bumps metadataVersion so an in-flight CAS conflicts",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      // CAS isolation: a reader fetches version N, then a concurrent
+      // mutateSnapshot(set_metadata) overwrites the metadata. The reader's
+      // CAS at expectedVersion=N must NOT silently win — both paths write
+      // payload.metadata, so set_metadata bumps the same counter the CAS
+      // is gated on.
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "cas_int",
+          envId: "env_c",
+          orgId: "org_1",
+          payload: serialiseSnapshot({ metadata: '{"v":0}', metadataType: "application/json" }),
+        });
+        // Reader observes version 0.
+        const before = await buffer.getEntry("cas_int");
+        expect(before!.metadataVersion).toBe(0);
+
+        // Concurrent snapshot mutation writes metadata + bumps version.
+        const mutated = await buffer.mutateSnapshot("cas_int", {
+          type: "set_metadata",
+          metadata: '{"v":1}',
+          metadataType: "application/json",
+        });
+        expect(mutated).toBe("applied_to_snapshot");
+        const mid = await buffer.getEntry("cas_int");
+        expect(mid!.metadataVersion).toBe(1);
+
+        // The reader's stale CAS conflicts instead of clobbering.
+        const result = await buffer.casSetMetadata({
+          runId: "cas_int",
+          expectedVersion: 0,
+          newMetadata: '{"v":2}',
+          newMetadataType: "application/json",
+        });
+        expect(result).toEqual({ kind: "version_conflict", currentVersion: 1 });
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
 });
 
 describe("MollifierBuffer.mutateSnapshot", () => {
@@ -2045,6 +2234,181 @@ describe("MollifierBuffer.listEntriesForEnv", () => {
   });
 });
 
+describe("MollifierBuffer.listForEnvWithWatermark", () => {
+  // Seed a QUEUED entry, then pin its ZSET score and hash `createdAtMicros`
+  // to a deterministic value so ordering and the watermark cursor don't
+  // depend on wall-clock timing (Date.now() ties within a millisecond).
+  async function seed(buffer: MollifierBuffer, envId: string, runId: string, micros: number) {
+    await buffer.accept({ runId, envId, orgId: "org_1", payload: "{}" });
+    await buffer["redis"].zadd(`mollifier:queue:${envId}`, String(micros), runId);
+    await buffer["redis"].hset(`mollifier:entries:${runId}`, "createdAtMicros", String(micros));
+  }
+
+  redisTest("pageSize <= 0 returns empty without hitting redis", { timeout: 20_000 }, async ({ redisContainer }) => {
+    const buffer = new MollifierBuffer({
+      redisOptions: {
+        host: redisContainer.getHost(),
+        port: redisContainer.getPort(),
+        password: redisContainer.getPassword(),
+      },
+      logger: new Logger("test", "log"),
+    });
+    try {
+      expect(await buffer.listForEnvWithWatermark({ envId: "env_w", pageSize: 0 })).toEqual([]);
+      expect(await buffer.listForEnvWithWatermark({ envId: "env_w", pageSize: -3 })).toEqual([]);
+    } finally {
+      await buffer.close();
+    }
+  });
+
+  redisTest(
+    "page 1 returns newest-first up to pageSize without consuming entries",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await seed(buffer, "env_w", "wa", 1000);
+        await seed(buffer, "env_w", "wb", 2000);
+        await seed(buffer, "env_w", "wc", 3000);
+
+        const page = await buffer.listForEnvWithWatermark({ envId: "env_w", pageSize: 2 });
+        // Newest-first by createdAtMicros.
+        expect(page.map((e) => e.runId)).toEqual(["wc", "wb"]);
+
+        // Non-destructive: drainer still pops oldest-first.
+        const popped: string[] = [];
+        for (let i = 0; i < 3; i++) {
+          const e = await buffer.pop("env_w");
+          if (e) popped.push(e.runId);
+        }
+        expect(popped).toEqual(["wa", "wb", "wc"]);
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "page N continues strictly below the watermark score",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await seed(buffer, "env_w", "wa", 1000);
+        await seed(buffer, "env_w", "wb", 2000);
+        await seed(buffer, "env_w", "wc", 3000);
+
+        const page1 = await buffer.listForEnvWithWatermark({ envId: "env_w", pageSize: 2 });
+        expect(page1.map((e) => e.runId)).toEqual(["wc", "wb"]);
+
+        const last = page1[page1.length - 1]!;
+        const page2 = await buffer.listForEnvWithWatermark({
+          envId: "env_w",
+          pageSize: 2,
+          watermark: { createdAtMicros: last.createdAtMicros, runId: last.runId },
+        });
+        // Only the entry strictly below score 2000 remains; no overlap.
+        expect(page2.map((e) => e.runId)).toEqual(["wa"]);
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "tied-score watermark surfaces lex-smaller members on the next page without dupes",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        // Three entries share one score; ZSET breaks the tie by member,
+        // and zrevrangebyscore returns them member-DESC: tc, tb, ta.
+        await seed(buffer, "env_w", "ta", 2000);
+        await seed(buffer, "env_w", "tb", 2000);
+        await seed(buffer, "env_w", "tc", 2000);
+
+        const page1 = await buffer.listForEnvWithWatermark({ envId: "env_w", pageSize: 2 });
+        expect(page1.map((e) => e.runId)).toEqual(["tc", "tb"]);
+
+        const last = page1[page1.length - 1]!;
+        const page2 = await buffer.listForEnvWithWatermark({
+          envId: "env_w",
+          pageSize: 2,
+          watermark: { createdAtMicros: last.createdAtMicros, runId: last.runId },
+        });
+        // Same score, lex-smaller than the "tb" anchor — and not "tb"
+        // itself (no duplicate across the page boundary).
+        expect(page2.map((e) => e.runId)).toEqual(["ta"]);
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "skips orphan queue references (entry hash gone) during listing",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await seed(buffer, "env_w", "live", 1000);
+        await seed(buffer, "env_w", "orphan", 2000);
+        // Drop the orphan's hash but leave its queue ref behind.
+        await buffer["redis"].del("mollifier:entries:orphan");
+
+        const page = await buffer.listForEnvWithWatermark({ envId: "env_w", pageSize: 10 });
+        expect(page.map((e) => e.runId)).toEqual(["live"]);
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest("returns empty for an env with no queued entries", { timeout: 20_000 }, async ({ redisContainer }) => {
+    const buffer = new MollifierBuffer({
+      redisOptions: {
+        host: redisContainer.getHost(),
+        port: redisContainer.getPort(),
+        password: redisContainer.getPassword(),
+      },
+      logger: new Logger("test", "log"),
+    });
+    try {
+      expect(await buffer.listForEnvWithWatermark({ envId: "env_empty_w", pageSize: 10 })).toEqual([]);
+    } finally {
+      await buffer.close();
+    }
+  });
+});
+
 // Composite-key safety. The Redis-key builders concatenate
 // `(envId, taskIdentifier, idempotencyKey)` with `:` separators; without
 // per-segment encoding, `taskIdentifier="a:b"` and `idempotencyKey="x"`
@@ -2285,10 +2649,10 @@ describe("MollifierBuffer pre-gate claim — ownership token safety", () => {
         expect(a.kind).toBe("claimed");
 
         // Step 2: simulate TTL expiry — DEL the slot directly so the
-        // test doesn't rely on wall-clock sleeping.
-        await buffer["redis"].del(`mollifier:claim:${[claimInput.envId, claimInput.taskIdentifier, claimInput.idempotencyKey]
-          .map((s) => Buffer.from(s, "utf8").toString("base64url"))
-          .join(":")}`);
+        // test doesn't rely on wall-clock sleeping. Targets the same key
+        // the buffer writes via the exported builder, so a key-format
+        // change can't silently make this DEL miss.
+        await buffer["redis"].del(makeIdempotencyClaimKey(claimInput));
 
         // Step 3: B claims with token "B".
         const b = await buffer.claimIdempotency({
diff --git a/packages/redis-worker/src/mollifier/buffer.ts b/packages/redis-worker/src/mollifier/buffer.ts
index 37752d21623..e8e413fd5dd 100644
--- a/packages/redis-worker/src/mollifier/buffer.ts
+++ b/packages/redis-worker/src/mollifier/buffer.ts
@@ -73,7 +73,9 @@ export function idempotencyLookupKeyFor(input: IdempotencyLookupInput): string {
 //   <runId>            → the winning trigger's resolved runId.
 const PENDING_PREFIX = "pending:";
 
-function makeIdempotencyClaimKey(input: IdempotencyLookupInput): string {
+// Exported (like `idempotencyLookupKeyFor`) so tests can target the same
+// claim key the buffer writes without hard-coding the encoding.
+export function makeIdempotencyClaimKey(input: IdempotencyLookupInput): string {
   return `mollifier:claim:${encodeKeyPart(input.envId)}:${encodeKeyPart(input.taskIdentifier)}:${encodeKeyPart(input.idempotencyKey)}`;
 }
 
@@ -303,8 +305,9 @@ export class MollifierBuffer {
   // the dashboard's "Recently queued" surface — non-destructive, so the
   // drainer still pops these entries in order. Returns up to `maxCount`
   // entries newest-first (highest score, which is `createdAtMicros`).
-  // Each entry hash is fetched separately; a `null` from getEntry (TTL
-  // expired between ZREVRANGE and HGETALL) is skipped.
+  // Each entry hash is fetched separately; a `null` from getEntry (entry
+  // torn down by a concurrent drainer ack/fail between ZREVRANGE and
+  // HGETALL) is skipped.
   async listEntriesForEnv(envId: string, maxCount: number): Promise<BufferEntry[]> {
     if (maxCount <= 0) return [];
     const runIds = await this.redis.zrevrange(
@@ -325,8 +328,9 @@ export class MollifierBuffer {
   // the buffer. Three outcomes:
   //   - "applied_to_snapshot": entry was QUEUED + not materialised; the
   //     drainer will read the patched payload on its next pop.
-  //   - "not_found": no entry hash exists for this runId.
-  //   - "busy": entry is DRAINING / FAILED / materialised. The API
+  //   - "not_found": no entry hash exists for this runId — including a
+  //     FAILED entry, whose hash the drainer-terminal `fail` path DELs.
+  //   - "busy": entry is DRAINING or materialised. The API
   //     wait-and-bounces through PG (Q3 design).
   async mutateSnapshot(runId: string, patch: SnapshotPatch): Promise<MutateSnapshotResult> {
     const result = (await this.redis.mutateMollifierSnapshot(
@@ -479,9 +483,11 @@ export class MollifierBuffer {
   // buffered run held this key.
   async resetIdempotency(input: IdempotencyLookupInput): Promise<{ clearedRunId: string | null }> {
     const lookupKey = idempotencyLookupKeyFor(input);
+    const claimKey = makeIdempotencyClaimKey(input);
     const clearedRunId = (await this.redis.resetMollifierIdempotency(
       lookupKey,
       "mollifier:entries:",
+      claimKey,
     )) as string;
     return { clearedRunId: clearedRunId.length > 0 ? clearedRunId : null };
   }
@@ -508,9 +514,12 @@ export class MollifierBuffer {
     );
   }
 
-  // Returns true if the entry transitioned to FAILED; false if the entry no
-  // longer exists (TTL expired between pop and fail). Caller can use the
-  // boolean to skip downstream FAILED handling for ghost entries.
+  // Returns true if a live entry was torn down; false if the entry no
+  // longer existed (a concurrent ack or manual cleanup removed it between
+  // pop and fail — there is no accept-time TTL). Note FAILED is not an
+  // observable state: the Lua marks the hash FAILED then DELs it in the
+  // same atomic script, so a subsequent getEntry returns null. Caller can
+  // use the boolean to skip downstream FAILED handling for ghost entries.
   async fail(runId: string, error: { code: string; message: string }): Promise<boolean> {
     const result = await this.redis.failMollifierEntry(
       `mollifier:entries:${runId}`,
@@ -679,10 +688,11 @@ export class MollifierBuffer {
           end
         end
 
-        -- Loop to skip orphan queue references — runIds whose entry hash has
-        -- expired (TTL hit). HSET on a missing key would CREATE a partial
-        -- hash without a TTL, leaking memory. The loop is bounded by queue
-        -- length; entire Lua script remains atomic.
+        -- Loop to skip orphan queue references — runIds whose entry hash is
+        -- gone (e.g. Redis maxmemory eviction, since QUEUED entries carry
+        -- no TTL of their own). HSET on a missing key would CREATE a
+        -- partial hash without a TTL, leaking memory. The loop is bounded
+        -- by queue length; entire Lua script remains atomic.
         while true do
           -- ZPOPMIN returns {member, score} as a flat array, or {} when empty.
           local popped = redis.call('ZPOPMIN', queueKey)
@@ -710,8 +720,8 @@ export class MollifierBuffer {
             end
             return cjson.encode(result)
           end
-          -- Orphan queue reference: entry TTL expired while runId was queued.
-          -- Discard the reference and loop to the next.
+          -- Orphan queue reference: entry hash gone (evicted) while runId
+          -- was queued. Discard the reference and loop to the next.
         end
       `,
     });
@@ -774,6 +784,13 @@ export class MollifierBuffer {
         end
 
         local existing = redis.call('GET', claimKey)
+        if not existing then
+          -- The slot expired in the race window between the SET NX
+          -- failing and this GET. It's free now — claim it so we don't
+          -- string.sub a nil and error out.
+          redis.call('SET', claimKey, pendingMarker, 'EX', ttl)
+          return 'claimed'
+        end
         -- Any "pending:*" value is a live claim — the caller-supplied
         -- token differentiates ownership but is opaque to losers.
         if string.sub(existing, 1, string.len(pendingPrefix)) == pendingPrefix then
@@ -827,6 +844,15 @@ export class MollifierBuffer {
       lua: `
         local lookupKey = KEYS[1]
         local entryPrefix = ARGV[1]
+        local claimKey = ARGV[2]
+
+        -- Reset reopens the key across BOTH the buffer lookup and the
+        -- cross-store pre-gate claim pointer. Without clearing the claim,
+        -- a resolved/pending claim would keep deduping new triggers for
+        -- the rest of its TTL even though the binding was reset. DEL is
+        -- unconditional — the claim is gone regardless of whether a
+        -- buffered run currently holds the lookup.
+        redis.call('DEL', claimKey)
 
         local runId = redis.call('GET', lookupKey)
         if not runId then
@@ -905,6 +931,12 @@ export class MollifierBuffer {
         elseif patch.type == 'set_metadata' then
           payload.metadata = patch.metadata
           payload.metadataType = patch.metadataType
+          -- Bump the denormalised metadataVersion so an in-flight
+          -- casSetMetadata (optimistic CAS keyed on this counter) sees
+          -- the concurrent write as a version conflict and retries,
+          -- instead of clobbering it under a now-stale expectedVersion.
+          local currentVersion = tonumber(redis.call('HGET', entryKey, 'metadataVersion') or '0') or 0
+          redis.call('HSET', entryKey, 'metadataVersion', tostring(currentVersion + 1))
         elseif patch.type == 'set_delay' then
           payload.delayUntil = patch.delayUntil
         elseif patch.type == 'mark_cancelled' then
@@ -925,8 +957,9 @@ export class MollifierBuffer {
         local entryKey = KEYS[1]
         local graceTtlSeconds = tonumber(ARGV[1])
 
-        -- Guard: never create a partial entry. If the hash expired between
-        -- pop and ack, the run is gone — nothing to mark materialised.
+        -- Guard: never create a partial entry. If the hash is gone between
+        -- pop and ack (concurrent fail or eviction — QUEUED entries carry
+        -- no TTL), the run is gone, nothing to mark materialised.
         if redis.call('EXISTS', entryKey) == 0 then
           return 0
         end
@@ -1048,6 +1081,7 @@ declare module "@internal/redis" {
     resetMollifierIdempotency(
       lookupKey: string,
       entryPrefix: string,
+      claimKey: string,
       callback?: Callback<string>,
     ): Result<string, Context>;
     claimMollifierIdempotency(
diff --git a/packages/redis-worker/src/mollifier/index.ts b/packages/redis-worker/src/mollifier/index.ts
index 6e93f105fee..6147b522603 100644
--- a/packages/redis-worker/src/mollifier/index.ts
+++ b/packages/redis-worker/src/mollifier/index.ts
@@ -2,11 +2,13 @@ export {
   MollifierBuffer,
   type MollifierBufferOptions,
   type SnapshotPatch,
+  type AcceptResult,
   type MutateSnapshotResult,
   type CasSetMetadataResult,
   type IdempotencyClaimResult,
   type IdempotencyLookupInput,
   idempotencyLookupKeyFor,
+  makeIdempotencyClaimKey,
 } from "./buffer.js";
 export {
   MollifierDrainer,

From 612de3b0f0b659098bddff664be2c4ec4d61e65a Mon Sep 17 00:00:00 2001
From: Daniel Sutton <dansutton@trigger.dev>
Date: Wed, 27 May 2026 15:05:10 +0100
Subject: [PATCH 6/7] refactor(redis-worker): mollifier queue ZSET -> LIST,
 drop dead watermark listing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The watermark listing (listForEnvWithWatermark) had zero consumers across
the whole stack — the dashboard renders buffered runs per-run via getEntry,
never a paginated queue list. With it gone, nothing left needs a sorted
set: FIFO drain is just insertion order, the stale sweep enumerates
without caring about order, and delay is materialised into PG at drain
time (never a queue sort key).

Revert the per-env queue to a Redis LIST: LPUSH on accept, RPOP on pop
(FIFO), RPUSH on requeue (transiently-failed entry pops next), LLEN for
the empty check, LRANGE for the stale-sweep enumeration. O(1) instead of
O(log N), simpler Lua. createdAtMicros stays a hash field for dwell
metrics; it is no longer a sort key.

Removes listForEnvWithWatermark + its tests; rewrites the ZSET-storage
tests as LIST-storage tests; updates orphan/requeue tests for RPOP/RPUSH.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .changeset/mollifier-buffer-extensions.md     |   2 +-
 .../redis-worker/src/mollifier/buffer.test.ts | 270 +++---------------
 packages/redis-worker/src/mollifier/buffer.ts | 132 ++-------
 3 files changed, 72 insertions(+), 332 deletions(-)

diff --git a/.changeset/mollifier-buffer-extensions.md b/.changeset/mollifier-buffer-extensions.md
index 2334c6d70b6..109ea5c3d20 100644
--- a/.changeset/mollifier-buffer-extensions.md
+++ b/.changeset/mollifier-buffer-extensions.md
@@ -2,4 +2,4 @@
 "@trigger.dev/redis-worker": minor
 ---
 
-Mollifier buffer extensions: idempotency dedup, an atomic `mutateSnapshot` API, metadata CAS, paginated listing, claim primitives, and a `MollifierSnapshot` type.
+Mollifier buffer extensions: idempotency dedup, an atomic `mutateSnapshot` API, metadata CAS, claim primitives, and a `MollifierSnapshot` type.
diff --git a/packages/redis-worker/src/mollifier/buffer.test.ts b/packages/redis-worker/src/mollifier/buffer.test.ts
index e9e004fb851..4af7575e252 100644
--- a/packages/redis-worker/src/mollifier/buffer.test.ts
+++ b/packages/redis-worker/src/mollifier/buffer.test.ts
@@ -238,8 +238,8 @@ describe("MollifierBuffer.pop orphan handling", () => {
       });
 
       try {
-        // Simulate a TTL-expired orphan: queue ref exists, entry hash does not.
-        await buffer["redis"].zadd("mollifier:queue:env_a", 1, "run_orphan");
+        // Simulate an evicted orphan: queue ref exists, entry hash does not.
+        await buffer["redis"].rpush("mollifier:queue:env_a", "run_orphan");
 
         const popped = await buffer.pop("env_a");
         expect(popped).toBeNull();
@@ -249,7 +249,7 @@ describe("MollifierBuffer.pop orphan handling", () => {
         expect(Object.keys(raw)).toHaveLength(0);
 
         // Queue is drained — the loop pops orphans until empty.
-        const qLen = await buffer["redis"].zcard("mollifier:queue:env_a");
+        const qLen = await buffer["redis"].llen("mollifier:queue:env_a");
         expect(qLen).toBe(0);
       } finally {
         await buffer.close();
@@ -271,12 +271,13 @@ describe("MollifierBuffer.pop orphan handling", () => {
       });
 
       try {
-        // Layout by score (lowest-first, since ZPOPMIN takes the min):
-        // orphan_a (score 1) → valid (score = its createdAtMicros, large) → orphan_b (score 1e18).
-        // First pop skips orphan_a, returns valid; orphan_b remains.
-        await buffer["redis"].zadd("mollifier:queue:env_a", 1, "orphan_a");
+        // Build the queue so RPOP (tail-first) yields: orphan_a, valid,
+        // orphan_b. accept LPUSHes "valid"; RPUSH puts orphan_a at the
+        // tail (popped first), LPUSH puts orphan_b at the head (popped
+        // last). First pop skips orphan_a, returns valid; orphan_b remains.
         await buffer.accept({ runId: "valid", envId: "env_a", orgId: "org_1", payload: "{}" });
-        await buffer["redis"].zadd("mollifier:queue:env_a", 1e18, "orphan_b");
+        await buffer["redis"].rpush("mollifier:queue:env_a", "orphan_a");
+        await buffer["redis"].lpush("mollifier:queue:env_a", "orphan_b");
 
         const popped = await buffer.pop("env_a");
         expect(popped).not.toBeNull();
@@ -284,7 +285,7 @@ describe("MollifierBuffer.pop orphan handling", () => {
         expect(popped!.status).toBe("DRAINING");
 
         // The trailing orphan_b is still in the queue (single pop call).
-        const remaining = await buffer["redis"].zcard("mollifier:queue:env_a");
+        const remaining = await buffer["redis"].llen("mollifier:queue:env_a");
         expect(remaining).toBe(1);
 
         // A second pop drains the trailing orphan_b. The queue is now
@@ -559,13 +560,14 @@ describe("MollifierBuffer.requeue on missing entry", () => {
 
 describe("MollifierBuffer.requeue ordering", () => {
   redisTest(
-    "requeued entry retains its original createdAt and pops next (oldest-first by createdAt)",
+    "requeued entry pops next (RPUSH to the RPOP/tail end), preserving FIFO",
     { timeout: 20_000 },
     async ({ redisContainer }) => {
-      // Score == createdAtMicros; requeue does not bump the score. The
-      // oldest entry continues to pop first across retries. `maxAttempts`
-      // in the drainer bounds the retry loop for a persistently failing
-      // entry (after which it goes to the `fail` path, not requeue).
+      // LIST FIFO: accept LPUSHes at the head, pop RPOPs from the tail, so
+      // the first-accepted entry pops first. requeue RPUSHes back to the
+      // tail, so a transiently failed entry pops next rather than going to
+      // the back. `maxAttempts` in the drainer bounds the retry loop for a
+      // persistently failing entry (after which it goes to `fail`, not requeue).
       const buffer = new MollifierBuffer({
         redisOptions: {
           host: redisContainer.getHost(),
@@ -577,9 +579,7 @@ describe("MollifierBuffer.requeue ordering", () => {
 
       try {
         await buffer.accept({ runId: "a", envId: "env_a", orgId: "org_1", payload: "{}" });
-        await new Promise((r) => setTimeout(r, 2));
         await buffer.accept({ runId: "b", envId: "env_a", orgId: "org_1", payload: "{}" });
-        await new Promise((r) => setTimeout(r, 2));
         await buffer.accept({ runId: "c", envId: "env_a", orgId: "org_1", payload: "{}" });
 
         const first = await buffer.pop("env_a");
@@ -587,7 +587,7 @@ describe("MollifierBuffer.requeue ordering", () => {
 
         await buffer.requeue("a");
 
-        // a still has the smallest createdAtMicros → pops next.
+        // a was RPUSHed back to the tail → pops next, ahead of b and c.
         const next = await buffer.pop("env_a");
         expect(next!.runId).toBe("a");
         const after = await buffer.pop("env_a");
@@ -2045,9 +2045,9 @@ describe("MollifierBuffer.mutateSnapshot", () => {
   );
 });
 
-describe("MollifierBuffer ZSET storage", () => {
+describe("MollifierBuffer LIST storage", () => {
   redisTest(
-    "queue key is a ZSET scored by entry's createdAtMicros",
+    "queue key is a LIST; createdAtMicros is a hash field, not a sort key",
     { timeout: 20_000 },
     async ({ redisContainer }) => {
       const buffer = new MollifierBuffer({
@@ -2062,24 +2062,21 @@ describe("MollifierBuffer ZSET storage", () => {
       try {
         await buffer.accept({ runId: "z1", envId: "env_z", orgId: "org_1", payload: "{}" });
 
-        // ZSET-only commands must succeed against the queue key.
-        const card = await buffer["redis"].zcard("mollifier:queue:env_z");
-        expect(card).toBe(1);
+        // LIST-only commands must succeed against the queue key.
+        const len = await buffer["redis"].llen("mollifier:queue:env_z");
+        expect(len).toBe(1);
+        const members = await buffer["redis"].lrange("mollifier:queue:env_z", 0, -1);
+        expect(members).toEqual(["z1"]);
 
-        const score = await buffer["redis"].zscore("mollifier:queue:env_z", "z1");
-        expect(score).not.toBeNull();
-        const scoreNum = Number(score);
-        expect(Number.isFinite(scoreNum)).toBe(true);
+        // The queue holds no score — it's not a ZSET.
+        await expect(buffer["redis"].zscore("mollifier:queue:env_z", "z1")).rejects.toThrow();
 
-        // Score matches the entry hash's createdAtMicros field.
-        const micros = await buffer["redis"].hget("mollifier:entries:z1", "createdAtMicros");
-        expect(micros).not.toBeNull();
-        expect(Number(micros)).toBe(scoreNum);
-
-        // Score is plausibly recent (within last minute as microseconds).
+        // createdAtMicros lives on the entry hash (for dwell metrics) and
+        // is plausibly recent (within the last minute, as microseconds).
+        const micros = Number(await buffer["redis"].hget("mollifier:entries:z1", "createdAtMicros"));
         const nowMicros = Date.now() * 1000;
-        expect(scoreNum).toBeGreaterThan(nowMicros - 60_000_000);
-        expect(scoreNum).toBeLessThanOrEqual(nowMicros + 1_000_000);
+        expect(micros).toBeGreaterThan(nowMicros - 60_000_000);
+        expect(micros).toBeLessThanOrEqual(nowMicros + 1_000_000);
       } finally {
         await buffer.close();
       }
@@ -2087,7 +2084,7 @@ describe("MollifierBuffer ZSET storage", () => {
   );
 
   redisTest(
-    "pop returns entries in ascending createdAtMicros order (FIFO by time, not by member)",
+    "pop returns entries in FIFO insertion order (independent of member lex order)",
     { timeout: 20_000 },
     async ({ redisContainer }) => {
       const buffer = new MollifierBuffer({
@@ -2100,11 +2097,10 @@ describe("MollifierBuffer ZSET storage", () => {
       });
 
       try {
-        // Insert runIds in reverse-lex order to prove ordering is by score, not member.
+        // Accept in reverse-lex order to prove ordering is by insertion
+        // (LPUSH head / RPOP tail), not by member value.
         await buffer.accept({ runId: "zzz", envId: "env_o", orgId: "org_1", payload: "{}" });
-        await new Promise((r) => setTimeout(r, 5));
         await buffer.accept({ runId: "mmm", envId: "env_o", orgId: "org_1", payload: "{}" });
-        await new Promise((r) => setTimeout(r, 5));
         await buffer.accept({ runId: "aaa", envId: "env_o", orgId: "org_1", payload: "{}" });
 
         const first = await buffer.pop("env_o");
@@ -2120,7 +2116,7 @@ describe("MollifierBuffer ZSET storage", () => {
   );
 
   redisTest(
-    "requeue keeps original score; createdAt is immutable across retries",
+    "requeue re-enqueues to the LIST; createdAt is immutable across retries",
     { timeout: 20_000 },
     async ({ redisContainer }) => {
       const buffer = new MollifierBuffer({
@@ -2134,24 +2130,17 @@ describe("MollifierBuffer ZSET storage", () => {
 
       try {
         await buffer.accept({ runId: "rq", envId: "env_rq", orgId: "org_1", payload: "{}" });
-        const originalScore = Number(
-          await buffer["redis"].zscore("mollifier:queue:env_rq", "rq"),
-        );
-        const originalMicros = Number(
-          await buffer["redis"].hget("mollifier:entries:rq", "createdAtMicros"),
-        );
+        const originalMicros = await buffer["redis"].hget("mollifier:entries:rq", "createdAtMicros");
 
         await buffer.pop("env_rq");
-        await new Promise((r) => setTimeout(r, 5));
+        // Queue is empty after the pop.
+        expect(await buffer["redis"].llen("mollifier:queue:env_rq")).toBe(0);
+
         await buffer.requeue("rq");
 
-        const newScore = Number(
-          await buffer["redis"].zscore("mollifier:queue:env_rq", "rq"),
-        );
-        const newMicros = Number(
-          await buffer["redis"].hget("mollifier:entries:rq", "createdAtMicros"),
-        );
-        expect(newScore).toBe(originalScore);
+        // Back on the LIST, and createdAtMicros is unchanged.
+        expect(await buffer["redis"].lrange("mollifier:queue:env_rq", 0, -1)).toEqual(["rq"]);
+        const newMicros = await buffer["redis"].hget("mollifier:entries:rq", "createdAtMicros");
         expect(newMicros).toBe(originalMicros);
       } finally {
         await buffer.close();
@@ -2234,181 +2223,6 @@ describe("MollifierBuffer.listEntriesForEnv", () => {
   });
 });
 
-describe("MollifierBuffer.listForEnvWithWatermark", () => {
-  // Seed a QUEUED entry, then pin its ZSET score and hash `createdAtMicros`
-  // to a deterministic value so ordering and the watermark cursor don't
-  // depend on wall-clock timing (Date.now() ties within a millisecond).
-  async function seed(buffer: MollifierBuffer, envId: string, runId: string, micros: number) {
-    await buffer.accept({ runId, envId, orgId: "org_1", payload: "{}" });
-    await buffer["redis"].zadd(`mollifier:queue:${envId}`, String(micros), runId);
-    await buffer["redis"].hset(`mollifier:entries:${runId}`, "createdAtMicros", String(micros));
-  }
-
-  redisTest("pageSize <= 0 returns empty without hitting redis", { timeout: 20_000 }, async ({ redisContainer }) => {
-    const buffer = new MollifierBuffer({
-      redisOptions: {
-        host: redisContainer.getHost(),
-        port: redisContainer.getPort(),
-        password: redisContainer.getPassword(),
-      },
-      logger: new Logger("test", "log"),
-    });
-    try {
-      expect(await buffer.listForEnvWithWatermark({ envId: "env_w", pageSize: 0 })).toEqual([]);
-      expect(await buffer.listForEnvWithWatermark({ envId: "env_w", pageSize: -3 })).toEqual([]);
-    } finally {
-      await buffer.close();
-    }
-  });
-
-  redisTest(
-    "page 1 returns newest-first up to pageSize without consuming entries",
-    { timeout: 20_000 },
-    async ({ redisContainer }) => {
-      const buffer = new MollifierBuffer({
-        redisOptions: {
-          host: redisContainer.getHost(),
-          port: redisContainer.getPort(),
-          password: redisContainer.getPassword(),
-        },
-        logger: new Logger("test", "log"),
-      });
-      try {
-        await seed(buffer, "env_w", "wa", 1000);
-        await seed(buffer, "env_w", "wb", 2000);
-        await seed(buffer, "env_w", "wc", 3000);
-
-        const page = await buffer.listForEnvWithWatermark({ envId: "env_w", pageSize: 2 });
-        // Newest-first by createdAtMicros.
-        expect(page.map((e) => e.runId)).toEqual(["wc", "wb"]);
-
-        // Non-destructive: drainer still pops oldest-first.
-        const popped: string[] = [];
-        for (let i = 0; i < 3; i++) {
-          const e = await buffer.pop("env_w");
-          if (e) popped.push(e.runId);
-        }
-        expect(popped).toEqual(["wa", "wb", "wc"]);
-      } finally {
-        await buffer.close();
-      }
-    },
-  );
-
-  redisTest(
-    "page N continues strictly below the watermark score",
-    { timeout: 20_000 },
-    async ({ redisContainer }) => {
-      const buffer = new MollifierBuffer({
-        redisOptions: {
-          host: redisContainer.getHost(),
-          port: redisContainer.getPort(),
-          password: redisContainer.getPassword(),
-        },
-        logger: new Logger("test", "log"),
-      });
-      try {
-        await seed(buffer, "env_w", "wa", 1000);
-        await seed(buffer, "env_w", "wb", 2000);
-        await seed(buffer, "env_w", "wc", 3000);
-
-        const page1 = await buffer.listForEnvWithWatermark({ envId: "env_w", pageSize: 2 });
-        expect(page1.map((e) => e.runId)).toEqual(["wc", "wb"]);
-
-        const last = page1[page1.length - 1]!;
-        const page2 = await buffer.listForEnvWithWatermark({
-          envId: "env_w",
-          pageSize: 2,
-          watermark: { createdAtMicros: last.createdAtMicros, runId: last.runId },
-        });
-        // Only the entry strictly below score 2000 remains; no overlap.
-        expect(page2.map((e) => e.runId)).toEqual(["wa"]);
-      } finally {
-        await buffer.close();
-      }
-    },
-  );
-
-  redisTest(
-    "tied-score watermark surfaces lex-smaller members on the next page without dupes",
-    { timeout: 20_000 },
-    async ({ redisContainer }) => {
-      const buffer = new MollifierBuffer({
-        redisOptions: {
-          host: redisContainer.getHost(),
-          port: redisContainer.getPort(),
-          password: redisContainer.getPassword(),
-        },
-        logger: new Logger("test", "log"),
-      });
-      try {
-        // Three entries share one score; ZSET breaks the tie by member,
-        // and zrevrangebyscore returns them member-DESC: tc, tb, ta.
-        await seed(buffer, "env_w", "ta", 2000);
-        await seed(buffer, "env_w", "tb", 2000);
-        await seed(buffer, "env_w", "tc", 2000);
-
-        const page1 = await buffer.listForEnvWithWatermark({ envId: "env_w", pageSize: 2 });
-        expect(page1.map((e) => e.runId)).toEqual(["tc", "tb"]);
-
-        const last = page1[page1.length - 1]!;
-        const page2 = await buffer.listForEnvWithWatermark({
-          envId: "env_w",
-          pageSize: 2,
-          watermark: { createdAtMicros: last.createdAtMicros, runId: last.runId },
-        });
-        // Same score, lex-smaller than the "tb" anchor — and not "tb"
-        // itself (no duplicate across the page boundary).
-        expect(page2.map((e) => e.runId)).toEqual(["ta"]);
-      } finally {
-        await buffer.close();
-      }
-    },
-  );
-
-  redisTest(
-    "skips orphan queue references (entry hash gone) during listing",
-    { timeout: 20_000 },
-    async ({ redisContainer }) => {
-      const buffer = new MollifierBuffer({
-        redisOptions: {
-          host: redisContainer.getHost(),
-          port: redisContainer.getPort(),
-          password: redisContainer.getPassword(),
-        },
-        logger: new Logger("test", "log"),
-      });
-      try {
-        await seed(buffer, "env_w", "live", 1000);
-        await seed(buffer, "env_w", "orphan", 2000);
-        // Drop the orphan's hash but leave its queue ref behind.
-        await buffer["redis"].del("mollifier:entries:orphan");
-
-        const page = await buffer.listForEnvWithWatermark({ envId: "env_w", pageSize: 10 });
-        expect(page.map((e) => e.runId)).toEqual(["live"]);
-      } finally {
-        await buffer.close();
-      }
-    },
-  );
-
-  redisTest("returns empty for an env with no queued entries", { timeout: 20_000 }, async ({ redisContainer }) => {
-    const buffer = new MollifierBuffer({
-      redisOptions: {
-        host: redisContainer.getHost(),
-        port: redisContainer.getPort(),
-        password: redisContainer.getPassword(),
-      },
-      logger: new Logger("test", "log"),
-    });
-    try {
-      expect(await buffer.listForEnvWithWatermark({ envId: "env_empty_w", pageSize: 10 })).toEqual([]);
-    } finally {
-      await buffer.close();
-    }
-  });
-});
-
 // Composite-key safety. The Redis-key builders concatenate
 // `(envId, taskIdentifier, idempotencyKey)` with `:` separators; without
 // per-segment encoding, `taskIdentifier="a:b"` and `idempotencyKey="x"`
diff --git a/packages/redis-worker/src/mollifier/buffer.ts b/packages/redis-worker/src/mollifier/buffer.ts
index e8e413fd5dd..2c2f0cc5fa7 100644
--- a/packages/redis-worker/src/mollifier/buffer.ts
+++ b/packages/redis-worker/src/mollifier/buffer.ts
@@ -135,12 +135,10 @@ export class MollifierBuffer {
     const orgsKey = "mollifier:orgs";
     const nowMs = Date.now();
     const createdAt = new Date(nowMs).toISOString();
-    // Microsecond epoch. JS only has millisecond precision, so multiple
-    // accepts in the same ms share a score; ZSET ties resolve by member
-    // (runId) lex order, which is deterministic and acceptable for FIFO
-    // pop. The hash carries the same value as `createdAtMicros` so the
-    // listing helper (Phase E) can read a stable per-run timestamp
-    // without re-fetching the score.
+    // Microsecond epoch, stored as a hash field for dwell-time metrics
+    // (stale sweep, drainer dwell span). FIFO ordering comes from the
+    // LIST itself (LPUSH head / RPOP tail), not from this value — it is
+    // no longer a queue sort key.
     const createdAtMicros = nowMs * 1000;
     const idempotencyLookupKey =
       input.idempotencyKey && input.taskIdentifier
@@ -231,86 +229,16 @@ export class MollifierBuffer {
     return this.redis.smembers(`mollifier:org-envs:${orgId}`);
   }
 
-  // Paginated read of currently-queued entries newest-first, bounded by
-  // an optional `(createdAtMicros, runId)` watermark. Q1 listing design.
-  // Returns hydrated `BufferEntry` rows up to `pageSize`. Skips orphans
-  // (queue ref without an entry hash) silently. Non-destructive — the
-  // drainer keeps popping these entries in createdAt order regardless.
-  async listForEnvWithWatermark(input: {
-    envId: string;
-    watermark?: { createdAtMicros: number; runId: string };
-    pageSize: number;
-  }): Promise<BufferEntry[]> {
-    if (input.pageSize <= 0) return [];
-    const queueKey = `mollifier:queue:${input.envId}`;
-
-    let runIds: string[];
-    if (!input.watermark) {
-      // Page 1 — newest first.
-      runIds = await this.redis.zrevrangebyscore(
-        queueKey,
-        "+inf",
-        "-inf",
-        "LIMIT",
-        0,
-        input.pageSize,
-      );
-    } else {
-      // Page N — strictly below the watermark score.
-      const belowScore = await this.redis.zrevrangebyscore(
-        queueKey,
-        `(${input.watermark.createdAtMicros}`,
-        "-inf",
-        "LIMIT",
-        0,
-        input.pageSize,
-      );
-      runIds = belowScore;
-      // Tied-score scan: ZSET ties broken by member-DESC, so entries
-      // sharing the watermark score with a lex-smaller runId still
-      // need to surface. Cheap second range over the tied band.
-      if (belowScore.length < input.pageSize) {
-        const remaining = input.pageSize - belowScore.length;
-        const tied = await this.redis.zrangebyscore(
-          queueKey,
-          input.watermark.createdAtMicros,
-          input.watermark.createdAtMicros,
-        );
-        // Filter to runIds lex-less than the watermark anchor, sort
-        // member-DESC, take `remaining`.
-        const tiedFiltered = tied
-          .filter((r) => r < input.watermark!.runId)
-          .sort((a, b) => (a < b ? 1 : a > b ? -1 : 0))
-          .slice(0, remaining);
-        runIds = [...belowScore, ...tiedFiltered];
-      }
-    }
-
-    if (runIds.length === 0) return [];
-
-    // Parallel HGETALL — one round-trip per entry, all in flight.
-    const fetched = await Promise.all(
-      runIds.map((runId) => this.redis.hgetall(`mollifier:entries:${runId}`)),
-    );
-    const entries: BufferEntry[] = [];
-    for (const value of fetched) {
-      if (!value || Object.keys(value).length === 0) continue;
-      const parsed = BufferEntrySchema.safeParse(value);
-      if (parsed.success) entries.push(parsed.data);
-    }
-    return entries;
-  }
-
-  // Read-only listing of currently-queued entries for a single env. Used by
-  // the dashboard's "Recently queued" surface — non-destructive, so the
-  // drainer still pops these entries in order. Returns up to `maxCount`
-  // entries newest-first (highest score, which is `createdAtMicros`).
-  // Each entry hash is fetched separately; a `null` from getEntry (entry
-  // torn down by a concurrent drainer ack/fail between ZREVRANGE and
-  // HGETALL) is skipped.
+  // Read-only enumeration of currently-queued entries for a single env.
+  // Used by the stale-sweep to compute per-entry dwell time, so order is
+  // immaterial — LRANGE returns them newest-first (LPUSH head) but the
+  // caller scans the whole window. Non-destructive: the drainer still
+  // RPOPs these entries in FIFO order. Each entry hash is fetched
+  // separately; a `null` from getEntry (entry torn down by a concurrent
+  // drainer ack/fail between LRANGE and HGETALL) is skipped.
   async listEntriesForEnv(envId: string, maxCount: number): Promise<BufferEntry[]> {
     if (maxCount <= 0) return [];
-    const runIds = await this.redis.zrevrange(
+    const runIds = await this.redis.lrange(
       `mollifier:queue:${envId}`,
       0,
       maxCount - 1,
@@ -613,12 +541,11 @@ export class MollifierBuffer {
         -- eviction would lose runs with no customer-visible signal.
         -- Memory pressure from an offline drainer is the alertable
         -- failure mode instead; see _ops/mollifier-ops.md.
-        -- ZSET keyed by createdAtMicros: ZPOPMIN drains oldest-first
-        -- (FIFO); listing pagination uses ZREVRANGEBYSCORE with a
-        -- (createdAt, runId) cursor anchor. Score is stable across the
-        -- entry's lifecycle — requeue does not bump it (see Phase 3b /
-        -- Q1 design).
-        redis.call('ZADD', queueKey, createdAtMicros, runId)
+        -- LIST queue: LPUSH at the head, drainer RPOPs from the tail, so
+        -- insertion order == drain order (FIFO). createdAtMicros is kept
+        -- as a hash field for dwell metrics only — it is no longer a sort
+        -- key now that the buffer has no list/pagination surface.
+        redis.call('LPUSH', queueKey, runId)
         -- Org-level membership: maintained atomically with the per-env
         -- queue so the drainer can walk orgs → envs-for-org and
         -- schedule one env per org per tick. SADDs are idempotent if the
@@ -640,8 +567,7 @@ export class MollifierBuffer {
 
         local envId = redis.call('HGET', entryKey, 'envId')
         local orgId = redis.call('HGET', entryKey, 'orgId')
-        local createdAtMicros = redis.call('HGET', entryKey, 'createdAtMicros')
-        if not envId or not createdAtMicros then
+        if not envId then
           return 0
         end
 
@@ -649,11 +575,12 @@ export class MollifierBuffer {
         local nextAttempts = tonumber(currentAttempts or '0') + 1
 
         redis.call('HSET', entryKey, 'status', 'QUEUED', 'attempts', tostring(nextAttempts))
-        -- Requeue re-adds with the ORIGINAL createdAtMicros score.
-        -- createdAt is immutable across retries (Phase 3b decision).
-        -- The drainer's maxAttempts caps the retry loop so a poisoned
-        -- entry doesn't head-of-line forever.
-        redis.call('ZADD', queuePrefix .. envId, tonumber(createdAtMicros), runId)
+        -- Requeue RPUSHes to the tail (the RPOP end) so a transiently
+        -- failed entry pops next rather than going to the back of the
+        -- line behind a fresh backlog. createdAt is immutable across
+        -- retries (Phase 3b decision); the drainer's maxAttempts caps the
+        -- retry loop so a poisoned entry doesn't head-of-line forever.
+        redis.call('RPUSH', queuePrefix .. envId, runId)
         -- Re-track the org/env: pop may have SREM'd them when the queue
         -- last emptied. SADDs are idempotent if the values are still
         -- present.
@@ -694,9 +621,8 @@ export class MollifierBuffer {
         -- partial hash without a TTL, leaking memory. The loop is bounded
         -- by queue length; entire Lua script remains atomic.
         while true do
-          -- ZPOPMIN returns {member, score} as a flat array, or {} when empty.
-          local popped = redis.call('ZPOPMIN', queueKey)
-          local runId = popped[1]
+          -- RPOP returns the tail member (oldest, FIFO), or false when empty.
+          local runId = redis.call('RPOP', queueKey)
           if not runId then
             -- Queue is empty AND we have no entry to read orgId from, so
             -- skip org-level cleanup. Stale org-envs entries are bounded
@@ -713,9 +639,9 @@ export class MollifierBuffer {
               result[raw[i]] = raw[i + 1]
             end
             -- Prune org-level membership if this pop drained the queue.
-            -- Atomic with the ZPOPMIN above — a concurrent accept AFTER
-            -- this script will SADD both back along with its ZADD.
-            if redis.call('ZCARD', queueKey) == 0 then
+            -- Atomic with the RPOP above — a concurrent accept AFTER
+            -- this script will SADD both back along with its LPUSH.
+            if redis.call('LLEN', queueKey) == 0 then
               pruneOrgMembership(result['orgId'])
             end
             return cjson.encode(result)

From 8bfa7419cda4bfd32866118b90eefae0dc7392af Mon Sep 17 00:00:00 2001
From: Daniel Sutton <dansutton@trigger.dev>
Date: Wed, 27 May 2026 15:42:19 +0100
Subject: [PATCH 7/7] fix(redis-worker): self-heal stale idempotency lookups +
 default createdAtMicros
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address AI review findings on PR #3752:

- accept(): if an idempotency lookup survives its entry hash being evicted
  (maxmemory), the lookup is stale — rebind to the new run instead of
  returning a dead existingRunId that blocks the key forever. Mirrors the
  self-heal lookupIdempotency already does. (CodeRabbit)
- lookupIdempotency(): clear a stale lookup with a compare-and-delete
  (delMollifierKeyIfEquals Lua) so a concurrent accept that rebinds the
  key between our GET and DEL isn't clobbered. (CodeRabbit)
- schemas: default createdAtMicros to "0" so an entry written before the
  field existed (or surviving across the deploy that added it) still
  parses on pop instead of being silently dropped. (Devin)
- rename the requeue-ordering test to "retry priority" — RPUSH-to-tail
  pops the requeued entry ahead of newer items; that's deliberate retry
  priority, not FIFO relative to the rest of the queue. (CodeRabbit nit)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../redis-worker/src/mollifier/buffer.test.ts | 102 +++++++++++++++++-
 packages/redis-worker/src/mollifier/buffer.ts |  39 ++++++-
 .../redis-worker/src/mollifier/schemas.ts     |   9 +-
 3 files changed, 138 insertions(+), 12 deletions(-)

diff --git a/packages/redis-worker/src/mollifier/buffer.test.ts b/packages/redis-worker/src/mollifier/buffer.test.ts
index 4af7575e252..00d618c0498 100644
--- a/packages/redis-worker/src/mollifier/buffer.test.ts
+++ b/packages/redis-worker/src/mollifier/buffer.test.ts
@@ -30,6 +30,24 @@ describe("schemas", () => {
     expect(parsed.createdAtMicros).toBe(1747044000000000);
   });
 
+  it("BufferEntrySchema defaults createdAtMicros for entries written before the field existed", () => {
+    // Backward compat: an entry written by an accept Lua predating
+    // createdAtMicros (only the original 7 fields) must still parse on
+    // pop rather than being silently dropped.
+    const raw = {
+      runId: "run_old",
+      envId: "env_1",
+      orgId: "org_1",
+      payload: serialiseSnapshot({}),
+      status: "QUEUED",
+      attempts: "0",
+      createdAt: "2026-05-11T10:00:00.000Z",
+      // no createdAtMicros
+    };
+    const parsed = BufferEntrySchema.parse(raw);
+    expect(parsed.createdAtMicros).toBe(0);
+  });
+
   it("BufferEntrySchema parses a FAILED entry with lastError", () => {
     const raw = {
       runId: "run_abc",
@@ -560,13 +578,15 @@ describe("MollifierBuffer.requeue on missing entry", () => {
 
 describe("MollifierBuffer.requeue ordering", () => {
   redisTest(
-    "requeued entry pops next (RPUSH to the RPOP/tail end), preserving FIFO",
+    "requeued entry gets retry priority (RPUSH to the RPOP/tail end), popping ahead of newer items",
     { timeout: 20_000 },
     async ({ redisContainer }) => {
-      // LIST FIFO: accept LPUSHes at the head, pop RPOPs from the tail, so
-      // the first-accepted entry pops first. requeue RPUSHes back to the
-      // tail, so a transiently failed entry pops next rather than going to
-      // the back. `maxAttempts` in the drainer bounds the retry loop for a
+      // LIST: accept LPUSHes at the head, pop RPOPs from the tail, so the
+      // first-accepted entry pops first. requeue RPUSHes back to the tail,
+      // giving a transiently failed entry *retry priority* — it pops next,
+      // ahead of newer queued items, rather than going to the back. (This
+      // is deliberately not FIFO relative to the rest of the queue.)
+      // `maxAttempts` in the drainer bounds the retry loop for a
       // persistently failing entry (after which it goes to `fail`, not requeue).
       const buffer = new MollifierBuffer({
         redisOptions: {
@@ -1486,6 +1506,78 @@ describe("MollifierBuffer idempotency lookup", () => {
       }
     },
   );
+
+  redisTest(
+    "accept self-heals a stale lookup: a new run rebinds when the bound entry was evicted",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      // If an entry hash is evicted (maxmemory) but its idempotency lookup
+      // survives, a fresh accept with the same key must NOT return the dead
+      // runId (which would block the key forever) — it should rebind to the
+      // new run and accept it.
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      const idem = { idempotencyKey: "kheal", taskIdentifier: "t" };
+      try {
+        await buffer.accept({ runId: "heal_old", envId: "env_h", orgId: "org_1", payload: "{}", ...idem });
+        // Simulate eviction of the entry hash while the lookup survives.
+        await buffer["redis"].del("mollifier:entries:heal_old");
+        const lookupKey = idempotencyLookupKeyFor({ envId: "env_h", ...idem });
+        expect(await buffer["redis"].get(lookupKey)).toBe("heal_old");
+
+        // A fresh accept with the same key rebinds rather than deduping
+        // onto the dead run.
+        const result = await buffer.accept({
+          runId: "heal_new",
+          envId: "env_h",
+          orgId: "org_1",
+          payload: "{}",
+          ...idem,
+        });
+        expect(result).toEqual({ kind: "accepted" });
+        expect(await buffer["redis"].get(lookupKey)).toBe("heal_new");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "accept still dedups when the bound entry is live",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      // The self-heal must not weaken normal dedup: a live bound entry
+      // still wins, and the loser gets its runId back.
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      const idem = { idempotencyKey: "klive", taskIdentifier: "t" };
+      try {
+        await buffer.accept({ runId: "live_win", envId: "env_h", orgId: "org_1", payload: "{}", ...idem });
+        const result = await buffer.accept({
+          runId: "live_lose",
+          envId: "env_h",
+          orgId: "org_1",
+          payload: "{}",
+          ...idem,
+        });
+        expect(result).toEqual({ kind: "duplicate_idempotency", existingRunId: "live_win" });
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
 });
 
 describe("MollifierBuffer.casSetMetadata", () => {
diff --git a/packages/redis-worker/src/mollifier/buffer.ts b/packages/redis-worker/src/mollifier/buffer.ts
index 2c2f0cc5fa7..d5492e745d5 100644
--- a/packages/redis-worker/src/mollifier/buffer.ts
+++ b/packages/redis-worker/src/mollifier/buffer.ts
@@ -160,6 +160,7 @@ export class MollifierBuffer {
       String(createdAtMicros),
       "mollifier:org-envs:",
       idempotencyLookupKey,
+      "mollifier:entries:",
     );
     // Lua returns 1 (accepted), 0 (duplicate runId), or a string runId
     // (duplicate idempotency — value is the existing winner's runId).
@@ -391,15 +392,17 @@ export class MollifierBuffer {
   // Resolve a buffered run by (env, task, idempotencyKey) tuple. Used by
   // `IdempotencyKeyConcern.handleTriggerRequest` after the PG check
   // misses — same key may belong to a buffered run waiting to drain. The
-  // lookup self-heals: if the lookup points at an entry hash that's
-  // expired, we DEL the lookup and report a miss.
+  // lookup self-heals: if the lookup points at an entry hash that's gone,
+  // we clear the lookup and report a miss. The clear is a compare-and-
+  // delete (only if the key still holds the stale runId we observed) so a
+  // fresh accept that rebinds the key between our GET and DEL isn't wiped.
   async lookupIdempotency(input: IdempotencyLookupInput): Promise<string | null> {
     const lookupKey = idempotencyLookupKeyFor(input);
     const runId = await this.redis.get(lookupKey);
     if (!runId) return null;
     const entry = await this.getEntry(runId);
     if (!entry) {
-      await this.redis.del(lookupKey);
+      await this.redis.delMollifierKeyIfEquals(lookupKey, runId);
       return null;
     }
     return runId;
@@ -502,6 +505,7 @@ export class MollifierBuffer {
         local createdAtMicros = ARGV[6]
         local orgEnvsPrefix = ARGV[7]
         local idempotencyLookupKey = ARGV[8] or ''
+        local entryPrefix = ARGV[9]
 
         -- Idempotent: refuse if an entry for this runId already exists in any
         -- state. Caller-side dedup is also enforced via API idempotency keys,
@@ -519,7 +523,14 @@ export class MollifierBuffer {
         if idempotencyLookupKey ~= '' then
           local existing = redis.call('GET', idempotencyLookupKey)
           if existing then
-            return existing
+            -- Self-heal: only honour the binding if its entry hash still
+            -- exists. If the entry was evicted (maxmemory) but the lookup
+            -- survived, the binding is stale — fall through and rebind to
+            -- this run rather than returning a dead runId that would block
+            -- the key indefinitely. Mirrors lookupIdempotency's self-heal.
+            if redis.call('EXISTS', entryPrefix .. existing) == 1 then
+              return existing
+            end
           end
           redis.call('SET', idempotencyLookupKey, runId)
         end
@@ -935,6 +946,20 @@ export class MollifierBuffer {
       `,
     });
 
+    // Compare-and-delete: DEL the key only if it still holds the expected
+    // value. Used by lookupIdempotency's stale-lookup self-heal so a
+    // concurrent accept that rebinds the key between the reader's GET and
+    // this DEL isn't clobbered.
+    this.redis.defineCommand("delMollifierKeyIfEquals", {
+      numberOfKeys: 1,
+      lua: `
+        if redis.call('GET', KEYS[1]) == ARGV[1] then
+          return redis.call('DEL', KEYS[1])
+        end
+        return 0
+      `,
+    });
+
     this.redis.defineCommand("mollifierEvaluateTrip", {
       numberOfKeys: 2,
       lua: `
@@ -974,6 +999,7 @@ declare module "@internal/redis" {
       createdAtMicros: string,
       orgEnvsPrefix: string,
       idempotencyLookupKey: string,
+      entryPrefix: string,
       callback?: Callback<number | string>,
     ): Result<number | string, Context>;
     popAndMarkDraining(
@@ -1039,6 +1065,11 @@ declare module "@internal/redis" {
       errorPayload: string,
       callback?: Callback<number>,
     ): Result<number, Context>;
+    delMollifierKeyIfEquals(
+      key: string,
+      expected: string,
+      callback?: Callback<number>,
+    ): Result<number, Context>;
     mollifierEvaluateTrip(
       rateKey: string,
       trippedKey: string,
diff --git a/packages/redis-worker/src/mollifier/schemas.ts b/packages/redis-worker/src/mollifier/schemas.ts
index c5d9915575a..7a2861a63a5 100644
--- a/packages/redis-worker/src/mollifier/schemas.ts
+++ b/packages/redis-worker/src/mollifier/schemas.ts
@@ -48,9 +48,12 @@ export const BufferEntrySchema = z.object({
   status: BufferEntryStatus,
   attempts: stringToInt,
   createdAt: stringToDate,
-  // Microsecond epoch matching the ZSET queue score. Stable across
-  // requeues — the score never moves once set at accept time.
-  createdAtMicros: stringToInt,
+  // Microsecond epoch of accept time, kept as a hash field for dwell
+  // metrics. Not a queue sort key (the queue is a FIFO LIST). Defaulted
+  // so an entry written by an accept Lua predating this field — or one
+  // surviving across the deploy that introduced it — still parses instead
+  // of being silently dropped on pop.
+  createdAtMicros: stringToInt.default("0"),
   // Drainer-ack flag: `true` once the drainer has materialised this run
   // into PG. The hash persists for a short grace TTL after ack so direct
   // reads (retrieve, trace, etc.) still resolve while PG replica lag