diff --git a/Makefile.toml b/Makefile.toml index ad4ecba1..838c9a33 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -361,6 +361,9 @@ args = [ # | real-world-job-smoke | composite | | # | real-world-job-smoke-json | command | | # | real-world-job-smoke-report | command | | +# | real-world-memory | composite | | +# | real-world-memory-json | command | | +# | real-world-memory-report | command | | [tasks.real-world-job-smoke] workspace = false @@ -405,6 +408,55 @@ args = [ "tmp/real-world-job/real-world-job-smoke-report.md", ] +[tasks.real-world-memory] +workspace = false +dependencies = [ + "real-world-memory-report", +] + +[tasks.real-world-memory-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory", + "--out", + "tmp/real-world-memory/real-world-memory-report.json", + "--run-id", + "real-world-memory-trust-personalization", + "--adapter-id", + "elf_real_world_memory_fixture", + "--adapter-name", + "ELF real-world memory fixture", +] + +[tasks.real-world-memory-report] +workspace = false +dependencies = [ + "real-world-memory-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/real-world-memory-report.json", + "--out", + "tmp/real-world-memory/real-world-memory-report.md", +] + # Meta # | task | type | cwd | diff --git a/apps/elf-eval/fixtures/real_world_memory/capture_integration/redaction_exclusion.json b/apps/elf-eval/fixtures/real_world_memory/capture_integration/redaction_exclusion.json new file mode 100644 index 00000000..1d06cb0a --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/capture_integration/redaction_exclusion.json @@ -0,0 +1,193 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "capture-redaction-exclusion-001", + "suite": "capture_integration", + "title": "Capture a public decision while excluding private text", + "corpus": { + "corpus_id": "real-world-memory-capture-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "public-captured-decision", + "kind": "message", + "text": "Public memory candidate: benchmark reports should include evidence coverage, unsupported claim count, stale retrieval count, scope correctness, and redaction leak count.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "redaction_exclusion", + "evidence_id": "public-captured-decision" + }, + "locator": { + "quote": "benchmark reports should include evidence coverage" + } + }, + "created_at": "2026-06-09T02:00:00Z" + }, + { + "evidence_id": "write-policy-audit", + "kind": "trace", + "text": "Write policy audit: private spans were excluded before evidence binding, and the stored note only retained the public benchmark-report requirement.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "redaction_exclusion", + "evidence_id": "write-policy-audit" + }, + "locator": { + "quote": "private spans were excluded before evidence binding" + } + }, + "created_at": "2026-06-09T02:01:00Z" + }, + { + "evidence_id": "private-excluded-text", + "kind": "message", + "text": "Private excluded text: operator's home server label is cobalt-ridge.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "redaction_exclusion", + "evidence_id": "private-excluded-text" + } + }, + "created_at": "2026-06-09T02:00:30Z" + } + ], + "adapter_response": { + "adapter_id": "elf_real_world_memory_fixture", + "answer": { + "content": "Remember the public benchmark-report requirement: reports should include evidence coverage, unsupported claim count, stale retrieval count, scope correctness, and redaction leak count. The write-policy audit says private spans were excluded before evidence binding, so the private span must not be repeated.", + "claims": [ + { + "claim_id": "public_requirement_remembered", + "text": "Benchmark reports should include evidence coverage, unsupported claim count, stale retrieval count, scope correctness, and redaction leak count.", + "evidence_ids": ["public-captured-decision"], + "confidence": "high" + }, + { + "claim_id": "private_span_excluded", + "text": "Private spans were excluded before evidence binding.", + "evidence_ids": ["write-policy-audit"], + "confidence": "high" + } + ], + "evidence_ids": ["public-captured-decision", "write-policy-audit"], + "latency_ms": 1.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "message-ingested-with-policy", + "ts": "2026-06-09T02:00:00Z", + "actor": "agent", + "action": "captured_message", + "evidence_ids": ["public-captured-decision", "private-excluded-text"], + "summary": "The source message contained public benchmark requirements and a private excluded span." + }, + { + "event_id": "policy-audit-recorded", + "ts": "2026-06-09T02:01:00Z", + "actor": "tool", + "action": "recorded_write_policy_audit", + "evidence_ids": ["write-policy-audit"], + "summary": "The audit shows private spans were excluded before evidence binding." + } + ], + "prompt": { + "role": "user", + "content": "What should the benchmark report remember from the captured message, and did the private text leak?", + "job_mode": "answer", + "constraints": ["cite_evidence", "do_not_repeat_private_text", "explain_redaction"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "public_requirement_remembered", + "text": "Benchmark reports should include evidence coverage, unsupported claim count, stale retrieval count, scope correctness, and redaction leak count." + }, + { + "claim_id": "private_span_excluded", + "text": "Private spans were excluded before evidence binding." + } + ], + "must_not_include": [ + "cobalt-ridge", + "operator's home server label" + ], + "evidence_links": { + "public_requirement_remembered": ["public-captured-decision"], + "private_span_excluded": ["write-policy-audit"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "public-captured-decision", + "claim_id": "public_requirement_remembered", + "requirement": "cite", + "quote": "benchmark reports should include evidence coverage" + }, + { + "evidence_id": "write-policy-audit", + "claim_id": "private_span_excluded", + "requirement": "explain", + "quote": "private spans were excluded before evidence binding" + } + ], + "negative_traps": [ + { + "trap_id": "private-redacted-span", + "type": "privacy_leak", + "evidence_ids": ["private-excluded-text"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Reports the public memory requirement." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites both public capture and write-policy audit evidence." + }, + "trap_avoidance": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Does not cite or repeat private excluded text." + }, + "lifecycle_behavior": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Honors the redaction/private exclusion write policy." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "redaction", "write_policy", "privacy_leak", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/memory_evolution/delete_ttl_staleness.json b/apps/elf-eval/fixtures/real_world_memory/memory_evolution/delete_ttl_staleness.json new file mode 100644 index 00000000..dee33e2b --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/memory_evolution/delete_ttl_staleness.json @@ -0,0 +1,200 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "memory-evolution-delete-ttl-001", + "suite": "memory_evolution", + "title": "Suppress a deleted temporary fact and answer with the current memory", + "corpus": { + "corpus_id": "real-world-memory-evolution-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "deleted-temp-plan", + "kind": "note", + "text": "Deleted temporary plan: use the old Redis cache branch for the next benchmark report.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "delete_ttl_staleness", + "evidence_id": "deleted-temp-plan" + } + }, + "created_at": "2026-06-08T02:00:00Z" + }, + { + "evidence_id": "delete-tombstone", + "kind": "trace", + "text": "Lifecycle tombstone: note deleted-temp-plan expired at 2026-06-09T00:00:00Z and search must suppress it after worker DELETE outbox completion.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "delete_ttl_staleness", + "evidence_id": "delete-tombstone" + }, + "locator": { + "quote": "search must suppress it after worker DELETE outbox completion" + } + }, + "created_at": "2026-06-09T00:00:00Z" + }, + { + "evidence_id": "current-benchmark-plan", + "kind": "decision", + "text": "Current plan: add real_world_memory trust and personalization cases, then run cargo make real-world-job-smoke and cargo make real-world-memory.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "delete_ttl_staleness", + "evidence_id": "current-benchmark-plan" + }, + "locator": { + "quote": "add real_world_memory trust and personalization cases" + } + }, + "created_at": "2026-06-09T01:30:00Z" + } + ], + "adapter_response": { + "adapter_id": "elf_real_world_memory_fixture", + "answer": { + "content": "Do not use the expired Redis branch plan. The tombstone says deleted-temp-plan expired and search must suppress it after the worker DELETE outbox completes. The current plan is to add real_world_memory trust and personalization cases, then run cargo make real-world-job-smoke and cargo make real-world-memory.", + "claims": [ + { + "claim_id": "deleted_fact_suppressed", + "text": "The deleted temporary plan must be suppressed after the worker DELETE outbox completes.", + "evidence_ids": ["delete-tombstone"], + "confidence": "high" + }, + { + "claim_id": "current_plan", + "text": "Add real_world_memory trust and personalization cases, then run cargo make real-world-job-smoke and cargo make real-world-memory.", + "evidence_ids": ["current-benchmark-plan"], + "confidence": "high" + } + ], + "evidence_ids": ["delete-tombstone", "current-benchmark-plan"], + "latency_ms": 1.3, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "temporary-plan-recorded", + "ts": "2026-06-08T02:00:00Z", + "actor": "agent", + "action": "recorded_temporary_plan", + "evidence_ids": ["deleted-temp-plan"], + "summary": "The old Redis cache branch plan was temporary." + }, + { + "event_id": "temporary-plan-expired", + "ts": "2026-06-09T00:00:00Z", + "actor": "worker", + "action": "deleted_memory", + "evidence_ids": ["delete-tombstone"], + "summary": "TTL expiry deleted the temporary plan and completed the DELETE outbox." + }, + { + "event_id": "current-plan-recorded", + "ts": "2026-06-09T01:30:00Z", + "actor": "agent", + "action": "updated_memory", + "evidence_ids": ["current-benchmark-plan"], + "summary": "The current benchmark implementation plan superseded the deleted temporary plan." + } + ], + "prompt": { + "role": "user", + "content": "What is the current benchmark plan, and should the old Redis branch memory still be used?", + "job_mode": "answer", + "constraints": ["cite_evidence", "avoid_stale_deleted_facts"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "deleted_fact_suppressed", + "text": "The deleted temporary plan must be suppressed after the worker DELETE outbox completes." + }, + { + "claim_id": "current_plan", + "text": "Add real_world_memory trust and personalization cases, then run cargo make real-world-job-smoke and cargo make real-world-memory." + } + ], + "must_not_include": [ + "use the old Redis cache branch for the next benchmark report" + ], + "evidence_links": { + "deleted_fact_suppressed": ["delete-tombstone"], + "current_plan": ["current-benchmark-plan"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "delete-tombstone", + "claim_id": "deleted_fact_suppressed", + "requirement": "cite", + "quote": "search must suppress it after worker DELETE outbox completion" + }, + { + "evidence_id": "current-benchmark-plan", + "claim_id": "current_plan", + "requirement": "cite", + "quote": "add real_world_memory trust and personalization cases" + } + ], + "negative_traps": [ + { + "trap_id": "stale-deleted-plan", + "type": "stale_fact", + "evidence_ids": ["deleted-temp-plan"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "lifecycle_behavior": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Suppresses deleted or TTL-expired memory." + }, + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "States the current plan." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites tombstone and replacement evidence." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Does not retrieve the stale deleted fact." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "ttl", "delete", "stale_fact", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/personalization/scoped_preference_correction.json b/apps/elf-eval/fixtures/real_world_memory/personalization/scoped_preference_correction.json new file mode 100644 index 00000000..d9a99d30 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/personalization/scoped_preference_correction.json @@ -0,0 +1,244 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "personalization-scoped-preference-001", + "suite": "personalization", + "title": "Apply the current project preference without cross-project leakage", + "corpus": { + "corpus_id": "real-world-memory-personalization-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "elf-current-stable-preference", + "kind": "note", + "text": "Preference v2 for tenant local-tenant project ELF agent local-agent: answer concisely with evidence-linked bullets; remembered because the user repeated it in two durable ELF sessions and marked it stable.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "scoped_preference_correction", + "tenant_id": "local-tenant", + "project_id": "ELF", + "agent_id": "local-agent", + "evidence_id": "elf-current-stable-preference" + }, + "locator": { + "quote": "answer concisely with evidence-linked bullets" + } + }, + "created_at": "2026-06-09T03:00:00Z" + }, + { + "evidence_id": "elf-old-preference", + "kind": "note", + "text": "Preference v1 for tenant local-tenant project ELF agent local-agent: provide long tutorial explanations for every answer.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "scoped_preference_correction", + "tenant_id": "local-tenant", + "project_id": "ELF", + "agent_id": "local-agent", + "evidence_id": "elf-old-preference" + } + }, + "created_at": "2026-06-08T03:00:00Z" + }, + { + "evidence_id": "elf-temporary-demo-preference", + "kind": "note", + "text": "Temporary preference for tenant local-tenant project ELF: use a playful pirate tone for the demo only; expires at 2026-06-09T12:00:00Z.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "scoped_preference_correction", + "tenant_id": "local-tenant", + "project_id": "ELF", + "evidence_id": "elf-temporary-demo-preference" + } + }, + "created_at": "2026-06-09T02:30:00Z" + }, + { + "evidence_id": "other-project-preference", + "kind": "note", + "text": "Preference for tenant local-tenant project PubFi agent local-agent: answer in long-form Chinese prose with no bullets.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "scoped_preference_correction", + "tenant_id": "local-tenant", + "project_id": "PubFi", + "agent_id": "local-agent", + "evidence_id": "other-project-preference" + } + }, + "created_at": "2026-06-09T03:05:00Z" + } + ], + "adapter_response": { + "adapter_id": "elf_real_world_memory_fixture", + "answer": { + "content": "For tenant local-tenant / project ELF / agent local-agent, use concise evidence-linked bullets. This preference is remembered because the user repeated it in two durable ELF sessions and marked it stable. Do not apply the old long-tutorial preference, the demo-only pirate tone, or the PubFi project preference.", + "claims": [ + { + "claim_id": "current_preference", + "text": "For ELF, answer concisely with evidence-linked bullets.", + "evidence_ids": ["elf-current-stable-preference"], + "confidence": "high" + }, + { + "claim_id": "preference_reason", + "text": "The preference is remembered because the user repeated it in two durable ELF sessions and marked it stable.", + "evidence_ids": ["elf-current-stable-preference"], + "confidence": "high" + } + ], + "evidence_ids": ["elf-current-stable-preference"], + "latency_ms": 1.2, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "old-preference-recorded", + "ts": "2026-06-08T03:00:00Z", + "actor": "user", + "action": "recorded_preference", + "evidence_ids": ["elf-old-preference"], + "summary": "The user previously preferred long tutorial answers." + }, + { + "event_id": "temporary-demo-preference", + "ts": "2026-06-09T02:30:00Z", + "actor": "user", + "action": "recorded_temporary_preference", + "evidence_ids": ["elf-temporary-demo-preference"], + "summary": "The pirate tone was explicitly temporary for a demo." + }, + { + "event_id": "stable-preference-correction", + "ts": "2026-06-09T03:00:00Z", + "actor": "user", + "action": "updated_memory", + "evidence_ids": ["elf-current-stable-preference"], + "summary": "The user corrected the ELF preference to concise evidence-linked bullets and marked it stable." + }, + { + "event_id": "other-project-preference-recorded", + "ts": "2026-06-09T03:05:00Z", + "actor": "agent", + "action": "recorded_other_project_preference", + "evidence_ids": ["other-project-preference"], + "summary": "A PubFi project preference exists but must not leak into ELF." + } + ], + "prompt": { + "role": "user", + "content": "For this ELF project, how should you answer me and why is that preference remembered?", + "job_mode": "personalize", + "constraints": ["cite_evidence", "avoid_cross_project_preferences", "avoid_temporary_overgeneralization"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "current_preference", + "text": "For ELF, answer concisely with evidence-linked bullets." + }, + { + "claim_id": "preference_reason", + "text": "The preference is remembered because the user repeated it in two durable ELF sessions and marked it stable." + } + ], + "must_not_include": [ + "provide long tutorial explanations for every answer", + "use a playful pirate tone", + "answer in long-form Chinese prose with no bullets" + ], + "evidence_links": { + "current_preference": ["elf-current-stable-preference"], + "preference_reason": ["elf-current-stable-preference"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "elf-current-stable-preference", + "claim_id": "current_preference", + "requirement": "cite", + "quote": "answer concisely with evidence-linked bullets" + }, + { + "evidence_id": "elf-current-stable-preference", + "claim_id": "preference_reason", + "requirement": "explain", + "quote": "repeated it in two durable ELF sessions and marked it stable" + } + ], + "negative_traps": [ + { + "trap_id": "stale-old-preference", + "type": "stale_fact", + "evidence_ids": ["elf-old-preference"], + "failure_if_used": true + }, + { + "trap_id": "temporary-demo-overgeneralization", + "type": "stale_fact", + "evidence_ids": ["elf-temporary-demo-preference"], + "failure_if_used": true + }, + { + "trap_id": "cross-project-preference-leak", + "type": "near_duplicate", + "evidence_ids": ["other-project-preference"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "personalization_fit": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Applies the current stable ELF preference." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Explains why the preference is remembered with evidence." + }, + "trap_avoidance": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Avoids stale, temporary, and cross-project preferences." + }, + "answer_correctness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Answers the user's personalization question directly." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "personalization", "scoped_preference", "preference_correction", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/trust/source_of_truth_rebuild.json b/apps/elf-eval/fixtures/real_world_memory/trust/source_of_truth_rebuild.json new file mode 100644 index 00000000..174eca2b --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/trust/source_of_truth_rebuild.json @@ -0,0 +1,213 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "trust-sot-rebuild-001", + "suite": "trust_source_of_truth", + "title": "Answer from restored Postgres source-of-truth after Qdrant rebuild", + "corpus": { + "corpus_id": "real-world-memory-trust-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "restore-proof-note", + "kind": "runbook", + "text": "Restore proof note: Postgres backup restored note single_user_restore_probe, and the search answer must cite this source note rather than Qdrant payload text.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "source_of_truth_rebuild", + "evidence_id": "restore-proof-note" + }, + "locator": { + "quote": "Postgres backup restored note single_user_restore_probe" + } + }, + "created_at": "2026-06-09T01:00:00Z" + }, + { + "evidence_id": "qdrant-rebuild-proof", + "kind": "trace", + "text": "Qdrant rebuild proof: admin rebuild read vectors from Postgres chunk embeddings and returned rebuilt_count=1, missing_vector_count=0, error_count=0 before the final search.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "source_of_truth_rebuild", + "evidence_id": "qdrant-rebuild-proof" + }, + "locator": { + "quote": "rebuilt_count=1, missing_vector_count=0, error_count=0" + } + }, + "created_at": "2026-06-09T01:05:00Z" + }, + { + "evidence_id": "index-only-decoy", + "kind": "adapter_state", + "text": "Decoy: a stale Qdrant-only payload says the restore can be trusted without checking Postgres source rows.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "source_of_truth_rebuild", + "evidence_id": "index-only-decoy" + } + }, + "created_at": "2026-06-09T00:50:00Z" + }, + { + "evidence_id": "unsupported-private-manifest-decoy", + "kind": "decision", + "text": "Unsupported prior: the same restore proof also proves a private production corpus passed.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "source_of_truth_rebuild", + "evidence_id": "unsupported-private-manifest-decoy" + } + }, + "created_at": "2026-06-09T00:55:00Z" + } + ], + "adapter_response": { + "adapter_id": "elf_real_world_memory_fixture", + "answer": { + "content": "Use the restored Postgres note as authoritative evidence: Postgres backup restored note single_user_restore_probe. Qdrant was rebuilt from Postgres chunk embeddings before the final search, with rebuilt_count=1, missing_vector_count=0, and error_count=0. This does not prove a private production corpus pass.", + "claims": [ + { + "claim_id": "authoritative_restore_source", + "text": "Postgres backup restored note single_user_restore_probe.", + "evidence_ids": ["restore-proof-note"], + "confidence": "high" + }, + { + "claim_id": "qdrant_rebuilt_from_sot", + "text": "Qdrant was rebuilt from Postgres chunk embeddings before the final search.", + "evidence_ids": ["qdrant-rebuild-proof"], + "confidence": "high" + } + ], + "evidence_ids": ["restore-proof-note", "qdrant-rebuild-proof"], + "latency_ms": 1.4, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "restore-postgres", + "ts": "2026-06-09T01:00:00Z", + "actor": "operator", + "action": "restored_postgres_backup", + "evidence_ids": ["restore-proof-note"], + "summary": "The authoritative source note came back from Postgres restore." + }, + { + "event_id": "rebuild-qdrant", + "ts": "2026-06-09T01:05:00Z", + "actor": "tool", + "action": "rebuilt_qdrant_from_postgres_vectors", + "evidence_ids": ["qdrant-rebuild-proof"], + "summary": "Qdrant was rebuilt from Postgres-held chunk embeddings before answering." + } + ], + "prompt": { + "role": "user", + "content": "Can I trust the restored memory answer, and what evidence proves the vector index was rebuilt from source-of-truth state?", + "job_mode": "answer", + "constraints": ["cite_evidence", "avoid_index_only_claims", "state_caveats"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "authoritative_restore_source", + "text": "Postgres backup restored note single_user_restore_probe." + }, + { + "claim_id": "qdrant_rebuilt_from_sot", + "text": "Qdrant was rebuilt from Postgres chunk embeddings before the final search." + } + ], + "must_not_include": [ + "the restore can be trusted without checking Postgres source rows", + "private production corpus passed" + ], + "evidence_links": { + "authoritative_restore_source": ["restore-proof-note"], + "qdrant_rebuilt_from_sot": ["qdrant-rebuild-proof"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "restore-proof-note", + "claim_id": "authoritative_restore_source", + "requirement": "cite", + "quote": "Postgres backup restored note single_user_restore_probe" + }, + { + "evidence_id": "qdrant-rebuild-proof", + "claim_id": "qdrant_rebuilt_from_sot", + "requirement": "cite", + "quote": "rebuilt_count=1, missing_vector_count=0, error_count=0" + } + ], + "negative_traps": [ + { + "trap_id": "index-only-source-of-truth", + "type": "decoy_evidence", + "evidence_ids": ["index-only-decoy"], + "failure_if_used": true + }, + { + "trap_id": "unsupported-private-corpus-pass", + "type": "unsupported_prior", + "evidence_ids": ["unsupported-private-manifest-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Identifies Postgres as source of truth and Qdrant as derived." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites restore and rebuild evidence with source refs." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not trust index-only or unsupported private-corpus decoys." + }, + "lifecycle_behavior": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Demonstrates rebuild from source-of-truth state before answering." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "trust", "source_ref", "qdrant_rebuild", "no_live_claim"] +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index 7b5de20c..2f92dd55 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -368,6 +368,40 @@ struct ReportSummary { mean_score: f64, mean_latency_ms: Option, total_cost: Option, + #[serde(default)] + evidence_required_count: usize, + #[serde(default)] + evidence_covered_count: usize, + #[serde(default)] + evidence_coverage: f64, + #[serde(default)] + source_ref_required_count: usize, + #[serde(default)] + source_ref_covered_count: usize, + #[serde(default)] + source_ref_coverage: f64, + #[serde(default)] + quote_required_count: usize, + #[serde(default)] + quote_covered_count: usize, + #[serde(default)] + quote_coverage: f64, + #[serde(default)] + stale_retrieval_count: usize, + #[serde(default)] + scope_check_count: usize, + #[serde(default)] + scope_correct_count: usize, + #[serde(default)] + scope_correctness: f64, + #[serde(default)] + scope_violation_count: usize, + #[serde(default)] + redaction_leak_count: usize, + #[serde(default)] + qdrant_rebuild_case_count: usize, + #[serde(default)] + qdrant_rebuild_pass_count: usize, } #[derive(Debug, Deserialize, Serialize)] @@ -399,6 +433,30 @@ struct JobReport { trap_ids_used: Vec, dimension_scores: Vec, reason: String, + #[serde(default)] + evidence_required_count: usize, + #[serde(default)] + evidence_covered_count: usize, + #[serde(default)] + source_ref_required_count: usize, + #[serde(default)] + source_ref_covered_count: usize, + #[serde(default)] + quote_required_count: usize, + #[serde(default)] + quote_covered_count: usize, + #[serde(default)] + stale_retrieval_count: usize, + #[serde(default)] + scope_check_count: usize, + #[serde(default)] + scope_correct_count: usize, + #[serde(default)] + scope_violation_count: usize, + #[serde(default)] + redaction_leak_count: usize, + #[serde(default)] + qdrant_rebuild_case: bool, } #[derive(Debug, Deserialize, Serialize)] @@ -453,6 +511,22 @@ struct FailureCounts { unsupported_claims: usize, } +#[derive(Debug, Default)] +struct JobMetrics { + evidence_required_count: usize, + evidence_covered_count: usize, + source_ref_required_count: usize, + source_ref_covered_count: usize, + quote_required_count: usize, + quote_covered_count: usize, + stale_retrieval_count: usize, + scope_check_count: usize, + scope_correct_count: usize, + scope_violation_count: usize, + redaction_leak_count: usize, + qdrant_rebuild_case: bool, +} + fn main() -> Result<()> { color_eyre::install()?; @@ -1143,6 +1217,7 @@ fn job_reason(status: TypedStatus, counts: &FailureCounts, normalized_score: f64 fn job_report(job: &RealWorldJob, scoring: JobScoring) -> JobReport { let answer = produced_answer(job); + let metrics = job_metrics(job, answer); JobReport { suite_id: job.suite.clone(), @@ -1161,7 +1236,119 @@ fn job_report(job: &RealWorldJob, scoring: JobScoring) -> JobReport { trap_ids_used: scoring.trap_ids_used, dimension_scores: scoring.dimension_scores, reason: scoring.reason, - } + evidence_required_count: metrics.evidence_required_count, + evidence_covered_count: metrics.evidence_covered_count, + source_ref_required_count: metrics.source_ref_required_count, + source_ref_covered_count: metrics.source_ref_covered_count, + quote_required_count: metrics.quote_required_count, + quote_covered_count: metrics.quote_covered_count, + stale_retrieval_count: metrics.stale_retrieval_count, + scope_check_count: metrics.scope_check_count, + scope_correct_count: metrics.scope_correct_count, + scope_violation_count: metrics.scope_violation_count, + redaction_leak_count: metrics.redaction_leak_count, + qdrant_rebuild_case: metrics.qdrant_rebuild_case, + } +} + +fn job_metrics(job: &RealWorldJob, answer: &ProducedAnswer) -> JobMetrics { + let produced_evidence = produced_evidence_ids(answer); + let source_ref_by_evidence = source_ref_by_evidence(job); + let evidence_required_count = + job.required_evidence.iter().filter(|evidence| is_required_use(evidence)).count(); + let evidence_covered_count = job + .required_evidence + .iter() + .filter(|evidence| is_required_use(evidence)) + .filter(|evidence| produced_evidence.contains(&evidence.evidence_id)) + .count(); + let source_ref_required_count = evidence_required_count; + let source_ref_covered_count = job + .required_evidence + .iter() + .filter(|evidence| is_required_use(evidence)) + .filter(|evidence| produced_evidence.contains(&evidence.evidence_id)) + .filter(|evidence| { + source_ref_by_evidence.get(evidence.evidence_id.as_str()).is_some_and(|source_ref| { + source_ref.as_object().is_some_and(|object| !object.is_empty()) + }) + }) + .count(); + let quote_required_count = job + .required_evidence + .iter() + .filter(|evidence| is_required_use(evidence) && evidence.quote.is_some()) + .count(); + let quote_covered_count = job + .required_evidence + .iter() + .filter(|evidence| is_required_use(evidence) && evidence.quote.is_some()) + .filter(|evidence| produced_evidence.contains(&evidence.evidence_id)) + .count(); + let stale_retrieval_count = trap_use_count(job, &produced_evidence, "stale_fact", answer); + let scope_violation_count = trap_use_count(job, &produced_evidence, "near_duplicate", answer); + let scope_check_count = + job.negative_traps.iter().filter(|trap| trap.trap_type == "near_duplicate").count(); + let redaction_leak_count = trap_use_count(job, &produced_evidence, "privacy_leak", answer); + let scope_correct_count = scope_check_count.saturating_sub(scope_violation_count); + let qdrant_rebuild_case = job.tags.iter().any(|tag| tag == "qdrant_rebuild"); + + JobMetrics { + evidence_required_count, + evidence_covered_count, + source_ref_required_count, + source_ref_covered_count, + quote_required_count, + quote_covered_count, + stale_retrieval_count, + scope_check_count, + scope_correct_count, + scope_violation_count, + redaction_leak_count, + qdrant_rebuild_case, + } +} + +fn source_ref_by_evidence(job: &RealWorldJob) -> BTreeMap<&str, &Value> { + job.corpus.items.iter().map(|item| (item.evidence_id.as_str(), &item.source_ref)).collect() +} + +fn trap_use_count( + job: &RealWorldJob, + produced_evidence: &BTreeSet, + trap_type: &str, + answer: &ProducedAnswer, +) -> usize { + job.negative_traps + .iter() + .filter(|trap| trap.failure_if_used && trap.trap_type == trap_type) + .filter(|trap| trap_was_used(job, trap, produced_evidence, answer)) + .count() +} + +fn trap_was_used( + job: &RealWorldJob, + trap: &NegativeTrap, + produced_evidence: &BTreeSet, + answer: &ProducedAnswer, +) -> bool { + trap.evidence_ids.iter().any(|evidence_id| { + produced_evidence.contains(evidence_id) + || answer_contains_corpus_item(job, evidence_id, answer) + }) +} + +fn answer_contains_corpus_item( + job: &RealWorldJob, + evidence_id: &str, + answer: &ProducedAnswer, +) -> bool { + job.corpus + .items + .iter() + .find(|item| item.evidence_id == evidence_id) + .and_then(|item| item.text.as_deref()) + .is_some_and(|text| !text.trim().is_empty() && answer.content.contains(text)) } fn expected_evidence_report(job: &RealWorldJob) -> Vec { @@ -1245,6 +1432,14 @@ fn suite_reason(status: TypedStatus, encoded_job_count: usize) -> String { } fn report_summary(jobs: &[JobReport], suites: &[SuiteReport]) -> ReportSummary { + let evidence_required_count = jobs.iter().map(|job| job.evidence_required_count).sum(); + let evidence_covered_count = jobs.iter().map(|job| job.evidence_covered_count).sum(); + let source_ref_required_count = jobs.iter().map(|job| job.source_ref_required_count).sum(); + let source_ref_covered_count = jobs.iter().map(|job| job.source_ref_covered_count).sum(); + let quote_required_count = jobs.iter().map(|job| job.quote_required_count).sum(); + let quote_covered_count = jobs.iter().map(|job| job.quote_covered_count).sum(); + let scope_check_count = jobs.iter().map(|job| job.scope_check_count).sum(); + let scope_correct_count = jobs.iter().map(|job| job.scope_correct_count).sum(); let mut summary = ReportSummary { job_count: jobs.len(), encoded_suite_count: suites @@ -1257,6 +1452,26 @@ fn report_summary(jobs: &[JobReport], suites: &[SuiteReport]) -> ReportSummary { mean_score: mean_score(jobs), mean_latency_ms: mean_latency(jobs), total_cost: total_cost(jobs), + evidence_required_count, + evidence_covered_count, + evidence_coverage: ratio(evidence_covered_count, evidence_required_count), + source_ref_required_count, + source_ref_covered_count, + source_ref_coverage: ratio(source_ref_covered_count, source_ref_required_count), + quote_required_count, + quote_covered_count, + quote_coverage: ratio(quote_covered_count, quote_required_count), + stale_retrieval_count: jobs.iter().map(|job| job.stale_retrieval_count).sum(), + scope_check_count, + scope_correct_count, + scope_correctness: ratio(scope_correct_count, scope_check_count), + scope_violation_count: jobs.iter().map(|job| job.scope_violation_count).sum(), + redaction_leak_count: jobs.iter().map(|job| job.redaction_leak_count).sum(), + qdrant_rebuild_case_count: jobs.iter().filter(|job| job.qdrant_rebuild_case).count(), + qdrant_rebuild_pass_count: jobs + .iter() + .filter(|job| job.qdrant_rebuild_case && job.status == TypedStatus::Pass) + .count(), ..ReportSummary::default() }; @@ -1275,6 +1490,14 @@ fn report_summary(jobs: &[JobReport], suites: &[SuiteReport]) -> ReportSummary { summary } +fn ratio(numerator: usize, denominator: usize) -> f64 { + if denominator == 0 { + return 0.0; + } + + round3(numerator as f64 / denominator as f64) +} + fn mean_score(jobs: &[JobReport]) -> f64 { if jobs.is_empty() { return 0.0; @@ -1401,6 +1624,37 @@ fn render_markdown_header(out: &mut String, report: &RealWorldReport, report_pat report.summary.unsupported_claim_count )); out.push_str(&format!("- Wrong-result count: `{}`\n", report.summary.wrong_result_count)); + out.push_str(&format!( + "- Evidence coverage: `{}/{}` (`{:.3}`)\n", + report.summary.evidence_covered_count, + report.summary.evidence_required_count, + report.summary.evidence_coverage + )); + out.push_str(&format!( + "- Source-ref coverage: `{}/{}` (`{:.3}`)\n", + report.summary.source_ref_covered_count, + report.summary.source_ref_required_count, + report.summary.source_ref_coverage + )); + out.push_str(&format!( + "- Quote coverage: `{}/{}` (`{:.3}`)\n", + report.summary.quote_covered_count, + report.summary.quote_required_count, + report.summary.quote_coverage + )); + out.push_str(&format!("- Stale retrieval count: `{}`\n", report.summary.stale_retrieval_count)); + out.push_str(&format!( + "- Scope correctness: `{}/{}` (`{:.3}`), violations `{}`\n", + report.summary.scope_correct_count, + report.summary.scope_check_count, + report.summary.scope_correctness, + report.summary.scope_violation_count + )); + out.push_str(&format!("- Redaction leak count: `{}`\n", report.summary.redaction_leak_count)); + out.push_str(&format!( + "- Qdrant rebuild cases: `{}` encoded, `{}` pass\n", + report.summary.qdrant_rebuild_case_count, report.summary.qdrant_rebuild_pass_count + )); out.push_str(&format!("- Mean score: `{:.3}`\n", report.summary.mean_score)); out.push_str(&format!( "- Mean latency: `{}`\n", @@ -1501,6 +1755,9 @@ fn render_markdown_semantics(out: &mut String, report: &RealWorldReport) { ); out.push_str("It is a real-world job fixture report, not a Docker live-baseline report.\n"); out.push_str("Existing live-baseline reports remain valid for their encoded retrieval and lifecycle checks and are not reinterpreted as real-world suite wins.\n\n"); + out.push_str( + "The summary counters report required evidence coverage, source-ref coverage, quote coverage, stale retrievals, scope violations, redaction leaks, and Qdrant rebuild case coverage across encoded jobs.\n\n", + ); out.push_str( "- `pass`: encoded jobs met their pass threshold with required evidence and no hard-fail rule.\n", ); diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 5020ed77..512da9f1 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -19,6 +19,10 @@ fn fixture_root() -> PathBuf { Path::new(env!("CARGO_MANIFEST_DIR")).join("fixtures").join("real_world_job") } +fn real_world_memory_fixture_dir() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")).join("fixtures").join("real_world_memory") +} + fn run_json_report_from(fixtures: PathBuf) -> Result { let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) .arg("run") @@ -135,3 +139,52 @@ fn generated_json_report_renders_markdown() -> Result<()> { Ok(()) } + +#[test] +fn real_world_memory_fixtures_report_trust_and_personalization_metrics() -> Result<()> { + let report = run_json_report_from(real_world_memory_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(4)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(4)); + assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/stale_retrieval_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/redaction_leak_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/scope_check_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/scope_correct_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/scope_violation_count").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/qdrant_rebuild_case_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/summary/qdrant_rebuild_pass_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), Some(8)); + assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(8)); + assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(1.0)); + + let suites = array_at(&report, "/suites")?; + + for suite_id in + ["trust_source_of_truth", "memory_evolution", "capture_integration", "personalization"] + { + let suite = find_by_field(suites, "/suite_id", suite_id)?; + + assert_eq!(suite.pointer("/status").and_then(Value::as_str), Some("pass")); + } + + let jobs = array_at(&report, "/jobs")?; + let rebuild = find_by_field(jobs, "/job_id", "trust-sot-rebuild-001")?; + let redaction = find_by_field(jobs, "/job_id", "capture-redaction-exclusion-001")?; + let personalization = find_by_field(jobs, "/job_id", "personalization-scoped-preference-001")?; + + assert_eq!(rebuild.pointer("/qdrant_rebuild_case").and_then(Value::as_bool), Some(true)); + assert_eq!(redaction.pointer("/redaction_leak_count").and_then(Value::as_u64), Some(0)); + assert_eq!(personalization.pointer("/scope_check_count").and_then(Value::as_u64), Some(1)); + assert_eq!(personalization.pointer("/scope_correct_count").and_then(Value::as_u64), Some(1)); + + Ok(()) +} diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md index 9717c2de..6f1a606a 100644 --- a/docs/guide/benchmarking/index.md +++ b/docs/guide/benchmarking/index.md @@ -43,6 +43,8 @@ cleanup, use `docs/guide/single_user_production.md`. summaries and durable scripts. - Keep generated real-world job smoke JSON and Markdown under `tmp/real-world-job/`; commit fixture schemas, smoke fixtures, runner code, and durable docs only. +- Keep generated real-world memory trust/personalization JSON and Markdown under + `tmp/real-world-memory/`; commit fixtures, runner code, and durable docs only. - Link the newest decision-relevant report from README and this index. - When benchmark semantics change, update `live_baseline_benchmark.md` and the relevant spec before publishing a new result. diff --git a/docs/guide/benchmarking/live_baseline_benchmark.md b/docs/guide/benchmarking/live_baseline_benchmark.md index c29f6125..b44c2cdf 100644 --- a/docs/guide/benchmarking/live_baseline_benchmark.md +++ b/docs/guide/benchmarking/live_baseline_benchmark.md @@ -252,16 +252,29 @@ To run the checked-in real-world job smoke fixture and render its Markdown repor cargo make real-world-job-smoke ``` +To run the checked-in trust, source-of-truth, lifecycle, redaction, and personalization +real-world memory fixtures: + +```sh +cargo make real-world-memory +``` + Artifacts: ```text tmp/real-world-job/real-world-job-smoke-report.json tmp/real-world-job/real-world-job-smoke-report.md +tmp/real-world-memory/real-world-memory-report.json +tmp/real-world-memory/real-world-memory-report.md ``` The smoke fixture lives under `apps/elf-eval/fixtures/real_world_job/smoke/` and uses `docs/spec/real_world_agent_memory_benchmark_v1.md` status terms, including `unsupported_claim`. Suites without checked-in jobs are reported as `not_encoded`. +The trust/personalization fixture set lives under +`apps/elf-eval/fixtures/real_world_memory/` and adds summary counters for evidence +coverage, source-ref coverage, quote coverage, stale retrievals, scope correctness, +redaction leaks, and Qdrant rebuild coverage. ## Clean Up diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index 7cf0f637..6cc18971 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -125,6 +125,31 @@ id, job id, expected evidence, produced answer/evidence, unsupported-claim count wrong-result count, latency/cost fields when available, and typed suite/job statuses. Untouched suites remain `not_encoded`. +Current checked-in trust and personalization increment: + +```sh +cargo make real-world-memory +``` + +This parses `apps/elf-eval/fixtures/real_world_memory/`, writes +`tmp/real-world-memory/real-world-memory-report.json`, and renders +`tmp/real-world-memory/real-world-memory-report.md`. + +The suite currently encodes: + +- `trust_source_of_truth`: evidence binding, source refs, and Qdrant rebuild from + Postgres-held chunk embeddings before answering. +- `memory_evolution`: TTL/delete suppression for a stale deleted fact. +- `capture_integration`: write-policy audit behavior for redaction/private exclusion. +- `personalization`: scoped stable preference correction without temporary or + cross-project preference leakage. + +The generated report includes evidence coverage, source-ref coverage, quote coverage, +unsupported-claim count, stale retrieval count, scope correctness, redaction leak +count, and Qdrant rebuild case/pass counts. The fixtures include negative traps for +unsupported prior claims, stale deleted facts, cross-project preference leakage, and +private/redacted text leakage. + Do not generate large fixtures or update production-adoption verdicts while adding the contract. The current adoption gate remains an existing benchmark decision until new real-world job reports are implemented and published.