From f82834dacd970ee904b712b429f907979fbd70c4 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Sat, 20 Jun 2026 02:09:45 +0800 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Build Source Library ingest and benchmark gate","authority":"XY-1017"} --- Makefile.toml | 49 +++ .../long_document_source_library.json | 221 +++++++++++ .../social_thread_source_library.json | 206 ++++++++++ .../src/bin/real_world_job_benchmark.rs | 2 + .../tests/real_world_job_benchmark.rs | 57 ++- docs/spec/system_doc_source_ref_v1.md | 97 ++++- docs/spec/system_source_ref_doc_pointer_v1.md | 25 +- docs/spec/system_version_registry.md | 9 +- packages/elf-service/src/docs.rs | 370 +++++++++++++++++- .../tests/acceptance/docs_extension_v1.rs | 56 +++ 10 files changed, 1080 insertions(+), 12 deletions(-) create mode 100644 apps/elf-eval/fixtures/real_world_memory/source_library/long_document_source_library.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/source_library/social_thread_source_library.json diff --git a/Makefile.toml b/Makefile.toml index 59c8ed47..17fa3b7e 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -585,6 +585,55 @@ args = [ "tmp/real-world-memory/knowledge-report.md", ] +[tasks.real-world-memory-source-library] +workspace = false +dependencies = [ + "real-world-memory-source-library-report", +] + +[tasks.real-world-memory-source-library-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory/source_library", + "--out", + "tmp/real-world-memory/source-library-report.json", + "--run-id", + "real-world-memory-source-library", + "--adapter-id", + "fixture_source_library", + "--adapter-name", + "ELF source library fixture", +] + +[tasks.real-world-memory-source-library-report] +workspace = false +dependencies = [ + "real-world-memory-source-library-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/source-library-report.json", + "--out", + "tmp/real-world-memory/source-library-report.md", +] + [tasks.real-world-memory-live-adapters] workspace = false command = "bash" diff --git a/apps/elf-eval/fixtures/real_world_memory/source_library/long_document_source_library.json b/apps/elf-eval/fixtures/real_world_memory/source_library/long_document_source_library.json new file mode 100644 index 00000000..c46ddaf5 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/source_library/long_document_source_library.json @@ -0,0 +1,221 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "source-library-long-doc-001", + "suite": "source_library", + "title": "Saved long-form article keeps stable source metadata and hydrates an excerpt pointer", + "corpus": { + "corpus_id": "real-world-memory-source-library-2026-06-20", + "profile": "synthetic", + "items": [ + { + "evidence_id": "article-source-record", + "kind": "source_library_record", + "text": "Source Library record: canonical_uri=https://example.com/research/agent-memory-os, source_kind=article, author=Example Research Group, captured_at=2026-06-20T01:10:00Z, source_created_at=2026-06-19T21:00:00Z, trust_label=public_web. The record is stored as a Doc Extension document, not as a durable Memory Note.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "elf_doc_ext/v1", + "ref": { + "doc_id": "11111111-1111-4111-8111-111111111111", + "chunk_id": "22222222-2222-4222-8222-222222222222" + }, + "state": { + "content_hash": "long-doc-content-hash", + "chunk_hash": "long-doc-chunk-hash" + }, + "hashes": { + "content_hash": "long-doc-content-hash", + "chunk_hash": "long-doc-chunk-hash" + }, + "locator": { + "position": { + "start": 0, + "end": 128 + } + } + }, + "created_at": "2026-06-20T01:10:00Z" + }, + { + "evidence_id": "article-hydrated-excerpt", + "kind": "hydrated_excerpt", + "text": "Hydrated excerpt: The article says source libraries preserve long-form evidence while agents promote only selected facts into memory. Verification: verified=true, content_hash=long-doc-content-hash, excerpt_hash=long-doc-excerpt-hash.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "elf_doc_ext/v1", + "ref": { + "doc_id": "11111111-1111-4111-8111-111111111111", + "chunk_id": "22222222-2222-4222-8222-222222222222" + }, + "locator": { + "quote": { + "exact": "source libraries preserve long-form evidence" + } + }, + "hashes": { + "content_hash": "long-doc-content-hash", + "excerpt_hash": "long-doc-excerpt-hash" + } + }, + "created_at": "2026-06-20T01:11:00Z" + }, + { + "evidence_id": "auto-memory-decoy", + "kind": "decoy", + "text": "Decoy: saving the article automatically created a durable Memory Note and made the whole article top-of-mind.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "source_library", + "evidence_id": "auto-memory-decoy" + } + }, + "created_at": "2026-06-20T01:12:00Z" + } + ], + "adapter_response": { + "adapter_id": "elf_real_world_memory_fixture", + "answer": { + "content": "The saved article is a Source Library document with canonical_uri=https://example.com/research/agent-memory-os, source_kind=article, author=Example Research Group, captured_at=2026-06-20T01:10:00Z, source_created_at=2026-06-19T21:00:00Z, and trust_label=public_web. Hydration uses the source_ref/v1 pointer with resolver elf_doc_ext/v1 and the verified excerpt says source libraries preserve long-form evidence. This source-only ingest does not automatically create a durable Memory Note.", + "claims": [ + { + "claim_id": "long_doc_metadata", + "text": "The saved article keeps canonical URI, source kind, author, captured/source-created timestamps, and trust label metadata.", + "evidence_ids": ["article-source-record"], + "confidence": "high" + }, + { + "claim_id": "long_doc_hydration", + "text": "The hydrated excerpt is verified through a source_ref/v1 pointer resolved by elf_doc_ext/v1.", + "evidence_ids": ["article-hydrated-excerpt"], + "confidence": "high" + }, + { + "claim_id": "source_not_memory", + "text": "Source-only ingest does not automatically create a durable Memory Note.", + "evidence_ids": ["article-source-record"], + "confidence": "high" + } + ], + "evidence_ids": ["article-source-record", "article-hydrated-excerpt"], + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "article-captured", + "ts": "2026-06-20T01:10:00Z", + "actor": "agent", + "action": "docs_put_source_library_record", + "evidence_ids": ["article-source-record"], + "summary": "The long article was captured as a Source Library document." + }, + { + "event_id": "article-hydrated", + "ts": "2026-06-20T01:11:00Z", + "actor": "agent", + "action": "docs_excerpts_get_verified", + "evidence_ids": ["article-hydrated-excerpt"], + "summary": "The article pointer hydrated a verified excerpt." + } + ], + "prompt": { + "role": "user", + "content": "What did we save for the agent memory OS article, how can it be cited later, and did saving it create a Memory Note?", + "job_mode": "answer", + "constraints": ["cite_evidence", "preserve_source_metadata", "do_not_promote_source_to_memory"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "long_doc_metadata", + "text": "The saved article keeps canonical URI, source kind, author, captured/source-created timestamps, and trust label metadata." + }, + { + "claim_id": "long_doc_hydration", + "text": "The hydrated excerpt is verified through a source_ref/v1 pointer resolved by elf_doc_ext/v1." + }, + { + "claim_id": "source_not_memory", + "text": "Source-only ingest does not automatically create a durable Memory Note." + } + ], + "must_not_include": [ + "automatically created a durable Memory Note", + "made the whole article top-of-mind" + ], + "evidence_links": { + "long_doc_metadata": ["article-source-record"], + "long_doc_hydration": ["article-hydrated-excerpt"], + "source_not_memory": ["article-source-record"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "article-source-record", + "claim_id": "long_doc_metadata", + "requirement": "cite", + "quote": "canonical_uri=https://example.com/research/agent-memory-os" + }, + { + "evidence_id": "article-hydrated-excerpt", + "claim_id": "long_doc_hydration", + "requirement": "cite", + "quote": "verified=true" + } + ], + "negative_traps": [ + { + "trap_id": "source-auto-memory", + "type": "decoy_evidence", + "evidence_ids": ["auto-memory-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Reports source metadata and the source-only memory boundary." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites both source record and hydrated excerpt evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not use the auto-memory decoy." + }, + "lifecycle_behavior": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Separates Source Library ingest from explicit memory promotion." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "claiming a source-only ingest automatically created durable memory", + "missing source_ref hydration evidence" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "source_library", "long_document", "source_ref", "no_memory_autopromotion"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/source_library/social_thread_source_library.json b/apps/elf-eval/fixtures/real_world_memory/source_library/social_thread_source_library.json new file mode 100644 index 00000000..4d25b221 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/source_library/social_thread_source_library.json @@ -0,0 +1,206 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "source-library-social-thread-001", + "suite": "source_library", + "title": "Saved social thread keeps handle metadata and remains source-only until promoted", + "corpus": { + "corpus_id": "real-world-memory-source-library-2026-06-20", + "profile": "synthetic", + "items": [ + { + "evidence_id": "thread-source-record", + "kind": "source_library_record", + "text": "Thread capture: canonical_uri=https://example.com/thread/agent-knowledge-os, source_kind=social_thread, author=Example Builder, handle=example-builder, captured_at=2026-06-20T02:00:00Z, source_created_at=2026-06-20T01:45:00Z, trust_label=public_web, doc_type=chat, thread_id=thread-agent-knowledge-os.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "elf_doc_ext/v1", + "ref": { + "doc_id": "33333333-3333-4333-8333-333333333333", + "chunk_id": "44444444-4444-4444-8444-444444444444" + }, + "state": { + "content_hash": "thread-content-hash", + "chunk_hash": "thread-chunk-hash" + }, + "hashes": { + "content_hash": "thread-content-hash", + "chunk_hash": "thread-chunk-hash" + }, + "locator": { + "position": { + "start": 42, + "end": 140 + } + } + }, + "created_at": "2026-06-20T02:00:00Z" + }, + { + "evidence_id": "thread-promotion-boundary", + "kind": "policy_boundary", + "text": "Promotion boundary: The thread remains a Source Library record. Durable Memory Notes are created only through explicit add_note or reviewed promotion; background organization may propose tags or summaries without mutating memory.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "source_library", + "evidence_id": "thread-promotion-boundary" + }, + "locator": { + "quote": { + "exact": "Durable Memory Notes are created only through explicit add_note or reviewed promotion" + } + } + }, + "created_at": "2026-06-20T02:01:00Z" + }, + { + "evidence_id": "silent-promotion-decoy", + "kind": "decoy", + "text": "Decoy: the background organizer silently promoted every post in the thread into permanent memory.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "source_library", + "evidence_id": "silent-promotion-decoy" + } + }, + "created_at": "2026-06-20T02:02:00Z" + } + ], + "adapter_response": { + "adapter_id": "elf_real_world_memory_fixture", + "answer": { + "content": "The social thread was saved as a Source Library chat document with canonical_uri=https://example.com/thread/agent-knowledge-os, source_kind=social_thread, author=Example Builder, handle=example-builder, captured/source-created timestamps, trust_label=public_web, and thread_id=thread-agent-knowledge-os. It stays source-only until an explicit add_note or reviewed promotion path creates Memory Notes; background organization may propose tags or summaries without mutating memory.", + "claims": [ + { + "claim_id": "thread_metadata", + "text": "The saved thread keeps canonical URI, source kind, author, handle, timestamps, trust label, and thread id metadata.", + "evidence_ids": ["thread-source-record"], + "confidence": "high" + }, + { + "claim_id": "thread_promotion_boundary", + "text": "The thread remains source-only until explicit add_note or reviewed promotion creates Memory Notes.", + "evidence_ids": ["thread-promotion-boundary"], + "confidence": "high" + } + ], + "evidence_ids": ["thread-source-record", "thread-promotion-boundary"], + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "thread-captured", + "ts": "2026-06-20T02:00:00Z", + "actor": "agent", + "action": "docs_put_social_thread", + "evidence_ids": ["thread-source-record"], + "summary": "The social thread was captured as a Source Library chat document." + }, + { + "event_id": "promotion-boundary-checked", + "ts": "2026-06-20T02:01:00Z", + "actor": "agent", + "action": "checked_memory_promotion_boundary", + "evidence_ids": ["thread-promotion-boundary"], + "summary": "The memory promotion boundary was preserved." + } + ], + "prompt": { + "role": "user", + "content": "What did we capture from this thread, and can the agent remember it without silently turning every post into Memory Notes?", + "job_mode": "answer", + "constraints": ["cite_evidence", "preserve_social_source_metadata", "require_explicit_memory_promotion"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "thread_metadata", + "text": "The saved thread keeps canonical URI, source kind, author, handle, timestamps, trust label, and thread id metadata." + }, + { + "claim_id": "thread_promotion_boundary", + "text": "The thread remains source-only until explicit add_note or reviewed promotion creates Memory Notes." + } + ], + "must_not_include": [ + "silently promoted every post", + "permanent memory without review" + ], + "evidence_links": { + "thread_metadata": ["thread-source-record"], + "thread_promotion_boundary": ["thread-promotion-boundary"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "thread-source-record", + "claim_id": "thread_metadata", + "requirement": "cite", + "quote": "source_kind=social_thread" + }, + { + "evidence_id": "thread-promotion-boundary", + "claim_id": "thread_promotion_boundary", + "requirement": "cite", + "quote": "explicit add_note or reviewed promotion" + } + ], + "negative_traps": [ + { + "trap_id": "silent-background-promotion", + "type": "decoy_evidence", + "evidence_ids": ["silent-promotion-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Reports social-thread metadata and the explicit memory promotion boundary." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites thread metadata and promotion-boundary evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Rejects silent background memory promotion." + }, + "lifecycle_behavior": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Keeps source capture, review proposals, and durable memory writes separate." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "claiming background organization can silently mutate durable memory", + "missing source metadata evidence" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "source_library", "social_thread", "source_ref", "no_silent_memory_mutation"] +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index eae9659f..796e767e 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -53,6 +53,7 @@ const SUITES: &[&str] = &[ "proactive_brief", "scheduled_memory", "knowledge_compilation", + "source_library", "operator_debugging_ux", "capture_integration", "production_ops", @@ -7514,6 +7515,7 @@ fn render_markdown_semantics(out: &mut String, report: &RealWorldReport) { out.push_str("- `unsupported_claim`: a job produced a substantive claim not supported by the fixture evidence links.\n"); out.push_str("- `not_encoded`: a suite has no checked-in fixture, or an encoded fixture declares a capability gap so no pass/fail claim is allowed.\n\n"); out.push_str("For `knowledge_compilation` jobs, generated pages are benchmark artifacts. Page sections must cite source evidence or timeline events, or be explicitly flagged as unsupported. Flagged unsupported summaries are counted separately from hidden unsupported claims.\n\n"); + out.push_str("For `source_library` jobs, saved long-form material and social/thread captures are source records, not durable Memory Notes. Source records must preserve canonical source metadata, source_ref hydration pointers, and explicit promotion boundaries before any memory write is claimed.\n\n"); out.push_str("For `memory_summary` jobs, summary artifacts are derived review surfaces. Top-of-mind entries must be current, included or downgraded entries must carry source refs, and derived project-profile entries must either cite sources or be explicitly flagged as unsupported.\n\n"); out.push_str("For `proactive_brief` jobs, brief artifacts are fixture-scored derived outputs, not scheduled UI behavior. Every suggestion must carry evidence refs, freshness/currentness metadata, and an action rationale; stale, superseded, or tombstoned sources must not be presented as current recommendations.\n\n"); out.push_str("For `scheduled_memory` jobs, task artifacts are deterministic fixture-scored stand-ins for asynchronous work. Every output must carry evidence refs, freshness/currentness metadata, action rationale, and execution trace/readback evidence; scheduled tasks must not mutate source notes silently or claim hosted scheduler/private-provider parity from fixture-only output.\n\n"); diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index be370dce..28d6d0e4 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -72,6 +72,10 @@ fn knowledge_fixture_dir() -> PathBuf { real_world_memory_fixture_dir().join("knowledge") } +fn source_library_fixture_dir() -> PathBuf { + real_world_memory_fixture_dir().join("source_library") +} + fn production_ops_fixture_dir() -> PathBuf { real_world_memory_fixture_dir().join("production_ops") } @@ -599,6 +603,42 @@ fn capture_integration_fixtures_score_redaction_and_source_ids() -> Result<()> { Ok(()) } +#[test] +fn source_library_fixtures_score_saved_sources_without_memory_promotion() -> Result<()> { + let report = run_json_report_from(source_library_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(2)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(2)); + assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(1.0)); + + let suites = array_at(&report, "/suites")?; + let source_library = find_by_field(suites, "/suite_id", "source_library")?; + + assert_eq!(source_library.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(source_library.pointer("/encoded_job_count").and_then(Value::as_u64), Some(2)); + + let jobs = array_at(&report, "/jobs")?; + let long_doc = find_by_field(jobs, "/job_id", "source-library-long-doc-001")?; + let thread = find_by_field(jobs, "/job_id", "source-library-social-thread-001")?; + + assert!(array_contains_str(long_doc, "/produced_evidence", "article-source-record")?); + assert!(array_contains_str(long_doc, "/produced_evidence", "article-hydrated-excerpt")?); + assert!(array_contains_str(thread, "/produced_evidence", "thread-source-record")?); + assert!(array_contains_str(thread, "/produced_evidence", "thread-promotion-boundary")?); + assert!(long_doc.pointer("/produced_answer").and_then(Value::as_str).is_some_and(|answer| { + answer.contains("does not automatically create a durable Memory Note") + })); + assert!( + thread + .pointer("/produced_answer") + .and_then(Value::as_str) + .is_some_and(|answer| answer.contains("explicit add_note or reviewed promotion")) + ); + + Ok(()) +} + #[test] fn external_adapter_run_summarizes_nonzero_scenario_losses() -> Result<()> { let manifest_path = Path::new(env!("CARGO_MANIFEST_DIR")) @@ -2367,7 +2407,7 @@ fn assert_live_sweep_record(adapter: &Value, production_ops_status: &str) -> Res fn runner_discovers_nested_fixture_layout() -> Result<()> { let report = run_json_report_from(fixture_root())?; - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(60)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(62)); Ok(()) } @@ -6408,9 +6448,9 @@ fn assert_root_knowledge_summary(report: &Value) { } fn assert_root_aggregate_summary(report: &Value) { - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(60)); - assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(16)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(53)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(62)); + assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(17)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(55)); assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(7)); @@ -6453,11 +6493,11 @@ fn assert_root_aggregate_summary(report: &Value) { ); assert_eq!( report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), - Some(133) + Some(137) ); assert_eq!( report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), - Some(133) + Some(137) ); assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0)); @@ -6642,6 +6682,11 @@ fn assert_root_aggregate_suites(report: &Value) -> Result<()> { assert_eq!(scheduled.pointer("/status").and_then(Value::as_str), Some("blocked")); assert_eq!(scheduled.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); + let source_library = find_by_field(suites, "/suite_id", "source_library")?; + + assert_eq!(source_library.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(source_library.pointer("/encoded_job_count").and_then(Value::as_u64), Some(2)); + let context_trajectory = find_by_field(suites, "/suite_id", "context_trajectory")?; assert_eq!(context_trajectory.pointer("/status").and_then(Value::as_str), Some("blocked")); diff --git a/docs/spec/system_doc_source_ref_v1.md b/docs/spec/system_doc_source_ref_v1.md index a695b40f..0249b7ac 100644 --- a/docs/spec/system_doc_source_ref_v1.md +++ b/docs/spec/system_doc_source_ref_v1.md @@ -6,7 +6,7 @@ resource: docs/spec/system_doc_source_ref_v1.md status: active authority: normative owner: spec -last_verified: 2026-06-18 +last_verified: 2026-06-20 tags: - docs - spec @@ -141,7 +141,56 @@ Backward compatibility: mappings are not performed. ================================================== -5) Examples +5) Source Library profile +================================================== + +`doc_source_ref/v1` also defines a first-class Source Library profile for +saved long-form material. This profile is opt-in: a payload enters the profile +when it provides any Source Library field below. Once it enters the profile, +the required profile keys below MUST be present and valid. + +Required Source Library profile keys: + +- `source_kind` (string): one of `article`, `social_thread`, `pdf`, + `text_export`, `repo_file`, `chat_excerpt`, or `web_page`. +- `canonical_uri` (string): stable URL, URN, file URI, repo URI, or source + identifier that can be used for deduplication and operator inspection. +- `captured_at` (string): timezone-aware RFC3339 timestamp for when ELF + captured the source. +- `trust_label` (string): one of `trusted`, `user_captured`, `public_web`, + `third_party`, or `unverified`. + +Optional Source Library profile keys: + +- `source_created_at` (string): timezone-aware RFC3339 source publication or + creation time when available. +- `author` (string): author or source display name when available. +- `handle` (string): stable social/repository/source handle when available. +- `source_content_hash` (string): producer-supplied source hash when available. + ELF also stores and returns its own canonical `content_hash` for the persisted + document bytes. +- `excerpt_locator` (object): selector hints for the saved source. It MAY + include: + - `quote`: object with required `exact` and optional `prefix`/`suffix`. + - `position`: object with integer `start` and `end` byte offsets, where + `start < end`. + +Compatibility with `doc_type`: + +- `source_kind = "social_thread"` and `source_kind = "chat_excerpt"` require + `doc_type = "chat"`. +- `source_kind = "repo_file"` requires `doc_type = "dev"`. +- Other source kinds may use the normal `knowledge` or `search` document + classes based on caller workflow. + +Boundary: + +- Source Library ingest stores a document and document chunks. It MUST NOT + create or mutate durable Memory Notes unless the caller separately invokes an + explicit memory-write or reviewed promotion path. + +================================================== +6) Examples ================================================== Chat: @@ -206,3 +255,47 @@ Knowledge: "uri": "docs://kb/architecture/2026/02/overview" } ``` + +Source Library article: + +```json +{ + "schema": "doc_source_ref/v1", + "doc_type": "knowledge", + "ts": "2026-06-20T01:10:00Z", + "source_kind": "article", + "canonical_uri": "https://example.com/research/agent-memory-os", + "captured_at": "2026-06-20T01:10:00Z", + "source_created_at": "2026-06-19T21:00:00Z", + "trust_label": "public_web", + "author": "Example Research Group", + "excerpt_locator": { + "quote": { + "exact": "source libraries preserve long-form evidence" + }, + "position": { + "start": 0, + "end": 128 + } + } +} +``` + +Source Library social thread: + +```json +{ + "schema": "doc_source_ref/v1", + "doc_type": "chat", + "ts": "2026-06-20T02:00:00Z", + "thread_id": "thread-agent-knowledge-os", + "role": "user", + "source_kind": "social_thread", + "canonical_uri": "https://example.com/thread/agent-knowledge-os", + "captured_at": "2026-06-20T02:00:00Z", + "source_created_at": "2026-06-20T01:45:00Z", + "trust_label": "public_web", + "author": "Example Builder", + "handle": "example-builder" +} +``` diff --git a/docs/spec/system_source_ref_doc_pointer_v1.md b/docs/spec/system_source_ref_doc_pointer_v1.md index c76be322..fcf85e4b 100644 --- a/docs/spec/system_source_ref_doc_pointer_v1.md +++ b/docs/spec/system_source_ref_doc_pointer_v1.md @@ -6,7 +6,7 @@ resource: docs/spec/system_source_ref_doc_pointer_v1.md status: active authority: normative owner: spec -last_verified: 2026-06-18 +last_verified: 2026-06-20 tags: - docs - spec @@ -101,6 +101,15 @@ Notes: If provided, these fields allow agents to detect drift and to report stronger provenance. +`docs_search_l0` returns both `state` and `hashes` in its pointer payload: + +- `state.content_hash` and `hashes.content_hash` are the authoritative document + content hash for the stored bytes. +- `state.chunk_hash` and `hashes.chunk_hash` are the authoritative chunk hash + for the returned hit. +- `state.doc_updated_at` is the document update timestamp used for cache and + staleness checks. + ### 3.4 `locator` (optional) `locator` carries excerpt selector hints. The canonical selector vocabulary is: @@ -120,6 +129,10 @@ Rules: Optional fields: - `level` (string): `"L0"`, `"L1"` or `"L2"` as a suggested excerpt size tier for hydration. If omitted, agents should choose based on context budget. +`docs_search_l0` returns a `locator.position` selector for the hit chunk. Agents +may pass this selector, the returned `ref.chunk_id`, or their own quote selector +to `docs_excerpts_get` for verified hydration. + ### 3.5 `hashes` (optional) `hashes` MAY include: @@ -200,6 +213,16 @@ The agent SHOULD: "state": { "content_hash": "baf7cfd2d5b71f5b0f5d5a08a3c38d7b43cf7a2e5a4f75d5c1b4a9072f6dd3b8", "chunk_hash": "bd85b0e07464bde3a7f3a2b2f3c2d5d4c1c9f0d0c1a2b3c4d5e6f7a8b9c0d1e2" + }, + "hashes": { + "content_hash": "baf7cfd2d5b71f5b0f5d5a08a3c38d7b43cf7a2e5a4f75d5c1b4a9072f6dd3b8", + "chunk_hash": "bd85b0e07464bde3a7f3a2b2f3c2d5d4c1c9f0d0c1a2b3c4d5e6f7a8b9c0d1e2" + }, + "locator": { + "position": { + "start": 128, + "end": 384 + } } } ``` diff --git a/docs/spec/system_version_registry.md b/docs/spec/system_version_registry.md index d2f9fc0b..51e0e52a 100644 --- a/docs/spec/system_version_registry.md +++ b/docs/spec/system_version_registry.md @@ -6,7 +6,7 @@ resource: docs/spec/system_version_registry.md status: active authority: normative owner: spec -last_verified: 2026-06-18 +last_verified: 2026-06-20 tags: - docs - spec @@ -50,6 +50,10 @@ This document is normative. When a new versioned identifier is introduced, it mu - Type: `docs_put.source_ref` JSON envelope schema identifier. - Defined in: `docs/spec/system_doc_source_ref_v1.md`. - Consumers: Docs ingestion (`POST /v2/docs`, MCP `elf_docs_put`) and any doc evidence consumers that need durable source provenance. +- Source Library profile: optional profile fields in `doc_source_ref/v1` cover + saved articles, social threads, PDFs, text exports, repository files, chat + excerpts, and web pages with canonical URI, source kind, timestamps, trust + label, and excerpt locator metadata. - Bump rule: Introduce `doc_source_ref/v2` only when the required/optional key contract becomes incompatible with v1. Keep older identifiers immutable. ### source_ref resolver: Doc Extension v1 doc pointer @@ -58,6 +62,9 @@ This document is normative. When a new versioned identifier is introduced, it mu - Type: `source_ref.resolver` identifier for Doc Extension v1 pointers. - Defined in: `docs/spec/system_source_ref_doc_pointer_v1.md`. - Consumers: Agents that hydrate doc excerpts and build evidence-linked facts; Doc Extension v1 excerpt endpoints. +- Pointer payloads returned by `docs_search_l0` include document/chunk ids, + state hashes, hash aliases, and a position locator for verified excerpt + hydration. - Bump rule: Introduce `elf_doc_ext/v2` only when the dereference contract (required fields, semantics, or verification surface) becomes incompatible. ### Note provenance bundle schema diff --git a/packages/elf-service/src/docs.rs b/packages/elf-service/src/docs.rs index ec9b652b..11d968cf 100644 --- a/packages/elf-service/src/docs.rs +++ b/packages/elf-service/src/docs.rs @@ -46,6 +46,21 @@ const DOC_RETRIEVAL_TRAJECTORY_SCHEMA_V1: &str = "doc_retrieval_trajectory/v1"; const DOC_SOURCE_REF_SCHEMA_V1: &str = "source_ref/v1"; const DOC_SOURCE_REF_RESOLVER_V1: &str = "elf_doc_ext/v1"; const DOC_STATUSES: [&str; 2] = ["active", "deleted"]; +const SOURCE_LIBRARY_FIELD_KEYS: [&str; 9] = [ + "source_kind", + "canonical_uri", + "captured_at", + "source_created_at", + "trust_label", + "author", + "handle", + "excerpt_locator", + "source_content_hash", +]; +const SOURCE_LIBRARY_KINDS: [&str; 7] = + ["article", "social_thread", "pdf", "text_export", "repo_file", "chat_excerpt", "web_page"]; +const SOURCE_LIBRARY_TRUST_LABELS: [&str; 5] = + ["trusted", "user_captured", "public_web", "third_party", "unverified"]; /// Document classification used for persistence and retrieval filters. #[derive(Clone, Copy, Debug, Eq, PartialEq, Deserialize, Serialize)] @@ -269,6 +284,10 @@ pub struct DocsSearchL0ItemPointer { pub reference: DocsSearchL0ItemReference, /// Freshness guard for the pointer target. pub state: DocsSearchL0ItemState, + /// Hash aliases for simpler pointer consumers. + pub hashes: DocsSearchL0ItemHashes, + /// Selector hints that can hydrate this chunk through `docs_excerpts_get`. + pub locator: DocsSearchL0ItemLocator, } /// Logical identifiers for a document-search hit. @@ -292,6 +311,22 @@ pub struct DocsSearchL0ItemState { pub doc_updated_at: OffsetDateTime, } +/// Hash values carried with a document-search pointer. +#[derive(Clone, Debug, Serialize)] +pub struct DocsSearchL0ItemHashes { + /// Whole-document BLAKE3 hash. + pub content_hash: String, + /// Chunk-level BLAKE3 hash. + pub chunk_hash: String, +} + +/// Locator hints carried with a document-search pointer. +#[derive(Clone, Debug, Serialize)] +pub struct DocsSearchL0ItemLocator { + /// Chunk byte position in the authoritative document content. + pub position: TextPositionSelector, +} + /// Explain payload for a document retrieval run. #[derive(Clone, Debug, Serialize)] pub struct DocRetrievalTrajectory { @@ -512,6 +547,8 @@ struct DocSearchRow { updated_at: OffsetDateTime, content_hash: String, chunk_hash: String, + start_offset: i32, + end_offset: i32, chunk_text: String, } @@ -1211,15 +1248,27 @@ fn docs_excerpt_locator( } fn build_docs_l0_pointer(row: &DocSearchRow, chunk_id: Uuid) -> DocsSearchL0ItemPointer { + let hashes = DocsSearchL0ItemHashes { + content_hash: row.content_hash.clone(), + chunk_hash: row.chunk_hash.clone(), + }; + DocsSearchL0ItemPointer { schema: DOC_SOURCE_REF_SCHEMA_V1.to_string(), resolver: DOC_SOURCE_REF_RESOLVER_V1.to_string(), reference: DocsSearchL0ItemReference { doc_id: row.doc_id, chunk_id }, state: DocsSearchL0ItemState { - content_hash: row.content_hash.clone(), - chunk_hash: row.chunk_hash.clone(), + content_hash: hashes.content_hash.clone(), + chunk_hash: hashes.chunk_hash.clone(), doc_updated_at: row.updated_at, }, + hashes, + locator: DocsSearchL0ItemLocator { + position: TextPositionSelector { + start: row.start_offset.max(0) as usize, + end: row.end_offset.max(0) as usize, + }, + }, } } @@ -1337,6 +1386,7 @@ fn validate_docs_put(req: &DocsPutRequest) -> Result { }; validate_doc_source_ref_requirements(source_ref_doc_type.as_str(), source_ref)?; + validate_source_library_metadata(source_ref_doc_type.as_str(), source_ref)?; let write_policy = writegate::apply_write_policy(req.content.as_str(), req.write_policy.as_ref()).map_err( @@ -1433,6 +1483,194 @@ fn validate_doc_source_ref_requirements( Ok(()) } +fn validate_source_library_metadata( + source_doc_type: &str, + source_ref: &Map, +) -> Result<()> { + if !source_library_metadata_present(source_ref) { + return Ok(()); + } + + let source_kind = + extract_source_ref_string(source_ref, "source_kind", "$.source_ref[\"source_kind\"]")?; + + if !SOURCE_LIBRARY_KINDS.contains(&source_kind.as_str()) { + return Err(Error::InvalidRequest { + message: format!( + "$.source_ref[\"source_kind\"] must be one of: {}.", + SOURCE_LIBRARY_KINDS.join("|") + ), + }); + } + + validate_source_kind_doc_type(source_kind.as_str(), source_doc_type)?; + extract_source_ref_string(source_ref, "canonical_uri", "$.source_ref[\"canonical_uri\"]")?; + validate_source_ref_rfc3339(source_ref, "captured_at")?; + + if source_ref.contains_key("source_created_at") { + validate_source_ref_rfc3339(source_ref, "source_created_at")?; + } + + let trust_label = + extract_source_ref_string(source_ref, "trust_label", "$.source_ref[\"trust_label\"]")?; + + if !SOURCE_LIBRARY_TRUST_LABELS.contains(&trust_label.as_str()) { + return Err(Error::InvalidRequest { + message: format!( + "$.source_ref[\"trust_label\"] must be one of: {}.", + SOURCE_LIBRARY_TRUST_LABELS.join("|") + ), + }); + } + + validate_optional_source_ref_string(source_ref, "author")?; + validate_optional_source_ref_string(source_ref, "handle")?; + validate_optional_source_ref_string(source_ref, "source_content_hash")?; + + if let Some(locator) = source_ref.get("excerpt_locator") { + validate_source_library_excerpt_locator(locator)?; + } + + Ok(()) +} + +fn source_library_metadata_present(source_ref: &Map) -> bool { + SOURCE_LIBRARY_FIELD_KEYS.iter().any(|key| source_ref.contains_key(*key)) +} + +fn validate_source_kind_doc_type(source_kind: &str, source_doc_type: &str) -> Result<()> { + let expected_doc_type = match source_kind { + "social_thread" | "chat_excerpt" => Some("chat"), + "repo_file" => Some("dev"), + _ => None, + }; + + if let Some(expected_doc_type) = expected_doc_type + && source_doc_type != expected_doc_type + { + return Err(Error::InvalidRequest { + message: format!( + "$.source_ref[\"source_kind\"]={source_kind} requires doc_type={expected_doc_type}." + ), + }); + } + + Ok(()) +} + +fn validate_source_ref_rfc3339(source_ref: &Map, key: &str) -> Result<()> { + let path = format!("$.source_ref[\"{key}\"]"); + let value = extract_source_ref_string(source_ref, key, path.as_str())?; + + OffsetDateTime::parse(value.as_str(), &Rfc3339).map_err(|_| Error::InvalidRequest { + message: format!("{path} must be an RFC3339 datetime string."), + })?; + + Ok(()) +} + +fn validate_optional_source_ref_string(source_ref: &Map, key: &str) -> Result<()> { + let path = format!("$.source_ref[\"{key}\"]"); + + validate_optional_source_ref_string_at(source_ref, key, path.as_str()) +} + +fn validate_optional_source_ref_string_at( + source_ref: &Map, + key: &str, + path: &str, +) -> Result<()> { + let Some(value) = source_ref.get(key) else { + return Ok(()); + }; + + value.as_str().map(str::trim).filter(|value| !value.is_empty()).ok_or_else(|| { + Error::InvalidRequest { message: format!("{path} must be a non-empty string.") } + })?; + + Ok(()) +} + +fn validate_source_library_excerpt_locator(locator: &Value) -> Result<()> { + let locator = locator.as_object().ok_or_else(|| Error::InvalidRequest { + message: "$.source_ref[\"excerpt_locator\"] must be a JSON object.".to_string(), + })?; + let has_quote = locator.contains_key("quote"); + let has_position = locator.contains_key("position"); + + if !has_quote && !has_position { + return Err(Error::InvalidRequest { + message: "$.source_ref[\"excerpt_locator\"] requires quote or position.".to_string(), + }); + } + + if let Some(quote) = locator.get("quote") { + validate_source_library_quote_locator(quote)?; + } + if let Some(position) = locator.get("position") { + validate_source_library_position_locator(position)?; + } + + Ok(()) +} + +fn validate_source_library_quote_locator(quote: &Value) -> Result<()> { + let quote = quote.as_object().ok_or_else(|| Error::InvalidRequest { + message: "$.source_ref[\"excerpt_locator\"][\"quote\"] must be a JSON object.".to_string(), + })?; + + extract_source_ref_string( + quote, + "exact", + "$.source_ref[\"excerpt_locator\"][\"quote\"][\"exact\"]", + )?; + validate_optional_source_ref_string_at( + quote, + "prefix", + "$.source_ref[\"excerpt_locator\"][\"quote\"][\"prefix\"]", + )?; + validate_optional_source_ref_string_at( + quote, + "suffix", + "$.source_ref[\"excerpt_locator\"][\"quote\"][\"suffix\"]", + )?; + + Ok(()) +} + +fn validate_source_library_position_locator(position: &Value) -> Result<()> { + let position = position.as_object().ok_or_else(|| Error::InvalidRequest { + message: "$.source_ref[\"excerpt_locator\"][\"position\"] must be a JSON object." + .to_string(), + })?; + let start = source_ref_u64( + position, + "start", + "$.source_ref[\"excerpt_locator\"][\"position\"][\"start\"]", + )?; + let end = source_ref_u64( + position, + "end", + "$.source_ref[\"excerpt_locator\"][\"position\"][\"end\"]", + )?; + + if start >= end { + return Err(Error::InvalidRequest { + message: "$.source_ref[\"excerpt_locator\"][\"position\"] start must be before end." + .to_string(), + }); + } + + Ok(()) +} + +fn source_ref_u64(source_ref: &Map, key: &str, path: &str) -> Result { + source_ref + .get(key) + .and_then(Value::as_u64) + .ok_or_else(|| Error::InvalidRequest { message: format!("{path} must be an integer.") }) +} + fn validate_docs_search_l0(req: &DocsSearchL0Request) -> Result { validate_docs_search_l0_query(req)?; @@ -2285,6 +2523,8 @@ SELECT d.updated_at, d.content_hash, c.chunk_hash, + c.start_offset, + c.end_offset, c.chunk_text FROM doc_chunks c JOIN doc_documents d ON d.doc_id = c.doc_id @@ -2322,6 +2562,7 @@ mod tests { use tokenizers::{ Tokenizer, models::wordlevel::WordLevel, pre_tokenizers::whitespace::Whitespace, }; + use uuid::Uuid; use crate::docs::{ self, DocType, DocsPutRequest, DocsSearchL0Filters, DocsSearchL0Request, DocsSparseMode, @@ -3051,6 +3292,131 @@ mod tests { assert_eq!(resolved_doc_type.doc_type, DocType::Chat); } + #[test] + fn validate_docs_put_accepts_source_library_article_metadata() { + let validated = docs::validate_docs_put(&DocsPutRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: "project_shared".to_string(), + doc_type: Some(DocType::Knowledge.as_str().to_string()), + title: Some("Saved article".to_string()), + write_policy: None, + source_ref: serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "knowledge", + "ts": "2026-02-25T12:00:00Z", + "source_kind": "article", + "canonical_uri": "https://example.com/research/source-library", + "captured_at": "2026-02-25T12:10:00Z", + "source_created_at": "2026-02-24T09:00:00Z", + "trust_label": "public_web", + "author": "Example Author", + "handle": "example-author", + "excerpt_locator": { + "quote": { + "exact": "Source libraries preserve long-form evidence." + }, + "position": { + "start": 0, + "end": 48 + } + } + }), + content: "Source libraries preserve long-form evidence. Agents can hydrate exact excerpts later.".to_string(), + }) + .expect("Expected source library metadata to be accepted."); + + assert_eq!(validated.doc_type, DocType::Knowledge); + } + + #[test] + fn validate_docs_put_rejects_incomplete_source_library_metadata() { + let err = docs::validate_docs_put(&DocsPutRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: "project_shared".to_string(), + doc_type: Some(DocType::Knowledge.as_str().to_string()), + title: Some("Saved article".to_string()), + write_policy: None, + source_ref: serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "knowledge", + "ts": "2026-02-25T12:00:00Z", + "source_kind": "article", + "captured_at": "2026-02-25T12:10:00Z", + "trust_label": "public_web" + }), + content: "Source libraries preserve long-form evidence.".to_string(), + }) + .expect_err("Expected canonical_uri to be required for source library metadata."); + + match err { + Error::InvalidRequest { message } => assert!(message.contains("canonical_uri")), + other => panic!("Unexpected error: {other:?}"), + } + + let err = docs::validate_docs_put(&DocsPutRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: "project_shared".to_string(), + doc_type: Some(DocType::Knowledge.as_str().to_string()), + title: Some("Saved thread".to_string()), + write_policy: None, + source_ref: serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "knowledge", + "ts": "2026-02-25T12:00:00Z", + "source_kind": "social_thread", + "canonical_uri": "https://example.com/thread/123", + "captured_at": "2026-02-25T12:10:00Z", + "trust_label": "public_web" + }), + content: "The thread says source libraries need social captures.".to_string(), + }) + .expect_err("Expected social_thread source_kind to require chat doc_type."); + + match err { + Error::InvalidRequest { message } => + assert!(message.contains("requires doc_type=chat")), + other => panic!("Unexpected error: {other:?}"), + } + } + + #[test] + fn docs_l0_pointer_carries_hashes_and_position_locator() { + let now = OffsetDateTime::parse("2026-02-25T12:00:00Z", &Rfc3339) + .expect("Expected test timestamp to parse."); + let row = super::DocSearchRow { + chunk_id: Uuid::parse_str("11111111-1111-4111-8111-111111111111") + .expect("Expected chunk UUID."), + doc_id: Uuid::parse_str("22222222-2222-4222-8222-222222222222") + .expect("Expected doc UUID."), + scope: "project_shared".to_string(), + doc_type: "knowledge".to_string(), + project_id: "project".to_string(), + agent_id: "agent".to_string(), + updated_at: now, + content_hash: "doc-hash".to_string(), + chunk_hash: "chunk-hash".to_string(), + start_offset: 12, + end_offset: 64, + chunk_text: "Source libraries preserve long-form evidence.".to_string(), + }; + let pointer = super::build_docs_l0_pointer(&row, row.chunk_id); + + assert_eq!(pointer.schema, "source_ref/v1"); + assert_eq!(pointer.resolver, "elf_doc_ext/v1"); + assert_eq!(pointer.hashes.content_hash, "doc-hash"); + assert_eq!(pointer.hashes.chunk_hash, "chunk-hash"); + assert_eq!(pointer.locator.position.start, 12); + assert_eq!(pointer.locator.position.end, 64); + assert_eq!(pointer.state.content_hash, pointer.hashes.content_hash); + assert_eq!(pointer.state.chunk_hash, pointer.hashes.chunk_hash); + } + #[test] fn validate_docs_put_applies_write_policy_and_includes_audit() { let validated = docs::validate_docs_put(&DocsPutRequest { diff --git a/packages/elf-service/tests/acceptance/docs_extension_v1.rs b/packages/elf-service/tests/acceptance/docs_extension_v1.rs index 9a236c9a..fb0e5c91 100644 --- a/packages/elf-service/tests/acceptance/docs_extension_v1.rs +++ b/packages/elf-service/tests/acceptance/docs_extension_v1.rs @@ -271,6 +271,62 @@ async fn docs_put_get_excerpts_and_search_l0_work_end_to_end() { test_db.cleanup().await.expect("Failed to cleanup test database."); } +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL (or ELF_QDRANT_GRPC_URL) to run."] +async fn docs_put_source_library_records_do_not_create_memory_notes() { + let Some(ctx) = setup_docs_context().await else { return }; + let DocsContext { test_db, service } = ctx; + let before: i64 = sqlx::query_scalar("SELECT COUNT(*) FROM memory_notes") + .fetch_one(&service.db.pool) + .await + .expect("Failed to count notes before docs_put."); + let put = put_test_doc_with( + &service, + "owner", + "project_shared", + Some("chat"), + "Captured thread", + serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "chat", + "ts": "2026-02-25T12:00:00Z", + "thread_id": "thread-source-library-1", + "role": "user", + "source_kind": "social_thread", + "canonical_uri": "https://example.com/thread/source-library-1", + "captured_at": "2026-02-25T12:10:00Z", + "source_created_at": "2026-02-25T11:55:00Z", + "trust_label": "public_web", + "author": "Example Researcher", + "handle": "example-researcher", + "excerpt_locator": { + "quote": { + "exact": "Source libraries should preserve thread context." + } + } + }), + "Source libraries should preserve thread context. Agents can later promote only selected facts.", + ) + .await; + let after: i64 = sqlx::query_scalar("SELECT COUNT(*) FROM memory_notes") + .fetch_one(&service.db.pool) + .await + .expect("Failed to count notes after docs_put."); + let doc_exists: bool = + sqlx::query_scalar("SELECT EXISTS(SELECT 1 FROM doc_documents WHERE doc_id = $1)") + .bind(put.doc_id) + .fetch_one(&service.db.pool) + .await + .expect("Failed to verify doc row."); + + assert!(doc_exists); + assert_eq!(after, before, "docs_put must not create durable Memory Notes."); + + drop(service); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + #[tokio::test] #[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL (or ELF_QDRANT_GRPC_URL) to run."] async fn docs_search_l0_respects_scope_doc_type_agent_id_and_updated_after_filters() {