saagpatel · saagpatel · Jun 3, 2026 · Jun 3, 2026 · chatgpt-codex-connector · Jun 3, 2026
diff --git a/src/portfolio_context_contract.py b/src/portfolio_context_contract.py
@@ -153,12 +153,12 @@ def analyze_project_context(
             readme_text = _read_small_text(readme_path)
     has_readme = bool(readme_text.strip())
 
-    primary_sections = _split_markdown_sections(primary_text)
-    readme_sections = _split_markdown_sections(readme_text) if has_readme else {}
+    primary_blocks = _section_blocks(primary_text)
+    readme_blocks = _section_blocks(readme_text) if has_readme else []
     section_presence = {
         field: (
-            _section_has_meaningful_content(primary_sections, aliases)
-            or _section_has_meaningful_content(readme_sections, aliases)
+            _section_has_meaningful_content(primary_blocks, aliases)
+            or _section_has_meaningful_content(readme_blocks, aliases)
         )
         for field, aliases in CONTEXT_SECTION_ALIASES.items()
     }
@@ -260,34 +260,78 @@ def _read_small_text(path: Path) -> str:
     return path.read_text(errors="replace")
 
 
-def _split_markdown_sections(text: str) -> dict[str, str]:
-    sections: dict[str, list[str]] = {}
-    current = "__preamble__"
-    sections[current] = []
+def _section_blocks(text: str) -> list[tuple[int, str, str]]:
+    """Ordered (level, normalized_heading, direct_body) for each markdown heading.
+
+    direct_body is the text under a heading up to the *next heading of any level*,
+    so a parent's subsections are separate, deeper-level blocks. Callers roll
+    descendant content up via the level (see _aggregated_block_text) — this keeps
+    a parent like "## Quick Start" whose content lives entirely under
+    "### Installation" from reading as empty.
+    """
+    blocks: list[tuple[int, str, list[str]]] = []
     in_fenced_code = False
     for line in text.splitlines():
         if re.match(r"^\s{0,3}```", line):
             in_fenced_code = not in_fenced_code
-            sections.setdefault(current, []).append(line)
+            if blocks:
+                blocks[-1][2].append(line)
             continue
-        match = None if in_fenced_code else re.match(r"^\s{0,3}#{1,6}\s+(.+?)\s*$", line)
+        match = None if in_fenced_code else re.match(r"^\s{0,3}(#{1,6})\s+(.+?)\s*$", line)
         if match:
-            current = _normalize_heading(match.group(1))
-            sections.setdefault(current, [])
+            blocks.append((len(match.group(1)), _normalize_heading(match.group(2)), []))
             continue
-        sections.setdefault(current, []).append(line)
-    return {heading: "\n".join(lines).strip() for heading, lines in sections.items()}
+        if blocks:
+            blocks[-1][2].append(line)
+    return [(level, heading, "\n".join(lines).strip()) for level, heading, lines in blocks]
 
 
 def _normalize_heading(value: str) -> str:
-    normalized = re.sub(r"[^a-z0-9]+", " ", value.lower()).strip()
-    return normalized
+    return re.sub(r"[^a-z0-9]+", " ", value.lower()).strip()
+
+
+def _heading_starts_with_alias(heading_words: list[str], alias_words: list[str]) -> bool:
+    """True if the heading begins with the alias phrase (prefix-anchored match).
 
+    Anchoring at the start keeps decorative trailing words ("Commands By Mode" ->
+    "commands") while rejecting an alias buried mid-heading ("Memory Usage
+    Statistics" must not match "usage").
+    """
+    return bool(alias_words) and heading_words[: len(alias_words)] == alias_words
+
+
+def _aggregated_block_text(blocks: list[tuple[int, str, str]], index: int) -> str:
+    """Body of blocks[index] plus all its descendant subsections (deeper level)."""
+    level = blocks[index][0]
+    parts = [blocks[index][2]]
+    for sub_level, _heading, body in blocks[index + 1 :]:
+        if sub_level <= level:
+            break
+        if body:
+            parts.append(body)
+    return _strip_badges_and_links("\n".join(part for part in parts if part)).strip()
 
-def _section_has_meaningful_content(sections: dict[str, str], aliases: tuple[str, ...]) -> bool:
-    for alias in aliases:
-        content = sections.get(_normalize_heading(alias), "")
-        if _is_nontrivial_text(content):
+
+def _section_has_meaningful_content(
+    blocks: list[tuple[int, str, str]], aliases: tuple[str, ...]
+) -> bool:
+    """True if a content heading (level >= 2) starting with an alias phrase has
+    non-trivial rolled-up content.
+
+    Matching is prefix-anchored (see _heading_starts_with_alias), so "commands"
+    matches "Commands By Mode" but not "Memory Usage Statistics". The H1 title
+    (level 1) is skipped — it is the document title, not a content section, and
+    matching it would roll the entire file up as that section's body.
+    """
+    alias_word_lists = [_normalize_heading(alias).split() for alias in aliases]
+    for index, (level, heading, _body) in enumerate(blocks):
+        if level < 2:
+            continue
+        heading_words = heading.split()
+        matched = any(
+            _heading_starts_with_alias(heading_words, words) for words in alias_word_lists
+        )
+        if matched and _is_nontrivial_text(_aggregated_block_text(blocks, index)):
             return True
     return False
 
@@ -297,7 +341,13 @@ def _is_nontrivial_text(text: str) -> bool:
     if not compact:
         return False
     words = re.findall(r"[A-Za-z0-9][A-Za-z0-9+./:_-]*", compact)
-    return len(words) >= 4 and len(compact) >= 24
+    return len(words) >= 2 and len(compact) >= 12
+
+
+def _strip_badges_and_links(text: str) -> str:
+    """Drop image/badge markdown and keep link text (dropping URLs)."""
+    text = re.sub(r"!\[[^\]]*\]\([^)]*\)", "", text)  # images/badges
+    return re.sub(r"\[([^\]]*)\]\([^)]*\)", r"\1", text)  # keep link text, drop URLs
 
 
 def _lead_paragraph_text(text: str) -> str:
@@ -313,10 +363,7 @@ def _lead_paragraph_text(text: str) -> str:
         if re.match(r"^#\s", line):  # the H1 title line itself
             continue
         lead_lines.append(line)
-    lead = "\n".join(lead_lines)
-    lead = re.sub(r"!\[[^\]]*\]\([^)]*\)", "", lead)  # drop images/badges
-    lead = re.sub(r"\[([^\]]*)\]\([^)]*\)", r"\1", lead)  # keep link text, drop URLs
-    return lead
+    return _strip_badges_and_links("\n".join(lead_lines))
 
 
 def _has_lead_summary(text: str) -> bool:

diff --git a/tests/test_portfolio_context_contract.py b/tests/test_portfolio_context_contract.py
@@ -114,3 +114,98 @@ def test_lead_paragraph_in_primary_file_counts(tmp_path):
     )
     result = analyze_project_context(tmp_path, ["CLAUDE.md"])
     assert result.project_summary_present is True
+
+
+# --- Layer-3: hierarchy-aware + substring heading matching, relaxed threshold ---
+
+
+def test_run_instructions_in_subsections_of_quick_start(tmp_path):
+    # "## Quick Start" has an empty direct body; the run commands live one level
+    # down in "### Installation" — content must roll up to the matched parent.
+    _write(
+        tmp_path,
+        "README.md",
+        "# Proj\n\nA tool that does a thing.\n\n"
+        "## Quick Start\n\n### Installation\n\n```bash\nuv tool install proj\n```\n",
+    )
+    result = analyze_project_context(tmp_path, ["README.md"])
+    assert result.run_instructions_present is True
+
+
+def test_heading_containing_alias_term_matches(tmp_path):
+    # "## Commands By Mode" contains the "commands" alias but is not an exact match.
+    _write(
+        tmp_path,
+        "README.md",
+        "# Proj\n\nA tool.\n\n"
+        "## Commands By Mode\n\nRun `proj audit` to start the daily flow today.\n",
+    )
+    result = analyze_project_context(tmp_path, ["README.md"])
+    assert result.run_instructions_present is True
+
+
+def test_terse_managed_stack_value_counts(tmp_path):
+    # The managed-context block writes a short-but-valid stack ("- Primary stack:
+    # Python", 3 words) — the analyzer must not reject its own generated content.
+    _write(
+        tmp_path,
+        "AGENTS.md",
+        "# proj\n\n## Portfolio Context\n\n"
+        "### What this project is\n\nA small useful local tool for the operator.\n\n"
+        "### Stack\n\n- Primary stack: Python\n",
+    )
+    result = analyze_project_context(tmp_path, ["AGENTS.md"])
+    assert result.stack_present is True
+
+
+def test_badge_only_section_does_not_count(tmp_path):
+    # A "## Status" section that is only a CI badge must NOT count as current state
+    # (guards the relaxed threshold against badge false positives).
+    _write(
+        tmp_path,
+        "README.md",
+        "# Proj\n\nA tool that does a thing for people.\n\n## Status\n\n![CI](https://x/ci.svg)\n",
+    )
+    result = analyze_project_context(tmp_path, ["README.md"])
+    assert result.current_state_present is False
+
+
+# --- Guard against over-matching: a single-word alias as a non-leading word in
+# an unrelated heading must NOT satisfy the section (prefix-anchored matching). ---
+
+
+def test_single_word_alias_does_not_overmatch_unrelated_heading(tmp_path):
+    # "## Memory Usage Statistics" contains "usage" but is not run guidance.
+    _write(
+        tmp_path,
+        "README.md",
+        "# P\n\nA tool that does a clear thing for people.\n\n"
+        "## Memory Usage Statistics\n\nWe used 4GB of RAM during the load test run.\n",
+    )
+    result = analyze_project_context(tmp_path, ["README.md"])
+    assert result.run_instructions_present is False
+
+
+def test_status_codes_heading_is_not_current_state(tmp_path):
+    # "## HTTP Status Codes" contains "status" but is not project state.
+    _write(
+        tmp_path,
+        "README.md",
+        "# P\n\nA tool that does a clear thing for people.\n\n"
+        "## HTTP Status Codes\n\nThe API returns 200, 404, and 500 in these cases.\n",
+    )
+    result = analyze_project_context(tmp_path, ["README.md"])
+    assert result.current_state_present is False
+
+
+def test_h1_title_alias_does_not_eat_whole_document(tmp_path):
+    # The H1 title is not a content section: an alias word in the title must not
+    # roll up the entire document as that section's content.
+    _write(
+        tmp_path,
+        "README.md",
+        "# Technology Stack\n\nThis project is a thing for users.\n\n"
+        "## Overview\n\nIt does stuff for people who need it done.\n",
+    )
+    result = analyze_project_context(tmp_path, ["README.md"])
+    assert result.stack_present is False