diff --git a/src/portfolio_context_contract.py b/src/portfolio_context_contract.py index 4138069..1eb51ee 100644 --- a/src/portfolio_context_contract.py +++ b/src/portfolio_context_contract.py @@ -153,12 +153,12 @@ def analyze_project_context( readme_text = _read_small_text(readme_path) has_readme = bool(readme_text.strip()) - primary_sections = _split_markdown_sections(primary_text) - readme_sections = _split_markdown_sections(readme_text) if has_readme else {} + primary_blocks = _section_blocks(primary_text) + readme_blocks = _section_blocks(readme_text) if has_readme else [] section_presence = { field: ( - _section_has_meaningful_content(primary_sections, aliases) - or _section_has_meaningful_content(readme_sections, aliases) + _section_has_meaningful_content(primary_blocks, aliases) + or _section_has_meaningful_content(readme_blocks, aliases) ) for field, aliases in CONTEXT_SECTION_ALIASES.items() } @@ -260,34 +260,78 @@ def _read_small_text(path: Path) -> str: return path.read_text(errors="replace") -def _split_markdown_sections(text: str) -> dict[str, str]: - sections: dict[str, list[str]] = {} - current = "__preamble__" - sections[current] = [] +def _section_blocks(text: str) -> list[tuple[int, str, str]]: + """Ordered (level, normalized_heading, direct_body) for each markdown heading. + + direct_body is the text under a heading up to the *next heading of any level*, + so a parent's subsections are separate, deeper-level blocks. Callers roll + descendant content up via the level (see _aggregated_block_text) — this keeps + a parent like "## Quick Start" whose content lives entirely under + "### Installation" from reading as empty. + """ + blocks: list[tuple[int, str, list[str]]] = [] in_fenced_code = False for line in text.splitlines(): if re.match(r"^\s{0,3}```", line): in_fenced_code = not in_fenced_code - sections.setdefault(current, []).append(line) + if blocks: + blocks[-1][2].append(line) continue - match = None if in_fenced_code else re.match(r"^\s{0,3}#{1,6}\s+(.+?)\s*$", line) + match = None if in_fenced_code else re.match(r"^\s{0,3}(#{1,6})\s+(.+?)\s*$", line) if match: - current = _normalize_heading(match.group(1)) - sections.setdefault(current, []) + blocks.append((len(match.group(1)), _normalize_heading(match.group(2)), [])) continue - sections.setdefault(current, []).append(line) - return {heading: "\n".join(lines).strip() for heading, lines in sections.items()} + if blocks: + blocks[-1][2].append(line) + return [(level, heading, "\n".join(lines).strip()) for level, heading, lines in blocks] def _normalize_heading(value: str) -> str: - normalized = re.sub(r"[^a-z0-9]+", " ", value.lower()).strip() - return normalized + return re.sub(r"[^a-z0-9]+", " ", value.lower()).strip() + + +def _heading_starts_with_alias(heading_words: list[str], alias_words: list[str]) -> bool: + """True if the heading begins with the alias phrase (prefix-anchored match). + Anchoring at the start keeps decorative trailing words ("Commands By Mode" -> + "commands") while rejecting an alias buried mid-heading ("Memory Usage + Statistics" must not match "usage"). + """ + return bool(alias_words) and heading_words[: len(alias_words)] == alias_words + + +def _aggregated_block_text(blocks: list[tuple[int, str, str]], index: int) -> str: + """Body of blocks[index] plus all its descendant subsections (deeper level).""" + level = blocks[index][0] + parts = [blocks[index][2]] + for sub_level, _heading, body in blocks[index + 1 :]: + if sub_level <= level: + break + if body: + parts.append(body) + return _strip_badges_and_links("\n".join(part for part in parts if part)).strip() -def _section_has_meaningful_content(sections: dict[str, str], aliases: tuple[str, ...]) -> bool: - for alias in aliases: - content = sections.get(_normalize_heading(alias), "") - if _is_nontrivial_text(content): + +def _section_has_meaningful_content( + blocks: list[tuple[int, str, str]], aliases: tuple[str, ...] +) -> bool: + """True if a content heading (level >= 2) starting with an alias phrase has + non-trivial rolled-up content. + + Matching is prefix-anchored (see _heading_starts_with_alias), so "commands" + matches "Commands By Mode" but not "Memory Usage Statistics". The H1 title + (level 1) is skipped — it is the document title, not a content section, and + matching it would roll the entire file up as that section's body. + """ + alias_word_lists = [_normalize_heading(alias).split() for alias in aliases] + for index, (level, heading, _body) in enumerate(blocks): + if level < 2: + continue + heading_words = heading.split() + matched = any( + _heading_starts_with_alias(heading_words, words) for words in alias_word_lists + ) + if matched and _is_nontrivial_text(_aggregated_block_text(blocks, index)): return True return False @@ -297,7 +341,13 @@ def _is_nontrivial_text(text: str) -> bool: if not compact: return False words = re.findall(r"[A-Za-z0-9][A-Za-z0-9+./:_-]*", compact) - return len(words) >= 4 and len(compact) >= 24 + return len(words) >= 2 and len(compact) >= 12 + + +def _strip_badges_and_links(text: str) -> str: + """Drop image/badge markdown and keep link text (dropping URLs).""" + text = re.sub(r"!\[[^\]]*\]\([^)]*\)", "", text) # images/badges + return re.sub(r"\[([^\]]*)\]\([^)]*\)", r"\1", text) # keep link text, drop URLs def _lead_paragraph_text(text: str) -> str: @@ -313,10 +363,7 @@ def _lead_paragraph_text(text: str) -> str: if re.match(r"^#\s", line): # the H1 title line itself continue lead_lines.append(line) - lead = "\n".join(lead_lines) - lead = re.sub(r"!\[[^\]]*\]\([^)]*\)", "", lead) # drop images/badges - lead = re.sub(r"\[([^\]]*)\]\([^)]*\)", r"\1", lead) # keep link text, drop URLs - return lead + return _strip_badges_and_links("\n".join(lead_lines)) def _has_lead_summary(text: str) -> bool: diff --git a/tests/test_portfolio_context_contract.py b/tests/test_portfolio_context_contract.py index 66d6357..f6ff48e 100644 --- a/tests/test_portfolio_context_contract.py +++ b/tests/test_portfolio_context_contract.py @@ -114,3 +114,98 @@ def test_lead_paragraph_in_primary_file_counts(tmp_path): ) result = analyze_project_context(tmp_path, ["CLAUDE.md"]) assert result.project_summary_present is True + + +# --- Layer-3: hierarchy-aware + substring heading matching, relaxed threshold --- + + +def test_run_instructions_in_subsections_of_quick_start(tmp_path): + # "## Quick Start" has an empty direct body; the run commands live one level + # down in "### Installation" — content must roll up to the matched parent. + _write( + tmp_path, + "README.md", + "# Proj\n\nA tool that does a thing.\n\n" + "## Quick Start\n\n### Installation\n\n```bash\nuv tool install proj\n```\n", + ) + result = analyze_project_context(tmp_path, ["README.md"]) + assert result.run_instructions_present is True + + +def test_heading_containing_alias_term_matches(tmp_path): + # "## Commands By Mode" contains the "commands" alias but is not an exact match. + _write( + tmp_path, + "README.md", + "# Proj\n\nA tool.\n\n" + "## Commands By Mode\n\nRun `proj audit` to start the daily flow today.\n", + ) + result = analyze_project_context(tmp_path, ["README.md"]) + assert result.run_instructions_present is True + + +def test_terse_managed_stack_value_counts(tmp_path): + # The managed-context block writes a short-but-valid stack ("- Primary stack: + # Python", 3 words) — the analyzer must not reject its own generated content. + _write( + tmp_path, + "AGENTS.md", + "# proj\n\n## Portfolio Context\n\n" + "### What this project is\n\nA small useful local tool for the operator.\n\n" + "### Stack\n\n- Primary stack: Python\n", + ) + result = analyze_project_context(tmp_path, ["AGENTS.md"]) + assert result.stack_present is True + + +def test_badge_only_section_does_not_count(tmp_path): + # A "## Status" section that is only a CI badge must NOT count as current state + # (guards the relaxed threshold against badge false positives). + _write( + tmp_path, + "README.md", + "# Proj\n\nA tool that does a thing for people.\n\n## Status\n\n![CI](https://x/ci.svg)\n", + ) + result = analyze_project_context(tmp_path, ["README.md"]) + assert result.current_state_present is False + + +# --- Guard against over-matching: a single-word alias as a non-leading word in +# an unrelated heading must NOT satisfy the section (prefix-anchored matching). --- + + +def test_single_word_alias_does_not_overmatch_unrelated_heading(tmp_path): + # "## Memory Usage Statistics" contains "usage" but is not run guidance. + _write( + tmp_path, + "README.md", + "# P\n\nA tool that does a clear thing for people.\n\n" + "## Memory Usage Statistics\n\nWe used 4GB of RAM during the load test run.\n", + ) + result = analyze_project_context(tmp_path, ["README.md"]) + assert result.run_instructions_present is False + + +def test_status_codes_heading_is_not_current_state(tmp_path): + # "## HTTP Status Codes" contains "status" but is not project state. + _write( + tmp_path, + "README.md", + "# P\n\nA tool that does a clear thing for people.\n\n" + "## HTTP Status Codes\n\nThe API returns 200, 404, and 500 in these cases.\n", + ) + result = analyze_project_context(tmp_path, ["README.md"]) + assert result.current_state_present is False + + +def test_h1_title_alias_does_not_eat_whole_document(tmp_path): + # The H1 title is not a content section: an alias word in the title must not + # roll up the entire document as that section's content. + _write( + tmp_path, + "README.md", + "# Technology Stack\n\nThis project is a thing for users.\n\n" + "## Overview\n\nIt does stuff for people who need it done.\n", + ) + result = analyze_project_context(tmp_path, ["README.md"]) + assert result.stack_present is False