From 4035908de9e4b809e424eaf1e9cf1e0d0d955108 Mon Sep 17 00:00:00 2001 From: saagpatel Date: Wed, 3 Jun 2026 01:27:30 -0700 Subject: [PATCH] fix(context): correct three false-negative bugs in the context-quality analyzer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The analyzer flagged repos as boilerplate even when their docs were rich, because section detection had three defects: 1. exact-key alias lookup — '## Commands By Mode' never matched the 'commands' alias; only headings exactly equal to an alias counted. 2. empty-parent headings — '## Quick Start' followed by '### Installation' had an empty direct body, so its run instructions (one level down) were invisible. 3. over-strict nontrivial threshold (>=4 words / >=24 chars) rejected valid terse sections — including the managed-context block's own generated '- Primary stack: Python' (3 words / 23 chars). Fix: parse ordered heading blocks with levels, roll descendant subsections up to their parent, prefix-anchor alias matching, strip badges/links before the content check, and relax the threshold to >=2 words / >=12 chars. Prefix-anchored matching (not substring) and skipping the H1 title avoid the inverse hazard surfaced in review: '## Memory Usage Statistics' must not match 'usage', and an alias word in the document title must not roll the whole file up as that section. Both pinned with regression tests. Live impact: 2 managed-block repos (cost-tracker, portfolio-health) clear, and run-detection false-negatives on 3 more are corrected. context-flag 10 -> 8. --- src/portfolio_context_contract.py | 97 ++++++++++++++++++------ tests/test_portfolio_context_contract.py | 95 +++++++++++++++++++++++ 2 files changed, 167 insertions(+), 25 deletions(-) diff --git a/src/portfolio_context_contract.py b/src/portfolio_context_contract.py index 4138069..1eb51ee 100644 --- a/src/portfolio_context_contract.py +++ b/src/portfolio_context_contract.py @@ -153,12 +153,12 @@ def analyze_project_context( readme_text = _read_small_text(readme_path) has_readme = bool(readme_text.strip()) - primary_sections = _split_markdown_sections(primary_text) - readme_sections = _split_markdown_sections(readme_text) if has_readme else {} + primary_blocks = _section_blocks(primary_text) + readme_blocks = _section_blocks(readme_text) if has_readme else [] section_presence = { field: ( - _section_has_meaningful_content(primary_sections, aliases) - or _section_has_meaningful_content(readme_sections, aliases) + _section_has_meaningful_content(primary_blocks, aliases) + or _section_has_meaningful_content(readme_blocks, aliases) ) for field, aliases in CONTEXT_SECTION_ALIASES.items() } @@ -260,34 +260,78 @@ def _read_small_text(path: Path) -> str: return path.read_text(errors="replace") -def _split_markdown_sections(text: str) -> dict[str, str]: - sections: dict[str, list[str]] = {} - current = "__preamble__" - sections[current] = [] +def _section_blocks(text: str) -> list[tuple[int, str, str]]: + """Ordered (level, normalized_heading, direct_body) for each markdown heading. + + direct_body is the text under a heading up to the *next heading of any level*, + so a parent's subsections are separate, deeper-level blocks. Callers roll + descendant content up via the level (see _aggregated_block_text) — this keeps + a parent like "## Quick Start" whose content lives entirely under + "### Installation" from reading as empty. + """ + blocks: list[tuple[int, str, list[str]]] = [] in_fenced_code = False for line in text.splitlines(): if re.match(r"^\s{0,3}```", line): in_fenced_code = not in_fenced_code - sections.setdefault(current, []).append(line) + if blocks: + blocks[-1][2].append(line) continue - match = None if in_fenced_code else re.match(r"^\s{0,3}#{1,6}\s+(.+?)\s*$", line) + match = None if in_fenced_code else re.match(r"^\s{0,3}(#{1,6})\s+(.+?)\s*$", line) if match: - current = _normalize_heading(match.group(1)) - sections.setdefault(current, []) + blocks.append((len(match.group(1)), _normalize_heading(match.group(2)), [])) continue - sections.setdefault(current, []).append(line) - return {heading: "\n".join(lines).strip() for heading, lines in sections.items()} + if blocks: + blocks[-1][2].append(line) + return [(level, heading, "\n".join(lines).strip()) for level, heading, lines in blocks] def _normalize_heading(value: str) -> str: - normalized = re.sub(r"[^a-z0-9]+", " ", value.lower()).strip() - return normalized + return re.sub(r"[^a-z0-9]+", " ", value.lower()).strip() + + +def _heading_starts_with_alias(heading_words: list[str], alias_words: list[str]) -> bool: + """True if the heading begins with the alias phrase (prefix-anchored match). + Anchoring at the start keeps decorative trailing words ("Commands By Mode" -> + "commands") while rejecting an alias buried mid-heading ("Memory Usage + Statistics" must not match "usage"). + """ + return bool(alias_words) and heading_words[: len(alias_words)] == alias_words + + +def _aggregated_block_text(blocks: list[tuple[int, str, str]], index: int) -> str: + """Body of blocks[index] plus all its descendant subsections (deeper level).""" + level = blocks[index][0] + parts = [blocks[index][2]] + for sub_level, _heading, body in blocks[index + 1 :]: + if sub_level <= level: + break + if body: + parts.append(body) + return _strip_badges_and_links("\n".join(part for part in parts if part)).strip() -def _section_has_meaningful_content(sections: dict[str, str], aliases: tuple[str, ...]) -> bool: - for alias in aliases: - content = sections.get(_normalize_heading(alias), "") - if _is_nontrivial_text(content): + +def _section_has_meaningful_content( + blocks: list[tuple[int, str, str]], aliases: tuple[str, ...] +) -> bool: + """True if a content heading (level >= 2) starting with an alias phrase has + non-trivial rolled-up content. + + Matching is prefix-anchored (see _heading_starts_with_alias), so "commands" + matches "Commands By Mode" but not "Memory Usage Statistics". The H1 title + (level 1) is skipped — it is the document title, not a content section, and + matching it would roll the entire file up as that section's body. + """ + alias_word_lists = [_normalize_heading(alias).split() for alias in aliases] + for index, (level, heading, _body) in enumerate(blocks): + if level < 2: + continue + heading_words = heading.split() + matched = any( + _heading_starts_with_alias(heading_words, words) for words in alias_word_lists + ) + if matched and _is_nontrivial_text(_aggregated_block_text(blocks, index)): return True return False @@ -297,7 +341,13 @@ def _is_nontrivial_text(text: str) -> bool: if not compact: return False words = re.findall(r"[A-Za-z0-9][A-Za-z0-9+./:_-]*", compact) - return len(words) >= 4 and len(compact) >= 24 + return len(words) >= 2 and len(compact) >= 12 + + +def _strip_badges_and_links(text: str) -> str: + """Drop image/badge markdown and keep link text (dropping URLs).""" + text = re.sub(r"!\[[^\]]*\]\([^)]*\)", "", text) # images/badges + return re.sub(r"\[([^\]]*)\]\([^)]*\)", r"\1", text) # keep link text, drop URLs def _lead_paragraph_text(text: str) -> str: @@ -313,10 +363,7 @@ def _lead_paragraph_text(text: str) -> str: if re.match(r"^#\s", line): # the H1 title line itself continue lead_lines.append(line) - lead = "\n".join(lead_lines) - lead = re.sub(r"!\[[^\]]*\]\([^)]*\)", "", lead) # drop images/badges - lead = re.sub(r"\[([^\]]*)\]\([^)]*\)", r"\1", lead) # keep link text, drop URLs - return lead + return _strip_badges_and_links("\n".join(lead_lines)) def _has_lead_summary(text: str) -> bool: diff --git a/tests/test_portfolio_context_contract.py b/tests/test_portfolio_context_contract.py index 66d6357..f6ff48e 100644 --- a/tests/test_portfolio_context_contract.py +++ b/tests/test_portfolio_context_contract.py @@ -114,3 +114,98 @@ def test_lead_paragraph_in_primary_file_counts(tmp_path): ) result = analyze_project_context(tmp_path, ["CLAUDE.md"]) assert result.project_summary_present is True + + +# --- Layer-3: hierarchy-aware + substring heading matching, relaxed threshold --- + + +def test_run_instructions_in_subsections_of_quick_start(tmp_path): + # "## Quick Start" has an empty direct body; the run commands live one level + # down in "### Installation" — content must roll up to the matched parent. + _write( + tmp_path, + "README.md", + "# Proj\n\nA tool that does a thing.\n\n" + "## Quick Start\n\n### Installation\n\n```bash\nuv tool install proj\n```\n", + ) + result = analyze_project_context(tmp_path, ["README.md"]) + assert result.run_instructions_present is True + + +def test_heading_containing_alias_term_matches(tmp_path): + # "## Commands By Mode" contains the "commands" alias but is not an exact match. + _write( + tmp_path, + "README.md", + "# Proj\n\nA tool.\n\n" + "## Commands By Mode\n\nRun `proj audit` to start the daily flow today.\n", + ) + result = analyze_project_context(tmp_path, ["README.md"]) + assert result.run_instructions_present is True + + +def test_terse_managed_stack_value_counts(tmp_path): + # The managed-context block writes a short-but-valid stack ("- Primary stack: + # Python", 3 words) — the analyzer must not reject its own generated content. + _write( + tmp_path, + "AGENTS.md", + "# proj\n\n## Portfolio Context\n\n" + "### What this project is\n\nA small useful local tool for the operator.\n\n" + "### Stack\n\n- Primary stack: Python\n", + ) + result = analyze_project_context(tmp_path, ["AGENTS.md"]) + assert result.stack_present is True + + +def test_badge_only_section_does_not_count(tmp_path): + # A "## Status" section that is only a CI badge must NOT count as current state + # (guards the relaxed threshold against badge false positives). + _write( + tmp_path, + "README.md", + "# Proj\n\nA tool that does a thing for people.\n\n## Status\n\n![CI](https://x/ci.svg)\n", + ) + result = analyze_project_context(tmp_path, ["README.md"]) + assert result.current_state_present is False + + +# --- Guard against over-matching: a single-word alias as a non-leading word in +# an unrelated heading must NOT satisfy the section (prefix-anchored matching). --- + + +def test_single_word_alias_does_not_overmatch_unrelated_heading(tmp_path): + # "## Memory Usage Statistics" contains "usage" but is not run guidance. + _write( + tmp_path, + "README.md", + "# P\n\nA tool that does a clear thing for people.\n\n" + "## Memory Usage Statistics\n\nWe used 4GB of RAM during the load test run.\n", + ) + result = analyze_project_context(tmp_path, ["README.md"]) + assert result.run_instructions_present is False + + +def test_status_codes_heading_is_not_current_state(tmp_path): + # "## HTTP Status Codes" contains "status" but is not project state. + _write( + tmp_path, + "README.md", + "# P\n\nA tool that does a clear thing for people.\n\n" + "## HTTP Status Codes\n\nThe API returns 200, 404, and 500 in these cases.\n", + ) + result = analyze_project_context(tmp_path, ["README.md"]) + assert result.current_state_present is False + + +def test_h1_title_alias_does_not_eat_whole_document(tmp_path): + # The H1 title is not a content section: an alias word in the title must not + # roll up the entire document as that section's content. + _write( + tmp_path, + "README.md", + "# Technology Stack\n\nThis project is a thing for users.\n\n" + "## Overview\n\nIt does stuff for people who need it done.\n", + ) + result = analyze_project_context(tmp_path, ["README.md"]) + assert result.stack_present is False