Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 72 additions & 25 deletions src/portfolio_context_contract.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,12 +153,12 @@ def analyze_project_context(
readme_text = _read_small_text(readme_path)
has_readme = bool(readme_text.strip())

primary_sections = _split_markdown_sections(primary_text)
readme_sections = _split_markdown_sections(readme_text) if has_readme else {}
primary_blocks = _section_blocks(primary_text)
readme_blocks = _section_blocks(readme_text) if has_readme else []
section_presence = {
field: (
_section_has_meaningful_content(primary_sections, aliases)
or _section_has_meaningful_content(readme_sections, aliases)
_section_has_meaningful_content(primary_blocks, aliases)
or _section_has_meaningful_content(readme_blocks, aliases)
)
for field, aliases in CONTEXT_SECTION_ALIASES.items()
}
Expand Down Expand Up @@ -260,34 +260,78 @@ def _read_small_text(path: Path) -> str:
return path.read_text(errors="replace")


def _split_markdown_sections(text: str) -> dict[str, str]:
sections: dict[str, list[str]] = {}
current = "__preamble__"
sections[current] = []
def _section_blocks(text: str) -> list[tuple[int, str, str]]:
"""Ordered (level, normalized_heading, direct_body) for each markdown heading.

direct_body is the text under a heading up to the *next heading of any level*,
so a parent's subsections are separate, deeper-level blocks. Callers roll
descendant content up via the level (see _aggregated_block_text) — this keeps
a parent like "## Quick Start" whose content lives entirely under
"### Installation" from reading as empty.
"""
blocks: list[tuple[int, str, list[str]]] = []
in_fenced_code = False
for line in text.splitlines():
if re.match(r"^\s{0,3}```", line):
in_fenced_code = not in_fenced_code
sections.setdefault(current, []).append(line)
if blocks:
blocks[-1][2].append(line)
continue
match = None if in_fenced_code else re.match(r"^\s{0,3}#{1,6}\s+(.+?)\s*$", line)
match = None if in_fenced_code else re.match(r"^\s{0,3}(#{1,6})\s+(.+?)\s*$", line)
if match:
current = _normalize_heading(match.group(1))
sections.setdefault(current, [])
blocks.append((len(match.group(1)), _normalize_heading(match.group(2)), []))
continue
sections.setdefault(current, []).append(line)
return {heading: "\n".join(lines).strip() for heading, lines in sections.items()}
if blocks:
blocks[-1][2].append(line)
return [(level, heading, "\n".join(lines).strip()) for level, heading, lines in blocks]


def _normalize_heading(value: str) -> str:
normalized = re.sub(r"[^a-z0-9]+", " ", value.lower()).strip()
return normalized
return re.sub(r"[^a-z0-9]+", " ", value.lower()).strip()


def _heading_starts_with_alias(heading_words: list[str], alias_words: list[str]) -> bool:
"""True if the heading begins with the alias phrase (prefix-anchored match).

Anchoring at the start keeps decorative trailing words ("Commands By Mode" ->
"commands") while rejecting an alias buried mid-heading ("Memory Usage
Statistics" must not match "usage").
"""
return bool(alias_words) and heading_words[: len(alias_words)] == alias_words


def _aggregated_block_text(blocks: list[tuple[int, str, str]], index: int) -> str:
"""Body of blocks[index] plus all its descendant subsections (deeper level)."""
level = blocks[index][0]
parts = [blocks[index][2]]
for sub_level, _heading, body in blocks[index + 1 :]:
if sub_level <= level:
break
if body:
parts.append(body)
return _strip_badges_and_links("\n".join(part for part in parts if part)).strip()

def _section_has_meaningful_content(sections: dict[str, str], aliases: tuple[str, ...]) -> bool:
for alias in aliases:
content = sections.get(_normalize_heading(alias), "")
if _is_nontrivial_text(content):

def _section_has_meaningful_content(
blocks: list[tuple[int, str, str]], aliases: tuple[str, ...]
) -> bool:
"""True if a content heading (level >= 2) starting with an alias phrase has
non-trivial rolled-up content.

Matching is prefix-anchored (see _heading_starts_with_alias), so "commands"
matches "Commands By Mode" but not "Memory Usage Statistics". The H1 title
(level 1) is skipped — it is the document title, not a content section, and
matching it would roll the entire file up as that section's body.
"""
alias_word_lists = [_normalize_heading(alias).split() for alias in aliases]
for index, (level, heading, _body) in enumerate(blocks):
if level < 2:
continue
Comment on lines +328 to +329
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Do not skip every H1 section

When a README or context file uses H1-style section headings after the document title, such as # Project followed by # Usage or # Stack, this guard now ignores those sections entirely. The previous parser matched aliases at any heading depth, so those files lose run/stack/etc. detection and can be downgraded to boilerplate; only the title H1 needs to be excluded, not all level-1 blocks.

Useful? React with 👍 / 👎.

heading_words = heading.split()
matched = any(
_heading_starts_with_alias(heading_words, words) for words in alias_word_lists
)
if matched and _is_nontrivial_text(_aggregated_block_text(blocks, index)):
return True
return False

Expand All @@ -297,7 +341,13 @@ def _is_nontrivial_text(text: str) -> bool:
if not compact:
return False
words = re.findall(r"[A-Za-z0-9][A-Za-z0-9+./:_-]*", compact)
return len(words) >= 4 and len(compact) >= 24
return len(words) >= 2 and len(compact) >= 12


def _strip_badges_and_links(text: str) -> str:
"""Drop image/badge markdown and keep link text (dropping URLs)."""
text = re.sub(r"!\[[^\]]*\]\([^)]*\)", "", text) # images/badges
return re.sub(r"\[([^\]]*)\]\([^)]*\)", r"\1", text) # keep link text, drop URLs


def _lead_paragraph_text(text: str) -> str:
Expand All @@ -313,10 +363,7 @@ def _lead_paragraph_text(text: str) -> str:
if re.match(r"^#\s", line): # the H1 title line itself
continue
lead_lines.append(line)
lead = "\n".join(lead_lines)
lead = re.sub(r"!\[[^\]]*\]\([^)]*\)", "", lead) # drop images/badges
lead = re.sub(r"\[([^\]]*)\]\([^)]*\)", r"\1", lead) # keep link text, drop URLs
return lead
return _strip_badges_and_links("\n".join(lead_lines))


def _has_lead_summary(text: str) -> bool:
Expand Down
95 changes: 95 additions & 0 deletions tests/test_portfolio_context_contract.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,98 @@ def test_lead_paragraph_in_primary_file_counts(tmp_path):
)
result = analyze_project_context(tmp_path, ["CLAUDE.md"])
assert result.project_summary_present is True


# --- Layer-3: hierarchy-aware + substring heading matching, relaxed threshold ---


def test_run_instructions_in_subsections_of_quick_start(tmp_path):
# "## Quick Start" has an empty direct body; the run commands live one level
# down in "### Installation" — content must roll up to the matched parent.
_write(
tmp_path,
"README.md",
"# Proj\n\nA tool that does a thing.\n\n"
"## Quick Start\n\n### Installation\n\n```bash\nuv tool install proj\n```\n",
)
result = analyze_project_context(tmp_path, ["README.md"])
assert result.run_instructions_present is True


def test_heading_containing_alias_term_matches(tmp_path):
# "## Commands By Mode" contains the "commands" alias but is not an exact match.
_write(
tmp_path,
"README.md",
"# Proj\n\nA tool.\n\n"
"## Commands By Mode\n\nRun `proj audit` to start the daily flow today.\n",
)
result = analyze_project_context(tmp_path, ["README.md"])
assert result.run_instructions_present is True


def test_terse_managed_stack_value_counts(tmp_path):
# The managed-context block writes a short-but-valid stack ("- Primary stack:
# Python", 3 words) — the analyzer must not reject its own generated content.
_write(
tmp_path,
"AGENTS.md",
"# proj\n\n## Portfolio Context\n\n"
"### What this project is\n\nA small useful local tool for the operator.\n\n"
"### Stack\n\n- Primary stack: Python\n",
)
result = analyze_project_context(tmp_path, ["AGENTS.md"])
assert result.stack_present is True


def test_badge_only_section_does_not_count(tmp_path):
# A "## Status" section that is only a CI badge must NOT count as current state
# (guards the relaxed threshold against badge false positives).
_write(
tmp_path,
"README.md",
"# Proj\n\nA tool that does a thing for people.\n\n## Status\n\n![CI](https://x/ci.svg)\n",
)
result = analyze_project_context(tmp_path, ["README.md"])
assert result.current_state_present is False


# --- Guard against over-matching: a single-word alias as a non-leading word in
# an unrelated heading must NOT satisfy the section (prefix-anchored matching). ---


def test_single_word_alias_does_not_overmatch_unrelated_heading(tmp_path):
# "## Memory Usage Statistics" contains "usage" but is not run guidance.
_write(
tmp_path,
"README.md",
"# P\n\nA tool that does a clear thing for people.\n\n"
"## Memory Usage Statistics\n\nWe used 4GB of RAM during the load test run.\n",
)
result = analyze_project_context(tmp_path, ["README.md"])
assert result.run_instructions_present is False


def test_status_codes_heading_is_not_current_state(tmp_path):
# "## HTTP Status Codes" contains "status" but is not project state.
_write(
tmp_path,
"README.md",
"# P\n\nA tool that does a clear thing for people.\n\n"
"## HTTP Status Codes\n\nThe API returns 200, 404, and 500 in these cases.\n",
)
result = analyze_project_context(tmp_path, ["README.md"])
assert result.current_state_present is False


def test_h1_title_alias_does_not_eat_whole_document(tmp_path):
# The H1 title is not a content section: an alias word in the title must not
# roll up the entire document as that section's content.
_write(
tmp_path,
"README.md",
"# Technology Stack\n\nThis project is a thing for users.\n\n"
"## Overview\n\nIt does stuff for people who need it done.\n",
)
result = analyze_project_context(tmp_path, ["README.md"])
assert result.stack_present is False