Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 66 additions & 1 deletion arch/suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -963,7 +963,11 @@ def _row_semantic_evidence_issues(
)
continue
for matched_value in matched_values:
if not _values_equal(matched_value, value):
if not _filter_value_matches_source_value(
variable,
value,
matched_value,
):
issues.append(
AgentAcceptanceIssue(
code="row_filter_value_mismatch",
Expand Down Expand Up @@ -1067,6 +1071,67 @@ def _source_row_value(row: SourceRow, variable: str) -> tuple[bool, Any]:
return False, None


def _filter_value_matches_source_value(
variable: str,
expected: Any,
source_value: Any,
) -> bool:
if _values_equal(source_value, expected):
return True
if _normalize_semantic_name(variable) != "incomerange":
return False
return _income_range_contains(expected, source_value)


def _income_range_contains(expected: Any, source_value: Any) -> bool:
expected_bounds = _income_range_bounds(expected)
source_bounds = _income_range_bounds(source_value)
if expected_bounds is None or source_bounds is None:
return False
expected_lower, expected_upper = expected_bounds
source_lower, source_upper = source_bounds
if expected_lower is not None:
if source_lower is None or source_lower < expected_lower:
return False
if expected_upper is not None:
if source_upper is None or source_upper > expected_upper:
return False
return True


def _income_range_bounds(value: Any) -> tuple[float | None, float | None] | None:
text = str(value).strip().lower()
if text == "all":
return (None, None)
if text.startswith("under_"):
upper = _income_range_number(text.removeprefix("under_"))
return (None, upper) if upper is not None else None
if text.endswith("_plus"):
lower = _income_range_number(text.removesuffix("_plus"))
return (lower, None) if lower is not None else None
if "_to_" in text:
lower_text, upper_text = text.split("_to_", 1)
lower = _income_range_number(lower_text)
upper = _income_range_number(upper_text)
if lower is None or upper is None:
return None
return (lower, upper)
return None


def _income_range_number(text: str) -> float | None:
match = re.fullmatch(r"(\d+(?:\.\d+)?)([km])?", text)
if match is None:
return None
value = float(match.group(1))
suffix = match.group(2)
if suffix == "k":
return value * 1_000
if suffix == "m":
return value * 1_000_000
return value


def _constraint_evidenced_by_source_rows(
rows: list[SourceRow],
constraint: Any,
Expand Down
12 changes: 11 additions & 1 deletion tests/test_arch_source_package.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
)
from arch.sources.cells import build_source_cell_key, validate_source_cells
from arch.sources.rows import validate_source_rows
from arch.suite import build_source_suite

REPO_ROOT = Path(__file__).resolve().parents[1]

Expand Down Expand Up @@ -2134,7 +2135,9 @@ def test_soi_historic_table_2_state_broad_package_builds_2022_state_facts():
assert ca_partnership.layout.source_column_id == "A26270"


def test_soi_historic_table_2_state_agi_package_builds_taxable_interest_facts():
def test_soi_historic_table_2_state_agi_package_builds_taxable_interest_facts(
tmp_path,
):
package = load_source_package("soi-historic-table-2-state-agi-2022")
rows = package.build_source_rows(2023)
cells = package.build_source_cells(2023, source_rows=rows)
Expand Down Expand Up @@ -2177,6 +2180,13 @@ def test_soi_historic_table_2_state_agi_package_builds_taxable_interest_facts():
assert {constraint.variable for constraint in ca_interest_amount.constraints} == {
"us:statutes/26/62#adjusted_gross_income"
}
suite = build_source_suite(
"soi-historic-table-2-state-agi-2022",
tmp_path / "soi-historic-table-2-state-agi-2022",
year=2023,
)
assert suite.agent_acceptance.valid
assert suite.agent_acceptance.counts["row_semantic_error_count"] == 0


def test_soi_historic_table_2_state_eitc_package_builds_child_count_facts():
Expand Down
125 changes: 125 additions & 0 deletions tests/test_arch_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,131 @@ def test_build_source_suite_supports_soi_table_1_4(tmp_path):
assert (output_dir / "arch.db").exists()


def test_agent_acceptance_accepts_aggregate_income_range_source_rows():
artifact = SourceArtifactMetadata(
source_name="irs_soi",
source_table="Historic Table 2 state AGI facts",
source_file="test.csv",
url="https://example.test/test.csv",
vintage="tax_year_2022",
sha256="abc123",
size_bytes=10,
extracted_at="2026-05-06",
extraction_method="test",
raw_r2_bucket="arch-raw",
raw_r2_key="raw/irs_soi/test.csv",
raw_r2_uri="r2://arch-raw/raw/irs_soi/test.csv",
)
row_500k_to_1m = SourceRow(
artifact=artifact,
sheet_name="in55cmcsv",
row_number=10,
values={"AGI_STUB": 9},
)
row_1m_plus = SourceRow(
artifact=artifact,
sheet_name="in55cmcsv",
row_number=11,
values={"AGI_STUB": 10},
)
row_keys = (
build_source_row_key(row_500k_to_1m),
build_source_row_key(row_1m_plus),
)
cells = [
SourceCell(
artifact=artifact,
sheet_name="in55cmcsv",
row_number=10,
column_number=1,
address="A10",
cell_type="number",
raw_value=1,
display_value="1",
source_row_key=row_keys[0],
),
SourceCell(
artifact=artifact,
sheet_name="in55cmcsv",
row_number=11,
column_number=1,
address="A11",
cell_type="number",
raw_value=2,
display_value="2",
source_row_key=row_keys[1],
),
]
fact = AggregateFact(
value=3,
period=PeriodDimension(type="tax_year", value=2022),
geography=GeographyDimension(
level="country",
id="0100000US",
vintage="current",
name="United States",
),
entity=EntityDimension(name="tax_unit"),
measure=Measure(concept="irs_soi.taxable_interest", unit="usd"),
aggregation=Aggregation(method="sum"),
source=SourceProvenance(
source_name="irs_soi",
source_table="test",
source_file="test.csv",
url="https://example.test/test.csv",
vintage="test",
extracted_at="2026-05-06",
extraction_method="test",
),
filters={"income_range": "500k_plus"},
source_record_id="irs_soi.test.500k_plus.taxable_interest",
source_cell_keys=tuple(build_source_cell_key(cell) for cell in cells),
source_row_keys=row_keys,
constraints=(
AggregateConstraint(
variable="us:statutes/26/62#adjusted_gross_income",
operator=">=",
value=500_000,
unit="usd",
),
),
layout=SourceRecordLayout(
groupby_dimension="income_range",
groupby_value_id="500k_plus",
table_record_kind="detail",
),
)

report = build_agent_acceptance_report(
[fact],
[row_500k_to_1m, row_1m_plus],
cells,
source_rows=validate_source_rows([row_500k_to_1m, row_1m_plus]),
source_cells=validate_source_cells(cells),
source_regions=SourceRegionSuiteReport(
region_count=0,
covered_cell_count=0,
errors=(),
),
source_records=SourceRecordSuiteReport(
spec_count=1,
resolved_count=1,
lineaged_count=1,
errors=(),
),
fact_report=validate_facts([fact]),
concept_alignments=ConceptAlignmentReport(
alignment_count=0,
checked_count=0,
alignments=(),
errors=(),
),
)

assert report.valid
assert report.counts["row_semantic_error_count"] == 0


def test_agent_acceptance_rejects_row_constraints_without_source_evidence():
artifact = SourceArtifactMetadata(
source_name="bea",
Expand Down
Loading