From b995fcc5a1c7d12eecde1e5e59c2db6029efeb29 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Tue, 23 Jun 2026 15:53:38 -0500 Subject: [PATCH] Accept contained income-range source evidence --- arch/suite.py | 67 +++++++++++++++- tests/test_arch_source_package.py | 12 ++- tests/test_arch_suite.py | 125 ++++++++++++++++++++++++++++++ 3 files changed, 202 insertions(+), 2 deletions(-) diff --git a/arch/suite.py b/arch/suite.py index 7c0931f..14a02ed 100644 --- a/arch/suite.py +++ b/arch/suite.py @@ -963,7 +963,11 @@ def _row_semantic_evidence_issues( ) continue for matched_value in matched_values: - if not _values_equal(matched_value, value): + if not _filter_value_matches_source_value( + variable, + value, + matched_value, + ): issues.append( AgentAcceptanceIssue( code="row_filter_value_mismatch", @@ -1067,6 +1071,67 @@ def _source_row_value(row: SourceRow, variable: str) -> tuple[bool, Any]: return False, None +def _filter_value_matches_source_value( + variable: str, + expected: Any, + source_value: Any, +) -> bool: + if _values_equal(source_value, expected): + return True + if _normalize_semantic_name(variable) != "incomerange": + return False + return _income_range_contains(expected, source_value) + + +def _income_range_contains(expected: Any, source_value: Any) -> bool: + expected_bounds = _income_range_bounds(expected) + source_bounds = _income_range_bounds(source_value) + if expected_bounds is None or source_bounds is None: + return False + expected_lower, expected_upper = expected_bounds + source_lower, source_upper = source_bounds + if expected_lower is not None: + if source_lower is None or source_lower < expected_lower: + return False + if expected_upper is not None: + if source_upper is None or source_upper > expected_upper: + return False + return True + + +def _income_range_bounds(value: Any) -> tuple[float | None, float | None] | None: + text = str(value).strip().lower() + if text == "all": + return (None, None) + if text.startswith("under_"): + upper = _income_range_number(text.removeprefix("under_")) + return (None, upper) if upper is not None else None + if text.endswith("_plus"): + lower = _income_range_number(text.removesuffix("_plus")) + return (lower, None) if lower is not None else None + if "_to_" in text: + lower_text, upper_text = text.split("_to_", 1) + lower = _income_range_number(lower_text) + upper = _income_range_number(upper_text) + if lower is None or upper is None: + return None + return (lower, upper) + return None + + +def _income_range_number(text: str) -> float | None: + match = re.fullmatch(r"(\d+(?:\.\d+)?)([km])?", text) + if match is None: + return None + value = float(match.group(1)) + suffix = match.group(2) + if suffix == "k": + return value * 1_000 + if suffix == "m": + return value * 1_000_000 + return value + + def _constraint_evidenced_by_source_rows( rows: list[SourceRow], constraint: Any, diff --git a/tests/test_arch_source_package.py b/tests/test_arch_source_package.py index fa138bf..70ac01b 100644 --- a/tests/test_arch_source_package.py +++ b/tests/test_arch_source_package.py @@ -28,6 +28,7 @@ ) from arch.sources.cells import build_source_cell_key, validate_source_cells from arch.sources.rows import validate_source_rows +from arch.suite import build_source_suite REPO_ROOT = Path(__file__).resolve().parents[1] @@ -2134,7 +2135,9 @@ def test_soi_historic_table_2_state_broad_package_builds_2022_state_facts(): assert ca_partnership.layout.source_column_id == "A26270" -def test_soi_historic_table_2_state_agi_package_builds_taxable_interest_facts(): +def test_soi_historic_table_2_state_agi_package_builds_taxable_interest_facts( + tmp_path, +): package = load_source_package("soi-historic-table-2-state-agi-2022") rows = package.build_source_rows(2023) cells = package.build_source_cells(2023, source_rows=rows) @@ -2177,6 +2180,13 @@ def test_soi_historic_table_2_state_agi_package_builds_taxable_interest_facts(): assert {constraint.variable for constraint in ca_interest_amount.constraints} == { "us:statutes/26/62#adjusted_gross_income" } + suite = build_source_suite( + "soi-historic-table-2-state-agi-2022", + tmp_path / "soi-historic-table-2-state-agi-2022", + year=2023, + ) + assert suite.agent_acceptance.valid + assert suite.agent_acceptance.counts["row_semantic_error_count"] == 0 def test_soi_historic_table_2_state_eitc_package_builds_child_count_facts(): diff --git a/tests/test_arch_suite.py b/tests/test_arch_suite.py index 068ddbc..b77dbc1 100644 --- a/tests/test_arch_suite.py +++ b/tests/test_arch_suite.py @@ -158,6 +158,131 @@ def test_build_source_suite_supports_soi_table_1_4(tmp_path): assert (output_dir / "arch.db").exists() +def test_agent_acceptance_accepts_aggregate_income_range_source_rows(): + artifact = SourceArtifactMetadata( + source_name="irs_soi", + source_table="Historic Table 2 state AGI facts", + source_file="test.csv", + url="https://example.test/test.csv", + vintage="tax_year_2022", + sha256="abc123", + size_bytes=10, + extracted_at="2026-05-06", + extraction_method="test", + raw_r2_bucket="arch-raw", + raw_r2_key="raw/irs_soi/test.csv", + raw_r2_uri="r2://arch-raw/raw/irs_soi/test.csv", + ) + row_500k_to_1m = SourceRow( + artifact=artifact, + sheet_name="in55cmcsv", + row_number=10, + values={"AGI_STUB": 9}, + ) + row_1m_plus = SourceRow( + artifact=artifact, + sheet_name="in55cmcsv", + row_number=11, + values={"AGI_STUB": 10}, + ) + row_keys = ( + build_source_row_key(row_500k_to_1m), + build_source_row_key(row_1m_plus), + ) + cells = [ + SourceCell( + artifact=artifact, + sheet_name="in55cmcsv", + row_number=10, + column_number=1, + address="A10", + cell_type="number", + raw_value=1, + display_value="1", + source_row_key=row_keys[0], + ), + SourceCell( + artifact=artifact, + sheet_name="in55cmcsv", + row_number=11, + column_number=1, + address="A11", + cell_type="number", + raw_value=2, + display_value="2", + source_row_key=row_keys[1], + ), + ] + fact = AggregateFact( + value=3, + period=PeriodDimension(type="tax_year", value=2022), + geography=GeographyDimension( + level="country", + id="0100000US", + vintage="current", + name="United States", + ), + entity=EntityDimension(name="tax_unit"), + measure=Measure(concept="irs_soi.taxable_interest", unit="usd"), + aggregation=Aggregation(method="sum"), + source=SourceProvenance( + source_name="irs_soi", + source_table="test", + source_file="test.csv", + url="https://example.test/test.csv", + vintage="test", + extracted_at="2026-05-06", + extraction_method="test", + ), + filters={"income_range": "500k_plus"}, + source_record_id="irs_soi.test.500k_plus.taxable_interest", + source_cell_keys=tuple(build_source_cell_key(cell) for cell in cells), + source_row_keys=row_keys, + constraints=( + AggregateConstraint( + variable="us:statutes/26/62#adjusted_gross_income", + operator=">=", + value=500_000, + unit="usd", + ), + ), + layout=SourceRecordLayout( + groupby_dimension="income_range", + groupby_value_id="500k_plus", + table_record_kind="detail", + ), + ) + + report = build_agent_acceptance_report( + [fact], + [row_500k_to_1m, row_1m_plus], + cells, + source_rows=validate_source_rows([row_500k_to_1m, row_1m_plus]), + source_cells=validate_source_cells(cells), + source_regions=SourceRegionSuiteReport( + region_count=0, + covered_cell_count=0, + errors=(), + ), + source_records=SourceRecordSuiteReport( + spec_count=1, + resolved_count=1, + lineaged_count=1, + errors=(), + ), + fact_report=validate_facts([fact]), + concept_alignments=ConceptAlignmentReport( + alignment_count=0, + checked_count=0, + alignments=(), + errors=(), + ), + ) + + assert report.valid + assert report.counts["row_semantic_error_count"] == 0 + + def test_agent_acceptance_rejects_row_constraints_without_source_evidence(): artifact = SourceArtifactMetadata( source_name="bea",