diff --git a/arch/core.py b/arch/core.py index 581a935..924158c 100644 --- a/arch/core.py +++ b/arch/core.py @@ -12,12 +12,37 @@ import re from collections import Counter from dataclasses import asdict, dataclass, field +from datetime import date from decimal import Decimal from typing import Any Scalar = str | int | float | bool | None ALLOWED_PERIOD_TYPES = {"calendar_year", "tax_year", "fiscal_year", "month"} +ALLOWED_PERIOD_BASES = { + "calendar_year", + "tax_year", + "fiscal_year", + "us_federal_fiscal_year", + "uk_fiscal_year", + "state_fiscal_year", + "reference_month", + "benefit_month", + "payment_month", + "payment_date_fiscal_year", + "statistical_annual", + "projection_year", +} +ALLOWED_ACCOUNTING_BASES = { + "accrual", + "cash", + "cash_outlay", + "cash_payment", + "benefit_month", + "payment_date", + "statistical_total", + "projection", +} ALLOWED_GEOGRAPHY_LEVELS = { "country", "region", @@ -70,6 +95,12 @@ class PeriodDimension: type: str value: int | str + start_date: str | None = None + end_date: str | None = None + basis: str | None = None + authority: str | None = None + source_label: str | None = None + accounting_basis: str | None = None @dataclass(frozen=True) @@ -247,7 +278,9 @@ def build_label(fact: AggregateFact) -> str: concept = _humanize(fact.measure.concept) aggregation = _humanize(fact.aggregation.method) entity = _humanize(fact.entity.name) - period = f"{fact.period.value} {_humanize(fact.period.type)}" + period = fact.period.source_label or ( + f"{fact.period.value} {_humanize(fact.period.type)}" + ) geography = fact.geography.name or fact.geography.id source = _source_label(fact.source) @@ -373,6 +406,7 @@ def validate_fact(fact: AggregateFact) -> tuple[ValidationIssue, ...]: errors.append( _issue("missing_period", "Period value is required", "period.value") ) + _validate_period_semantics(errors, fact.period) if fact.geography.level not in ALLOWED_GEOGRAPHY_LEVELS: errors.append( @@ -500,7 +534,7 @@ def fact_counts(facts: list[AggregateFact]) -> dict[str, dict[str, int]]: def _canonical_key_payload(fact: AggregateFact) -> dict[str, Any]: payload = { - "period": asdict(fact.period), + "period": _period_key_payload(fact.period), "geography": { "level": fact.geography.level, "id": fact.geography.id, @@ -527,6 +561,95 @@ def _canonical_key_payload(fact: AggregateFact) -> dict[str, Any]: return payload +def _period_key_payload(period: PeriodDimension) -> dict[str, Any]: + return {key: value for key, value in asdict(period).items() if value is not None} + + +def _validate_period_semantics( + errors: list[ValidationIssue], + period: PeriodDimension, +) -> None: + if period.basis is not None and period.basis not in ALLOWED_PERIOD_BASES: + errors.append( + _issue( + "malformed_period", + f"Unsupported period basis: {period.basis!r}", + "period.basis", + ) + ) + if ( + period.accounting_basis is not None + and period.accounting_basis not in ALLOWED_ACCOUNTING_BASES + ): + errors.append( + _issue( + "malformed_period", + f"Unsupported accounting basis: {period.accounting_basis!r}", + "period.accounting_basis", + ) + ) + if period.authority is not None and not period.authority.strip(): + errors.append( + _issue( + "missing_period", + "Period authority must be nonempty when provided", + "period.authority", + ) + ) + if period.source_label is not None and not period.source_label.strip(): + errors.append( + _issue( + "missing_period", + "Period source label must be nonempty when provided", + "period.source_label", + ) + ) + + parsed_start = _parse_iso_date(errors, period.start_date, "period.start_date") + parsed_end = _parse_iso_date(errors, period.end_date, "period.end_date") + if ( + parsed_start is not None + and parsed_end is not None + and parsed_start > parsed_end + ): + errors.append( + _issue( + "malformed_period", + "Period start_date must be on or before end_date", + "period.start_date", + ) + ) + + +def _parse_iso_date( + errors: list[ValidationIssue], + value: str | None, + field_name: str, +) -> date | None: + if value is None: + return None + if not re.fullmatch(r"\d{4}-\d{2}-\d{2}", value): + errors.append( + _issue( + "malformed_period", + f"Period date must use ISO YYYY-MM-DD format: {value!r}", + field_name, + ) + ) + return None + try: + return date.fromisoformat(value) + except ValueError: + errors.append( + _issue( + "malformed_period", + f"Period date must use ISO YYYY-MM-DD format: {value!r}", + field_name, + ) + ) + return None + + def _validate_value(errors: list[ValidationIssue], value: Any) -> None: if isinstance(value, Decimal): return diff --git a/arch/database.py b/arch/database.py index d426e5a..8fef3a4 100644 --- a/arch/database.py +++ b/arch/database.py @@ -33,7 +33,7 @@ source_row_to_mapping, ) -ARCH_DB_SCHEMA_VERSION = "arch.relational.v1" +ARCH_DB_SCHEMA_VERSION = "arch.relational.v2" @dataclass(frozen=True) @@ -77,9 +77,7 @@ def build_arch_db( columns = source_columns_from_source_rows(rows) source_row_values_count = sum(len(row.values) for row in rows) resolved_build_id = build_id or _build_id(facts, cells, rows) - fact_constraints = [ - (fact, build_aggregate_constraints(fact)) for fact in facts - ] + fact_constraints = [(fact, build_aggregate_constraints(fact)) for fact in facts] source_record_ids = { fact.source_record_id for fact in facts if fact.source_record_id is not None } @@ -117,9 +115,7 @@ def build_arch_db( return ArchDbBuildReport( build_id=resolved_build_id, facts_count=len(facts), - constraints_count=sum( - len(constraints) for _, constraints in fact_constraints - ), + constraints_count=sum(len(constraints) for _, constraints in fact_constraints), source_records_count=len(source_record_ids), source_rows_count=len(rows), source_columns_count=len(columns), @@ -244,13 +240,25 @@ def _create_schema(connection: sqlite3.Connection) -> None: legal_vintage TEXT, period_type TEXT, period_value TEXT, + period_start_date TEXT, + period_end_date TEXT, + period_basis TEXT, + period_authority TEXT, + period_source_label TEXT, + period_accounting_basis TEXT, PRIMARY KEY ( source_concept, canonical_concept, relation, legal_vintage, period_type, - period_value + period_value, + period_start_date, + period_end_date, + period_basis, + period_authority, + period_source_label, + period_accounting_basis ) ); @@ -278,6 +286,12 @@ def _create_schema(connection: sqlite3.Connection) -> None: value_numeric REAL, period_type TEXT NOT NULL, period_value TEXT NOT NULL, + period_start_date TEXT, + period_end_date TEXT, + period_basis TEXT, + period_authority TEXT, + period_source_label TEXT, + period_accounting_basis TEXT, geography_level TEXT NOT NULL, geography_id TEXT NOT NULL, geography_vintage TEXT, @@ -740,7 +754,7 @@ def _insert_concept_alignments( facts: list[AggregateFact], build_id: str, ) -> None: - seen: set[tuple[str, str, str, str | None, str, str]] = set() + seen: set[tuple[Any, ...]] = set() for fact in facts: measure = fact.measure if not measure.source_concept or not measure.concept_relation: @@ -752,6 +766,12 @@ def _insert_concept_alignments( measure.legal_vintage, fact.period.type, str(fact.period.value), + fact.period.start_date, + fact.period.end_date, + fact.period.basis, + fact.period.authority, + fact.period.source_label, + fact.period.accounting_basis, ) if key in seen: continue @@ -768,9 +788,15 @@ def _insert_concept_alignments( evidence_notes, legal_vintage, period_type, - period_value + period_value, + period_start_date, + period_end_date, + period_basis, + period_authority, + period_source_label, + period_accounting_basis ) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( measure.source_concept, @@ -783,6 +809,12 @@ def _insert_concept_alignments( measure.legal_vintage, fact.period.type, str(fact.period.value), + fact.period.start_date, + fact.period.end_date, + fact.period.basis, + fact.period.authority, + fact.period.source_label, + fact.period.accounting_basis, ), ) @@ -820,6 +852,12 @@ def _insert_aggregate_fact( value_numeric, period_type, period_value, + period_start_date, + period_end_date, + period_basis, + period_authority, + period_source_label, + period_accounting_basis, geography_level, geography_id, geography_vintage, @@ -849,7 +887,7 @@ def _insert_aggregate_fact( source_extraction_method, source_method_notes ) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( fact_key, @@ -875,6 +913,12 @@ def _insert_aggregate_fact( _numeric_value(fact.value), fact.period.type, str(fact.period.value), + fact.period.start_date, + fact.period.end_date, + fact.period.basis, + fact.period.authority, + fact.period.source_label, + fact.period.accounting_basis, fact.geography.level, fact.geography.id, fact.geography.vintage, diff --git a/arch/source_package.py b/arch/source_package.py index f2ce1c1..78f5d59 100644 --- a/arch/source_package.py +++ b/arch/source_package.py @@ -85,17 +85,11 @@ "census-acs-s2201-congressional-district-snap-2024": Path( "census/acs_s2201_district_2024" ), - "census-b01001-female-age-2023": Path( - "census/b01001_female_15_44_2023" - ), + "census-b01001-female-age-2023": Path("census/b01001_female_15_44_2023"), "census-pep-2024-national-age-sex": Path("census/pep_2024_national_age_sex"), "census-pep-2024-state-age-sex": Path("census/pep_2024_state_age_sex"), - "census-population-projections-2023": Path( - "census/population_projections_2023" - ), - "census-stc-individual-income-tax": Path( - "census/stc_individual_income_tax" - ), + "census-population-projections-2023": Path("census/population_projections_2023"), + "census-stc-individual-income-tax": Path("census/stc_individual_income_tax"), "cms-medicaid-chip-monthly-enrollment-december-2024": Path( "cms_medicaid/chip_monthly_enrollment_december_2024" ), @@ -105,9 +99,7 @@ "cms-aca-oep-state-level": Path("cms_aca/oep_state_level"), "cms-aca-oep-state-level-2022": Path("cms_aca/oep_state_level_2022"), "cms-aca-oep-state-level-2025": Path("cms_aca/oep_state_level_2025"), - "cms-aca-effectuated-enrollment-2022": Path( - "cms_aca/effectuated_enrollment_2022" - ), + "cms-aca-effectuated-enrollment-2022": Path("cms_aca/effectuated_enrollment_2022"), "cms-medicare-trustees-report-2025-part-b-premium-income": Path( "cms_medicare/medicare_trustees_report_2025" ), @@ -131,9 +123,7 @@ ), "soi-table-4-3": Path("irs_soi/table_4_3"), "soi-state-2022": Path("irs_soi/state_2022"), - "soi-congressional-district-2022": Path( - "irs_soi/congressional_district_2022" - ), + "soi-congressional-district-2022": Path("irs_soi/congressional_district_2022"), "soi-historic-table-2": Path("irs_soi/historic_table_2"), "soi-historic-table-2-state-agi-2022": Path( "irs_soi/historic_table_2_state_agi_2022" @@ -711,6 +701,30 @@ def to_record_set_spec(self, year: int) -> SourceRecordSetSpec: ), period_type=_required(self.payload, "period_type", "record_set"), period=_record_set_period_from_mapping(self.payload, year=year), + period_start_date=_optional_rendered_string( + self.payload.get("period_start_date"), + year=year, + ), + period_end_date=_optional_rendered_string( + self.payload.get("period_end_date"), + year=year, + ), + period_basis=_optional_rendered_string( + self.payload.get("period_basis"), + year=year, + ), + period_authority=_optional_rendered_string( + self.payload.get("period_authority"), + year=year, + ), + period_source_label=_optional_rendered_string( + self.payload.get("period_source_label"), + year=year, + ), + accounting_basis=_optional_rendered_string( + self.payload.get("accounting_basis"), + year=year, + ), geography_id=_required(self.payload, "geography_id", "record_set"), geography_level=_required( self.payload, @@ -1536,7 +1550,16 @@ def _fact_from_source_record( spec = record.spec return AggregateFact( value=record.value, - period=PeriodDimension(type=spec.period_type, value=spec.period), + period=PeriodDimension( + type=spec.period_type, + value=spec.period, + start_date=spec.period_start_date, + end_date=spec.period_end_date, + basis=spec.period_basis, + authority=spec.period_authority, + source_label=spec.period_source_label, + accounting_basis=spec.accounting_basis, + ), geography=GeographyDimension( level=spec.geography_level, id=spec.geography_id, @@ -1761,6 +1784,13 @@ def _scaffold_template( sheet_name: TODO period_type: tax_year period: "{{year}}" + # Optional period metadata: + # period_start_date: "{{year}}-01-01" + # period_end_date: "{{year}}-12-31" + # period_basis: tax_year + # period_authority: TODO + # period_source_label: Tax year {{year}} + # accounting_basis: cash geography_id: TODO geography_level: country geography_name: TODO diff --git a/arch/sources/specs.py b/arch/sources/specs.py index 3dab333..b371109 100644 --- a/arch/sources/specs.py +++ b/arch/sources/specs.py @@ -81,6 +81,12 @@ class SourceRecordSpec: entity_role: str | None aggregation: str domain: str + period_start_date: str | None = None + period_end_date: str | None = None + period_basis: str | None = None + period_authority: str | None = None + period_source_label: str | None = None + accounting_basis: str | None = None filters: dict[str, Scalar] = field(default_factory=dict) constraints: tuple[AggregateConstraint, ...] = () value_scale: int | float = 1 @@ -182,6 +188,12 @@ class SourceRecordSetSpec: groupby_dimension: str rows: tuple[SourceRecordSetRow, ...] measures: tuple[SourceRecordSetMeasure, ...] + period_start_date: str | None = None + period_end_date: str | None = None + period_basis: str | None = None + period_authority: str | None = None + period_source_label: str | None = None + accounting_basis: str | None = None shared_filters: dict[str, Scalar] = field(default_factory=dict) shared_constraints: tuple[AggregateConstraint, ...] = () @@ -244,6 +256,12 @@ def compile_source_record_set_specs( unit=measure.unit, period_type=spec.period_type, period=spec.period, + period_start_date=spec.period_start_date, + period_end_date=spec.period_end_date, + period_basis=spec.period_basis, + period_authority=spec.period_authority, + period_source_label=spec.period_source_label, + accounting_basis=spec.accounting_basis, geography_id=row.geography_id or spec.geography_id, geography_level=row.geography_level or spec.geography_level, geography_name=( @@ -713,6 +731,16 @@ def _record_set_spec_hash(spec: SourceRecordSetSpec) -> str: payload = asdict(spec) if not payload.get("shared_constraints"): payload.pop("shared_constraints", None) + for key in ( + "period_start_date", + "period_end_date", + "period_basis", + "period_authority", + "period_source_label", + "accounting_basis", + ): + if payload.get(key) is None: + payload.pop(key, None) for row in payload["rows"]: if row.get("row_end_number") is None: row.pop("row_end_number", None) diff --git a/tests/test_arch_core.py b/tests/test_arch_core.py index b73cb39..d675efb 100644 --- a/tests/test_arch_core.py +++ b/tests/test_arch_core.py @@ -74,6 +74,63 @@ def test_valid_fact_passes_validation(): assert validate_facts([_fact()]).valid +def test_period_metadata_passes_validation_and_updates_label(): + fact = _fact( + label=None, + period=PeriodDimension( + type="tax_year", + value=2023, + start_date="2023-01-01", + end_date="2023-12-31", + basis="tax_year", + authority="26 USC 441", + source_label="IRS tax year 2023", + accounting_basis="cash", + ), + ) + + assert validate_fact(fact) == () + assert "IRS tax year 2023" in build_label(fact) + + +def test_period_metadata_validation_reports_bad_values(): + fact = _fact( + period=PeriodDimension( + type="tax_year", + value=2023, + start_date="2023-12-31", + end_date="2023-01-01", + basis="publication_year", + authority=" ", + source_label=" ", + accounting_basis="magic", + ), + ) + + errors = validate_fact(fact) + fields_by_code = {(error.code, error.field) for error in errors} + + assert ("malformed_period", "period.basis") in fields_by_code + assert ("malformed_period", "period.accounting_basis") in fields_by_code + assert ("missing_period", "period.authority") in fields_by_code + assert ("missing_period", "period.source_label") in fields_by_code + assert ("malformed_period", "period.start_date") in fields_by_code + + +def test_period_date_validation_requires_iso_yyyy_mm_dd(): + fact = _fact( + period=PeriodDimension( + type="tax_year", + value=2023, + start_date="2023/01/01", + ) + ) + + assert ("malformed_period", "period.start_date") in { + (error.code, error.field) for error in validate_fact(fact) + } + + def test_stable_key_ignores_human_label(): fact = _fact() relabeled = _fact(label="A different display label") diff --git a/tests/test_arch_database.py b/tests/test_arch_database.py index 7ce218b..3fbe9c5 100644 --- a/tests/test_arch_database.py +++ b/tests/test_arch_database.py @@ -8,6 +8,7 @@ import pytest from arch.core import build_aggregate_constraints +from arch.core import PeriodDimension from arch.jurisdictions.us.soi import AXIOM_IRC_AGI_CONCEPT from arch.database import build_arch_db from arch.harness import build_arch_db_file @@ -21,13 +22,14 @@ def test_build_aggregate_constraints_lifts_agi_filters(): fact = next( fact for fact in build_soi_table_1_1_facts(2023) - if fact.source_record_id - == "irs_soi.ty2023.table_1_1.100k_to_200k.return_count" + if fact.source_record_id == "irs_soi.ty2023.table_1_1.100k_to_200k.return_count" ) constraints = build_aggregate_constraints(fact) - assert [(item.variable, item.operator, item.value, item.unit) for item in constraints] == [ + assert [ + (item.variable, item.operator, item.value, item.unit) for item in constraints + ] == [ (AXIOM_IRC_AGI_CONCEPT, ">=", 100_000, "usd"), (AXIOM_IRC_AGI_CONCEPT, "<", 200_000, "usd"), ] @@ -36,6 +38,22 @@ def test_build_aggregate_constraints_lifts_agi_filters(): def test_build_arch_db_writes_aggregate_fact_constraints_and_lineage(tmp_path): db_path = tmp_path / "arch-fixture.db" facts = build_soi_table_1_1_facts(2023) + facts = [ + replace( + fact, + period=PeriodDimension( + type=fact.period.type, + value=fact.period.value, + start_date="2023-01-01", + end_date="2023-12-31", + basis="tax_year", + authority="26 USC 441", + source_label="IRS tax year 2023", + accounting_basis="cash", + ), + ) + for fact in facts + ] cells = build_soi_table_1_1_source_cells(2023) report = build_arch_db(facts, db_path, source_cells=cells) @@ -73,9 +91,7 @@ def test_build_arch_db_writes_aggregate_fact_constraints_and_lineage(tmp_path): assert all_returns["value_numeric"] == 160_602_107 assert all_returns["domain"] == "all_individual_income_tax_returns" assert artifact["raw_r2_bucket"] == "arch-raw" - assert artifact["raw_r2_key"].startswith( - "raw/irs_soi/soi-table-1-1/2023/" - ) + assert artifact["raw_r2_key"].startswith("raw/irs_soi/soi-table-1-1/2023/") assert artifact["raw_r2_uri"].startswith("r2://arch-raw/") assert build_artifact_count == 0 @@ -118,12 +134,36 @@ def test_build_arch_db_writes_aggregate_fact_constraints_and_lineage(tmp_path): ).fetchone() alignment = connection.execute( """ - SELECT source_concept, canonical_concept, relation, authority + SELECT + source_concept, + canonical_concept, + relation, + authority, + period_start_date, + period_end_date, + period_basis, + period_authority, + period_source_label, + period_accounting_basis FROM concept_alignments WHERE source_concept = ? """, ("irs_soi.adjusted_gross_income",), ).fetchone() + period_metadata = connection.execute( + """ + SELECT + period_start_date, + period_end_date, + period_basis, + period_authority, + period_source_label, + period_accounting_basis + FROM aggregate_facts + WHERE source_record_id = ? + """, + ("irs_soi.ty2023.table_1_1.all.adjusted_gross_income",), + ).fetchone() assert tuple(agi_fact) == ( AXIOM_IRC_AGI_CONCEPT, @@ -138,6 +178,20 @@ def test_build_arch_db_writes_aggregate_fact_constraints_and_lineage(tmp_path): AXIOM_IRC_AGI_CONCEPT, "exact", "arch-us", + "2023-01-01", + "2023-12-31", + "tax_year", + "26 USC 441", + "IRS tax year 2023", + "cash", + ) + assert tuple(period_metadata) == ( + "2023-01-01", + "2023-12-31", + "tax_year", + "26 USC 441", + "IRS tax year 2023", + "cash", ) lineage = connection.execute( @@ -196,7 +250,9 @@ def test_build_arch_db_file_uses_fixture_facts_and_cells(tmp_path): facts_count = connection.execute( "SELECT COUNT(*) FROM aggregate_facts" ).fetchone()[0] - cells_count = connection.execute("SELECT COUNT(*) FROM source_cells").fetchone()[0] + cells_count = connection.execute( + "SELECT COUNT(*) FROM source_cells" + ).fetchone()[0] assert facts_count == 80 assert cells_count == 1932 diff --git a/tests/test_arch_source_package.py b/tests/test_arch_source_package.py index 4f6d9d0..c6594eb 100644 --- a/tests/test_arch_source_package.py +++ b/tests/test_arch_source_package.py @@ -115,6 +115,34 @@ def test_empty_guard_cells_do_not_change_legacy_single_cell_hash(tmp_path): assert specs[0].layout.record_set_spec_hash == "d606c87f11948c197386dfa4" +def test_record_set_period_metadata_compiles_to_facts(tmp_path): + source_path = REPO_ROOT / "packages" / "irs_soi" / "table_1_1" + payload = yaml.safe_load((source_path / "source_package.yaml").read_text()) + record_set = payload["record_sets"][0] + record_set["period_start_date"] = "{year}-01-01" + record_set["period_end_date"] = "{year}-12-31" + record_set["period_basis"] = "tax_year" + record_set["period_authority"] = "26 USC 441" + record_set["period_source_label"] = "IRS tax year {year}" + record_set["accounting_basis"] = "cash" + + package_dir = tmp_path / "soi-with-period-metadata" + package_dir.mkdir() + (package_dir / "source_package.yaml").write_text(yaml.safe_dump(payload)) + + package = load_source_package(package_dir) + facts = package.build_facts(2023) + + assert validate_facts(facts).valid + assert facts[0].period.start_date == "2023-01-01" + assert facts[0].period.end_date == "2023-12-31" + assert facts[0].period.basis == "tax_year" + assert facts[0].period.authority == "26 USC 441" + assert facts[0].period.source_label == "IRS tax year 2023" + assert facts[0].period.accounting_basis == "cash" + assert "IRS tax year 2023" in facts[0].label + + def test_guard_cell_order_does_not_change_record_set_spec_hash(tmp_path): source_path = REPO_ROOT / "packages" / "irs_soi" / "table_1_1" payload = yaml.safe_load((source_path / "source_package.yaml").read_text()) @@ -153,9 +181,9 @@ def test_guard_cell_expected_value_changes_record_set_spec_hash(tmp_path): {"column": "A", "expected_value": "All returns"}, ] changed = deepcopy(payload) - changed["record_sets"][0]["rows"][0]["guard_cells"][0][ - "expected_value" - ] = "Different label" + changed["record_sets"][0]["rows"][0]["guard_cells"][0]["expected_value"] = ( + "Different label" + ) package_dir = tmp_path / "soi-original" changed_dir = tmp_path / "soi-changed-guard" @@ -253,9 +281,7 @@ def test_source_package_path_builds_valid_soi_table_1_4_facts(): assert validate_facts(facts).valid assert facts[0].source.source_table == "Publication 1304 Table 1.4" assert ( - values_by_record[ - "irs_soi.ty2023.table_1_4.all.alimony_received_amount" - ].value + values_by_record["irs_soi.ty2023.table_1_4.all.alimony_received_amount"].value == 6_686_429_000 ) assert ( @@ -477,12 +503,18 @@ def test_cbo_income_by_source_package_preserves_cbo_projection_concepts(): row["concept_alignment"]["source_concept"]: row for row in consumer_fact_rows(facts) } - assert consumer_rows_by_source_concept["cbo.net_capital_gain"][ - "concept_alignment" - ]["canonical_concept"] == "cbo.net_capital_gain_projection" - assert consumer_rows_by_source_concept["cbo.net_business_income"][ - "concept_alignment" - ]["canonical_concept"] == "cbo.net_business_income_projection" + assert ( + consumer_rows_by_source_concept["cbo.net_capital_gain"]["concept_alignment"][ + "canonical_concept" + ] + == "cbo.net_capital_gain_projection" + ) + assert ( + consumer_rows_by_source_concept["cbo.net_business_income"]["concept_alignment"][ + "canonical_concept" + ] + == "cbo.net_business_income_projection" + ) def test_validate_source_package_reports_fixture_counts(): @@ -615,8 +647,7 @@ def test_soi_table_2_1_package_builds_itemized_deduction_details(): "home_mortgage_personal_seller_amount" ] deductible_points = values_by_record[ - "irs_soi.ty2023.table_2_1.itemized_all_returns.all." - "deductible_points_amount" + "irs_soi.ty2023.table_2_1.itemized_all_returns.all.deductible_points_amount" ] limited_salt = values_by_record[ "irs_soi.ty2023.table_2_1.itemized_all_returns.all." @@ -631,8 +662,7 @@ def test_soi_table_2_1_package_builds_itemized_deduction_details(): "state_local_income_or_sales_tax_amount" ] real_estate_taxes = values_by_record[ - "irs_soi.ty2023.table_2_1.itemized_all_returns.all." - "real_estate_taxes_amount" + "irs_soi.ty2023.table_2_1.itemized_all_returns.all.real_estate_taxes_amount" ] assert charitable.value == 211_975_123_000 @@ -972,8 +1002,7 @@ def test_cms_medicare_trustees_package_builds_part_b_premium_fact(): values_by_record = {fact.source_record_id: fact for fact in facts} assert ( - package.package_id - == "cms-medicare-trustees-report-2025-part-b-premium-income" + package.package_id == "cms-medicare-trustees-report-2025-part-b-premium-income" ) assert len(cells) == 93_486 assert validate_source_cells(cells).valid @@ -1048,8 +1077,7 @@ def test_hhs_acf_liheap_package_builds_household_count_fact( assert all(fact.source.raw_r2_uri for fact in facts) record_id = ( - f"hhs_acf_liheap.fy{year}.national_profile." - "state_programs.households_served" + f"hhs_acf_liheap.fy{year}.national_profile.state_programs.households_served" ) assert records[0].source_cell_addresses == addresses assert values_by_record[record_id].value == households @@ -1095,12 +1123,12 @@ def test_soi_table_2_5_eitc_child_totals_build_2022_facts(): assert three_child_amount.value == 14_000_930_000 assert no_child_returns.filters == {"income_range": "all"} assert no_child_returns.layout.table_record_kind == "total" - assert { - constraint.variable for constraint in no_child_returns.constraints - } == {"us.tax.earned_income_credit_qualifying_children"} - assert { - constraint.operator for constraint in three_child_amount.constraints - } == {">="} + assert {constraint.variable for constraint in no_child_returns.constraints} == { + "us.tax.earned_income_credit_qualifying_children" + } + assert {constraint.operator for constraint in three_child_amount.constraints} == { + ">=" + } def test_ssa_supplement_source_package_alias_validates_fixture_counts(): @@ -1240,9 +1268,7 @@ def test_source_package_alias_builds_census_population_projection_age_facts(): } age_0 = "census.popproj2023.cy2025.national_population.age_0.population" - age_85_plus = ( - "census.popproj2023.cy2025.national_population.age_85_plus.population" - ) + age_85_plus = "census.popproj2023.cy2025.national_population.age_85_plus.population" assert records_by_id[age_0].source_cell_addresses == ( "F2", "F3", @@ -1314,9 +1340,7 @@ def test_census_acs_s0101_source_package_aliases_validate_fixture_counts(): def test_census_acs_s0101_congressional_district_package_builds_age_facts(): - package = load_source_package( - "census-acs-s0101-congressional-district-age-2024" - ) + package = load_source_package("census-acs-s0101-congressional-district-age-2024") rows = package.build_source_rows(2024) cells = package.build_source_cells(2024, source_rows=rows) facts = package.build_facts(2024, cells=cells, source_rows=rows) @@ -1368,16 +1392,13 @@ def test_census_b01001_female_age_source_package_builds_state_facts(): assert all(fact.source.raw_r2_uri for fact in facts) al_age_15_to_17 = values_by_record[ - "census_acs.acs1_2023.b01001.female_age.01." - "age_15_to_17.female_population" + "census_acs.acs1_2023.b01001.female_age.01.age_15_to_17.female_population" ] ca_age_40_to_44 = values_by_record[ - "census_acs.acs1_2023.b01001.female_age.06." - "age_40_to_44.female_population" + "census_acs.acs1_2023.b01001.female_age.06.age_40_to_44.female_population" ] pr_age_40_to_44 = values_by_record[ - "census_acs.acs1_2023.b01001.female_age.72." - "age_40_to_44.female_population" + "census_acs.acs1_2023.b01001.female_age.72.age_40_to_44.female_population" ] assert al_age_15_to_17.value == 100_354 @@ -1417,9 +1438,7 @@ def test_census_acs_s2201_source_package_alias_validates_fixture_counts(): def test_census_acs_s2201_congressional_district_package_builds_snap_facts(): - package = load_source_package( - "census-acs-s2201-congressional-district-snap-2024" - ) + package = load_source_package("census-acs-s2201-congressional-district-snap-2024") rows = package.build_source_rows(2024) cells = package.build_source_cells(2024, source_rows=rows) facts = package.build_facts(2024, cells=cells, source_rows=rows) @@ -1538,9 +1557,7 @@ def test_validate_source_package_reports_cms_aca_effectuated_enrollment_2022_cou def test_cms_medicaid_package_builds_december_2024_state_enrollment_facts(): - package = load_source_package( - "cms-medicaid-chip-monthly-enrollment-december-2024" - ) + package = load_source_package("cms-medicaid-chip-monthly-enrollment-december-2024") cells = package.build_source_cells(2023) records = package.build_source_records(2023, cells=cells) facts = package.build_facts(2023, cells=cells) @@ -1555,24 +1572,19 @@ def test_cms_medicaid_package_builds_december_2024_state_enrollment_facts(): assert all(fact.source.raw_r2_uri for fact in facts) us_medicaid = ( - "cms_medicaid.month2024_12.state_enrollment.us." - "total_medicaid_enrollment" + "cms_medicaid.month2024_12.state_enrollment.us.total_medicaid_enrollment" ) ca_medicaid = ( - "cms_medicaid.month2024_12.state_enrollment.ca." - "total_medicaid_enrollment" + "cms_medicaid.month2024_12.state_enrollment.ca.total_medicaid_enrollment" ) tx_medicaid_chip = ( - "cms_medicaid.month2024_12.state_enrollment.tx." - "total_medicaid_chip_enrollment" + "cms_medicaid.month2024_12.state_enrollment.tx.total_medicaid_chip_enrollment" ) ny_adult = ( - "cms_medicaid.month2024_12.state_enrollment.ny." - "total_adult_medicaid_enrollment" + "cms_medicaid.month2024_12.state_enrollment.ny.total_adult_medicaid_enrollment" ) fl_child = ( - "cms_medicaid.month2024_12.state_enrollment.fl." - "medicaid_chip_child_enrollment" + "cms_medicaid.month2024_12.state_enrollment.fl.medicaid_chip_child_enrollment" ) assert records_by_id[us_medicaid].source_cell_addresses[:3] == ( @@ -1636,8 +1648,7 @@ def test_cms_medicaid_monthly_dataset_builds_december_2025_state_enrollment_fact assert all(fact.source_row_keys for fact in facts) assert all(fact.source.source_name == "cms_medicaid" for fact in facts) assert all( - fact.source.source_file == "pi-dataset-april-2026-release.csv" - for fact in facts + fact.source.source_file == "pi-dataset-april-2026-release.csv" for fact in facts ) assert all(fact.source.raw_r2_uri for fact in facts) assert {f"{fact.period.type}:{fact.period.value}" for fact in facts} == { @@ -1645,31 +1656,23 @@ def test_cms_medicaid_monthly_dataset_builds_december_2025_state_enrollment_fact } ca_total = ( - "cms_medicaid.month2025_12.state_enrollment.ca." - "total_medicaid_chip_enrollment" + "cms_medicaid.month2025_12.state_enrollment.ca.total_medicaid_chip_enrollment" ) ca_medicaid = ( - "cms_medicaid.month2025_12.state_enrollment.ca." - "total_medicaid_enrollment" - ) - ca_chip = ( - "cms_medicaid.month2025_12.state_enrollment.ca.total_chip_enrollment" + "cms_medicaid.month2025_12.state_enrollment.ca.total_medicaid_enrollment" ) + ca_chip = "cms_medicaid.month2025_12.state_enrollment.ca.total_chip_enrollment" ca_child = ( - "cms_medicaid.month2025_12.state_enrollment.ca." - "medicaid_chip_child_enrollment" + "cms_medicaid.month2025_12.state_enrollment.ca.medicaid_chip_child_enrollment" ) ca_adult = ( - "cms_medicaid.month2025_12.state_enrollment.ca." - "total_adult_medicaid_enrollment" + "cms_medicaid.month2025_12.state_enrollment.ca.total_adult_medicaid_enrollment" ) tx_total = ( - "cms_medicaid.month2025_12.state_enrollment.tx." - "total_medicaid_chip_enrollment" + "cms_medicaid.month2025_12.state_enrollment.tx.total_medicaid_chip_enrollment" ) ny_total = ( - "cms_medicaid.month2025_12.state_enrollment.ny." - "total_medicaid_chip_enrollment" + "cms_medicaid.month2025_12.state_enrollment.ny.total_medicaid_chip_enrollment" ) assert records_by_id[ca_total].source_cell_addresses == ( @@ -1812,8 +1815,7 @@ def test_soi_congressional_district_2022_builds_all_return_facts(): "al_01.adjusted_gross_income" ] ca_53_returns = values_by_record[ - "irs_soi.ty2022.congressional_district_2022.all_returns." - "ca_53.return_count" + "irs_soi.ty2022.congressional_district_2022.all_returns.ca_53.return_count" ] assert al_01_agi.value == 22_915_824_000 @@ -1847,9 +1849,7 @@ def test_soi_historic_table_2_package_builds_2022_national_facts(): ptc_returns = values_by_record[ "irs_soi.ty2022.historic_table_2.us.all.premium_tax_credit_returns" ] - eitc_amount = values_by_record[ - "irs_soi.ty2022.historic_table_2.us.all.eitc_amount" - ] + eitc_amount = values_by_record["irs_soi.ty2022.historic_table_2.us.all.eitc_amount"] real_estate_taxes = values_by_record[ "irs_soi.ty2022.historic_table_2.us.all.real_estate_taxes_amount" ] @@ -1865,18 +1865,12 @@ def test_soi_historic_table_2_package_builds_2022_national_facts(): medical_dental = values_by_record[ "irs_soi.ty2022.historic_table_2.us.all.medical_dental_expense_amount" ] - qbi = values_by_record[ - "irs_soi.ty2022.historic_table_2.us.all.qbi_amount" - ] + qbi = values_by_record["irs_soi.ty2022.historic_table_2.us.all.qbi_amount"] rental = values_by_record[ "irs_soi.ty2022.historic_table_2.us.all.rental_royalty_income_amount" ] - ctc = values_by_record[ - "irs_soi.ty2022.historic_table_2.us.all.ctc_amount" - ] - actc = values_by_record[ - "irs_soi.ty2022.historic_table_2.us.all.actc_amount" - ] + ctc = values_by_record["irs_soi.ty2022.historic_table_2.us.all.ctc_amount"] + actc = values_by_record["irs_soi.ty2022.historic_table_2.us.all.actc_amount"] agi_bracket_eitc_claims = values_by_record[ "irs_soi.ty2022.historic_table_2.us.1_to_10k.eitc_claims" ] @@ -1894,7 +1888,9 @@ def test_soi_historic_table_2_package_builds_2022_national_facts(): assert ctc.value == 82_862_736_000 assert actc.value == 33_857_987_000 assert agi_bracket_eitc_claims.value == 5_013_220 - assert {constraint.operator for constraint in agi_bracket_eitc_claims.constraints} == { + assert { + constraint.operator for constraint in agi_bracket_eitc_claims.constraints + } == { "<", ">=", } @@ -1970,12 +1966,10 @@ def test_soi_historic_table_2_state_eitc_package_builds_child_count_facts(): assert len(facts) == 510 ca_one_child_amount = values_by_record[ - "irs_soi.ty2022.historic_table_2.state_eitc.ca.ca." - "eitc_one_child_amount" + "irs_soi.ty2022.historic_table_2.state_eitc.ca.ca.eitc_one_child_amount" ] ca_two_children_claims = values_by_record[ - "irs_soi.ty2022.historic_table_2.state_eitc.ca.ca." - "eitc_two_children_claims" + "irs_soi.ty2022.historic_table_2.state_eitc.ca.ca.eitc_two_children_claims" ] ca_three_or_more_amount = values_by_record[ "irs_soi.ty2022.historic_table_2.state_eitc.ca.ca." @@ -1986,9 +1980,9 @@ def test_soi_historic_table_2_state_eitc_package_builds_child_count_facts(): assert ca_two_children_claims.value == 550_910 assert ca_three_or_more_amount.value == 1_266_651_000 assert ca_one_child_amount.filters["eitc_child_count"] == 1 - assert { - constraint.variable for constraint in ca_one_child_amount.constraints - } == {"us.tax.earned_income_credit_qualifying_children"} + assert {constraint.variable for constraint in ca_one_child_amount.constraints} == { + "us.tax.earned_income_credit_qualifying_children" + } assert { constraint.operator for constraint in ca_three_or_more_amount.constraints } == {">="}