From 4d5b62b74ad4d6202205641ebf63131fb3039cdf Mon Sep 17 00:00:00 2001 From: Vahid Ahmadi Date: Wed, 17 Jun 2026 16:43:58 +0100 Subject: [PATCH 1/6] Add end-to-end regression test for bus_fare_spending in dataset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit generate_lcfs_table is unit-tested to compute bus_fare_spending, but nothing checked it survives the QRF predict + enhanced-dataset assembly/save into the published dataset — and it currently doesn't (issue #430): every other consumption output lands, bus_fare_spending is dropped downstream. Add an end-to-end test asserting the enhanced dataset carries a populated bus_fare_spending column. Marked xfail so it is mergeable and documents the gap; it will XPASS once the pipeline is fixed. Refs #430. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../test_bus_fare_spending_in_dataset.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 policyengine_uk_data/tests/test_bus_fare_spending_in_dataset.py diff --git a/policyengine_uk_data/tests/test_bus_fare_spending_in_dataset.py b/policyengine_uk_data/tests/test_bus_fare_spending_in_dataset.py new file mode 100644 index 00000000..b7a7c0d2 --- /dev/null +++ b/policyengine_uk_data/tests/test_bus_fare_spending_in_dataset.py @@ -0,0 +1,36 @@ +"""End-to-end regression test: bus_fare_spending must survive the full build. + +`generate_lcfs_table` is unit-tested to compute the bus_fare_spending column +(test_lcfs_consumption_ingestion), but nothing checks that it survives the +QRF train/predict and enhanced-dataset assembly/save into the published +dataset. It currently does not (see issue #430) — every other consumption +output lands, but bus_fare_spending is dropped somewhere downstream. + +This test is marked xfail so it is mergeable and documents the known gap; it +will XPASS once the pipeline is fixed, prompting removal of the marker and +conversion to a hard assertion. +""" + +import pytest + + +@pytest.mark.xfail( + reason=( + "bus_fare_spending is imputed but dropped downstream of " + "generate_lcfs_table before reaching the enhanced dataset (issue #430). " + "Remove this marker once the dataset carries the column." + ), + strict=False, +) +def test_enhanced_dataset_contains_bus_fare_spending(baseline): + assert "bus_fare_spending" in baseline.input_variables, ( + "bus_fare_spending is not present in the enhanced dataset." + ) + total = baseline.calculate( + "bus_fare_spending", map_to="household", period=2025 + ).sum() + # UK household bus/coach fare spend is ~£2.7bn; guard against an all-zero + # column slipping through as 'present'. + assert total > 1e9, ( + f"bus_fare_spending present but implausibly small: £{total / 1e9:.2f}bn" + ) From 4493f097ee379d0a761d184f1e1039a67e8a9789 Mon Sep 17 00:00:00 2001 From: Vahid Ahmadi Date: Wed, 17 Jun 2026 16:44:23 +0100 Subject: [PATCH 2/6] Add changelog entry for bus_fare_spending dataset regression test (#431) Co-Authored-By: Claude Opus 4.8 (1M context) --- changelog.d/431.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/431.md diff --git a/changelog.d/431.md b/changelog.d/431.md new file mode 100644 index 00000000..ad3fcb58 --- /dev/null +++ b/changelog.d/431.md @@ -0,0 +1 @@ +- Add an end-to-end regression test asserting the enhanced dataset contains a populated `bus_fare_spending` column (xfail until the downstream build drop in #430 is fixed), covering the gap between the unit-tested `generate_lcfs_table` and the published dataset. From 0f8ca266bef20d7acfa0e40799142b141ab71b56 Mon Sep 17 00:00:00 2001 From: Vahid Ahmadi Date: Thu, 18 Jun 2026 10:51:40 +0100 Subject: [PATCH 3/6] Calibrate bus_fare_spending and bus_subsidy_spending to DfT totals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Anchor both bus variables to official DfT Annual Bus Statistics (y/e March 2025, England): passenger fare receipts £3.4bn (BUS05aii) and net government support £3.0bn (BUS05bii). Adds calibrate_bus_fare_spending (consumption) and calibrate_bus_subsidy_spending (services), mirroring calibrate_rail_subsidy_ spending, called after weight calibration in create_datasets. Unanchored, imputed bus fare inherited the broader transport-consumption over-estimate (~£10bn, ~3x) and bus subsidy drifted low (~£1.5bn). Updates the bus_subsidy_spending smoke target to the official £3.0bn and de-xfails the end-to-end bus_fare_spending dataset test (the column is present in the current release; the earlier "drop" was a stale-file misread, not a pipeline bug). Co-Authored-By: Claude Opus 4.8 (1M context) --- changelog.d/431.md | 2 +- .../datasets/create_datasets.py | 10 +++++ .../datasets/imputations/consumption.py | 41 +++++++++++++++++++ .../datasets/imputations/services/services.py | 41 +++++++++++++++++++ policyengine_uk_data/tests/test_aggregates.py | 16 ++++---- .../test_bus_fare_spending_in_dataset.py | 29 ++++--------- 6 files changed, 108 insertions(+), 31 deletions(-) diff --git a/changelog.d/431.md b/changelog.d/431.md index ad3fcb58..b861c62c 100644 --- a/changelog.d/431.md +++ b/changelog.d/431.md @@ -1 +1 @@ -- Add an end-to-end regression test asserting the enhanced dataset contains a populated `bus_fare_spending` column (xfail until the downstream build drop in #430 is fixed), covering the gap between the unit-tested `generate_lcfs_table` and the published dataset. +- Calibrate `bus_fare_spending` and `bus_subsidy_spending` to DfT Annual Bus Statistics (year ending March 2025, England) totals — passenger fare receipts £3.4bn (table BUS05aii) and net government support £3.0bn (table BUS05bii) — via post-calibration scaling steps mirroring the rail subsidy calibration. Without anchoring, imputed bus fare inherited the broader transport-consumption over-estimate (~£10bn, ~3× too high) and bus subsidy drifted low (~£1.5bn). Updates the `bus_subsidy_spending` smoke target to the official £3.0bn and adds an end-to-end test that `bus_fare_spending` reaches the enhanced dataset. diff --git a/policyengine_uk_data/datasets/create_datasets.py b/policyengine_uk_data/datasets/create_datasets.py index 391896db..bebdc5d7 100644 --- a/policyengine_uk_data/datasets/create_datasets.py +++ b/policyengine_uk_data/datasets/create_datasets.py @@ -292,20 +292,30 @@ def main(): update_dataset("Calibrate public service aggregates", "processing") from policyengine_uk_data.datasets.imputations.services.services import ( calibrate_rail_subsidy_spending, + calibrate_bus_subsidy_spending, ) calibrate_rail_subsidy_spending( frs_calibrated, frs_release.calibration_year, ) + calibrate_bus_subsidy_spending( + frs_calibrated, + frs_release.calibration_year, + ) update_dataset("Calibrate public service aggregates", "completed") update_dataset("Calibrate fuel litres", "processing") from policyengine_uk_data.datasets.imputations.consumption import ( calibrate_dataset_fuel_litre_proxies_to_road_fuel, + calibrate_bus_fare_spending, ) calibrate_dataset_fuel_litre_proxies_to_road_fuel(frs_calibrated) + calibrate_bus_fare_spending( + frs_calibrated, + frs_release.calibration_year, + ) update_dataset("Calibrate fuel litres", "completed") update_dataset("Save final dataset", "processing") diff --git a/policyengine_uk_data/datasets/imputations/consumption.py b/policyengine_uk_data/datasets/imputations/consumption.py index 3742bb04..9c23fbdb 100644 --- a/policyengine_uk_data/datasets/imputations/consumption.py +++ b/policyengine_uk_data/datasets/imputations/consumption.py @@ -781,6 +781,47 @@ def calibrate_dataset_fuel_litre_proxies_to_road_fuel( ) +BUS_FARE_TARGETS = { + # DfT Annual Bus Statistics, year ending March 2025 (England), table + # BUS05aii: passenger fare receipts on local bus services were GBP 3.4bn + # (52% of GBP 6.6bn total operating revenue). + # https://www.gov.uk/government/statistics/annual-bus-statistics-year-ending-march-2025/annual-bus-statistics-year-ending-march-2025 + # England-coverage figure used as the UK anchor: DfT publishes no single + # GB/UK total and GB/UK would be ~10-20% higher. Without this the imputed + # aggregate inherits the broader transport-consumption over-estimate + # (~GBP 10bn, ~3x too high). + 2025: 3.4e9, +} + + +def calibrate_bus_fare_spending( + dataset: UKSingleYearDataset, + time_period: int, +) -> float | None: + """Scale bus_fare_spending to the DfT passenger-fare total (BUS_FARE_TARGETS).""" + target = BUS_FARE_TARGETS.get(time_period) + if target is None or "bus_fare_spending" not in dataset.household: + return None + + original_time_period = dataset.time_period + dataset.time_period = str(original_time_period) + try: + simulation = Microsimulation(dataset=dataset) + actual = simulation.calculate( + "bus_fare_spending", + period=time_period, + map_to="household", + ).sum() + finally: + dataset.time_period = original_time_period + if actual <= 0: + raise ValueError(f"Cannot calibrate bus_fare_spending: aggregate is {actual}.") + + scale = target / actual + dataset.household["bus_fare_spending"] *= scale + return scale + + def save_imputation_models(): from policyengine_uk_data.utils.qrf import QRF diff --git a/policyengine_uk_data/datasets/imputations/services/services.py b/policyengine_uk_data/datasets/imputations/services/services.py index 43ab2639..a052f997 100644 --- a/policyengine_uk_data/datasets/imputations/services/services.py +++ b/policyengine_uk_data/datasets/imputations/services/services.py @@ -21,6 +21,17 @@ 2025: 21.6e9, } +BUS_SUBSIDY_TARGETS = { + # DfT Annual Bus Statistics, year ending March 2025 (England), table + # BUS05bii: total net government support for local bus services was + # GBP 3.0bn (of which GBP 0.8bn concessionary travel reimbursement). + # https://www.gov.uk/government/statistics/annual-bus-statistics-year-ending-march-2025/annual-bus-statistics-year-ending-march-2025 + # England-coverage figure used as the UK anchor: DfT publishes no single + # GB/UK total and GB/UK would be ~10-20% higher, but this is far better + # than the unanchored aggregate, which drifts well below the true total. + 2025: 3.0e9, +} + def get_fare_index_survey_year() -> float: """ @@ -66,6 +77,36 @@ def calibrate_rail_subsidy_spending( return scale +def calibrate_bus_subsidy_spending( + dataset: UKSingleYearDataset, + time_period: int, +) -> float | None: + """Scale bus_subsidy_spending to the DfT net-support total (BUS_SUBSIDY_TARGETS).""" + target = BUS_SUBSIDY_TARGETS.get(time_period) + if target is None or "bus_subsidy_spending" not in dataset.household: + return None + + original_time_period = dataset.time_period + dataset.time_period = str(original_time_period) + try: + simulation = Microsimulation(dataset=dataset) + actual = simulation.calculate( + "bus_subsidy_spending", + period=time_period, + map_to="household", + ).sum() + finally: + dataset.time_period = original_time_period + if actual <= 0: + raise ValueError( + f"Cannot calibrate bus_subsidy_spending: aggregate is {actual}." + ) + + scale = target / actual + dataset.household["bus_subsidy_spending"] *= scale + return scale + + def impute_services( dataset: UKSingleYearDataset, ) -> UKSingleYearDataset: diff --git a/policyengine_uk_data/tests/test_aggregates.py b/policyengine_uk_data/tests/test_aggregates.py index 6a63c2be..24b3a5b6 100644 --- a/policyengine_uk_data/tests/test_aggregates.py +++ b/policyengine_uk_data/tests/test_aggregates.py @@ -6,14 +6,14 @@ # ORR/GOV.UK rail finance statistics report GBP 21.6bn of government # support to the rail industry in 2024-25. "rail_subsidy_spending": 21.6e9, - # Approximate public support for local bus services; kept as a loose - # smoke-test target because source coverage and dataset coverage differ. - "bus_subsidy_spending": 2.5e9, - # DfT Annual Bus Statistics (year ending March 2025) report GBP 3.4bn - # passenger fare receipts for local bus services in England. The LCFS input - # is UK household bus/coach fare spending, so this is an order-of-magnitude - # target. Enable once a dataset built with the bus_fare_spending imputation - # is published — the column is absent from the currently-released dataset. + # DfT Annual Bus Statistics (year ending March 2025, England), table + # BUS05bii: total net government support for local bus services ~GBP 3.0bn. + # bus_subsidy_spending is calibrated to this in the build. + "bus_subsidy_spending": 3.0e9, + # DfT Annual Bus Statistics (year ending March 2025, England), table + # BUS05aii: passenger fare receipts ~GBP 3.4bn. bus_fare_spending is + # calibrated to this in the build. Enable once a dataset built with that + # calibration is published (the released dataset predates it). # "bus_fare_spending": 3.4e9, } diff --git a/policyengine_uk_data/tests/test_bus_fare_spending_in_dataset.py b/policyengine_uk_data/tests/test_bus_fare_spending_in_dataset.py index b7a7c0d2..97d6f11f 100644 --- a/policyengine_uk_data/tests/test_bus_fare_spending_in_dataset.py +++ b/policyengine_uk_data/tests/test_bus_fare_spending_in_dataset.py @@ -1,27 +1,13 @@ -"""End-to-end regression test: bus_fare_spending must survive the full build. +"""End-to-end regression test: bus_fare_spending must reach the enhanced dataset. -`generate_lcfs_table` is unit-tested to compute the bus_fare_spending column -(test_lcfs_consumption_ingestion), but nothing checks that it survives the -QRF train/predict and enhanced-dataset assembly/save into the published -dataset. It currently does not (see issue #430) — every other consumption -output lands, but bus_fare_spending is dropped somewhere downstream. - -This test is marked xfail so it is mergeable and documents the known gap; it -will XPASS once the pipeline is fixed, prompting removal of the marker and -conversion to a hard assertion. +`generate_lcfs_table` is unit-tested to compute the bus_fare_spending column; +this guards the other half — that it survives the QRF predict and the +enhanced-dataset assembly/save into the dataset the model loads. It is present +in the current release (enhanced_frs_2024_25.h5) and calibrated to the DfT +passenger-fare total in the build. """ -import pytest - -@pytest.mark.xfail( - reason=( - "bus_fare_spending is imputed but dropped downstream of " - "generate_lcfs_table before reaching the enhanced dataset (issue #430). " - "Remove this marker once the dataset carries the column." - ), - strict=False, -) def test_enhanced_dataset_contains_bus_fare_spending(baseline): assert "bus_fare_spending" in baseline.input_variables, ( "bus_fare_spending is not present in the enhanced dataset." @@ -29,8 +15,7 @@ def test_enhanced_dataset_contains_bus_fare_spending(baseline): total = baseline.calculate( "bus_fare_spending", map_to="household", period=2025 ).sum() - # UK household bus/coach fare spend is ~£2.7bn; guard against an all-zero - # column slipping through as 'present'. + # Guard against an all-zero column slipping through as 'present'. assert total > 1e9, ( f"bus_fare_spending present but implausibly small: £{total / 1e9:.2f}bn" ) From 4da192ebd5a0600facfbc901b97b6d85cc35dba5 Mon Sep 17 00:00:00 2001 From: Vahid Ahmadi Date: Thu, 18 Jun 2026 10:55:46 +0100 Subject: [PATCH 4/6] Uplift bus calibration targets from England to UK by population MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DfT bus-finance figures are England-only; scale to UK by the ONS mid-2023 population ratio (UK 68.3M / England 57.7M ≈ 1.18) as a documented best approximation. Targets: bus fare £3.4bn→~£4.0bn, bus subsidy £3.0bn→~£3.5bn. Indicative (bus use per head varies by nation); refine with Scotland/Wales/NI sources if a direct UK figure is needed. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../datasets/imputations/consumption.py | 16 ++++++++++------ .../datasets/imputations/services/services.py | 17 ++++++++++++----- policyengine_uk_data/tests/test_aggregates.py | 14 +++++++------- 3 files changed, 29 insertions(+), 18 deletions(-) diff --git a/policyengine_uk_data/datasets/imputations/consumption.py b/policyengine_uk_data/datasets/imputations/consumption.py index 9c23fbdb..fd24227d 100644 --- a/policyengine_uk_data/datasets/imputations/consumption.py +++ b/policyengine_uk_data/datasets/imputations/consumption.py @@ -781,16 +781,20 @@ def calibrate_dataset_fuel_litre_proxies_to_road_fuel( ) +# England → UK uplift for England-only DfT bus figures: ONS mid-2023 population +# ratio (UK 68.3M / England 57.7M ≈ 1.18), a best approximation since DfT +# publishes no single GB/UK bus-finance total. Indicative — bus use per head +# varies by nation. https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationestimates +ENGLAND_TO_UK_POPULATION_UPLIFT = 68.3 / 57.7 # ≈ 1.18 + BUS_FARE_TARGETS = { # DfT Annual Bus Statistics, year ending March 2025 (England), table # BUS05aii: passenger fare receipts on local bus services were GBP 3.4bn - # (52% of GBP 6.6bn total operating revenue). + # (52% of GBP 6.6bn total operating revenue), uplifted England → UK by + # population (≈ GBP 4.0bn UK). Without anchoring, the imputed aggregate + # inherits the broader transport-consumption over-estimate (~GBP 10bn). # https://www.gov.uk/government/statistics/annual-bus-statistics-year-ending-march-2025/annual-bus-statistics-year-ending-march-2025 - # England-coverage figure used as the UK anchor: DfT publishes no single - # GB/UK total and GB/UK would be ~10-20% higher. Without this the imputed - # aggregate inherits the broader transport-consumption over-estimate - # (~GBP 10bn, ~3x too high). - 2025: 3.4e9, + 2025: 3.4e9 * ENGLAND_TO_UK_POPULATION_UPLIFT, } diff --git a/policyengine_uk_data/datasets/imputations/services/services.py b/policyengine_uk_data/datasets/imputations/services/services.py index a052f997..47d983fc 100644 --- a/policyengine_uk_data/datasets/imputations/services/services.py +++ b/policyengine_uk_data/datasets/imputations/services/services.py @@ -21,15 +21,22 @@ 2025: 21.6e9, } +# England → UK uplift for England-only DfT bus figures. DfT publishes no single +# GB/UK bus-finance total, so we scale by the ONS mid-2023 population ratio +# (UK 68.3M / England 57.7M ≈ 1.18) as a best approximation. This is indicative: +# bus use per head varies by nation (London lifts England's per-capita use), so +# the true UK factor is likely a little below the population ratio. +# ONS mid-year population estimates: +# https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationestimates +ENGLAND_TO_UK_POPULATION_UPLIFT = 68.3 / 57.7 # ≈ 1.18 + BUS_SUBSIDY_TARGETS = { # DfT Annual Bus Statistics, year ending March 2025 (England), table # BUS05bii: total net government support for local bus services was - # GBP 3.0bn (of which GBP 0.8bn concessionary travel reimbursement). + # GBP 3.0bn (of which GBP 0.8bn concessionary travel reimbursement), + # uplifted England → UK by population (≈ GBP 3.5bn UK). # https://www.gov.uk/government/statistics/annual-bus-statistics-year-ending-march-2025/annual-bus-statistics-year-ending-march-2025 - # England-coverage figure used as the UK anchor: DfT publishes no single - # GB/UK total and GB/UK would be ~10-20% higher, but this is far better - # than the unanchored aggregate, which drifts well below the true total. - 2025: 3.0e9, + 2025: 3.0e9 * ENGLAND_TO_UK_POPULATION_UPLIFT, } diff --git a/policyengine_uk_data/tests/test_aggregates.py b/policyengine_uk_data/tests/test_aggregates.py index 24b3a5b6..0da69a04 100644 --- a/policyengine_uk_data/tests/test_aggregates.py +++ b/policyengine_uk_data/tests/test_aggregates.py @@ -7,14 +7,14 @@ # support to the rail industry in 2024-25. "rail_subsidy_spending": 21.6e9, # DfT Annual Bus Statistics (year ending March 2025, England), table - # BUS05bii: total net government support for local bus services ~GBP 3.0bn. - # bus_subsidy_spending is calibrated to this in the build. - "bus_subsidy_spending": 3.0e9, + # BUS05bii: net government support ~GBP 3.0bn, uplifted England→UK by + # population (~GBP 3.5bn). bus_subsidy_spending is calibrated to this. + "bus_subsidy_spending": 3.0e9 * 68.3 / 57.7, # DfT Annual Bus Statistics (year ending March 2025, England), table - # BUS05aii: passenger fare receipts ~GBP 3.4bn. bus_fare_spending is - # calibrated to this in the build. Enable once a dataset built with that - # calibration is published (the released dataset predates it). - # "bus_fare_spending": 3.4e9, + # BUS05aii: passenger fare receipts ~GBP 3.4bn, uplifted England→UK by + # population (~GBP 4.0bn). bus_fare_spending is calibrated to this. Enable + # once a dataset built with that calibration is published (released predates). + # "bus_fare_spending": 3.4e9 * 68.3 / 57.7, } From 9c8968bc3d828a75c5ac3e6837465a7717e96a90 Mon Sep 17 00:00:00 2001 From: Vahid Ahmadi Date: Thu, 18 Jun 2026 12:15:47 +0100 Subject: [PATCH 5/6] Test bus fare/subsidy totals match DfT targets in the built dataset Replace the presence-only check with an active 20% total test for both bus_fare_spending and bus_subsidy_spending against the DfT Annual Bus Statistics targets (England, population-uplifted to UK). Uses the enhanced FRS dataset, which make data builds but make download does not fetch, so the baseline fixture skips it in PR CI and runs it on the post-merge build against the freshly calibrated data (same pattern as test_energy_calibration). Co-Authored-By: Claude Opus 4.8 (1M context) --- changelog.d/431.md | 2 +- .../test_bus_fare_spending_in_dataset.py | 41 +++++++++++-------- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/changelog.d/431.md b/changelog.d/431.md index b861c62c..a8180a96 100644 --- a/changelog.d/431.md +++ b/changelog.d/431.md @@ -1 +1 @@ -- Calibrate `bus_fare_spending` and `bus_subsidy_spending` to DfT Annual Bus Statistics (year ending March 2025, England) totals — passenger fare receipts £3.4bn (table BUS05aii) and net government support £3.0bn (table BUS05bii) — via post-calibration scaling steps mirroring the rail subsidy calibration. Without anchoring, imputed bus fare inherited the broader transport-consumption over-estimate (~£10bn, ~3× too high) and bus subsidy drifted low (~£1.5bn). Updates the `bus_subsidy_spending` smoke target to the official £3.0bn and adds an end-to-end test that `bus_fare_spending` reaches the enhanced dataset. +- Calibrate `bus_fare_spending` and `bus_subsidy_spending` to DfT Annual Bus Statistics (year ending March 2025, England) totals — passenger fare receipts £3.4bn (table BUS05aii) and net government support £3.0bn (table BUS05bii), uplifted England→UK by population — via post-calibration scaling steps mirroring the rail subsidy calibration. Without anchoring, imputed bus fare inherited the broader transport-consumption over-estimate (~£10bn, ~3× too high) and bus subsidy drifted low (~£1.5bn). Adds tests asserting both bus totals match the DfT targets within 20% in the built dataset (skipped in PR CI where no dataset is built, active on the post-merge build, like test_energy_calibration). diff --git a/policyengine_uk_data/tests/test_bus_fare_spending_in_dataset.py b/policyengine_uk_data/tests/test_bus_fare_spending_in_dataset.py index 97d6f11f..bfbb7bad 100644 --- a/policyengine_uk_data/tests/test_bus_fare_spending_in_dataset.py +++ b/policyengine_uk_data/tests/test_bus_fare_spending_in_dataset.py @@ -1,21 +1,30 @@ -"""End-to-end regression test: bus_fare_spending must reach the enhanced dataset. +"""Bus fare / subsidy totals in the built dataset must match the DfT targets. -`generate_lcfs_table` is unit-tested to compute the bus_fare_spending column; -this guards the other half — that it survives the QRF predict and the -enhanced-dataset assembly/save into the dataset the model loads. It is present -in the current release (enhanced_frs_2024_25.h5) and calibrated to the DfT -passenger-fare total in the build. +These use the enhanced FRS dataset, which is produced by ``make data`` (the +build / push CI / local generation) and is *not* fetched by ``make download``. +So the `baseline` fixture skips them in PR CI (no built dataset) and runs them +after a build, against the freshly calibrated data — the same pattern as +test_energy_calibration. Both bus variables are calibrated to the official DfT +totals in the build, so the totals should match closely; a 20% band is allowed. """ +import pytest -def test_enhanced_dataset_contains_bus_fare_spending(baseline): - assert "bus_fare_spending" in baseline.input_variables, ( - "bus_fare_spending is not present in the enhanced dataset." - ) - total = baseline.calculate( - "bus_fare_spending", map_to="household", period=2025 - ).sum() - # Guard against an all-zero column slipping through as 'present'. - assert total > 1e9, ( - f"bus_fare_spending present but implausibly small: £{total / 1e9:.2f}bn" +# DfT Annual Bus Statistics, year ending March 2025 (England), uplifted +# England -> UK by ONS mid-2023 population (x 68.3 / 57.7): +# bus_fare_spending -> BUS05aii passenger fare receipts £3.4bn (~£4.0bn UK) +# bus_subsidy_spending -> BUS05bii net government support £3.0bn (~£3.5bn UK) +# https://www.gov.uk/government/statistics/annual-bus-statistics-year-ending-march-2025/annual-bus-statistics-year-ending-march-2025 +BUS_TARGETS = { + "bus_fare_spending": 3.4e9 * 68.3 / 57.7, + "bus_subsidy_spending": 3.0e9 * 68.3 / 57.7, +} + + +@pytest.mark.parametrize("variable,target", sorted(BUS_TARGETS.items())) +def test_bus_total_matches_dft_target(baseline, variable: str, target: float): + total = baseline.calculate(variable, map_to="household", period=2025).sum() + assert abs(total / target - 1) < 0.2, ( + f"{variable}: £{total / 1e9:.2f}bn vs DfT target £{target / 1e9:.2f}bn " + f"(relative error {abs(total / target - 1):.1%})." ) From fb53ccae210a08eb3cdfc16046728b83a80064ba Mon Sep 17 00:00:00 2001 From: Vahid Ahmadi Date: Thu, 18 Jun 2026 12:26:43 +0100 Subject: [PATCH 6/6] Remove column-presence fallback from bus calibration Drop the 'or "" not in dataset.household' guard from calibrate_bus_fare_spending / calibrate_bus_subsidy_spending so they match the rail calibration (if target is None: return None) and fail loudly if the imputed column is unexpectedly absent, rather than silently skipping. Co-Authored-By: Claude Opus 4.8 (1M context) --- policyengine_uk_data/datasets/imputations/consumption.py | 2 +- policyengine_uk_data/datasets/imputations/services/services.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/policyengine_uk_data/datasets/imputations/consumption.py b/policyengine_uk_data/datasets/imputations/consumption.py index fd24227d..e41914fe 100644 --- a/policyengine_uk_data/datasets/imputations/consumption.py +++ b/policyengine_uk_data/datasets/imputations/consumption.py @@ -804,7 +804,7 @@ def calibrate_bus_fare_spending( ) -> float | None: """Scale bus_fare_spending to the DfT passenger-fare total (BUS_FARE_TARGETS).""" target = BUS_FARE_TARGETS.get(time_period) - if target is None or "bus_fare_spending" not in dataset.household: + if target is None: return None original_time_period = dataset.time_period diff --git a/policyengine_uk_data/datasets/imputations/services/services.py b/policyengine_uk_data/datasets/imputations/services/services.py index 47d983fc..9c92862c 100644 --- a/policyengine_uk_data/datasets/imputations/services/services.py +++ b/policyengine_uk_data/datasets/imputations/services/services.py @@ -90,7 +90,7 @@ def calibrate_bus_subsidy_spending( ) -> float | None: """Scale bus_subsidy_spending to the DfT net-support total (BUS_SUBSIDY_TARGETS).""" target = BUS_SUBSIDY_TARGETS.get(time_period) - if target is None or "bus_subsidy_spending" not in dataset.household: + if target is None: return None original_time_period = dataset.time_period