From e79f87e23e6e41ec07555dd6642035de63a69624 Mon Sep 17 00:00:00 2001 From: gaoflow Date: Tue, 2 Jun 2026 05:03:40 +0200 Subject: [PATCH] Parse read_nsrdb_psm4 header with csv module to keep quoted commas read_nsrdb_psm4 split the three header lines with a naive str.split(','), which broke spectral-on-demand files whose column names are quoted fields containing commas (e.g. '"GaAs (Bauhuis et al., 2009)"'). Such names were split into spurious columns, raising on read. Parse the header lines with the csv module so quoted fields are kept intact. Fixes #2736 --- docs/sphinx/source/whatsnew/v0.15.2.rst | 5 +++++ pvlib/iotools/psm4.py | 12 +++++++++--- tests/iotools/test_psm4.py | 25 +++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 3 deletions(-) diff --git a/docs/sphinx/source/whatsnew/v0.15.2.rst b/docs/sphinx/source/whatsnew/v0.15.2.rst index 327b36c1ae..5d52960900 100644 --- a/docs/sphinx/source/whatsnew/v0.15.2.rst +++ b/docs/sphinx/source/whatsnew/v0.15.2.rst @@ -22,6 +22,10 @@ Bug fixes introduced in v0.15.1 (:pull:`2702`) that caused a broadcasting ``ValueError`` when ``tracker_theta`` was a 2-D (or higher rank) array. (:issue:`2747`, :pull:`2749`) +* :py:func:`pvlib.iotools.read_nsrdb_psm4` now parses the file header with the + :py:mod:`csv` module instead of a naive ``str.split(',')``, so quoted column + names containing commas (e.g. the material names in spectral-on-demand files) + are no longer split into spurious columns. (:issue:`2736`, :pull:`2771`) Enhancements ~~~~~~~~~~~~ @@ -63,6 +67,7 @@ Maintenance Contributors ~~~~~~~~~~~~ * :ghuser:`Omesh37` +* :ghuser:`gaoflow` * Cliff Hansen (:ghuser:`cwhanse`) * :ghuser:`shethkajal7` * Arthur Onno (:ghuser:`ArthurOnnoTerabase`) diff --git a/pvlib/iotools/psm4.py b/pvlib/iotools/psm4.py index 9eb760f382..fc8d098a09 100644 --- a/pvlib/iotools/psm4.py +++ b/pvlib/iotools/psm4.py @@ -6,6 +6,7 @@ https://developer.nlr.gov/docs/solar/nsrdb/nsrdb-GOES-full-disc-v4-0-0-download/ """ +import csv import io from urllib.parse import urljoin import requests @@ -723,11 +724,16 @@ def read_nsrdb_psm4(filename, map_variables=True): `_ """ with tools._file_context_manager(filename) as fbuf: + # The first 3 header lines are parsed with the csv module rather than a + # naive str.split(',') so that quoted fields containing commas are kept + # intact. Spectral-on-demand files, for instance, have column names + # like '"GaAs (Bauhuis et al., 2009)"' whose embedded commas would + # otherwise be split into spurious columns (see GH #2736). # The first 2 lines of the response are headers with metadata - metadata_fields = fbuf.readline().split(',') - metadata_values = fbuf.readline().split(',') + metadata_fields = next(csv.reader([fbuf.readline()])) + metadata_values = next(csv.reader([fbuf.readline()])) # get the column names so we can set the dtypes - columns = fbuf.readline().split(',') + columns = next(csv.reader([fbuf.readline()])) columns[-1] = columns[-1].strip() # strip trailing newline # Since the header has so many columns, excel saves blank cols in the # data below the header lines. diff --git a/tests/iotools/test_psm4.py b/tests/iotools/test_psm4.py index 3b4313b070..c16a714aa3 100644 --- a/tests/iotools/test_psm4.py +++ b/tests/iotools/test_psm4.py @@ -185,6 +185,31 @@ def test_read_nsrdb_psm4_map_variables(): assert_index_equal(data.columns, pd.Index(columns_mapped)) +def test_read_nsrdb_psm4_quoted_columns_with_commas(): + """spectral-on-demand files have quoted column names containing commas; + these must not be split into spurious columns (GH #2736)""" + # Minimal NSRDB file whose column header (3rd line) has quoted material + # names with embedded commas, which is valid CSV. A naive str.split(',') + # would break these into extra columns and raise on read. + content = ( + "Source,Location ID,City,State,Country,Latitude,Longitude,Time Zone," + "Elevation,Local Time Zone,Version\n" + "NSRDB,1,-,-,-,40.0,-105.0,-7,1600,-7,4.0.1\n" + 'Year,Month,Day,Hour,Minute,GHI,"GaAs (Bauhuis et al., 2009)",' + '"InGaP (Gray, 2008)"\n' + "2023,1,1,0,0,0,0.1,0.2\n" + "2023,1,1,1,0,5,0.3,0.4\n" + ) + data, metadata = psm4.read_nsrdb_psm4(StringIO(content), + map_variables=False) + assert list(data.columns) == [ + 'Year', 'Month', 'Day', 'Hour', 'Minute', 'GHI', + 'GaAs (Bauhuis et al., 2009)', 'InGaP (Gray, 2008)'] + assert data.shape == (2, 8) + # the embedded-comma data columns round-trip as floats + assert data['GaAs (Bauhuis et al., 2009)'].tolist() == [0.1, 0.3] + + @pytest.mark.remote_data @pytest.mark.flaky(reruns=RERUNS, reruns_delay=RERUNS_DELAY) def test_get_nsrdb_psm4_aggregated_parameter_mapping(nlr_api_key):