diff --git a/NEWS.md b/NEWS.md index 7519c62f..755ae58a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +**06/03/2026:** The request-error hierarchy is now unified. Every module (`nwis`, `wqp`, `nldi`, `waterdata`, `nadp`, `streamstats`) raises a subclass of `dataretrieval.DataRetrievalError` on a failed request, so a single `except dataretrieval.DataRetrievalError` spans them all. An HTTP error status surfaces as an `HTTPError` carrying `.status_code` (inspect it to branch on a specific code); the retryable 429/5xx subset is `TransientError` (`RateLimited` / `ServiceUnavailable`, carrying `.retry_after`); and a request too large to satisfy is a `RequestTooLarge` (`URLTooLong` for an over-long single request, `Unchunkable` when the Water Data chunker cannot split a call small enough). Connection-level failures (timeouts, DNS, refused connections) are wrapped as a `NetworkError`, with the underlying `httpx` exception on `__cause__`. Every `DataRetrievalError` also exposes `.status_code` (`None` when there is no HTTP status), `.retry_after`, and `.retryable`, so a single `except dataretrieval.DataRetrievalError as e` clause can branch on the status or retry transient failures without knowing the concrete subclass. **Breaking change:** these exceptions no longer multiply-inherit a built-in — code that caught request failures with `except ValueError` or `except RuntimeError` should switch to `except dataretrieval.DataRetrievalError` (or a specific subclass). A no-data result is **not** an error: the modern getters (`waterdata`, `wqp`, `nldi`) return an empty DataFrame when nothing matches. Only the deprecated `nwis` (waterservices) path still raises `NoSitesError` on no data. + **05/17/2026:** The OGC `waterdata` getters (`get_daily`, `get_continuous`, `get_field_measurements`, and the rest of the multi-value-capable functions) now transparently chunk requests whose URLs would otherwise exceed the server's ~8 KB byte limit. **05/16/2026:** Fixed silent truncation in the paginated `waterdata` request loops (`_walk_pages` and `get_stats_data`). Mid-pagination failures (HTTP 429, 5xx, network error) were previously swallowed — pagination would quietly stop and the function would return whatever rows it had collected, leaving callers with truncated DataFrames they had no way to detect. The loops now status-check every page like the initial request and raise `RuntimeError` on any failure, with the upstream exception chained as `__cause__` and a short menu of recovery actions (wait and retry, reduce the request, or obtain an API token) in the message. **Behavior change**: callers that previously consumed partial DataFrames on transient upstream blips will now see an exception; retry the call (possibly with a smaller `limit` or narrower query). diff --git a/dataretrieval/__init__.py b/dataretrieval/__init__.py index 4b58247e..9dfb1991 100644 --- a/dataretrieval/__init__.py +++ b/dataretrieval/__init__.py @@ -17,8 +17,9 @@ ``nldi`` requires geopandas (``pip install dataretrieval[nldi]``) and is imported on demand: ``from dataretrieval import nldi``. -Every request failure raises a subclass of :class:`dataretrieval.DataRetrievalError`; -the taxonomy lives in ``dataretrieval.exceptions``. +A failed request raises a subclass of :class:`dataretrieval.DataRetrievalError` +(the taxonomy lives in ``dataretrieval.exceptions``); connection-level failures +(timeouts, DNS) are wrapped as :class:`dataretrieval.NetworkError`. """ from importlib.metadata import PackageNotFoundError, version @@ -29,10 +30,10 @@ __version__ = "version-unknown" from dataretrieval.exceptions import ( - BadRequestError, DataRetrievalError, + HTTPError, + NetworkError, NoSitesError, - NotFoundError, RateLimited, RequestTooLarge, ServiceUnavailable, @@ -64,10 +65,10 @@ # error taxonomy (canonical home: ``dataretrieval.exceptions``), re-exported # so callers can ``except dataretrieval.DataRetrievalError`` "exceptions", - "BadRequestError", "DataRetrievalError", + "HTTPError", + "NetworkError", "NoSitesError", - "NotFoundError", "RateLimited", "RequestTooLarge", "ServiceUnavailable", diff --git a/dataretrieval/exceptions.py b/dataretrieval/exceptions.py index 2bb955c3..294621f2 100644 --- a/dataretrieval/exceptions.py +++ b/dataretrieval/exceptions.py @@ -1,89 +1,214 @@ """Exception taxonomy for ``dataretrieval``. -A failed request from any service module (``nwis``, ``wqp``, ``waterdata``, -``nldi``, ...) raises a subclass of :class:`DataRetrievalError`, so a caller can -handle any request failure with a single ``except dataretrieval.DataRetrievalError``. - -The tree has two intermediate bases a caller can catch to span a whole family: -:class:`RequestTooLarge` (the request can't fit, however it was issued) and -:class:`TransientError` (a temporary failure worth retrying). - -This module deliberately has no third-party dependencies, so any module can -import it without pulling in pandas/httpx. +Every service module (``nwis``, ``wqp``, ``nldi``, ``waterdata``, ``nadp``, +``streamstats``) raises a subclass of :class:`DataRetrievalError` when a request +fails, so one ``except dataretrieval.DataRetrievalError`` catches them all -- +including connection-level failures (timeouts, DNS, refused connections), which +are wrapped as :class:`NetworkError` with the underlying ``httpx`` exception on +``__cause__``. + +Most failures are an :class:`HTTPError` carrying the response ``.status_code``, +of which :class:`TransientError` (429 / 5xx) is the retryable subset. The rest +aren't a plain status: :class:`RequestTooLarge` (with :class:`URLTooLong` / +:class:`Unchunkable`), :class:`NetworkError` (a failed connection, per above), +and :class:`NoSitesError`. :func:`error_for_status` maps a status to its type. + +This module has no third-party runtime dependencies -- ``httpx`` is imported only +for type checking -- so any module can import it without pulling in pandas / httpx +and without risking an import cycle. """ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any, ClassVar if TYPE_CHECKING: import httpx __all__ = [ "DataRetrievalError", - "BadRequestError", - "NotFoundError", + "HTTPError", + "TransientError", + "RateLimited", + "ServiceUnavailable", "RequestTooLarge", "URLTooLong", "Unchunkable", + "NetworkError", "NoSitesError", - "TransientError", - "RateLimited", - "ServiceUnavailable", + "error_for_status", ] class DataRetrievalError(Exception): - """Base class for errors raised when a request to a USGS or EPA web - service fails. + """Base class for every failed-request error in ``dataretrieval``. - Every service module (``nwis``, ``wqp``, ``waterdata``, ``nldi``, ...) - raises a subclass of this when a request fails, so a caller can handle any - request failure uniformly:: + Catch it to handle any USGS or EPA service failure uniformly, and branch on + the read-anywhere fields below without needing the concrete subclass:: try: - df, md = dataretrieval.wqp.get_results(...) - except dataretrieval.DataRetrievalError: - ... - - Subclasses also inherit from the built-in exception this package has - historically raised for the condition's *kind* -- :class:`ValueError` for a - request that can't succeed as written (bad params, too large), and - :class:`RuntimeError` for a transient transport failure -- so existing - ``except ValueError`` / ``except RuntimeError`` handlers keep working. + df, md = dataretrieval.waterdata.get_daily(...) + except dataretrieval.DataRetrievalError as e: + if e.retryable: # 429 / 5xx / connection failure + time.sleep(e.retry_after or backoff) + ... # re-issue the request + elif e.status_code == 404: # ``None`` unless an HTTP status error + ... + else: + raise + + Connection-level failures (timeouts, DNS) are wrapped as + :class:`NetworkError`, so this single clause covers them too. + """ + + #: HTTP status that triggered the error, or ``None`` for errors without one + #: (connection failure, too-long URL, no data). Set by :class:`HTTPError`. + status_code: int | None = None + #: Seconds the server asked us to wait before retrying (its ``Retry-After`` + #: header), or ``None`` when it gave no hint. Set by :class:`TransientError`. + retry_after: float | None = None + #: Whether re-issuing the same request might succeed -- ``True`` for the + #: transient HTTP statuses (429 / 5xx, :class:`TransientError`) and for + #: connection failures (:class:`NetworkError`); ``False`` otherwise. + retryable: ClassVar[bool] = False + + # These errors get pickled back across process boundaries (a lithops / + # multiprocessing worker returns whatever it raises). Default ``BaseException`` + # pickling rebuilds via ``cls(*args)``, which these subclasses can't survive -- + # keyword-only constructor fields, and ``ChunkInterrupted`` builds its message + # internally. So reconstruct via ``__new__`` + the standard getstate/setstate + # protocol, bypassing ``__init__``; a subclass drops unpicklable state by + # overriding ``__getstate__`` (see ``ChunkInterrupted``). + def __reduce__(self) -> tuple[Any, ...]: + return (_new_error, (self.__class__,), self.__getstate__()) + + def __getstate__(self) -> dict[str, Any]: + return {"args": self.args, **self.__dict__} + + def __setstate__(self, state: dict[str, Any] | None) -> None: + state = state or {} + self.args = state.pop("args", ()) + self.__dict__.update(state) + + +def _new_error(cls: type[DataRetrievalError]) -> DataRetrievalError: + """Build a blank :class:`DataRetrievalError` for unpickling, bypassing + ``__init__``; pickle then calls ``__setstate__`` to restore its state.""" + return cls.__new__(cls) + + +# --- HTTP status errors -------------------------------------------------- + + +class HTTPError(DataRetrievalError): + """The service returned an error HTTP status. + + The numeric status is on :attr:`status_code`; branch on it, e.g. + ``except HTTPError as e: ... if e.status_code == 404``. :class:`TransientError` + (429 / 5xx) is the retryable subset, and is itself an ``HTTPError``. The one + exception to "a status is an ``HTTPError``" is a request the service rejects + as too long: it surfaces as :class:`URLTooLong` (a :class:`RequestTooLarge`), + *not* an ``HTTPError`` -- so catch :class:`DataRetrievalError` to be certain + of spanning every failure. See :func:`error_for_status` for the full mapping. + + Parameters + ---------- + message : str + Human-readable error message. + status_code : int + The HTTP status the service returned. + """ + + def __init__(self, message: str, *, status_code: int) -> None: + super().__init__(message) + self.status_code = status_code + + +class TransientError(HTTPError): + """A 429 or 5xx the server may serve on a later try -- :class:`RateLimited` + for 429, :class:`ServiceUnavailable` for 5xx. + + This only classifies the condition; it does not itself retry. Whether to + retry is up to the calling path: a single-shot request raises it for the + caller to handle (e.g. wait :attr:`retry_after` seconds, then re-issue), + while the Water Data chunker retries and resumes automatically. + + Parameters + ---------- + message : str + Human-readable error message. + status_code : int, optional + The HTTP status the service returned. Defaults to the leaf's canonical + code (429 / 503) when omitted; :func:`error_for_status` always passes the + real status. + retry_after : float, optional + Seconds to wait before retrying, parsed from the ``Retry-After`` response + header; ``None`` when the header is absent or unparseable. """ + retryable: ClassVar[bool] = True + + #: Canonical status a concrete transient stamps when built without an + #: explicit ``status_code`` (:class:`RateLimited` = 429, + #: :class:`ServiceUnavailable` = 503). ``TransientError`` itself is abstract + #: and sets none, so constructing it bare requires ``status_code``. + _DEFAULT_STATUS: ClassVar[int] + + def __init__( + self, + message: str, + *, + status_code: int | None = None, + retry_after: float | None = None, + ) -> None: + if status_code is None: + status_code = getattr(self, "_DEFAULT_STATUS", None) + if status_code is None: + raise TypeError( + f"{type(self).__name__} requires status_code " + "(only the RateLimited / ServiceUnavailable leaves default it)" + ) + super().__init__(message, status_code=status_code) + self.retry_after = retry_after + + +class RateLimited(TransientError): + """A request was rejected with HTTP 429 (too many requests).""" + + _DEFAULT_STATUS = 429 -# --- Fatal client errors ------------------------------------------------- -# The request can't succeed as written; retrying it unchanged won't help. Each -# is also a ``ValueError`` -- the built-in the legacy ``query`` path has always -# raised -- so existing ``except ValueError`` handlers keep working. +class ServiceUnavailable(TransientError): + """A request was rejected with a server error (HTTP 5xx). + + Raised by both the legacy ``query`` path and the Water Data path, so a 5xx + surfaces as one type whichever subsystem issued the request. ``.status_code`` + holds the actual 5xx; it falls back to 503 only on a bare hand-construction. + """ -class BadRequestError(DataRetrievalError, ValueError): - """The service rejected the request parameters (HTTP 400).""" + _DEFAULT_STATUS = 503 -class NotFoundError(DataRetrievalError, ValueError): - """The requested resource was not found; often an empty query (HTTP 404).""" +# --- Request can't fit (not necessarily an HTTP status) ------------------ -class RequestTooLarge(DataRetrievalError, ValueError): +class RequestTooLarge(DataRetrievalError): """The request is too large for the service to satisfy. - A base for the two ways a request can exceed what the service accepts; - catch it to handle either. The concrete subclasses are :class:`URLTooLong` - (a single request the server rejected) and :class:`Unchunkable` (the Water - Data chunker could not split the call small enough to fit). + Base for the two ways that happens; catch it to handle either: + :class:`URLTooLong` (a single request rejected for length) and + :class:`Unchunkable` (a Water Data call the chunker could not split small + enough to fit). """ class URLTooLong(RequestTooLarge): - """A single request URL exceeded the service's limit (HTTP 414, or rejected - client-side before it was sent). + """A single request URL was too long for the service. - Raised by the legacy ``query`` path, which issues one request without - chunking. Remediation: query fewer sites, or split the call manually. + Raised on the legacy ``query`` path (which sends one un-chunked request), + whether the URL is rejected client-side before sending or by the server + (see :func:`error_for_status`). Remediation: query fewer sites, or split the + call manually. """ @@ -99,56 +224,72 @@ class Unchunkable(RequestTooLarge): """ -class NoSitesError(DataRetrievalError): - """The selection criteria matched no sites/data.""" +# --- Connection failure (no HTTP response) ------------------------------- - def __init__(self, url: httpx.URL) -> None: - self.url = url - def __str__(self) -> str: - return ( - "No sites/data found using the selection criteria specified in " - f"url: {self.url}" - ) +class NetworkError(DataRetrievalError): + """The request never completed a round-trip to the service -- a DNS + failure, refused connection, or timeout -- so no HTTP response arrived to + classify. + Wraps the underlying ``httpx`` transport exception, preserved on + ``__cause__``. Worth retrying (:attr:`~DataRetrievalError.retryable` is + ``True``), but carries no ``.status_code`` because no response came back. + """ -# --- Transient transport errors ------------------------------------------ -# The service was reachable but temporarily refused the request; the same call -# may succeed if retried. Each is also a ``RuntimeError`` (the built-in the -# waterdata path has always raised). The Water Data chunker recognizes them via -# ``isinstance(exc, TransientError)`` and wraps them as resumable -# ``ChunkInterrupted`` subclasses. + retryable: ClassVar[bool] = True -class TransientError(DataRetrievalError, RuntimeError): - """Base for transient HTTP failures that are worth an automatic retry. +# --- Empty result -------------------------------------------------------- - One subclass per recoverable HTTP status family (429 -> :class:`RateLimited`, - 5xx -> :class:`ServiceUnavailable`); the Water Data chunker recognizes them - by this shared base and wraps them as resumable interruptions. - Parameters - ---------- - message : str - Human-readable error message. - retry_after : float, optional - Seconds to wait before retrying, parsed from the ``Retry-After`` - response header; stored on the :attr:`retry_after` attribute (``None`` - when the header is absent or unparseable). +class NoSitesError(DataRetrievalError): + """A request succeeded (HTTP 200) but matched no sites/data. + + A no-data result is normally **not** an error: the modern getters + (``waterdata``, ``wqp``, ``nldi``) return an empty ``DataFrame``. Only the + deprecated ``nwis`` (waterservices) path still raises this. """ - def __init__(self, message: str, *, retry_after: float | None = None) -> None: - super().__init__(message) - self.retry_after = retry_after + def __init__(self, url: httpx.URL) -> None: + self.url = url + + def __str__(self) -> str: + return ( + "No sites/data found using the selection criteria specified in " + f"url: {self.url}" + ) -class RateLimited(TransientError): - """A request was rejected with HTTP 429 (too many requests).""" +def error_for_status( + status: int, message: str, *, retry_after: float | None = None +) -> DataRetrievalError: + """Return the typed :class:`DataRetrievalError` for an HTTP error *status*. + The one status-to-type mapping every request path shares (the legacy + ``query`` path, ``waterdata``, ``nadp`` / ``streamstats``), so a given status + becomes the same type everywhere: -class ServiceUnavailable(TransientError): - """A request was rejected with a server error (HTTP 5xx). + * **413, 414** -> :class:`URLTooLong` (a :class:`RequestTooLarge`) -- the + "too long" semantic is more actionable than a bare status, and it matches + the client-side over-long-URL case + * **429** -> :class:`RateLimited` + * **5xx** -> :class:`ServiceUnavailable` + * **anything else** -> :class:`HTTPError` - Raised by both the legacy ``query`` path and the Water Data path, so a 5xx - surfaces as one type regardless of which subsystem issued the request. + ``message`` is used verbatim; ``retry_after`` is attached only to the + transient (:class:`TransientError`) types. *status* must be an error status + (``>= 400``) -- classifying a success or redirect is a usage error and raises + :class:`ValueError`. """ + if status < 400: + raise ValueError( + f"error_for_status expects an HTTP error status (>= 400), got {status}" + ) + if status in (413, 414): + return URLTooLong(message) + if status == 429: + return RateLimited(message, status_code=status, retry_after=retry_after) + if 500 <= status < 600: + return ServiceUnavailable(message, status_code=status, retry_after=retry_after) + return HTTPError(message, status_code=status) diff --git a/dataretrieval/nadp.py b/dataretrieval/nadp.py index a02b6671..4966fa92 100644 --- a/dataretrieval/nadp.py +++ b/dataretrieval/nadp.py @@ -43,9 +43,7 @@ import warnings import zipfile -import httpx - -from dataretrieval.utils import HTTPX_DEFAULTS +from dataretrieval.utils import HTTPX_DEFAULTS, _get, _raise_for_status _DEPRECATION_MESSAGE = ( "The `nadp` module is deprecated and will be removed from `dataretrieval` " @@ -229,8 +227,8 @@ def get_zip(url: str, filename: str) -> NADP_ZipFile: """ _warn_deprecated() - req = httpx.get(url + filename, **HTTPX_DEFAULTS) - req.raise_for_status() + req = _get(url + filename, **HTTPX_DEFAULTS) + _raise_for_status(req) # z = zipfile.ZipFile(io.BytesIO(req.content)) z = NADP_ZipFile(io.BytesIO(req.content)) diff --git a/dataretrieval/nldi.py b/dataretrieval/nldi.py index cb4e2488..9a169414 100644 --- a/dataretrieval/nldi.py +++ b/dataretrieval/nldi.py @@ -19,13 +19,11 @@ def _query_nldi( url: str, query_params: dict[str, str], - error_message: str, ) -> dict[str, Any] | list[Any]: - # A helper function to query the NLDI API + # A helper function to query the NLDI API. ``query()`` already raises a + # typed ``DataRetrievalError`` for any HTTP error response, so a returned + # response is a success that we only need to parse. response = query(url, payload=query_params) - if response.status_code != 200: - raise ValueError(f"{error_message}. Error reason: {response.reason_phrase}") - response_data: dict[str, Any] | list[Any] = {} try: response_data = response.json() @@ -112,15 +110,7 @@ def get_flowlines( if stop_comid is not None: query_params["stopComid"] = str(stop_comid) - if feature_source: - err_msg = ( - f"Error getting flowlines for feature source '{feature_source}'" - f" and feature_id '{feature_id}'" - ) - else: - err_msg = f"Error getting flowlines for comid '{comid}'" - - feature_collection = cast("dict[str, Any]", _query_nldi(url, query_params, err_msg)) + feature_collection = cast("dict[str, Any]", _query_nldi(url, query_params)) if as_json: return feature_collection gdf = _features_to_gdf(feature_collection) @@ -172,11 +162,7 @@ def get_basin( "simplified": simplified_str, "splitCatchment": split_catchment_str, } - err_msg = ( - f"Error getting basin for feature source '{feature_source}' and " - f"feature_id '{feature_id}'" - ) - feature_collection = cast("dict[str, Any]", _query_nldi(url, query_params, err_msg)) + feature_collection = cast("dict[str, Any]", _query_nldi(url, query_params)) if as_json: return feature_collection gdf = _features_to_gdf(feature_collection) @@ -266,7 +252,6 @@ def get_features( ) url = f"{NLDI_API_BASE_URL}/comid/position" query_params = {"coords": f"POINT({long} {lat})"} - err_msg = f"Error getting features for lat '{lat}' and long '{long}'" else: if (comid is not None or data_source is not None) and navigation_mode is None: raise ValueError( @@ -290,9 +275,8 @@ def get_features( else: url = f"{NLDI_API_BASE_URL}/{feature_source}/{feature_id}" query_params = {} - err_msg = _features_err_msg(feature_source, feature_id, comid, data_source) - feature_collection = cast("dict[str, Any]", _query_nldi(url, query_params, err_msg)) + feature_collection = cast("dict[str, Any]", _query_nldi(url, query_params)) if as_json: return feature_collection gdf = _features_to_gdf(feature_collection) @@ -327,8 +311,7 @@ def get_features_by_data_source(data_source: str) -> gpd.GeoDataFrame: # validate the data source _validate_data_source(data_source) url = f"{NLDI_API_BASE_URL}/{data_source}" - err_msg = f"Error getting features for data source '{data_source}'" - feature_collection = cast("dict[str, Any]", _query_nldi(url, {}, err_msg)) + feature_collection = cast("dict[str, Any]", _query_nldi(url, {})) gdf = _features_to_gdf(feature_collection) return gdf @@ -477,9 +460,7 @@ def _validate_data_source(data_source: str) -> None: # get the available data/feature sources - if not already cached if _AVAILABLE_DATA_SOURCES is None: url = f"{NLDI_API_BASE_URL}/" - available_data_sources = _query_nldi( - url, {}, "Error getting available data sources" - ) + available_data_sources = _query_nldi(url, {}) if not isinstance(available_data_sources, list) or not all( isinstance(ds, dict) and "source" in ds for ds in available_data_sources ): @@ -498,20 +479,6 @@ def _validate_data_source(data_source: str) -> None: raise ValueError(err_msg) -def _features_err_msg( - feature_source: str | None, - feature_id: str | None, - comid: int | None, - data_source: str | None, -) -> str: - if feature_source is not None: - return ( - f"Error getting features for feature source '{feature_source}'" - f" and feature_id '{feature_id}', and data source '{data_source}'" - ) - return f"Error getting features for comid '{comid}' and data source '{data_source}'" - - def _validate_navigation_mode(navigation_mode: str | None) -> str: if navigation_mode is None: raise ValueError( diff --git a/dataretrieval/streamstats.py b/dataretrieval/streamstats.py index d17aa7df..1458494a 100644 --- a/dataretrieval/streamstats.py +++ b/dataretrieval/streamstats.py @@ -12,7 +12,7 @@ import httpx -from dataretrieval.utils import HTTPX_DEFAULTS +from dataretrieval.utils import HTTPX_DEFAULTS, _get, _raise_for_status def download_workspace(workspaceID: str, format: str = "") -> httpx.Response: @@ -37,9 +37,9 @@ def download_workspace(workspaceID: str, format: str = "") -> httpx.Response: payload = {"workspaceID": workspaceID, "format": format} url = "https://streamstats.usgs.gov/streamstatsservices/download" - r = httpx.get(url, params=payload, **HTTPX_DEFAULTS) + r = _get(url, params=payload, **HTTPX_DEFAULTS) - r.raise_for_status() + _raise_for_status(r) return r # data = r.raw.read() @@ -144,9 +144,9 @@ def get_watershed( } url = "https://streamstats.usgs.gov/streamstatsservices/watershed.geojson" - r = httpx.get(url, params=payload, **HTTPX_DEFAULTS) + r = _get(url, params=payload, **HTTPX_DEFAULTS) - r.raise_for_status() + _raise_for_status(r) if format == "geojson": return r diff --git a/dataretrieval/utils.py b/dataretrieval/utils.py index 9751b269..f0cec64b 100644 --- a/dataretrieval/utils.py +++ b/dataretrieval/utils.py @@ -14,11 +14,10 @@ import dataretrieval from dataretrieval.codes import tz from dataretrieval.exceptions import ( - BadRequestError, + NetworkError, NoSitesError, - NotFoundError, - ServiceUnavailable, URLTooLong, + error_for_status, ) # Typed as ``dict[str, Any]`` (not the inferred ``dict[str, object]``) so that @@ -289,32 +288,45 @@ def _url_too_long_error(detail: str) -> URLTooLong: ) +def _network_error(url: str | httpx.URL, exc: httpx.TransportError) -> NetworkError: + """Build the :class:`~dataretrieval.exceptions.NetworkError` for a failed + round-trip ``exc`` (no HTTP response: timeout, DNS, refused connection).""" + # Some httpx transport errors stringify empty (e.g. ``ConnectTimeout()``); + # fall back to the class name so the message is always informative. + detail = str(exc) or type(exc).__name__ + return NetworkError(f"Could not reach the service at {url}: {detail}") + + +def _get(url: str | httpx.URL, **kwargs: Any) -> httpx.Response: + """``httpx.get`` for the single-shot paths, surfacing a transport failure as + a typed :class:`~dataretrieval.exceptions.NetworkError` (the chunker wraps its + own as resumable interruptions, so it stays off this wrapper).""" + try: + return httpx.get(url, **kwargs) + except httpx.TransportError as exc: + raise _network_error(url, exc) from exc + + def _raise_for_status(response: httpx.Response) -> None: - """Map an unsuccessful HTTP status to a typed :class:`DataRetrievalError`; + """Raise the typed :class:`DataRetrievalError` for an HTTP error response; return ``None`` on success. - Shared by the legacy :func:`query` path. The 4xx types stay - :class:`ValueError`-compatible (this path's historical contract), but a 5xx - raises the transient :class:`ServiceUnavailable` (a :class:`RuntimeError`), - since a server failure is retryable rather than a bad request. + Shared by the legacy :func:`query` path (and ``nadp`` / ``streamstats``). + Delegates the status-to-type mapping to + :func:`dataretrieval.exceptions.error_for_status`, except a too-long-URL + status (413 / 414): that gets the same actionable "split your query" + remediation as the client-side over-long-URL case below, rather than a bare + ``HTTP 414`` (both still raise :class:`~dataretrieval.exceptions.URLTooLong`). """ status = response.status_code - if status == 400: - raise BadRequestError( - f"Bad Request, check that your parameters are correct. URL: {response.url}" - ) - elif status == 404: - raise NotFoundError( - "Page Not Found Error. May be the result of an empty query. " - f"URL: {response.url}" - ) - elif status == 414: + if status < 400: + return + if status in (413, 414): raise _url_too_long_error(f"API response reason: {response.reason_phrase}") - elif 500 <= status < 600: - raise ServiceUnavailable( - f"Service Unavailable: {status} {response.reason_phrase}. " - f"The service at {response.url} may be down or experiencing issues." - ) + raise error_for_status( + status, + f"HTTP {status} {response.reason_phrase}".rstrip() + f" (URL: {response.url})", + ) def query( @@ -348,13 +360,12 @@ def query( Raises ------ DataRetrievalError - On failure: :class:`~dataretrieval.exceptions.BadRequestError` (400), - :class:`~dataretrieval.exceptions.NotFoundError` (404), - :class:`~dataretrieval.exceptions.URLTooLong` (414 or a client-side - over-long URL), :class:`~dataretrieval.exceptions.ServiceUnavailable` - (5xx), or :class:`~dataretrieval.exceptions.NoSitesError` (no sites/data - matched). The 4xx types are also :class:`ValueError`; - ``ServiceUnavailable`` is a :class:`RuntimeError`. + On an HTTP error response, the typed subclass for the status (see + :func:`dataretrieval.exceptions.error_for_status` for the mapping); or + :class:`~dataretrieval.exceptions.NoSitesError` when a 200 response + reports no data matched; or :class:`~dataretrieval.exceptions.NetworkError` + on a connection-level failure (timeout, DNS), with the underlying + ``httpx`` exception on ``__cause__``. """ for key, value in payload.items(): @@ -366,7 +377,7 @@ def query( user_agent = {"user-agent": f"python-dataretrieval/{dataretrieval.__version__}"} try: - response = httpx.get( + response = _get( url, params=payload, headers=user_agent, @@ -378,6 +389,8 @@ def query( _raise_for_status(response) + # USGS waterservices signals an empty result with a 200 whose body starts + # "No sites/data ..." (its legacy wording); surface it as NoSitesError. if response.text.startswith("No sites/data"): raise NoSitesError(response.url) diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 5c2a4657..f4ecba47 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -20,6 +20,7 @@ HTTPX_DEFAULTS, BaseMetadata, _attach_datetime_columns, + _get, to_str, ) from dataretrieval.waterdata.filters import FILTER_LANG @@ -2126,7 +2127,7 @@ def get_codes(code_service: CODE_SERVICES) -> tuple[pd.DataFrame, BaseMetadata]: url = f"{SAMPLES_URL}/codeservice/{code_service}?mimeType=application%2Fjson" - response = httpx.get(url, headers=_default_headers(), **HTTPX_DEFAULTS) + response = _get(url, headers=_default_headers(), **HTTPX_DEFAULTS) _raise_for_non_200(response) @@ -2150,7 +2151,7 @@ def _get_samples_csv( as metadata and applies any per-getter post-step. """ logger.debug("Request: %s", httpx.URL(url).copy_merge_params(params)) - response = httpx.get( + response = _get( url, params=params, verify=ssl_check, diff --git a/dataretrieval/waterdata/chunking.py b/dataretrieval/waterdata/chunking.py index 82f5a37e..6626aba9 100644 --- a/dataretrieval/waterdata/chunking.py +++ b/dataretrieval/waterdata/chunking.py @@ -390,7 +390,7 @@ def _passthrough_result( return frame, response -class ChunkInterrupted(DataRetrievalError, RuntimeError): +class ChunkInterrupted(DataRetrievalError): """ Base class for mid-stream chunk failures whose completed work is preserved and resumable. @@ -496,6 +496,15 @@ def __init__( self.partial_frame = call.partial_frame.copy() self.partial_response = call.partial_response + def __getstate__(self) -> dict[str, Any]: + # Drop the live ChunkedCall before pickling: its ``.fetch`` is an + # undecorated module function pickle can't reference by name, so the + # interruption can't cross a process boundary with ``.call`` attached. + # The degraded ``call=None`` form keeps the counts, retry hint, and + # partial frame / response; only ``.resume()`` is lost (cross-process + # resume was never possible anyway). + return {**super().__getstate__(), "call": None} + class QuotaExhausted(ChunkInterrupted): """ @@ -1034,18 +1043,18 @@ def _classify_chunk_error( Notes ----- - ``_walk_pages`` re-wraps mid-pagination failures as - ``RuntimeError`` with the typed transport exception linked as + ``_walk_pages`` re-wraps mid-pagination failures as a base + ``DataRetrievalError`` with the typed transport exception linked as ``__cause__``, so this function must walk the chain rather than just ``isinstance`` the top-level exception. Bare ``httpx.HTTPError`` (``ConnectError``, ``TimeoutException``, etc.) and ``httpx.InvalidURL`` (server-supplied cursor URL too long, oversize follow-up) are also treated as transport failures - and wrapped as :class:`ServiceInterrupted` — these don't inherit - from ``RuntimeError`` (and ``InvalidURL`` doesn't even inherit - from ``HTTPError``), so without explicit handling they would - escape the chunker's catch with no resumable handle. + and wrapped as :class:`ServiceInterrupted` — they aren't one of the + typed status errors above (and ``InvalidURL`` doesn't even inherit + from ``httpx.HTTPError``), so without explicit handling they would + escape classification with no resumable handle. """ cur: BaseException | None = exc while cur is not None: @@ -1067,8 +1076,8 @@ def _retryable(exc: BaseException) -> tuple[bool, float | None]: :func:`_classify_chunk_error`, which walks the ``__cause__`` chain. The distinction matters because ``_paginate`` raises an initial-request transient (429 / 5xx / :class:`httpx.TransportError`) - *raw*, but wraps a mid-pagination failure as a ``RuntimeError``. So a - raw transient means a sub-request that made no progress and is cheap to + *raw*, but wraps a mid-pagination failure as a base ``DataRetrievalError``. + So a raw transient means a sub-request that made no progress and is cheap to re-issue, whereas a mid-pagination failure is left to escalate to a resumable :class:`ChunkInterrupted` rather than re-walked from page 1 (which would re-spend the quota just exhausted). ``httpx.InvalidURL`` diff --git a/dataretrieval/waterdata/ratings.py b/dataretrieval/waterdata/ratings.py index c0f870c1..2ffe5089 100644 --- a/dataretrieval/waterdata/ratings.py +++ b/dataretrieval/waterdata/ratings.py @@ -17,8 +17,9 @@ import httpx import pandas as pd +from dataretrieval.exceptions import DataRetrievalError from dataretrieval.rdb import extract_rdb_comment, read_rdb -from dataretrieval.utils import HTTPX_DEFAULTS +from dataretrieval.utils import HTTPX_DEFAULTS, _get from .utils import ( _DURATION_RE, @@ -184,17 +185,16 @@ def get_ratings( fid = feature["id"] try: out[fid] = _download_and_parse(feature, file_path, ssl_check) - # _download_and_parse can raise the module's typed errors via - # _raise_for_non_200 (RateLimited / ServiceUnavailable / RuntimeError — - # all RuntimeError subclasses), and a feature missing its data asset - # raises LookupError. Catch those too so one bad feature is logged and - # skipped rather than aborting the whole multi-site batch. + # One bad feature shouldn't abort the batch: log and skip the module's + # typed errors (DataRetrievalError, e.g. an HTTPError from + # _raise_for_non_200) plus the transport / parse / file / missing-asset + # errors a single download can raise. except ( + DataRetrievalError, httpx.HTTPError, - RuntimeError, - ValueError, - OSError, LookupError, + OSError, + ValueError, ) as e: logger.warning("Failed to download / parse %s: %s", fid, e) @@ -260,7 +260,7 @@ def _search( params: dict[str, Any] | None = query_params features: list[dict[str, Any]] = [] while url is not None: - response = httpx.get( + response = _get( url, params=params, headers=_default_headers(), @@ -288,9 +288,7 @@ def _download_and_parse( ) -> pd.DataFrame: """Fetch the feature's data asset, parse RDB, optionally persist to disk.""" url = feature["assets"]["data"]["href"] - response = httpx.get( - url, headers=_default_headers(), verify=ssl_check, **HTTPX_DEFAULTS - ) + response = _get(url, headers=_default_headers(), verify=ssl_check, **HTTPX_DEFAULTS) _raise_for_non_200(response) if file_path is not None: diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index a4706c15..7a2c0d5c 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -27,8 +27,8 @@ from anyio.from_thread import start_blocking_portal from dataretrieval import __version__ -from dataretrieval.exceptions import RateLimited, ServiceUnavailable -from dataretrieval.utils import HTTPX_DEFAULTS, BaseMetadata +from dataretrieval.exceptions import DataRetrievalError, RateLimited, error_for_status +from dataretrieval.utils import HTTPX_DEFAULTS, BaseMetadata, _get, _network_error from dataretrieval.waterdata import _progress, chunking from dataretrieval.waterdata.chunking import ( _QUOTA_HEADER, @@ -424,15 +424,15 @@ def _check_ogc_requests( ------ ValueError If req_type is not "queryables" or "schema". - RateLimited, ServiceUnavailable, RuntimeError - From :func:`_raise_for_non_200` on any non-200 — same typed - contract as the main data path so callers can use one - ``except`` clause everywhere. + DataRetrievalError + From :func:`_raise_for_non_200` on any non-200 (the typed subclass for + the status) — same typed contract as the main data path so callers can + use one ``except`` clause everywhere. """ if req_type not in ("queryables", "schema"): raise ValueError(f"req_type must be 'queryables' or 'schema', got {req_type!r}") url = f"{OGC_API_URL}/collections/{endpoint}/{req_type}" - resp = httpx.get(url, headers=_default_headers(), **HTTPX_DEFAULTS) + resp = _get(url, headers=_default_headers(), **HTTPX_DEFAULTS) _raise_for_non_200(resp) # ``Response.json`` is typed ``Any``; the OGC queryables/schema endpoints # return a JSON object, and callers index it as a dict. @@ -537,26 +537,24 @@ def _raise_for_non_200(resp: httpx.Response) -> None: Raises ------ - RateLimited - On HTTP 429 — typed so ``ChunkedCall`` can wrap as a resumable - :class:`~dataretrieval.waterdata.chunking.QuotaExhausted`. - ServiceUnavailable - On HTTP 5xx — typed so ``ChunkedCall`` can wrap as a resumable - :class:`~dataretrieval.waterdata.chunking.ServiceInterrupted`. - RuntimeError - On any other non-200 (4xx other than 429) — these are - programmer errors that retry won't fix. + DataRetrievalError + The typed subclass for the status (see + :func:`dataretrieval.exceptions.error_for_status` for the mapping). The + transient types (:class:`~dataretrieval.exceptions.TransientError`) are + distinguished so ``ChunkedCall`` can wrap them as a resumable + :class:`~dataretrieval.waterdata.chunking.QuotaExhausted` / + :class:`~dataretrieval.waterdata.chunking.ServiceInterrupted`; a fatal + :class:`~dataretrieval.exceptions.HTTPError` (not a ``TransientError``) + the chunker won't resume. """ status = resp.status_code - if status == 200: + if status < 400: return - body = _error_body(resp) - retry_after = _parse_retry_after(resp.headers.get("Retry-After")) - if status == 429: - raise RateLimited(body, retry_after=retry_after) - if 500 <= status < 600: - raise ServiceUnavailable(body, retry_after=retry_after) - raise RuntimeError(body) + raise error_for_status( + status, + _error_body(resp), + retry_after=_parse_retry_after(resp.headers.get("Retry-After")), + ) def _paginated_failure_message(pages_collected: int, cause: BaseException) -> str: @@ -578,7 +576,7 @@ def _paginated_failure_message(pages_collected: int, cause: BaseException) -> st Returns ------- str - A message suitable for the ``RuntimeError`` that + A message suitable for the ``DataRetrievalError`` that ``_walk_pages`` and ``get_stats_data`` raise from the original exception. """ @@ -1066,7 +1064,7 @@ async def _paginate( :func:`get_stats_data`: send the initial request, then loop calling ``follow_up`` until ``parse_response`` reports a ``None`` cursor, accumulating frames and elapsed time. Any mid-pagination failure - raises ``RuntimeError`` wrapping the cause — the API exposes no + raises ``DataRetrievalError`` wrapping the cause — the API exposes no resume cursor, so the caller's only recovery is to retry the whole call. Issuing HTTP asynchronously lets the multiple sub-requests of a chunked call run concurrently under @@ -1101,15 +1099,14 @@ async def _paginate( Raises ------ - RuntimeError - On a non-200 initial response (typed - :class:`~dataretrieval.exceptions.RateLimited` / - :class:`~dataretrieval.exceptions.ServiceUnavailable` - for 429/5xx, otherwise plain ``RuntimeError`` from - :func:`_error_body`), on an initial-page parse failure - (wrapped via :func:`_paginated_failure_message` with the - original exception on ``__cause__``), or any failure on a - subsequent page (same wrapping). + DataRetrievalError + On a non-200 initial response, the typed subclass for the status from + :func:`_raise_for_non_200` (a + :class:`~dataretrieval.exceptions.TransientError` for a retryable + 429 / 5xx, otherwise a fatal :class:`~dataretrieval.exceptions.HTTPError`); + or, on an initial-page parse failure or any subsequent-page failure, a + base ``DataRetrievalError`` wrapping the cause (built by + :func:`_paginated_failure_message`, original exception on ``__cause__``). httpx.HTTPError Network-level failures on the *initial* request (e.g. ``ConnectError``, ``TimeoutException``) propagate unmodified @@ -1132,7 +1129,7 @@ async def _paginate( # treatment as follow-up failures so callers see a consistent # diagnostic regardless of which page broke. logger.warning("Initial response parse failed.") - raise RuntimeError(_paginated_failure_message(0, e)) from e + raise DataRetrievalError(_paginated_failure_message(0, e)) from e dfs = [df] # Stop following ``next`` links once the optional row cap is reached # (see :func:`_row_cap`); ``None`` means uncapped. The concatenation @@ -1164,7 +1161,7 @@ async def _paginate( "Request failed at cursor %r. Data download interrupted.", cursor, ) - raise RuntimeError(_paginated_failure_message(len(dfs), e)) from e + raise DataRetrievalError(_paginated_failure_message(len(dfs), e)) from e # Aggregate headers / elapsed onto a COPY of the initial # response so the user's caller never sees an in-place @@ -1231,7 +1228,7 @@ async def _walk_pages( Raises ------ - RuntimeError + DataRetrievalError See :func:`_paginate`. httpx.HTTPError See :func:`_paginate`. @@ -1740,7 +1737,12 @@ def _run_sync( """ with _progress.progress_context(service=service): with start_blocking_portal() as portal: - return portal.call(make_coro) + try: + return portal.call(make_coro) + except httpx.TransportError as exc: + # The initial-request connection failure ``_paginate`` lets + # through raw; mid-pagination failures are already typed. + raise _network_error(OGC_API_URL, exc) from exc def get_stats_data( @@ -1785,6 +1787,14 @@ def get_stats_data( A DataFrame containing the retrieved and processed statistical data. BaseMetadata A metadata object containing request information including URL and query time. + + Raises + ------ + DataRetrievalError + The typed subclass for an HTTP error response (see :func:`_paginate`); + or :class:`~dataretrieval.exceptions.NetworkError` if the initial request + can't reach the service (timeout / DNS), the ``httpx`` exception chained + on ``__cause__``. """ url = f"{STATISTICS_API_URL}/{service}" diff --git a/docs/source/reference/exceptions.rst b/docs/source/reference/exceptions.rst new file mode 100644 index 00000000..1d8de47e --- /dev/null +++ b/docs/source/reference/exceptions.rst @@ -0,0 +1,8 @@ +.. _exceptions: + +dataretrieval.exceptions +------------------------ + +.. automodule:: dataretrieval.exceptions + :members: + :show-inheritance: diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst index ec1eec43..43def275 100644 --- a/docs/source/reference/index.rst +++ b/docs/source/reference/index.rst @@ -7,6 +7,7 @@ API reference .. toctree:: :maxdepth: 1 + exceptions nadp nldi nwis diff --git a/docs/source/userguide/errors.rst b/docs/source/userguide/errors.rst new file mode 100644 index 00000000..cd81f546 --- /dev/null +++ b/docs/source/userguide/errors.rst @@ -0,0 +1,103 @@ +.. _handling-errors: + +=============== +Handling errors +=============== + +Every failed request raises a subclass of +:class:`~dataretrieval.exceptions.DataRetrievalError`, so a single ``except`` +clause handles any failure regardless of which service you called: + +.. code-block:: python + + import dataretrieval + + try: + df, md = dataretrieval.waterdata.get_daily( + monitoring_location_id="USGS-05427718" + ) + except dataretrieval.DataRetrievalError: + ... # any request failure: error status, connection loss, too-large, ... + +Connection-level failures (timeouts, DNS, refused connections) are wrapped as +:class:`~dataretrieval.exceptions.NetworkError`, so the clause above covers them +too -- you never have to catch an ``httpx`` exception. A *no-data* result is **not** an +error: the modern getters return an empty ``DataFrame`` when nothing matches, so +check ``df.empty`` rather than catching anything. + +Branch without knowing the concrete type +========================================= + +Every :class:`~dataretrieval.exceptions.DataRetrievalError` exposes three +read-anywhere fields, so you rarely need to import the specific subclasses: + +* ``.status_code`` -- the HTTP status, or ``None`` when the failure carried no + response (a connection error, an over-long URL, ...). +* ``.retry_after`` -- seconds the server asked you to wait (its ``Retry-After`` + header), or ``None``. +* ``.retryable`` -- ``True`` when re-issuing the same request might succeed (a + 429 / 5xx, or a connection failure); ``False`` otherwise. + +.. code-block:: python + + except dataretrieval.DataRetrievalError as e: + if e.status_code == 404: + ... # not found + elif e.retryable: + ... # transient -- see the retry recipe below + else: + raise + +Retry transient failures with backoff +===================================== + +``.retryable`` and ``.retry_after`` make a backoff loop type-agnostic -- it +covers rate limits (429), server errors (5xx), and connection failures alike, +honoring the server's ``Retry-After`` hint when present: + +.. code-block:: python + + import time + import dataretrieval + + for attempt in range(5): + try: + df, md = dataretrieval.waterdata.get_continuous( + monitoring_location_id=sites + ) + break + except dataretrieval.DataRetrievalError as e: + if not e.retryable or attempt == 4: + raise + time.sleep(e.retry_after or 2 ** attempt) + +Resume a large Water Data request +================================= + +The Water Data getters transparently split an over-large request into chunks. +When a transient failure interrupts one mid-stream, the work already completed +is preserved: catch ``ChunkInterrupted`` and call ``exc.call.resume()`` once the +condition clears -- only the unfinished sub-requests are re-issued. + +.. code-block:: python + + import time + from dataretrieval.waterdata import get_daily + from dataretrieval.waterdata.chunking import ChunkInterrupted + + try: + df, md = get_daily(monitoring_location_id=long_list_of_sites) + except ChunkInterrupted as exc: + while True: + time.sleep(exc.retry_after or 5 * 60) + try: + df, md = exc.call.resume() + break + except ChunkInterrupted as again: + exc = again + +The full taxonomy +================= + +See :doc:`/reference/exceptions` for the complete class tree and per-type +details. diff --git a/docs/source/userguide/index.rst b/docs/source/userguide/index.rst index 2ba5a93a..3cc4748a 100644 --- a/docs/source/userguide/index.rst +++ b/docs/source/userguide/index.rst @@ -13,5 +13,6 @@ Contents .. toctree:: :maxdepth: 1 + errors timeconventions dataportals diff --git a/tests/nldi_test.py b/tests/nldi_test.py index 2249e4f2..9092bbf3 100644 --- a/tests/nldi_test.py +++ b/tests/nldi_test.py @@ -341,12 +341,11 @@ def test_validate_navigation_mode_normalizes_lowercase(): assert _validate_navigation_mode("um") == "UM" -def test_query_nldi_non_200_surfaces_reason_phrase(httpx_mock): - """``_query_nldi`` must include the response's reason phrase in - the raised ``ValueError``. Pre-fix this crashed with - ``AttributeError: 'Response' object has no attribute 'reason'`` - because the migration to httpx renamed ``.reason`` → - ``.reason_phrase`` but missed this call site.""" +def test_query_nldi_non_200_raises_typed_error(httpx_mock): + """A non-200 NLDI response surfaces a typed ``DataRetrievalError`` (here a + 429 → ``RateLimited``, raised by the shared ``query`` path).""" + from dataretrieval.exceptions import RateLimited + httpx_mock.add_response( method="GET", url=f"{NLDI_API_BASE_URL}/WQP/USGS-MISSING/basin" @@ -354,7 +353,7 @@ def test_query_nldi_non_200_surfaces_reason_phrase(httpx_mock): status_code=429, ) mock_request_data_sources(httpx_mock) - with pytest.raises(ValueError, match="Error reason:"): + with pytest.raises(RateLimited, match="429"): nldi.get_basin(feature_source="WQP", feature_id="USGS-MISSING") @@ -374,15 +373,15 @@ def test_validate_data_source_rejects_malformed_catalog(httpx_mock, monkeypatch) def test_query_504_raises_service_unavailable(httpx_mock): - """``utils.query`` must classify 504 Gateway Timeout as a 5xx failure - (the transient ``ServiceUnavailable``). Pre-fix: the membership check - ``[500, 502, 503]`` missed 504 and returned the response unchanged, - leading downstream callers (e.g. ``_query_nldi``) to silently swallow - the failure as an empty dict via JSONDecodeError.""" + """``utils.query`` classifies any 5xx (here 504 Gateway Timeout) as the + transient ``ServiceUnavailable`` -- the whole 5xx range, not an enumerated + subset of codes.""" from dataretrieval.exceptions import ServiceUnavailable from dataretrieval.utils import query url = "https://example.invalid/x" httpx_mock.add_response(method="GET", url=f"{url}?a=1", status_code=504) - with pytest.raises(ServiceUnavailable, match="Service Unavailable: 504"): + # Match on the status number — robust against the exact message, which the + # legacy query path renders verbatim as "HTTP 504 (URL: ...)". + with pytest.raises(ServiceUnavailable, match="504"): query(url, {"a": "1"}) diff --git a/tests/nwis_test.py b/tests/nwis_test.py index f343f26e..dab46d6a 100644 --- a/tests/nwis_test.py +++ b/tests/nwis_test.py @@ -9,6 +9,7 @@ import pandas as pd import pytest +from dataretrieval.exceptions import DataRetrievalError from dataretrieval.nwis import ( NWIS_Metadata, _read_rdb, @@ -73,8 +74,9 @@ def test_nwis_service_live(): try: # Minimal query: just most recent record get_iv(sites=site) - except ValueError as e: - # Catch known transient service failures surfaced as ValueError + except (DataRetrievalError, ValueError) as e: + # Catch known transient service failures: a typed DataRetrievalError + # (e.g. ServiceUnavailable on a 5xx, a RuntimeError) or a legacy ValueError error_text = str(e) if any( err in error_text diff --git a/tests/utils_test.py b/tests/utils_test.py index 00cec52e..d90821ae 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -15,16 +15,14 @@ def test_url_too_long(self): """Test to confirm error when query URL too long. Test based on GitHub Issue #64. - The server may respond with a 414 (converted to ValueError by query()) - or abruptly close the connection (ConnectionError). Both are valid - responses to an excessively long URL. + The server may respond with a 414 (converted to URLTooLong by query()) + or abruptly close the connection (a transport error, now wrapped as + NetworkError). Both are valid responses to an excessively long URL. """ - import httpx - # all sites in MD sites, _ = nwis.what_sites(stateCd="MD") # raise error by trying to query them all, so URL is way too long - with pytest.raises((ValueError, httpx.ConnectError)): + with pytest.raises((exceptions.URLTooLong, exceptions.NetworkError)): nwis.get_iv(sites=sites.site_no.values.tolist()) def test_header(self): @@ -45,35 +43,62 @@ def test_header(self): class Test_error_taxonomy: """The unified request-error hierarchy. - Every module's request failures are catchable as ``DataRetrievalError``, - while remaining backward-compatible with the built-in type each path - historically raised (``ValueError`` for the legacy ``query`` path, - ``RuntimeError`` for the waterdata retryable types). + Every module's request failure is catchable as ``DataRetrievalError``. + A status error is an ``HTTPError`` carrying ``.status_code`` (the retryable + 429 / 5xx subset is ``TransientError``); a connection failure is a + ``NetworkError``. The sole base is ``DataRetrievalError`` -- no builtin + (``ValueError`` / ``RuntimeError``) mixins. """ @pytest.mark.parametrize( - "status, exc_name, match, builtin", + "status, exc_name", [ - (400, "BadRequestError", "Bad Request", ValueError), - (404, "NotFoundError", "Page Not Found", ValueError), - (414, "URLTooLong", "Request URL too long", ValueError), - (503, "ServiceUnavailable", "Service Unavailable: 503", RuntimeError), + (400, "HTTPError"), + (403, "HTTPError"), + (404, "HTTPError"), + (429, "RateLimited"), + (503, "ServiceUnavailable"), ], ) - def test_query_maps_status_to_typed_error( - self, httpx_mock, status, exc_name, match, builtin - ): - """``query`` maps each HTTP status family to a typed error that is both a - ``DataRetrievalError`` (new, unified) and the built-in this path - historically raised for that kind of failure -- ``ValueError`` for a bad - request, ``RuntimeError`` for a transient 5xx -- with the message kept.""" + def test_query_maps_status_to_typed_error(self, httpx_mock, status, exc_name): + """``query`` maps each HTTP status to the right typed ``DataRetrievalError``: + a generic ``HTTPError`` (carrying ``.status_code``) for a fatal 4xx, and + the transient ``RateLimited`` / ``ServiceUnavailable`` for 429 / 5xx. The + too-long-URL statuses (413 / 414) are covered separately because their + message is the actionable remediation, not the bare status number.""" exc_cls = getattr(exceptions, exc_name) url = "https://example.invalid/x" httpx_mock.add_response(method="GET", url=f"{url}?a=1", status_code=status) - with pytest.raises(exc_cls, match=match) as excinfo: + with pytest.raises(exc_cls, match=str(status)) as excinfo: utils.query(url, {"a": "1"}) assert isinstance(excinfo.value, exceptions.DataRetrievalError) - assert isinstance(excinfo.value, builtin) # backward compatibility + if isinstance(excinfo.value, exceptions.HTTPError): + assert excinfo.value.status_code == status + + @pytest.mark.parametrize("status", [413, 414]) + def test_query_too_long_url_gives_actionable_message(self, httpx_mock, status): + """A server 413 / 414 surfaces as ``URLTooLong`` carrying the actionable + "Modify your query" remediation (the same message as the client-side + over-long-URL path), not a bare ``HTTP 414`` status line.""" + url = "https://example.invalid/x" + httpx_mock.add_response(method="GET", url=f"{url}?a=1", status_code=status) + with pytest.raises(exceptions.URLTooLong, match="Modify your query") as excinfo: + utils.query(url, {"a": "1"}) + assert isinstance(excinfo.value, exceptions.RequestTooLarge) + + def test_transport_error_wrapped_as_network_error(self, httpx_mock): + """A connection-level failure (no HTTP response) surfaces as the typed + ``NetworkError`` -- catchable via ``except DataRetrievalError`` like the + response-based errors, with the original ``httpx`` exception on + ``__cause__`` -- rather than leaking a raw ``httpx`` exception.""" + import httpx + + httpx_mock.add_exception(httpx.ConnectError("name resolution failed")) + with pytest.raises(exceptions.NetworkError) as excinfo: + utils.query("https://example.invalid/x", {"a": "1"}) + assert isinstance(excinfo.value, exceptions.DataRetrievalError) + assert not isinstance(excinfo.value, exceptions.HTTPError) # no status + assert isinstance(excinfo.value.__cause__, httpx.ConnectError) def test_query_failure_catchable_as_base(self, httpx_mock): """A bare ``except DataRetrievalError`` catches a legacy query failure.""" @@ -82,16 +107,72 @@ def test_query_failure_catchable_as_base(self, httpx_mock): with pytest.raises(exceptions.DataRetrievalError): utils.query(url, {"a": "1"}) + def test_uniform_retry_attributes_readable_on_every_error(self): + """Every error exposes ``.status_code`` / ``.retry_after`` / ``.retryable`` + so a base ``except DataRetrievalError as e`` can branch and retry without + an ``AttributeError`` on the types that lack a status (URLTooLong, + NetworkError, NoSitesError, ...). ``.retryable`` marks the 429/5xx and + connection failures.""" + import httpx + + # (error, status_code, retry_after, retryable) + cases = [ + (exceptions.error_for_status(404, "x"), 404, None, False), + (exceptions.error_for_status(429, "x", retry_after=5.0), 429, 5.0, True), + (exceptions.error_for_status(503, "x"), 503, None, True), + (exceptions.error_for_status(414, "x"), None, None, False), # URLTooLong + (exceptions.NetworkError("x"), None, None, True), + (exceptions.NoSitesError(httpx.URL("https://x/y")), None, None, False), + (exceptions.Unchunkable("x"), None, None, False), + ] + for err, status, retry_after, retryable in cases: + assert err.status_code == status, err + assert err.retry_after == retry_after, err + assert err.retryable is retryable, err + def test_no_sites_error_is_data_retrieval_error(self): - """``NoSitesError`` joins the root (was a bare ``Exception``).""" + """``NoSitesError`` (the legacy nwis no-data signal) roots at + ``DataRetrievalError`` and is not a builtin ``ValueError``, so it is + caught by the unified ``except dataretrieval.DataRetrievalError``.""" assert issubclass(exceptions.NoSitesError, exceptions.DataRetrievalError) - assert not issubclass(exceptions.NoSitesError, ValueError) # unchanged + assert not issubclass(exceptions.NoSitesError, ValueError) + import dataretrieval + + assert dataretrieval.NoSitesError is exceptions.NoSitesError + + def test_typed_errors_survive_pickle_and_deepcopy(self): + """Typed errors round-trip through pickle/deepcopy -- they get pickled + back from multiprocessing / lithops workers, and their constructor fields + (status_code, retry_after, url) must survive the trip.""" + import copy + import pickle + + import httpx + + samples = [ + exceptions.error_for_status(404, "not found"), # bare HTTPError + exceptions.error_for_status(429, "slow down", retry_after=5.0), + exceptions.error_for_status(503, "down"), + exceptions.TransientError("boom", status_code=502, retry_after=1.5), + exceptions.NoSitesError(httpx.URL("https://example.invalid/x?a=1")), + exceptions.NetworkError("could not reach the service"), + ] + for err in samples: + for revived in (pickle.loads(pickle.dumps(err)), copy.deepcopy(err)): + assert type(revived) is type(err) + assert str(revived) == str(err) + if isinstance(err, exceptions.HTTPError): + assert revived.status_code == err.status_code + if isinstance(err, exceptions.TransientError): + assert revived.retry_after == err.retry_after + if isinstance(err, exceptions.NoSitesError): + assert revived.url == err.url def test_waterdata_exceptions_share_the_root(self): """waterdata's typed exceptions are ``DataRetrievalError`` too, so one - ``except`` clause spans the legacy and waterdata subsystems — while - keeping their historical ``RuntimeError`` / ``ValueError`` bases and the - shared family bases (``TransientError``, ``RequestTooLarge``).""" + ``except`` clause spans the legacy and waterdata subsystems, and they + slot under the shared family bases (``HTTPError`` / ``TransientError`` / + ``RequestTooLarge``).""" from dataretrieval.waterdata.chunking import ( ChunkInterrupted, RateLimited, @@ -101,13 +182,12 @@ def test_waterdata_exceptions_share_the_root(self): for cls in (RateLimited, ServiceUnavailable, Unchunkable, ChunkInterrupted): assert issubclass(cls, exceptions.DataRetrievalError) - # Transient transport failures: RuntimeError, under TransientError. + # Transient 429/5xx: an HTTPError-with-status, under TransientError. assert issubclass(RateLimited, exceptions.TransientError) assert issubclass(ServiceUnavailable, exceptions.TransientError) - assert issubclass(ServiceUnavailable, RuntimeError) - # "Too large" failures: ValueError, under RequestTooLarge. + assert issubclass(ServiceUnavailable, exceptions.HTTPError) + # "Too large" failures slot under RequestTooLarge. assert issubclass(Unchunkable, exceptions.RequestTooLarge) - assert issubclass(Unchunkable, ValueError) def test_base_exported_at_top_level(self): """Users can write ``except dataretrieval.DataRetrievalError``.""" diff --git a/tests/waterdata_chunking_test.py b/tests/waterdata_chunking_test.py index 4ee4b555..0857da4a 100644 --- a/tests/waterdata_chunking_test.py +++ b/tests/waterdata_chunking_test.py @@ -31,6 +31,7 @@ if sys.version_info < (3, 10): pytest.skip("Skip entire module on Python < 3.10", allow_module_level=True) +from dataretrieval.exceptions import DataRetrievalError from dataretrieval.waterdata import chunking as _chunking from dataretrieval.waterdata import utils as _utils from dataretrieval.waterdata.chunking import ( @@ -38,6 +39,7 @@ _NEVER_CHUNK, _OR_SEP, _QUOTA_HEADER, + ChunkedCall, ChunkInterrupted, ChunkPlan, QuotaExhausted, @@ -455,11 +457,11 @@ async def fetch(args): state["i"] += 1 if i == 2: # Match _walk_pages's wrapping: a generic mid-pagination - # RuntimeError with the typed RateLimited as __cause__. + # DataRetrievalError with the typed RateLimited as __cause__. try: raise RateLimited("429: Too many requests made.") except RateLimited as cause: - raise RuntimeError( + raise DataRetrievalError( "Paginated request failed after collecting 0 page(s): " "429: Too many requests made." ) from cause @@ -687,7 +689,7 @@ async def fetch(args): try: raise ServiceUnavailable("503: Service unavailable.") except ServiceUnavailable as cause: - raise RuntimeError(str(cause)) from cause + raise DataRetrievalError(str(cause)) from cause return ( pd.DataFrame({"sites": list(args["sites"])}), _quota_response(500), @@ -717,10 +719,72 @@ def test_chunk_interrupted_base_class_catches_both(): and ``ServiceInterrupted`` must both subclass it.""" assert issubclass(QuotaExhausted, ChunkInterrupted) assert issubclass(ServiceInterrupted, ChunkInterrupted) - # Sanity: ``ChunkInterrupted`` is itself a ``RuntimeError`` so - # bare ``except RuntimeError`` callers don't suddenly miss the - # wrapped failures after this refactor. - assert issubclass(ChunkInterrupted, RuntimeError) + # ``ChunkInterrupted`` roots at ``DataRetrievalError`` like the rest of the + # taxonomy (no ``RuntimeError`` mixin), so one ``except DataRetrievalError`` + # spans chunked and single-shot failures alike. + assert issubclass(ChunkInterrupted, DataRetrievalError) + assert not issubclass(ChunkInterrupted, RuntimeError) + + +def test_chunk_interrupted_pickles_as_degraded_across_process_boundary(): + """A real ChunkInterrupted carries a live ChunkedCall whose ``fetch`` is not + stdlib-picklable, so a worker raising it inside a multiprocessing / + ProcessPoolExecutor pool could not ship it back. ``__getstate__`` drops + ``.call`` and pickles the documented degraded ``call=None`` state -- counts + and retry hint preserved, ``.resume()`` gone (un-resumable cross-process).""" + import pickle + + plan = ChunkPlan( + {"monitoring_location_id": ["A", "B", "C"]}, _fake_build, url_limit=8000 + ) + # A local function isn't picklable by reference -- mirrors production, where + # ChunkedCall.fetch is the undecorated _fetch_once shadowed by its wrapper. + call = ChunkedCall(plan, lambda args: (pd.DataFrame(), None)) + exc = call.wrap_failure(RateLimited("429: too many requests", retry_after=12.0)) + assert isinstance(exc, QuotaExhausted) and exc.call is call + # the live fetch handle alone can't pickle (the whole point of the override) + with pytest.raises((pickle.PicklingError, AttributeError)): + pickle.dumps(exc.call.fetch) + + revived = pickle.loads(pickle.dumps(exc)) + assert isinstance(revived, QuotaExhausted) + assert revived.call is None # degraded: no cross-process resume handle + assert revived.completed_chunks == exc.completed_chunks + assert revived.total_chunks == exc.total_chunks + assert revived.retry_after == 12.0 + assert str(revived) == str(exc) + + +def test_chunk_interrupted_with_partial_data_pickles_intact(): + """The degrade drops only the live ``.call``; the captured *partial work* + must still cross the boundary so a worker can report what it salvaged. + Exercises the path the no-completed-chunks case above doesn't: a real + ``partial_frame`` (rows) and ``partial_response`` (a live ``httpx.Response``, + which must itself remain picklable).""" + import pickle + + plan = ChunkPlan( + {"monitoring_location_id": ["A", "B", "C"]}, _fake_build, url_limit=8000 + ) + call = ChunkedCall(plan, lambda args: (pd.DataFrame(), None)) + # One sub-request completed before the failure: a real frame + response. + call._chunks[0] = ( + pd.DataFrame({"id": ["A"]}), + httpx.Response( + 200, + request=httpx.Request("GET", "https://example.invalid/a"), + json={"features": []}, + ), + ) + exc = call.wrap_failure(ServiceUnavailable("503: down")) + assert exc.completed_chunks == 1 + assert not exc.partial_frame.empty and exc.partial_response is not None + + revived = pickle.loads(pickle.dumps(exc)) + assert revived.call is None + assert revived.partial_frame["id"].tolist() == ["A"] + assert isinstance(revived.partial_response, httpx.Response) + assert revived.partial_response.status_code == 200 def test_connection_error_wrapped_as_service_interrupted(): @@ -810,7 +874,7 @@ async def fetch(args): try: raise ServiceUnavailable("503: Service unavailable.") except ServiceUnavailable as cause: - raise RuntimeError(str(cause)) from cause + raise DataRetrievalError(str(cause)) from cause return ( pd.DataFrame({"sites": list(args["sites"])}), _quota_response(500), @@ -1000,7 +1064,7 @@ async def fetch(args): try: raise RateLimited("429: Too many requests.", retry_after=42.0) except RateLimited as cause: - raise RuntimeError(str(cause)) from cause + raise DataRetrievalError(str(cause)) from cause return ( pd.DataFrame({"sites": list(args["sites"])}), _quota_response(500), @@ -1525,12 +1589,12 @@ def test_combine_chunk_responses_does_not_mutate_input_urls(): def _wrap_cause(transport_exc): - """Wrap ``transport_exc`` the way ``_walk_pages`` does — a - ``RuntimeError`` with the typed transport error on ``__cause__`` — so + """Wrap ``transport_exc`` the way ``_walk_pages`` does — a base + ``DataRetrievalError`` with the typed transport error on ``__cause__`` — so chain-walking is exercised realistically.""" try: - raise RuntimeError("Paginated request failed") from transport_exc - except RuntimeError as wrapped: + raise DataRetrievalError("Paginated request failed") from transport_exc + except DataRetrievalError as wrapped: return wrapped @@ -1603,18 +1667,23 @@ def test_retry_policy_from_env_honors_monkeypatched_constants(monkeypatch): def test_retryable_taxonomy(): + from dataretrieval.exceptions import HTTPError + assert _retryable(RateLimited("429", retry_after=5.0)) == (True, 5.0) assert _retryable(ServiceUnavailable("503")) == (True, None) assert _retryable(httpx.ReadTimeout("slow")) == (True, None) assert _retryable(httpx.ConnectError("down")) == (True, None) # InvalidURL is resumable but NOT retryable (a too-long cursor won't fix). assert _retryable(httpx.InvalidURL("too long")) == (False, None) - # Plain non-transient (e.g. a 4xx programmer error wrapped as RuntimeError). + # A fatal HTTP error (a plain HTTPError, not a TransientError) is never + # retried; nor is a bare RuntimeError. + assert _retryable(HTTPError("400", status_code=400)) == (False, None) + assert _retryable(HTTPError("403", status_code=403)) == (False, None) assert _retryable(RuntimeError("400")) == (False, None) def test_retryable_skips_wrapped_midpagination_transient(): - # A transient surfaced mid-pagination is re-wrapped as RuntimeError by + # A transient surfaced mid-pagination is re-wrapped as DataRetrievalError by # _paginate; it must NOT be auto-retried (re-walking from page 1 # would re-spend quota) — it escalates to the resumable handle instead. # Only the raw, top-level (initial-request) transient is retryable. diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index 01012d73..10733773 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -9,6 +9,7 @@ import pytest import dataretrieval.waterdata.utils as _utils_module +from dataretrieval.exceptions import DataRetrievalError, HTTPError, TransientError from dataretrieval.waterdata.chunking import RateLimited, ServiceUnavailable from dataretrieval.waterdata.utils import ( OGC_API_URL, @@ -240,7 +241,7 @@ def test_walk_pages_raises_on_connection_error_mid_pagination(): """A connection error mid-pagination must raise with the upstream cause chained, and the wrapper message must include recovery guidance that is NOT rate-limit-specific (no quota window involved).""" - with pytest.raises(RuntimeError, match="Paginated request failed") as excinfo: + with pytest.raises(DataRetrievalError, match="Paginated request failed") as excinfo: _walk_pages_with_failure(httpx.ConnectError("boom")) msg = str(excinfo.value) @@ -254,7 +255,7 @@ def test_walk_pages_raises_with_class_name_when_cause_stringifies_empty(): """Some ``httpx`` exceptions (e.g. ``TimeoutException("")``) stringify to ``""``. The wrapper must still produce an informative message — fall back to the exception class name.""" - with pytest.raises(RuntimeError, match="Paginated request failed") as excinfo: + with pytest.raises(DataRetrievalError, match="Paginated request failed") as excinfo: _walk_pages_with_failure(httpx.TimeoutException("")) msg = str(excinfo.value) @@ -276,7 +277,7 @@ def test_walk_pages_raises_on_5xx_mid_pagination(): } page2_503.url = "https://example.com/page2" - with pytest.raises(RuntimeError, match="Paginated request failed") as excinfo: + with pytest.raises(DataRetrievalError, match="Paginated request failed") as excinfo: _walk_pages_with_failure(page2_503) msg = str(excinfo.value) @@ -291,7 +292,7 @@ def test_walk_pages_raises_on_mid_pagination_429(): page2_429.status_code = 429 page2_429.url = "https://example.com/page2" - with pytest.raises(RuntimeError, match="Paginated request failed") as excinfo: + with pytest.raises(DataRetrievalError, match="Paginated request failed") as excinfo: _walk_pages_with_failure(page2_429) msg = str(excinfo.value) @@ -320,7 +321,7 @@ def test_walk_pages_wraps_initial_page_parse_error(): mock_req.headers = {} mock_req.url = "https://example.com/page1" - with pytest.raises(RuntimeError, match="Paginated request failed") as excinfo: + with pytest.raises(DataRetrievalError, match="Paginated request failed") as excinfo: _run_walk_pages(geopd=False, req=mock_req, client=mock_client) # The JSONDecodeError causing it is on __cause__ so callers can drill in. @@ -445,7 +446,7 @@ def test_get_stats_data_raises_on_mid_pagination_failure(monkeypatch): exercised by the ``_walk_pages`` triplet above. This single ``get_stats_data`` mid-pagination case proves the stats-specific follow-up callback is wired into ``_paginate`` correctly.""" - with pytest.raises(RuntimeError, match="Paginated request failed") as excinfo: + with pytest.raises(DataRetrievalError, match="Paginated request failed") as excinfo: _run_get_stats_data_with_failure( httpx.ConnectError("stats-boom"), monkeypatch, @@ -468,7 +469,7 @@ def test_get_stats_data_warning_includes_next_token(caplog, monkeypatch): "description": "upstream timeout", } - with pytest.raises(RuntimeError): + with pytest.raises(DataRetrievalError): _run_get_stats_data_with_failure(page2_503, monkeypatch) warnings_ = [r.getMessage() for r in caplog.records if r.levelno == logging.WARNING] @@ -791,9 +792,9 @@ def test_parse_retry_after_returns_none_for_unparseable(): def test_raise_for_non_200_raises_service_unavailable_for_5xx(): - """5xx must surface as the typed ``ServiceUnavailable`` (not bare - ``RuntimeError``) so the chunker can wrap it as a resumable - ``ServiceInterrupted`` rather than treating it as a fatal error.""" + """5xx must surface as the typed ``ServiceUnavailable`` so the chunker can + wrap it as a resumable ``ServiceInterrupted`` rather than treating it as a + fatal error.""" resp = _make_response(503, "", reason="Service Unavailable") resp.headers["Retry-After"] = "120" with pytest.raises(ServiceUnavailable) as excinfo: @@ -812,22 +813,22 @@ def test_raise_for_non_200_attaches_retry_after_to_rate_limited(): assert excinfo.value.retry_after == 60.0 -def test_raise_for_non_200_still_raises_bare_runtimeerror_for_other_4xx(): - """4xx other than 429 (e.g. 400 Bad Request) is a programmer error - that retry won't fix. Must remain bare ``RuntimeError`` so the - chunker's classifier doesn't wrap it as resumable.""" +def test_raise_for_non_200_400_raises_http_error(): + """400 raises a fatal ``HTTPError`` (status_code=400) the chunker won't + resume. It must NOT be a ``TransientError`` so the chunker's classifier + treats it as fatal rather than wrapping it as resumable.""" resp = _make_response( 400, '{"code": "BadRequest", "description": "missing parameter"}', reason="Bad Request", content_type="application/json", ) - with pytest.raises(RuntimeError) as excinfo: + with pytest.raises(HTTPError) as excinfo: _raise_for_non_200(resp) - # Must be exactly RuntimeError — not RateLimited, not - # ServiceUnavailable. Both subclass RuntimeError, so a plain - # ``pytest.raises(RuntimeError)`` would match either. - assert type(excinfo.value) is RuntimeError + assert excinfo.value.status_code == 400 + # Fatal, not transient: the chunker keys off ``isinstance(_, TransientError)`` + # to decide whether to wrap a failure as a resumable ChunkInterrupted. + assert not isinstance(excinfo.value, TransientError) def test_next_req_url_rejects_cross_host(): @@ -848,11 +849,9 @@ def test_next_req_url_rejects_cross_host(): def test_check_ogc_requests_raises_typed_on_5xx(httpx_mock): - """``_check_ogc_requests`` previously called ``resp.raise_for_status()``, - which leaks raw ``httpx.HTTPStatusError``. Now routes through - ``_raise_for_non_200`` so callers see ``ServiceUnavailable`` / - ``RateLimited`` / ``RuntimeError`` — the same typed contract as - the main data path.""" + """``_check_ogc_requests`` routes a non-200 through ``_raise_for_non_200``, + so a 5xx surfaces as the typed ``ServiceUnavailable`` — the same typed + contract as the main data path, not a raw ``httpx`` error.""" httpx_mock.add_response( method="GET", url=f"{OGC_API_URL}/collections/daily/schema", diff --git a/tests/waterservices_test.py b/tests/waterservices_test.py index 874c2a0e..4b30456e 100644 --- a/tests/waterservices_test.py +++ b/tests/waterservices_test.py @@ -3,6 +3,7 @@ import pytest from pandas import DataFrame +from dataretrieval.exceptions import HTTPError, NoSitesError from dataretrieval.nwis import ( get_discharge_peaks, get_dv, @@ -15,7 +16,6 @@ query_waterservices, what_sites, ) -from dataretrieval.utils import NoSitesError try: import geopandas as gpd @@ -62,7 +62,7 @@ def test_query_validation(httpx_mock): "https://waterservices.usgs.gov/nwis/stat?sites=bad_site_id&format=rdb" ) httpx_mock.add_response(method="GET", url=request_url, status_code=400) - with pytest.raises(ValueError) as type_error: + with pytest.raises(HTTPError) as type_error: get_stats(sites="bad_site_id") assert request_url in str(type_error)