scaleapi · luke-e-schaefer · May 12, 2026 · May 12, 2026 · May 12, 2026 · May 12, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,11 @@ All notable changes to the [Nucleus Python Client](https://github.com/scaleapi/n
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.18.4](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.18.3) - 2026-05-28
+
+### Added
+- **Evaluations V2** client support for COCO-style metrics on model runs via stored `evaluation_match_v2` rows. `NucleusClient` exposes `create_evaluation_v2()`, `get_evaluation_v2()`, and `list_evaluations_v2()`. The `EvaluationV2` resource supports `wait_for_completion()`, `charts()` (mAP, confusion matrix, PR curve, TIDE, and related aggregates), `examples()` (paginated TP/FP/FN rows), `delete()`, and `refresh()`. `AllowedLabelMatch` configures allowed ground-truth / prediction label pairs; filter and response types include `EvaluationV2FilterArgs`, `EvaluationV2Charts`, `EvaluationV2ExamplesPage`, and `EvaluationV2MatchExample`. Sphinx docs cover the workflow under Evaluations V2.
+
 ## [0.18.3](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.18.3) - 2026-05-18
 
 ### Added

diff --git a/docs/index.rst b/docs/index.rst
@@ -12,6 +12,36 @@ Scale Nucleus helps you:
 
 Nucleus is a new way—the right way—to develop ML models, helping us move away from the concept of one dataset and towards a paradigm of collections of scenarios.
 
+.. _evaluations-v2:
+
+Evaluations V2
+--------------
+
+Evaluation V2 measures how well a **model run** matches ground-truth annotations.
+Create a run with :meth:`NucleusClient.create_evaluation_v2`, wait with
+:meth:`nucleus.evaluation_v2.EvaluationV2.wait_for_completion`, then read summary metrics with
+:meth:`nucleus.evaluation_v2.EvaluationV2.charts` or individual matches with
+:meth:`nucleus.evaluation_v2.EvaluationV2.examples`.
+
+.. code-block:: python
+
+   import nucleus
+
+   client = nucleus.NucleusClient(api_key="YOUR_API_KEY")
+   evaluation = client.create_evaluation_v2(
+       model_run_id="run_xxx",
+       name="my-eval",
+       allowed_label_matches=[
+           nucleus.AllowedLabelMatch(
+               ground_truth_label="car",
+               model_prediction_label="vehicle",
+           ),
+       ],
+   )
+   evaluation.wait_for_completion()
+   charts = evaluation.charts(iou_threshold=0.5)
+   fps = evaluation.examples(match_type="FP", limit=20)
+
 .. _installation:
 
 Installation

diff --git a/nucleus/__init__.py b/nucleus/__init__.py
@@ -1,7 +1,8 @@
-"""Nucleus Python SDK. """
+"""Nucleus Python SDK."""
 
 __all__ = [
     "AsyncJob",
+    "AllowedLabelMatch",
     "EmbeddingsExportJob",
     "BoxAnnotation",
     "DeduplicationJob",
@@ -17,6 +18,12 @@
     "DatasetInfo",
     "DatasetItem",
     "DatasetItemRetrievalError",
+    "EvaluationV2",
+    "EvaluationV2Charts",
+    "EvaluationV2ExamplesPage",
+    "EvaluationV2FilterArgs",
+    "EvaluationV2MatchExample",
+    "EvaluationV2Status",
     "Frame",
     "Keypoint",
     "KeypointsAnnotation",
@@ -129,6 +136,12 @@
 )
 from .data_transfer_object.dataset_details import DatasetDetails
 from .data_transfer_object.dataset_info import DatasetInfo
+from .data_transfer_object.evaluation_v2 import (
+    EvaluationV2Charts,
+    EvaluationV2ExamplesPage,
+    EvaluationV2FilterArgs,
+    EvaluationV2MatchExample,
+)
 from .data_transfer_object.job_status import JobInfoRequestPayload
 from .dataset import Dataset
 from .dataset_item import DatasetItem
@@ -146,6 +159,7 @@
     NotFoundError,
     NucleusAPIError,
 )
+from .evaluation_v2 import AllowedLabelMatch, EvaluationV2, EvaluationV2Status
 from .job import CustomerJobTypes
 from .model import Model
 from .model_run import ModelRun
@@ -875,6 +889,76 @@ def commit_model_run(
             payload = {}
         return self.make_request(payload, f"modelRun/{model_run_id}/commit")
 
+    def create_evaluation_v2(
+        self,
+        model_run_id: str,
+        *,
+        name: Optional[str] = None,
+        allowed_label_matches: Optional[List[AllowedLabelMatch]] = None,
+        allowed_label_matches_id: Optional[str] = None,
+    ) -> EvaluationV2:
+        """Create an evaluation for a model run.
+
+        The evaluation runs in the background. Call
+        :meth:`EvaluationV2.wait_for_completion`, then
+        :meth:`EvaluationV2.charts` or :meth:`EvaluationV2.examples` for results.
+
+        Parameters:
+            model_run_id: Model run id (``run_*``).
+            name: Optional display name.
+            allowed_label_matches: Optional label pairs to treat as matches.
+            allowed_label_matches_id: Optional id of a saved label-match configuration.
+
+        Returns:
+            :class:`EvaluationV2`: The created evaluation.
+        """
+        payload: Dict[str, Any] = {}
+        if name is not None:
+            payload["name"] = name
+        if allowed_label_matches is not None:
+            payload["allowed_label_matches"] = [
+                m.to_api_dict() for m in allowed_label_matches
+            ]
+        if allowed_label_matches_id is not None:
+            payload["allowed_label_matches_id"] = allowed_label_matches_id
+        result = self.make_request(
+            payload, f"modelRun/{model_run_id}/evaluationsV2"
+        )
+        eval_id = result.get("evaluation_id")
+        if not eval_id:
+            raise RuntimeError(
+                f"Unexpected create evaluation V2 response: {result}"
+            )
+        return self.get_evaluation_v2(str(eval_id))
+
+    def get_evaluation_v2(self, evaluation_id: str) -> EvaluationV2:
+        """Get an evaluation by id.
+
+        Parameters:
+            evaluation_id: Evaluation id (``evalv2_*``).
+
+        Returns:
+            :class:`EvaluationV2`.
+        """
+        data = self.get(f"evaluationsV2/{evaluation_id}")
+        return EvaluationV2.from_json(data, self)
+
+    def list_evaluations_v2(self, model_run_id: str) -> List[EvaluationV2]:
+        """List evaluations for a model run (newest first).
+
+        Parameters:
+            model_run_id: Model run id (``run_*``).
+
+        Returns:
+            List of :class:`EvaluationV2`.
+        """
+        rows = self.get(f"modelRun/{model_run_id}/evaluationsV2")
+        if not isinstance(rows, list):
+            raise RuntimeError(
+                f"Unexpected list evaluations V2 response: {rows!r}"
+            )
+        return [EvaluationV2.from_json(r, self) for r in rows]
+
     @deprecated(msg="Prefer calling Dataset.info() directly.")
     def dataset_info(self, dataset_id: str):
         dataset = self.get_dataset(dataset_id)

diff --git a/nucleus/data_transfer_object/evaluation_v2.py b/nucleus/data_transfer_object/evaluation_v2.py
@@ -0,0 +1,162 @@
+"""Response and filter models for Evaluation V2."""
+
+from typing import Any, Dict, List, Literal, Optional
+
+from nucleus.pydantic_base import DictCompatibleModel
+
+
+def _snake_to_camel(name: str) -> str:
+    parts = name.split("_")
+    if len(parts) == 1:
+        return name
+    return parts[0] + "".join(part.capitalize() for part in parts[1:])
+
+
+def _camelize_filter_value(value: Any) -> Any:
+    if isinstance(value, dict):
+        return {
+            _snake_to_camel(key): (
+                val if key == "value" else _camelize_filter_value(val)
+            )
+            for key, val in value.items()
+        }
+    if isinstance(value, list):
+        return [_camelize_filter_value(item) for item in value]
+    return value
+
+
+class RangeNum(DictCompatibleModel):
+    min: Optional[float] = None
+    max: Optional[float] = None
+
+
+class MetadataPredicate(DictCompatibleModel):
+    key: str
+    op: Literal["EQ", "IN", "GT", "LT"]
+    value: Optional[Any] = None
+
+
+_FILTER_API_KEYS = {
+    "confidence_range": "confidenceRange",
+    "iou_range": "iouRange",
+    "pred_labels": "predLabels",
+    "gt_labels": "gtLabels",
+    "item_metadata": "itemMetadata",
+    "prediction_metadata": "predictionMetadata",
+    "label_equality": "labelEquality",
+    "has_ground_truth": "hasGroundTruth",
+    "tide_background": "tideBackground",
+}
+
+
+class EvaluationV2FilterArgs(DictCompatibleModel):
+    """Optional filters for :meth:`nucleus.evaluation_v2.EvaluationV2.charts` and :meth:`nucleus.evaluation_v2.EvaluationV2.examples`."""
+
+    confidence_range: Optional[RangeNum] = None
+    iou_range: Optional[RangeNum] = None
+    pred_labels: Optional[List[str]] = None
+    gt_labels: Optional[List[str]] = None
+    item_metadata: Optional[List[MetadataPredicate]] = None
+    prediction_metadata: Optional[List[MetadataPredicate]] = None
+    label_equality: Optional[Literal["EQ", "NEQ"]] = None
+    has_ground_truth: Optional[bool] = None
+    tide_background: Optional[bool] = None
+
+    def to_api_filters(self) -> Dict[str, Any]:
+        """Return filters as a dict ready for API requests."""
+        d = self.dict(exclude_none=True)
+        return {
+            api_key: _camelize_filter_value(d[snake_key])
+            for snake_key, api_key in _FILTER_API_KEYS.items()
+            if snake_key in d
+        }
+
+
+class MapSummary(DictCompatibleModel):
+    mapAt50: Optional[float] = None
+    mapAt75: Optional[float] = None
+    mapAt5095: Optional[float] = None
+
+
+class PerClassAp(DictCompatibleModel):
+    classLabel: str
+    ap: float
+
+
+class ConfusionEntry(DictCompatibleModel):
+    gtLabel: str
+    predLabel: str
+    count: int
+
+
+class ScoreHistogramBucket(DictCompatibleModel):
+    bucketMin: float
+    bucketMax: float
+    count: int
+
+
+class TotalCounts(DictCompatibleModel):
+    tp: int
+    fp: int
+    fn: int
+    predsWithConfidence: int
+
+
+class ApBySize(DictCompatibleModel):
+    small: Optional[float] = None
+    medium: Optional[float] = None
+    large: Optional[float] = None
+
+
+class PrCurvePoint(DictCompatibleModel):
+    classLabel: str
+    recall: float
+    precision: float
+
+
+class TideAttribution(DictCompatibleModel):
+    truePositive: int
+    localization: int
+    classification: int
+    both: int
+    duplicate: int
+    background: int
+    missed: int
+
+
+class EvaluationV2Charts(DictCompatibleModel):
+    mapSummary: MapSummary
+    perClassAp: List[PerClassAp]
+    confusionMatrix: List[ConfusionEntry]
+    scoreHistogram: List[ScoreHistogramBucket]
+    computedIouRanges: List[float]
+    totalCounts: TotalCounts
+    apBySize: ApBySize
+    prCurve: List[PrCurvePoint]
+    tideAttribution: TideAttribution
+
+
+class EvaluationV2MatchExample(DictCompatibleModel):
+    id: str
+    evaluation_id: str
+    dataset_item_id: str
+    model_prediction_id: Optional[str] = None
+    ground_truth_annotation_id: Optional[str] = None
+    pred_canonical_label: Optional[str] = None
+    gt_canonical_label: Optional[str] = None
+    pred_raw_label: Optional[str] = None
+    gt_raw_label: Optional[str] = None
+    iou: Optional[float] = None
+    confidence: Optional[float] = None
+    true_positive: bool
+    match_type: str
+    gt_area: Optional[float] = None
+    item_metadata: Optional[Dict[str, Any]] = None
+    prediction_metadata: Optional[Dict[str, Any]] = None
+    prediction_row: Optional[Dict[str, Any]] = None
+    annotation_row: Optional[Dict[str, Any]] = None
+
+
+class EvaluationV2ExamplesPage(DictCompatibleModel):
+    rows: List[EvaluationV2MatchExample]
+    total: int