Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions packages/gooddata-eval/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ dev = [
test = [
"pytest~=8.3.4",
"pytest-cov~=6.0.0",
"pytest-json-report==1.5.0",
"pytest-mock>=3.14.0",
]

Expand Down
21 changes: 9 additions & 12 deletions packages/gooddata-eval/src/gooddata_eval/cli/agentic_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import time
from typing import Any, TypedDict

from gooddata_eval.core.agentic._langfuse import HttpxLangfuseClient, make_langfuse_client
from gooddata_eval.core.agentic._langfuse import make_langfuse_client
from gooddata_eval.core.agentic.alert_skill import evaluate_agentic_alert_skill
from gooddata_eval.core.agentic.conversation import ConversationFixture, evaluate_agentic_conversation
from gooddata_eval.core.agentic.general_question import evaluate_agentic_general_question
Expand All @@ -17,17 +17,14 @@
from gooddata_eval.core.models import CreatedVisualization, DatasetItem
from gooddata_eval.core.runner import EvalReport, ItemReport

_LfKw = TypedDict(
"_LfKw",
{
"langfuse": Any,
"dataset_item_id": str,
"dataset_name": str,
"run_timestamp": str,
"model_version_override": str | None,
},
total=False,
)

class _LfKw(TypedDict, total=False):
langfuse: Any
dataset_item_id: str
dataset_name: str
run_timestamp: str
model_version_override: str | None


AGENTIC_TEST_KINDS = frozenset(
{
Expand Down
2 changes: 1 addition & 1 deletion packages/gooddata-eval/src/gooddata_eval/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from rich.console import Console
from rich.table import Table

from gooddata_eval.cli.agentic_runner import AGENTIC_TEST_KINDS, run_agentic_items
from gooddata_eval.core.chat.sse_client import ChatClient
from gooddata_eval.core.config import RunConfig
from gooddata_eval.core.connection import ConnectionError_, resolve_connection
Expand All @@ -20,7 +21,6 @@
from gooddata_eval.core.models import ChatResult, DatasetItem
from gooddata_eval.core.reporting.console import render_comparison, render_console
from gooddata_eval.core.reporting.json_report import write_multi_model_report
from gooddata_eval.cli.agentic_runner import AGENTIC_TEST_KINDS, run_agentic_items
from gooddata_eval.core.runner import ItemReport, run_items
from gooddata_eval.core.summary.http_client import SummaryClient
from gooddata_eval.core.workspace import ModelResolutionError, WorkspaceModelController
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,10 @@
import os
import re
from dataclasses import dataclass

from typing import Any

from gooddata_eval.core.chat.sse_client import ChatClient
from gooddata_eval.core.agentic._catalog import CatalogMetricAlert

from gooddata_eval.core.chat.sse_client import ChatClient
from gooddata_eval.core.models import ToolCallEvent

try:
Expand Down Expand Up @@ -438,7 +436,9 @@ def evaluate_agentic_alert_skill(
model_version_override: str | None = None,
) -> None:
"""Run alert-skill evaluation, log to Langfuse, and raise AlertSkillAssertionError on failure."""
from datetime import datetime as _dt, timezone as _tz # noqa: PLC0415
from datetime import datetime as _dt # noqa: PLC0415
from datetime import timezone as _tz # noqa: PLC0415

from gooddata_eval.core.agentic._langfuse import try_make_langfuse_client # noqa: PLC0415

if langfuse is None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,9 @@ def evaluate_agentic_conversation(
model_version_override: str | None = None,
) -> None:
"""Run conversation evaluation, log to Langfuse, and raise on failure."""
from datetime import datetime as _dt, timezone as _tz # noqa: PLC0415
from datetime import datetime as _dt # noqa: PLC0415
from datetime import timezone as _tz # noqa: PLC0415

from gooddata_eval.core.agentic._langfuse import try_make_langfuse_client # noqa: PLC0415

if langfuse is None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,9 @@ def evaluate_agentic_general_question(
model_version_override: str | None = None,
) -> None:
"""Run general-question evaluation, log to Langfuse, and raise on failure."""
from datetime import datetime as _dt, timezone as _tz # noqa: PLC0415
from datetime import datetime as _dt # noqa: PLC0415
from datetime import timezone as _tz # noqa: PLC0415

from gooddata_eval.core.agentic._langfuse import try_make_langfuse_client # noqa: PLC0415

if langfuse is None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,9 @@ def evaluate_agentic_guardrail(
model_version_override: str | None = None,
) -> None:
"""Run guardrail evaluation, log to Langfuse, and raise on failure."""
from datetime import datetime as _dt, timezone as _tz # noqa: PLC0415
from datetime import datetime as _dt # noqa: PLC0415
from datetime import timezone as _tz # noqa: PLC0415

from gooddata_eval.core.agentic._langfuse import try_make_langfuse_client # noqa: PLC0415

if langfuse is None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,10 @@

from __future__ import annotations

from typing import Any

import os
import re
from dataclasses import dataclass
from typing import Any

from gooddata_eval.core.chat.sse_client import ChatClient
from gooddata_eval.core.models import ToolCallEvent
Expand Down Expand Up @@ -247,7 +246,9 @@ def evaluate_agentic_metric_skill(
model_version_override: str | None = None,
) -> None:
"""Run metric-skill evaluation, log to Langfuse, and raise MetricSkillAssertionError on failure."""
from datetime import datetime as _dt, timezone as _tz # noqa: PLC0415
from datetime import datetime as _dt # noqa: PLC0415
from datetime import timezone as _tz # noqa: PLC0415

from gooddata_eval.core.agentic._langfuse import try_make_langfuse_client # noqa: PLC0415

if langfuse is None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,9 @@ def evaluate_agentic_search_tool(
model_version_override: str | None = None,
) -> None:
"""Run search-tool evaluation, log to Langfuse, and raise SearchToolAssertionError on failure."""
from datetime import datetime as _dt, timezone as _tz # noqa: PLC0415
from datetime import datetime as _dt # noqa: PLC0415
from datetime import timezone as _tz # noqa: PLC0415

from gooddata_eval.core.agentic._langfuse import try_make_langfuse_client # noqa: PLC0415

if langfuse is None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,9 @@ def evaluate_agentic_visualization(
) -> None:
"""Run visualization evaluation, log to Langfuse, and raise VisualizationAssertionError on failure."""
import json as _json # noqa: PLC0415
from datetime import datetime as _dt, timezone as _tz # noqa: PLC0415
from datetime import datetime as _dt # noqa: PLC0415
from datetime import timezone as _tz # noqa: PLC0415

from gooddata_eval.core.agentic._langfuse import try_make_langfuse_client # noqa: PLC0415

if langfuse is None:
Expand Down
2 changes: 0 additions & 2 deletions packages/gooddata-eval/tests/test_agentic_alert_skill.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,7 @@
from unittest.mock import MagicMock, patch

from gooddata_eval.core.agentic.alert_skill import (
AgenticAlertSummary,
AlertEvaluation,
AlertRunResult,
_deep_subset,
_to_number,
run_agentic_alert_skill,
Expand Down
3 changes: 1 addition & 2 deletions packages/gooddata-eval/tests/test_agentic_conversation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,12 @@

from gooddata_eval.core.agentic.conversation import (
ConversationFixture,
ConversationResult,
TurnDefinition,
TurnResult,
_resolve_refs,
run_agentic_conversation,
)
from gooddata_eval.core.models import ChatResult, ToolCallEvent
from gooddata_eval.core.models import ToolCallEvent


def test_turn_definition_model():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,9 @@
from unittest.mock import MagicMock, patch

from gooddata_eval.core.agentic.general_question import (
AgenticGeneralQuestionSummary,
GeneralQuestionResult,
run_agentic_general_question,
)
from gooddata_eval.core.models import ChatResult


def test_general_question_result_fields():
Expand Down
2 changes: 0 additions & 2 deletions packages/gooddata-eval/tests/test_agentic_guardrail.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,9 @@
from unittest.mock import MagicMock, patch

from gooddata_eval.core.agentic.guardrail import (
AgenticGuardrailSummary,
GuardrailResult,
run_agentic_guardrail,
)
from gooddata_eval.core.models import ChatResult


def test_guardrail_result_fields():
Expand Down
2 changes: 0 additions & 2 deletions packages/gooddata-eval/tests/test_agentic_search_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
from unittest.mock import MagicMock, patch

from gooddata_eval.core.agentic.search_tool import (
AgenticSearchSummary,
SearchResult,
_tool_correctness,
_tool_selection,
run_agentic_search_tool,
Expand Down
13 changes: 5 additions & 8 deletions packages/gooddata-eval/tests/test_agentic_visualization.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,9 @@
All tests mock ChatClient so no network is needed.
"""

from dataclasses import dataclass
from unittest.mock import MagicMock, call, patch

import pytest

from gooddata_eval.core.agentic.visualization import (
AgenticRunSummary,
RunResult,
_execute_single_run,
run_agentic_visualization,
)
Expand Down Expand Up @@ -154,7 +149,8 @@ def test_run_agentic_visualization_uses_initial_conversation_for_run_0():
# create_conversation should NOT be called for run 0
instance.create_conversation.assert_not_called()
instance.send_message.assert_called_once_with("existing-conv", "Show revenue")
instance.delete_conversation.assert_called_once_with("existing-conv")
# the caller-supplied conversation is left intact; the function only deletes conversations it created
instance.delete_conversation.assert_not_called()
assert len(summary.run_results) == 1


Expand All @@ -176,7 +172,8 @@ def test_run_agentic_visualization_creates_fresh_conversations_for_remaining_run
)

assert instance.create_conversation.call_count == 1 # only for run 1
assert instance.delete_conversation.call_count == 2 # existing-conv + fresh-1
# only the self-created fresh-1 is deleted; the caller-supplied existing-conv is left intact
instance.delete_conversation.assert_called_once_with("fresh-1")
assert len(summary.run_results) == 2


Expand Down Expand Up @@ -233,7 +230,7 @@ def test_run_agentic_visualization_creates_conversation_when_no_initial_id():
instance.create_conversation.side_effect = ["new-0", "new-1"]
instance.send_message.return_value = _chat_with_viz()

summary = run_agentic_visualization(
run_agentic_visualization(
host="https://example.com",
token="tok",
workspace_id="ws",
Expand Down
19 changes: 11 additions & 8 deletions packages/gooddata-eval/tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
# (C) 2026 GoodData Corporation
import io
from concurrent.futures import ThreadPoolExecutor, as_completed

import httpx
import orjson
import pytest
Expand Down Expand Up @@ -86,7 +89,8 @@ def _fake_run(
]
)
assert exit_code == 0
assert orjson.loads(out.read_bytes())["runs"]["gpt-5.2"]["summary"]["passed"] == 1
# run keys are provider-prefixed (provider_name/model) to stay collision-free across providers
assert orjson.loads(out.read_bytes())["runs"]["Test Provider/gpt-5.2"]["summary"]["passed"] == 1


def test_cli_operational_error_exits_nonzero(monkeypatch, fixtures_dir):
Expand Down Expand Up @@ -405,9 +409,10 @@ def _fake_run(items, backend, *, runs, model, workspace_id, **kw):
]
)
data = orjson.loads(out.read_bytes())
assert data["models"] == ["gpt-5.2", "gpt-4o"]
# keys are provider-prefixed (provider_name/model); provider_name is "P" here
assert data["models"] == ["P/gpt-5.2", "P/gpt-4o"]
assert "runs" in data and "comparison" in data
assert data["comparison"]["gpt-5.2"]["passed"] == 1
assert data["comparison"]["P/gpt-5.2"]["passed"] == 1


def test_cli_restore_fires_even_when_model_loop_raises(monkeypatch, fixtures_dir):
Expand Down Expand Up @@ -534,10 +539,6 @@ def test_cli_rejects_negative_concurrency(monkeypatch, fixtures_dir):

def test_progress_callbacks_thread_safe():
"""Verify progress callbacks can be called from multiple threads without error."""
import io
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed

console = Console(file=io.StringIO(), force_terminal=False)
on_item_start, on_run_done, on_item_done = cli_main._make_progress_callbacks(console)

Expand All @@ -554,7 +555,9 @@ def _worker(index: int) -> None:
)
on_item_start(index, 100, item)
on_run_done(index, 100, 1, 1, index % 2 == 0, 1.5)
report = ItemReport(id=f"test-{index}", dataset_name="test", test_kind="general_question")
report = ItemReport(
id=f"test-{index}", dataset_name="test", test_kind="general_question", question=f"Question {index}"
)
report.runs = 1
report.latency_s = 1.5
report.pass_at_k = index % 2 == 0
Expand Down
3 changes: 2 additions & 1 deletion packages/gooddata-eval/tests/test_langfuse_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ def test_item_from_raw_dict_input():
item = _item_from_raw(raw, dataset_name="ds", test_kind="visualization")
assert item.id == "lf-1"
assert item.question == "Show revenue"
assert item.test_kind == "visualization"
# expected_output carries a "visualization" key, so _infer_test_kind classifies it as production agentic vis
assert item.test_kind == "vis_agentic"
assert item.dataset_name == "ds"


Expand Down
16 changes: 16 additions & 0 deletions packages/gooddata-eval/tox.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# (C) 2026 GoodData Corporation
[tox]
envlist = py3{10,11,12,13,14}

[testenv]
runner = uv-venv-lock-runner
package = wheel
wheel_build_env = .pkg
extras =
llm-judge
dependency_groups =
test
setenv =
COVERAGE_CORE=sysmon
commands =
pytest -v --cov --cov-report=xml tests {posargs} --json-report --json-report-file=.json-report-{envname}.json
2 changes: 1 addition & 1 deletion packages/gooddata-fdw/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ test = [
"pytest~=8.3.4",
"pytest-cov~=6.0.0",
"pytest-json-report==1.5.0",
"vcrpy~=8.0.0",
"vcrpy~=8.2.1",
# TODO - Bump the version together with bumping the version of openapi generator
"urllib3~=2.6.0",
"pyyaml",
Expand Down
2 changes: 1 addition & 1 deletion packages/gooddata-pandas/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ test = [
"pytest-json-report==1.5.0",
"pytest-snapshot==0.9.0",
"pytest-order~=1.3.0",
"vcrpy~=8.0.0",
"vcrpy~=8.2.1",
"urllib3~=2.6.0",
"python-dotenv~=1.0.0",
"pyyaml",
Expand Down
2 changes: 1 addition & 1 deletion packages/gooddata-sdk/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ test = [
"pytest-json-report==1.5.0",
"pytest-snapshot==0.9.0",
"pytest-order~=1.3.0",
"vcrpy~=8.0.0",
"vcrpy~=8.2.1",
"urllib3~=2.6.0",
"python-dotenv~=1.0.0",
"deepdiff~=8.5.0",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from datetime import datetime
from importlib.util import find_spec
from typing import Any, Literal, TypeAlias, Union, cast
from typing import Any, Literal, TypeAlias, Union

from attrs import define
from gooddata_api_client.model.inline_filter_definition_inline import InlineFilterDefinitionInline
Expand Down Expand Up @@ -670,7 +670,7 @@ def description(self, labels: dict[str, str], format_locale: str | None = None)
metric_id = self.metric.id if isinstance(self.metric, ObjId) else self.metric
if self.operator in ["BETWEEN", "NOT_BETWEEN"] and len(self.values) == 2:
not_between = "not " if self.operator == "NOT_BETWEEN" else ""
values = cast(tuple[float, float], self.values)
values = self.values
return f"{labels.get(metric_id, metric_id)}: {not_between}between {values[0]} - {values[1]}"
else:
return (
Expand Down
Loading
Loading