diff --git a/lib/sdk/server-ai/README.md b/lib/sdk/server-ai/README.md
index b2d8d4f2..d6afbb44 100644
--- a/lib/sdk/server-ai/README.md
+++ b/lib/sdk/server-ai/README.md
@@ -43,8 +43,40 @@ The companion `agentConfig`/`agentConfigs` and `judgeConfig` methods retrieve ag
configs respectively. Within a prompt message or agent instruction, the evaluation context is
available as `{{ldctx}}` (for example `{{ldctx.key}}`).
-Metric tracking and manual judge evaluation will be added as the SDK is built out (see epic
-AIC-2629).
+## Tracking AI runs
+
+Every retrieved config exposes a tracker via `config.createTracker()`. Use it to record duration,
+time-to-first-token, success/error, token usage, tool calls, and feedback for an AI run. Trackers
+are thread-safe, and at-most-once metrics (duration, time-to-first-token, outcome, feedback, tokens)
+emit a single event even under concurrent calls. A run can be correlated across processes with
+`tracker.getResumptionToken()` and rebuilt later via `aiClient.createTracker(token, context)`.
+
+## Evaluating responses with judges (manual)
+
+A judge is an AI Config with `mode: judge` that scores another config's output against an evaluation
+metric.
+
+In `v1.0`, evaluation is **manual only**. The SDK parses `judgeConfiguration` and exposes it on
+configs, but it does **not** automatically invoke judges on completion or agent calls. Sample-rate
+driven auto-attachment is deferred past `v1.0`. Because no provider-specific runners ship yet, you
+supply your own `Runner` that calls your model and returns structured `{score, reasoning}` output.
+
+```java
+Runner runner = input -> {
+ // Call your model with `input`, then return its score/reasoning as structured output.
+ // metrics carries success/tokens/duration for the invocation.
+ return RunnerResult.builder(Metrics.builder(true).build())
+ .parsed(LDValue.buildObject().put("score", 0.9).put("reasoning", "grounded").build())
+ .build();
+};
+
+Judge judge = aiClient.createJudge("my-judge-key", context, null, variables, runner, 1.0);
+if (judge != null) {
+ JudgeResult result = judge.evaluate(originalInput, modelOutput);
+ // Recording the result is the caller's responsibility:
+ completionTracker.trackJudgeResult(result);
+}
+```
## Internal API convention
diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Evaluator.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Evaluator.java
new file mode 100644
index 00000000..8e53f024
--- /dev/null
+++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Evaluator.java
@@ -0,0 +1,127 @@
+package com.launchdarkly.sdk.server.ai;
+
+import com.launchdarkly.logging.LDLogger;
+import com.launchdarkly.logging.Logs;
+import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.JudgeResult;
+
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Objects;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+/**
+ * Runs a fixed set of {@link Judge}s against one input/output pair and collects their results.
+ *
+ * Each judge runs with fault isolation: a judge that throws or times out yields a
+ * failed {@link JudgeResult} for that judge while every other judge's result is preserved, in the
+ * original order. Judges run concurrently and each is bounded by a per-judge timeout so a single
+ * hung judge cannot stall the whole evaluation.
+ *
+ * The evaluator does not record results; recording the returned {@link JudgeResult}s (for example
+ * via a tracker) is the caller's responsibility. Instances are immutable and thread-safe.
+ *
+ * This type is not part of the public API in v1.0 and may change without notice.
+ */
+final class Evaluator {
+ /**
+ * Default per-judge timeout used when one is not supplied.
+ */
+ public static final Duration DEFAULT_PER_JUDGE_TIMEOUT = Duration.ofSeconds(30);
+
+ private final List judges;
+ private final Duration perJudgeTimeout;
+ private final LDLogger logger;
+
+ /**
+ * Creates an evaluator using the {@link #DEFAULT_PER_JUDGE_TIMEOUT default per-judge timeout}.
+ *
+ * @param judges the judges to run; must not be {@code null}
+ * @param logger the logger; must not be {@code null}
+ */
+ public Evaluator(List judges, LDLogger logger) {
+ this(judges, DEFAULT_PER_JUDGE_TIMEOUT, Objects.requireNonNull(logger, "logger"));
+ }
+
+ /**
+ * Creates an evaluator with an explicit per-judge timeout.
+ *
+ * @param judges the judges to run; must not be {@code null}
+ * @param perJudgeTimeout the maximum time to wait for each judge; must not be {@code null}
+ * @param logger the logger; must not be {@code null}
+ */
+ public Evaluator(List judges, Duration perJudgeTimeout, LDLogger logger) {
+ this.judges = Collections.unmodifiableList(new ArrayList<>(Objects.requireNonNull(judges, "judges")));
+ this.perJudgeTimeout = Objects.requireNonNull(perJudgeTimeout, "perJudgeTimeout");
+ this.logger = Objects.requireNonNull(logger, "logger");
+ }
+
+ /**
+ * Returns an evaluator with no judges. Its {@link #evaluate} returns an empty list and logs
+ * nothing.
+ *
+ * @return a no-op evaluator
+ */
+ public static Evaluator noop() {
+ return new Evaluator(
+ Collections.emptyList(), DEFAULT_PER_JUDGE_TIMEOUT, LDLogger.withAdapter(Logs.none(), ""));
+ }
+
+ /**
+ * Runs every judge against the given input and output.
+ *
+ * @param input the input that was provided to the AI being evaluated
+ * @param output the AI-generated response to score
+ * @return one {@link JudgeResult} per judge, in the judges' order; empty when there are no judges
+ */
+ public List evaluate(String input, String output) {
+ if (judges.isEmpty()) {
+ return Collections.emptyList();
+ }
+
+ ExecutorService pool = Executors.newFixedThreadPool(judges.size());
+ try {
+ List> futures = new ArrayList<>(judges.size());
+ for (Judge judge : judges) {
+ futures.add(pool.submit(() -> judge.evaluate(input, output)));
+ }
+
+ List results = new ArrayList<>(judges.size());
+ for (int i = 0; i < judges.size(); i++) {
+ results.add(awaitResult(judges.get(i), futures.get(i)));
+ }
+ return results;
+ } finally {
+ pool.shutdownNow();
+ }
+ }
+
+ private JudgeResult awaitResult(Judge judge, Future future) {
+ String key = judge.getAIConfig().getKey();
+ try {
+ return future.get(perJudgeTimeout.toMillis(), TimeUnit.MILLISECONDS);
+ } catch (TimeoutException e) {
+ future.cancel(true);
+ logger.warn("Judge {} timed out after {} ms", key, perJudgeTimeout.toMillis());
+ return failed(key, "Judge evaluation timed out");
+ } catch (ExecutionException e) {
+ Throwable cause = e.getCause() != null ? e.getCause() : e;
+ logger.error("Judge {} failed: {}", key, cause.toString());
+ return failed(key, cause.getMessage() != null ? cause.getMessage() : "Unknown error");
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ future.cancel(true);
+ return failed(key, "Judge evaluation interrupted");
+ }
+ }
+
+ private static JudgeResult failed(String key, String message) {
+ return JudgeResult.builder(true, false).judgeConfigKey(key).errorMessage(message).build();
+ }
+}
diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Judge.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Judge.java
new file mode 100644
index 00000000..33ba7c4d
--- /dev/null
+++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Judge.java
@@ -0,0 +1,243 @@
+package com.launchdarkly.sdk.server.ai;
+
+import com.launchdarkly.logging.LDLogger;
+import com.launchdarkly.sdk.LDValue;
+import com.launchdarkly.sdk.LDValueType;
+import com.launchdarkly.sdk.server.ai.datamodel.LDAIConfigTypes.Message;
+import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.JudgeResult;
+import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.Metrics;
+
+import java.util.List;
+import java.util.Objects;
+
+/**
+ * Evaluates the output of another AI Config using a judge AI Config and a caller-supplied
+ * {@link Runner}.
+ *
+ * A judge is an AI Config with {@code mode: judge} that scores a model response. Obtain one from
+ * {@link LDAIClient#createJudge}, then call {@link #evaluate(String, String)} with the original
+ * input and the response to score. Evaluation is synchronous.
+ *
+ * The judge records invocation metrics (duration, success, tokens) on its own tracker but does
+ * not emit the score via {@code trackJudgeResult}; recording the returned
+ * {@link JudgeResult} is the caller's responsibility.
+ *
+ * Instances are immutable and safe to share across threads as long as the supplied {@link Runner}
+ * is too.
+ */
+public final class Judge {
+ private final AIJudgeConfig config;
+ private final Runner runner;
+ private final double sampleRate;
+ private final LDLogger logger;
+
+ /**
+ * Creates a judge.
+ *
+ * @param config the judge AI Config; must not be {@code null}
+ * @param runner the runner used to invoke the judge model; must not be {@code null}
+ * @param sampleRate the default sampling rate in {@code [0.0, 1.0]}; non-finite, negative, or
+ * greater-than-one values are normalized
+ * @param logger the logger; must not be {@code null}
+ */
+ public Judge(AIJudgeConfig config, Runner runner, double sampleRate, LDLogger logger) {
+ this.config = Objects.requireNonNull(config, "config");
+ this.runner = Objects.requireNonNull(runner, "runner");
+ this.sampleRate = normalizeSampleRate(sampleRate);
+ this.logger = Objects.requireNonNull(logger, "logger");
+ }
+
+ /**
+ * Normalizes a sampling rate into {@code [0.0, 1.0]}. Non-finite rates fall back to {@code 1.0}
+ * (the default "always sample"); negative rates clamp to {@code 0.0}; rates above one clamp to
+ * {@code 1.0}.
+ *
+ * @param rate the requested rate
+ * @return the normalized rate
+ */
+ public static double normalizeSampleRate(double rate) {
+ if (Double.isNaN(rate) || Double.isInfinite(rate)) {
+ return 1.0;
+ }
+ if (rate < 0.0) {
+ return 0.0;
+ }
+ if (rate > 1.0) {
+ return 1.0;
+ }
+ return rate;
+ }
+
+ /**
+ * Returns the default sampling rate baked in at construction.
+ *
+ * @return the sampling rate
+ */
+ public double getSampleRate() {
+ return sampleRate;
+ }
+
+ /**
+ * Returns the judge AI Config.
+ *
+ * @return the config
+ */
+ public AIJudgeConfig getAIConfig() {
+ return config;
+ }
+
+ /**
+ * Returns the runner this judge invokes.
+ *
+ * @return the runner
+ */
+ public Runner getRunner() {
+ return runner;
+ }
+
+ /**
+ * Evaluates a response using the judge's default sampling rate.
+ *
+ * @param input the input that was provided to the AI being evaluated
+ * @param output the AI-generated response to score
+ * @return the evaluation result; never {@code null}
+ */
+ public JudgeResult evaluate(String input, String output) {
+ return evaluate(input, output, sampleRate);
+ }
+
+ /**
+ * Evaluates a response, deciding sampling before invoking the model.
+ *
+ * @param input the input that was provided to the AI being evaluated
+ * @param output the AI-generated response to score
+ * @param samplingRate the sampling rate to use for this call; an explicit {@code 0} suppresses the
+ * evaluation
+ * @return the evaluation result; never {@code null}. The result is failed (and never sampled) when
+ * the judge config has no evaluation metric key, when sampling skips it, when the response
+ * cannot be parsed, or when the runner throws.
+ */
+ public JudgeResult evaluate(String input, String output, double samplingRate) {
+ double effectiveRate = normalizeSampleRate(samplingRate);
+ String key = config.getKey();
+ LDAIConfigTracker tracker = config.createTracker();
+ try {
+ String metricKey = evaluationMetricKey();
+ if (metricKey == null) {
+ logger.warn("Judge configuration is missing required evaluation metric key: {}", key);
+ return JudgeResult.builder(true, false)
+ .judgeConfigKey(key)
+ .errorMessage("Judge configuration is missing required evaluation metric key")
+ .build();
+ }
+
+ if (Math.random() > effectiveRate) {
+ logger.debug("Judge evaluation skipped due to sampling rate: {}", effectiveRate);
+ return JudgeResult.builder(false, false).judgeConfigKey(key).build();
+ }
+
+ String evaluationInput = buildEvaluationInput(input, output);
+ RunnerResult response = tracker.trackMetricsOf(RunnerResult::getMetrics,
+ () -> runner.run(evaluationInput));
+
+ ParsedEvaluation parsed = parseEvaluationResponse(response.getParsed());
+ if (parsed == null) {
+ logger.warn("Could not parse judge evaluation response for: {}", key);
+ return JudgeResult.builder(true, false).judgeConfigKey(key).build();
+ }
+
+ Metrics metrics = response.getMetrics();
+ boolean success = metrics != null && metrics.isSuccess();
+ return JudgeResult.builder(true, success)
+ .judgeConfigKey(key)
+ .metricKey(metricKey)
+ .score(parsed.score)
+ .reasoning(parsed.reasoning)
+ .build();
+ } catch (Exception e) {
+ logger.error("Judge evaluation failed for {}: {}", key, e.toString());
+ String message = e.getMessage() != null ? e.getMessage() : "Unknown error";
+ return JudgeResult.builder(true, false).judgeConfigKey(key).errorMessage(message).build();
+ }
+ }
+
+ /**
+ * Evaluates a response from a conversation history and a runner result, using the judge's default
+ * sampling rate.
+ *
+ * @param messages the conversation history; may be empty or {@code null}
+ * @param response the runner result whose content is scored
+ * @return the evaluation result; never {@code null}
+ */
+ public JudgeResult evaluateMessages(List messages, RunnerResult response) {
+ return evaluateMessages(messages, response, sampleRate);
+ }
+
+ /**
+ * Evaluates a response from a conversation history and a runner result.
+ *
+ * Each message is rendered as {@code : } and the messages are joined with newlines
+ * to form the input; the response's content is the output.
+ *
+ * @param messages the conversation history; may be empty or {@code null}
+ * @param response the runner result whose content is scored
+ * @param samplingRate the sampling rate to use for this call
+ * @return the evaluation result; never {@code null}
+ */
+ public JudgeResult evaluateMessages(List messages, RunnerResult response, double samplingRate) {
+ StringBuilder input = new StringBuilder();
+ if (messages != null) {
+ boolean first = true;
+ for (Message message : messages) {
+ if (!first) {
+ input.append('\n');
+ }
+ input.append(message.getRole().getWireValue()).append(": ").append(message.getContent());
+ first = false;
+ }
+ }
+ String output = response == null ? null : response.getContent();
+ return evaluate(input.toString(), output, samplingRate);
+ }
+
+ private String evaluationMetricKey() {
+ String key = config.getEvaluationMetricKey();
+ if (key != null && !key.trim().isEmpty()) {
+ return key.trim();
+ }
+ return null;
+ }
+
+ private static String buildEvaluationInput(String input, String output) {
+ return "MESSAGE HISTORY:\n" + input + "\n\nRESPONSE TO EVALUATE:\n" + output;
+ }
+
+ private static ParsedEvaluation parseEvaluationResponse(LDValue parsed) {
+ if (parsed == null || parsed.getType() != LDValueType.OBJECT) {
+ return null;
+ }
+ LDValue score = parsed.get("score");
+ if (!score.isNumber()) {
+ return null;
+ }
+ double value = score.doubleValue();
+ if (value < 0.0 || value > 1.0) {
+ return null;
+ }
+ LDValue reasoning = parsed.get("reasoning");
+ if (!reasoning.isString()) {
+ return null;
+ }
+ return new ParsedEvaluation(value, reasoning.stringValue());
+ }
+
+ private static final class ParsedEvaluation {
+ private final double score;
+ private final String reasoning;
+
+ ParsedEvaluation(double score, String reasoning) {
+ this.score = score;
+ this.reasoning = reasoning;
+ }
+ }
+}
diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/LDAIClient.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/LDAIClient.java
index 523e1d68..ad11114d 100644
--- a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/LDAIClient.java
+++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/LDAIClient.java
@@ -96,4 +96,29 @@ AIJudgeConfig judgeConfig(
* @throws IllegalArgumentException if the token is malformed
*/
LDAIConfigTracker createTracker(String resumptionToken, LDContext context);
+
+ /**
+ * Retrieves a judge AI Config and builds a {@link Judge} for manual evaluation.
+ *
+ * This fires only the {@code $ld:ai:usage:create-judge} usage event. In v1.0 the SDK does not
+ * auto-attach judges to completion or agent calls; evaluation is manual, driven by the returned
+ * judge. Because the SDK ships no provider runners yet, the caller supplies the {@link Runner}.
+ *
+ * @param key the judge AI Config key
+ * @param context the context to evaluate the configuration in
+ * @param defaultValue the default used when the flag is absent or cannot be evaluated; when
+ * {@code null}, a disabled default is used
+ * @param variables variables interpolated into the judge prompt; may be {@code null}
+ * @param runner the runner the judge invokes; when {@code null}, no judge is created
+ * @param sampleRate the default sampling rate for the judge in {@code [0.0, 1.0]}
+ * @return a {@link Judge}, or {@code null} if the configuration is disabled or no runner was
+ * supplied
+ */
+ Judge createJudge(
+ String key,
+ LDContext context,
+ AIJudgeConfigDefault defaultValue,
+ Map variables,
+ Runner runner,
+ double sampleRate);
}
diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/LDAIClientImpl.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/LDAIClientImpl.java
index 80ef29b9..4b097115 100644
--- a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/LDAIClientImpl.java
+++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/LDAIClientImpl.java
@@ -47,6 +47,7 @@ public final class LDAIClientImpl implements LDAIClient {
private static final String TRACK_USAGE_AGENT_CONFIG = "$ld:ai:usage:agent-config";
private static final String TRACK_USAGE_AGENT_CONFIGS = "$ld:ai:usage:agent-configs";
private static final String TRACK_USAGE_JUDGE_CONFIG = "$ld:ai:usage:judge-config";
+ private static final String TRACK_USAGE_CREATE_JUDGE = "$ld:ai:usage:create-judge";
private static final LDContext INIT_TRACK_CONTEXT = LDContext
.builder("ld-internal-tracking")
@@ -145,6 +146,37 @@ public LDAIConfigTracker createTracker(String resumptionToken, LDContext context
return LDAIConfigTrackerImpl.fromResumptionToken(resumptionToken, client, context, logger);
}
+ @Override
+ public Judge createJudge(
+ String key,
+ LDContext context,
+ AIJudgeConfigDefault defaultValue,
+ Map variables,
+ Runner runner,
+ double sampleRate) {
+ // Manual-only path: fire the create-judge usage event, then resolve the config through the
+ // internal evaluate (which does not fire $ld:ai:usage:judge-config).
+ client.trackMetric(TRACK_USAGE_CREATE_JUDGE, context, LDValue.of(key), 1);
+ try {
+ AIJudgeConfigDefault effectiveDefault =
+ defaultValue != null ? defaultValue : AIJudgeConfigDefault.disabled();
+ AIJudgeConfig judgeConfig =
+ (AIJudgeConfig) evaluate(key, context, effectiveDefault, Mode.JUDGE, variables);
+ if (!judgeConfig.isEnabled()) {
+ logger.info("Judge configuration is disabled: {}", key);
+ return null;
+ }
+ if (runner == null) {
+ logger.warn("No runner supplied for judge: {}", key);
+ return null;
+ }
+ return new Judge(judgeConfig, runner, sampleRate, logger);
+ } catch (RuntimeException e) {
+ logger.error("Failed to initialize judge {}: {}", key, e.toString());
+ return null;
+ }
+ }
+
private AIAgentConfig evaluateAgent(
String key, LDContext context, AIAgentConfigDefault defaultValue, Map variables) {
AIAgentConfigDefault effectiveDefault =
diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Runner.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Runner.java
new file mode 100644
index 00000000..6542c33f
--- /dev/null
+++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Runner.java
@@ -0,0 +1,26 @@
+package com.launchdarkly.sdk.server.ai;
+
+/**
+ * Invokes an AI model with a string input and returns its result.
+ *
+ * In v1.0 the AI SDK does not ship provider-specific runners; an application supplies its own
+ * {@code Runner} (for example wrapping an OpenAI or Bedrock call) when creating a {@link Judge} via
+ * {@link LDAIClient#createJudge}. Built-in provider runners are planned for a later release.
+ *
+ * For structured-output use cases such as judge evaluation, the runner is expected to make the
+ * model's parsed JSON available via {@link RunnerResult#getParsed()}.
+ *
+ * Implementations should be safe to invoke from multiple threads if the same runner is shared across
+ * concurrently-evaluating judges.
+ */
+public interface Runner {
+ /**
+ * Invokes the model with the given input.
+ *
+ * @param input the input string to send to the model
+ * @return the model result; must not be {@code null}
+ * @throws Exception if the invocation fails; the caller (a {@link Judge}) records the failure and
+ * surfaces it as a failed evaluation rather than propagating it
+ */
+ RunnerResult run(String input) throws Exception;
+}
diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/RunnerResult.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/RunnerResult.java
new file mode 100644
index 00000000..9994da11
--- /dev/null
+++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/RunnerResult.java
@@ -0,0 +1,105 @@
+package com.launchdarkly.sdk.server.ai;
+
+import com.launchdarkly.sdk.LDValue;
+import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.Metrics;
+
+/**
+ * The result of a {@link Runner} invocation.
+ *
+ * Carries the model's text {@link #getContent() content}, the {@link #getMetrics() metrics} the SDK
+ * uses to track the run, and any {@link #getParsed() parsed} structured output. For judge
+ * evaluation the parsed value is expected to be a JSON object with {@code score} (a number in
+ * {@code [0.0, 1.0]}) and {@code reasoning} (a string). Instances are immutable.
+ */
+public final class RunnerResult {
+ private final String content;
+ private final Metrics metrics;
+ private final LDValue parsed;
+
+ private RunnerResult(Builder b) {
+ this.content = b.content;
+ this.metrics = b.metrics;
+ this.parsed = b.parsed == null ? LDValue.ofNull() : b.parsed;
+ }
+
+ /**
+ * Returns the model's text response.
+ *
+ * @return the content, or {@code null} if none was produced
+ */
+ public String getContent() {
+ return content;
+ }
+
+ /**
+ * Returns the metrics for this invocation.
+ *
+ * @return the metrics, or {@code null} if none were provided
+ */
+ public Metrics getMetrics() {
+ return metrics;
+ }
+
+ /**
+ * Returns the parsed structured output.
+ *
+ * @return the parsed value; never {@code null}, but {@link LDValue#ofNull()} when there was none
+ */
+ public LDValue getParsed() {
+ return parsed;
+ }
+
+ /**
+ * Creates a builder.
+ *
+ * @param metrics the metrics for the invocation
+ * @return a new {@link Builder}
+ */
+ public static Builder builder(Metrics metrics) {
+ return new Builder(metrics);
+ }
+
+ /**
+ * Builder for {@link RunnerResult}.
+ */
+ public static final class Builder {
+ private final Metrics metrics;
+ private String content;
+ private LDValue parsed;
+
+ private Builder(Metrics metrics) {
+ this.metrics = metrics;
+ }
+
+ /**
+ * Sets the model's text response.
+ *
+ * @param v the content
+ * @return this builder
+ */
+ public Builder content(String v) {
+ this.content = v;
+ return this;
+ }
+
+ /**
+ * Sets the parsed structured output.
+ *
+ * @param v the parsed value
+ * @return this builder
+ */
+ public Builder parsed(LDValue v) {
+ this.parsed = v;
+ return this;
+ }
+
+ /**
+ * Builds the immutable {@link RunnerResult}.
+ *
+ * @return a new {@link RunnerResult}
+ */
+ public RunnerResult build() {
+ return new RunnerResult(this);
+ }
+ }
+}
diff --git a/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/EvaluatorTest.java b/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/EvaluatorTest.java
new file mode 100644
index 00000000..2a4c8997
--- /dev/null
+++ b/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/EvaluatorTest.java
@@ -0,0 +1,121 @@
+package com.launchdarkly.sdk.server.ai;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.contains;
+import static org.hamcrest.Matchers.containsString;
+import static org.hamcrest.Matchers.empty;
+import static org.hamcrest.Matchers.hasSize;
+import static org.hamcrest.Matchers.is;
+import static org.mockito.Mockito.mock;
+
+import com.launchdarkly.logging.LDLogger;
+import com.launchdarkly.logging.Logs;
+import com.launchdarkly.sdk.LDContext;
+import com.launchdarkly.sdk.LDValue;
+import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.JudgeResult;
+import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.Metrics;
+import com.launchdarkly.sdk.server.ai.internal.LDAIConfigTrackerImpl;
+import com.launchdarkly.sdk.server.interfaces.LDClientInterface;
+
+import java.time.Duration;
+import java.util.Arrays;
+import java.util.List;
+import java.util.function.Supplier;
+
+import org.junit.Before;
+import org.junit.Test;
+
+@SuppressWarnings("javadoc")
+public class EvaluatorTest {
+ private LDClientInterface client;
+ private LDLogger logger;
+ private final LDContext context = LDContext.create("user-key");
+
+ @Before
+ public void setUp() {
+ client = mock(LDClientInterface.class);
+ logger = LDLogger.withAdapter(Logs.capture(), "test");
+ }
+
+ private Judge judge(String key, Runner runner) {
+ Supplier trackerFactory = () -> new LDAIConfigTrackerImpl(
+ client, "run-" + key, key, "v1", 1, "gpt-4", "openai", context, null, logger);
+ AIJudgeConfig config = new AIJudgeConfig(key, true, null, null, null, "relevance", trackerFactory);
+ return new Judge(config, runner, 1.0, logger);
+ }
+
+ private static Runner scoring(double score) {
+ return input -> RunnerResult.builder(Metrics.builder(true).build())
+ .parsed(LDValue.buildObject().put("score", score).put("reasoning", "r").build())
+ .build();
+ }
+
+ @Test
+ public void noopReturnsEmptyListAndLogsNothing() {
+ List results = Evaluator.noop().evaluate("q", "a");
+ assertThat(results, is(empty()));
+ }
+
+ @Test
+ public void runsEveryJudgePreservingOrder() {
+ Evaluator evaluator = new Evaluator(
+ Arrays.asList(judge("first", scoring(0.1)), judge("second", scoring(0.2))), logger);
+ List results = evaluator.evaluate("q", "a");
+ assertThat(results, hasSize(2));
+ assertThat(results.get(0).getJudgeConfigKey(), is("first"));
+ assertThat(results.get(0).getScore(), is(0.1));
+ assertThat(results.get(1).getJudgeConfigKey(), is("second"));
+ assertThat(results.get(1).getScore(), is(0.2));
+ }
+
+ @Test
+ public void faultyJudgeIsolatedAndOthersPreserved() {
+ Runner failing = input -> {
+ throw new RuntimeException("boom");
+ };
+ Evaluator evaluator = new Evaluator(
+ Arrays.asList(judge("ok", scoring(0.9)), judge("bad", failing)), logger);
+ List results = evaluator.evaluate("q", "a");
+ assertThat(results, hasSize(2));
+ assertThat(results.get(0).isSuccess(), is(true));
+ assertThat(results.get(0).getScore(), is(0.9));
+ assertThat(results.get(1).isSuccess(), is(false));
+ assertThat(results.get(1).getErrorMessage(), is("boom"));
+ }
+
+ @Test
+ public void hungJudgeTimesOutWithoutStallingChain() {
+ Runner slow = input -> {
+ Thread.sleep(5000);
+ return RunnerResult.builder(Metrics.builder(true).build()).build();
+ };
+ Evaluator evaluator = new Evaluator(
+ Arrays.asList(judge("fast", scoring(0.7)), judge("slow", slow)),
+ Duration.ofMillis(150),
+ logger);
+ List results = evaluator.evaluate("q", "a");
+ assertThat(results, hasSize(2));
+ assertThat(results.get(0).isSuccess(), is(true));
+ assertThat(results.get(1).isSuccess(), is(false));
+ assertThat(results.get(1).getErrorMessage(), containsString("timed out"));
+ assertThat(results.get(1).getJudgeConfigKey(), is("slow"));
+ }
+
+ @Test
+ public void resultsAreInJudgeOrderEvenWhenCompletionOrderDiffers() {
+ Runner slowOk = input -> {
+ Thread.sleep(300);
+ return RunnerResult.builder(Metrics.builder(true).build())
+ .parsed(LDValue.buildObject().put("score", 0.5).put("reasoning", "r").build())
+ .build();
+ };
+ Evaluator evaluator = new Evaluator(
+ Arrays.asList(judge("slow", slowOk), judge("fast", scoring(0.6))),
+ Duration.ofSeconds(5),
+ logger);
+ List results = evaluator.evaluate("q", "a");
+ assertThat(
+ Arrays.asList(results.get(0).getJudgeConfigKey(), results.get(1).getJudgeConfigKey()),
+ contains("slow", "fast"));
+ }
+}
diff --git a/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/JudgeTest.java b/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/JudgeTest.java
new file mode 100644
index 00000000..7fae78b3
--- /dev/null
+++ b/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/JudgeTest.java
@@ -0,0 +1,170 @@
+package com.launchdarkly.sdk.server.ai;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.containsString;
+import static org.hamcrest.Matchers.is;
+import static org.hamcrest.Matchers.nullValue;
+import static org.mockito.Mockito.mock;
+
+import com.launchdarkly.logging.LDLogger;
+import com.launchdarkly.logging.Logs;
+import com.launchdarkly.sdk.LDContext;
+import com.launchdarkly.sdk.LDValue;
+import com.launchdarkly.sdk.server.ai.datamodel.LDAIConfigTypes.Message;
+import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.JudgeResult;
+import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.Metrics;
+import com.launchdarkly.sdk.server.ai.internal.LDAIConfigTrackerImpl;
+import com.launchdarkly.sdk.server.interfaces.LDClientInterface;
+
+import java.util.Arrays;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.Supplier;
+
+import org.junit.Before;
+import org.junit.Test;
+
+@SuppressWarnings("javadoc")
+public class JudgeTest {
+ private LDClientInterface client;
+ private LDLogger logger;
+ private final LDContext context = LDContext.create("user-key");
+
+ @Before
+ public void setUp() {
+ client = mock(LDClientInterface.class);
+ logger = LDLogger.withAdapter(Logs.capture(), "test");
+ }
+
+ private AIJudgeConfig judgeConfig(String metricKey, boolean enabled) {
+ Supplier trackerFactory = () -> new LDAIConfigTrackerImpl(
+ client, "run-1", "judge-key", "v1", 1, "gpt-4", "openai", context, null, logger);
+ return new AIJudgeConfig("judge-key", enabled, null, null, null, metricKey, trackerFactory);
+ }
+
+ private static Runner runnerReturning(double score, String reasoning) {
+ return input -> RunnerResult.builder(Metrics.builder(true).build())
+ .content("evaluated")
+ .parsed(LDValue.buildObject().put("score", score).put("reasoning", reasoning).build())
+ .build();
+ }
+
+ @Test
+ public void evaluateScoresResponseAndReportsMetricKey() {
+ Judge judge = new Judge(judgeConfig("relevance", true), runnerReturning(0.8, "well grounded"), 1.0, logger);
+ JudgeResult result = judge.evaluate("the question", "the answer");
+ assertThat(result.isSampled(), is(true));
+ assertThat(result.isSuccess(), is(true));
+ assertThat(result.getScore(), is(0.8));
+ assertThat(result.getReasoning(), is("well grounded"));
+ assertThat(result.getMetricKey(), is("relevance"));
+ assertThat(result.getJudgeConfigKey(), is("judge-key"));
+ }
+
+ @Test
+ public void evaluateBuildsWellKnownInputFormat() {
+ AtomicReference captured = new AtomicReference<>();
+ Runner capturing = input -> {
+ captured.set(input);
+ return RunnerResult.builder(Metrics.builder(true).build())
+ .parsed(LDValue.buildObject().put("score", 0.5).put("reasoning", "ok").build())
+ .build();
+ };
+ Judge judge = new Judge(judgeConfig("relevance", true), capturing, 1.0, logger);
+ judge.evaluate("what is 2+2?", "4");
+ assertThat(captured.get(), is("MESSAGE HISTORY:\nwhat is 2+2?\n\nRESPONSE TO EVALUATE:\n4"));
+ }
+
+ @Test
+ public void zeroSamplingRateSkipsInvocation() {
+ AtomicReference invoked = new AtomicReference<>(false);
+ Runner runner = input -> {
+ invoked.set(true);
+ return RunnerResult.builder(Metrics.builder(true).build()).build();
+ };
+ Judge judge = new Judge(judgeConfig("relevance", true), runner, 0.0, logger);
+ JudgeResult result = judge.evaluate("q", "a");
+ assertThat(result.isSampled(), is(false));
+ assertThat(result.isSuccess(), is(false));
+ assertThat(invoked.get(), is(false));
+ }
+
+ @Test
+ public void missingEvaluationMetricKeyYieldsFailure() {
+ Judge judge = new Judge(judgeConfig(" ", true), runnerReturning(0.8, "x"), 1.0, logger);
+ JudgeResult result = judge.evaluate("q", "a");
+ assertThat(result.isSampled(), is(true));
+ assertThat(result.isSuccess(), is(false));
+ assertThat(result.getErrorMessage(), containsString("evaluation metric key"));
+ }
+
+ @Test
+ public void outOfRangeScoreFailsToParse() {
+ Judge judge = new Judge(judgeConfig("relevance", true), runnerReturning(1.5, "too high"), 1.0, logger);
+ JudgeResult result = judge.evaluate("q", "a");
+ assertThat(result.isSampled(), is(true));
+ assertThat(result.isSuccess(), is(false));
+ assertThat(result.getScore(), is(nullValue()));
+ }
+
+ @Test
+ public void missingReasoningFailsToParse() {
+ Runner runner = input -> RunnerResult.builder(Metrics.builder(true).build())
+ .parsed(LDValue.buildObject().put("score", 0.5).build())
+ .build();
+ Judge judge = new Judge(judgeConfig("relevance", true), runner, 1.0, logger);
+ JudgeResult result = judge.evaluate("q", "a");
+ assertThat(result.isSuccess(), is(false));
+ assertThat(result.getScore(), is(nullValue()));
+ }
+
+ @Test
+ public void runnerFailureYieldsFailedResult() {
+ Runner runner = input -> {
+ throw new RuntimeException("model exploded");
+ };
+ Judge judge = new Judge(judgeConfig("relevance", true), runner, 1.0, logger);
+ JudgeResult result = judge.evaluate("q", "a");
+ assertThat(result.isSampled(), is(true));
+ assertThat(result.isSuccess(), is(false));
+ assertThat(result.getErrorMessage(), is("model exploded"));
+ }
+
+ @Test
+ public void runnerReportingFailureMetricsYieldsUnsuccessfulResult() {
+ Runner runner = input -> RunnerResult.builder(Metrics.builder(false).build())
+ .parsed(LDValue.buildObject().put("score", 0.3).put("reasoning", "weak").build())
+ .build();
+ Judge judge = new Judge(judgeConfig("relevance", true), runner, 1.0, logger);
+ JudgeResult result = judge.evaluate("q", "a");
+ // Parsed successfully, but the runner's own metrics say the call did not succeed.
+ assertThat(result.isSuccess(), is(false));
+ assertThat(result.getScore(), is(0.3));
+ }
+
+ @Test
+ public void evaluateMessagesRendersRolePrefixedHistory() {
+ AtomicReference captured = new AtomicReference<>();
+ Runner capturing = input -> {
+ captured.set(input);
+ return RunnerResult.builder(Metrics.builder(true).build())
+ .parsed(LDValue.buildObject().put("score", 0.9).put("reasoning", "great").build())
+ .build();
+ };
+ Judge judge = new Judge(judgeConfig("relevance", true), capturing, 1.0, logger);
+ RunnerResult response = RunnerResult.builder(Metrics.builder(true).build()).content("the answer").build();
+ judge.evaluateMessages(
+ Arrays.asList(new Message(Message.Role.SYSTEM, "be helpful"), new Message(Message.Role.USER, "hi")),
+ response);
+ assertThat(captured.get(),
+ is("MESSAGE HISTORY:\nsystem: be helpful\nuser: hi\n\nRESPONSE TO EVALUATE:\nthe answer"));
+ }
+
+ @Test
+ public void normalizeSampleRateClampsAndDefaults() {
+ assertThat(Judge.normalizeSampleRate(-0.5), is(0.0));
+ assertThat(Judge.normalizeSampleRate(2.0), is(1.0));
+ assertThat(Judge.normalizeSampleRate(Double.NaN), is(1.0));
+ assertThat(Judge.normalizeSampleRate(Double.POSITIVE_INFINITY), is(1.0));
+ assertThat(Judge.normalizeSampleRate(0.42), is(0.42));
+ }
+}
diff --git a/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/LDAIClientImplTest.java b/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/LDAIClientImplTest.java
index 7ba4798e..0ab4f63d 100644
--- a/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/LDAIClientImplTest.java
+++ b/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/LDAIClientImplTest.java
@@ -9,9 +9,11 @@
import static org.hamcrest.Matchers.notNullValue;
import static org.hamcrest.Matchers.nullValue;
import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.ArgumentMatchers.anyDouble;
import static org.mockito.ArgumentMatchers.anyString;
import static org.mockito.ArgumentMatchers.eq;
import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.never;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
@@ -24,6 +26,7 @@
import com.launchdarkly.sdk.server.ai.datamodel.LDAIConfigTypes.Mode;
import com.launchdarkly.sdk.server.ai.datamodel.LDAIConfigTypes.Message;
import com.launchdarkly.sdk.server.ai.datamodel.LDAIConfigTypes.Model;
+import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.Metrics;
import com.launchdarkly.sdk.server.interfaces.LDClientInterface;
import java.util.ArrayList;
@@ -320,4 +323,57 @@ public void eachCreateTrackerCallStartsANewRun() {
assertThat(runA, is(notNullValue()));
assertThat(runA.equals(runB), is(false));
}
+
+ // ---- createJudge ----------------------------------------------------------
+
+ private static final String JUDGE_JSON =
+ "{\"_ldMeta\":{\"enabled\":true,\"mode\":\"judge\"},\"evaluationMetricKeys\":[\"relevance\"]}";
+
+ private static Runner stubRunner() {
+ return input -> RunnerResult.builder(Metrics.builder(true).build())
+ .parsed(LDValue.buildObject().put("score", 0.5).put("reasoning", "r").build())
+ .build();
+ }
+
+ @Test
+ public void createJudgeFiresOnlyCreateJudgeUsageEvent() {
+ when(client.jsonValueVariation(anyString(), any(), any())).thenReturn(LDValue.parse(JUDGE_JSON));
+
+ ai.createJudge("judge-key", context, null, null, stubRunner(), 1.0);
+
+ verify(client).trackMetric(eq("$ld:ai:usage:create-judge"), eq(context), eq(LDValue.of("judge-key")), eq(1.0));
+ verify(client, never()).trackMetric(eq("$ld:ai:usage:judge-config"), any(), any(), anyDouble());
+ }
+
+ @Test
+ public void createJudgeReturnsJudgeForEnabledConfig() {
+ when(client.jsonValueVariation(anyString(), any(), any())).thenReturn(LDValue.parse(JUDGE_JSON));
+
+ Runner runner = stubRunner();
+ Judge judge = ai.createJudge("judge-key", context, null, null, runner, 1.0);
+
+ assertThat(judge, is(notNullValue()));
+ assertThat(judge.getAIConfig().getKey(), is("judge-key"));
+ assertThat(judge.getAIConfig().getEvaluationMetricKey(), is("relevance"));
+ assertThat(judge.getRunner(), is(runner));
+ }
+
+ @Test
+ public void createJudgeReturnsNullWhenDisabled() {
+ String disabled = "{\"_ldMeta\":{\"enabled\":false,\"mode\":\"judge\"}}";
+ when(client.jsonValueVariation(anyString(), any(), any())).thenReturn(LDValue.parse(disabled));
+
+ Judge judge = ai.createJudge("judge-key", context, null, null, stubRunner(), 1.0);
+ assertThat(judge, is(nullValue()));
+ }
+
+ @Test
+ public void createJudgeReturnsNullWhenNoRunner() {
+ when(client.jsonValueVariation(anyString(), any(), any())).thenReturn(LDValue.parse(JUDGE_JSON));
+
+ Judge judge = ai.createJudge("judge-key", context, null, null, null, 1.0);
+ assertThat(judge, is(nullValue()));
+ // The usage event still fires before the runner check.
+ verify(client).trackMetric(eq("$ld:ai:usage:create-judge"), eq(context), eq(LDValue.of("judge-key")), eq(1.0));
+ }
}