EntityProcess · christso · Jul 2, 2026 · Jul 1, 2026 · Jul 1, 2026 · Jul 1, 2026
diff --git a/.agents/product-boundary.md b/.agents/product-boundary.md
@@ -80,6 +80,8 @@ Use public reference standards before inventing AgentV-specific contracts:
 - Hugging Face Datasets for dataset, split, record, and portable corpus conventions.
 - OpenInference for trace, span, tool-call, and model-observability semantics.
 
+Research those references from local cloned repositories first when a clone is available, and use DeepWiki MCP for repository-level orientation or cross-repo questions. Broad web search is a fallback, not the default. If current public documentation matters for the decision, use official docs and record the exact source or commit alongside the conclusion.
+
 Treat these as reference inputs, not dependencies. AgentV should adopt the shared lowest common denominator when it fits the repo-native artifact model, and document any intentional divergence in the relevant plan, ADR, or contract docs.
 
 ### 5. YAGNI - You Aren't Gonna Need It

diff --git a/AGENTS.md b/AGENTS.md
@@ -26,6 +26,7 @@ Design guardrails:
 - Document composition patterns before inventing a new feature.
 - Match industry-standard lowest-common-denominator contracts when possible.
 - When designing AgentV contracts, check public reference standards such as Claude Skills, Vercel agent-eval, Hugging Face Datasets, and OpenInference before inventing AgentV-specific shapes. Use their shared lowest common denominator where it fits, and document any intentional divergence.
+- For peer-framework research, prefer local cloned repositories and DeepWiki MCP over broad web search. If a public contract must be checked for currentness, use official docs and record the source or commit behind the conclusion.
 - Apply YAGNI aggressively and solve the current request with the smallest surface that works.
 - Keep extensions non-breaking unless a same-week unreleased surface should be hard-corrected.
 - Design for AI comprehension with self-describing modules, clear extension points, and no dead scaffolding.

diff --git a/apps/cli/src/commands/eval/commands/bundle.ts b/apps/cli/src/commands/eval/commands/bundle.ts
@@ -30,7 +30,7 @@ function unique(values: readonly string[]): readonly string[] {
 
 function targetReferenceNames(target: TargetDefinition): readonly string[] {
   const references: string[] = [];
-  for (const key of ['use_target', 'grader_target', 'judge_target'] as const) {
+  for (const key of ['use_target', 'grader_target'] as const) {
     const value = target[key];
     if (typeof value === 'string' && value.trim().length > 0 && !value.includes('${{')) {
       references.push(value.trim());

diff --git a/apps/cli/src/commands/eval/task-bundle.ts b/apps/cli/src/commands/eval/task-bundle.ts
@@ -497,7 +497,7 @@ function buildEvalCase(
 
 function targetReferenceNames(target: TargetDefinition): readonly string[] {
   const references: string[] = [];
-  for (const key of ['use_target', 'grader_target', 'judge_target'] as const) {
+  for (const key of ['use_target', 'grader_target'] as const) {
     const value = target[key];
     if (typeof value === 'string' && value.trim().length > 0 && !value.includes('${{')) {
       references.push(value.trim());
@@ -831,7 +831,7 @@ async function collectWorkspaceReferences(
 
     for (const hookName of ['before_all', 'before_each', 'after_each', 'after_all'] as const) {
       const hook = hooks[hookName];
-      const command = hook?.command ?? hook?.script;
+      const command = hook?.command;
       if (!command || command.length === 0) {
         continue;
       }

diff --git a/apps/cli/src/commands/pipeline/input.ts b/apps/cli/src/commands/pipeline/input.ts
@@ -293,7 +293,7 @@ async function writeGraderConfigs(
         weight: r.weight ?? 1.0,
         ...(r.score_ranges ? { score_range: r.score_ranges } : {}),
         ...(r.required !== undefined ? { required: r.required } : {}),
-        ...(r.required_min_score !== undefined ? { required_min_score: r.required_min_score } : {}),
+        ...(r.min_score !== undefined ? { min_score: r.min_score } : {}),
       }));
 
       await writeJson(join(llmGradersDir, `${config.name}.json`), {

diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts
@@ -476,7 +476,7 @@ async function writeGraderConfigs(
         weight: r.weight ?? 1.0,
         ...(r.score_ranges ? { score_range: r.score_ranges } : {}),
         ...(r.required !== undefined ? { required: r.required } : {}),
-        ...(r.required_min_score !== undefined ? { required_min_score: r.required_min_score } : {}),
+        ...(r.min_score !== undefined ? { min_score: r.min_score } : {}),
       }));
 
       await writeJson(join(llmGradersDir, `${config.name}.json`), {

diff --git a/apps/cli/src/commands/runs/rerun.ts b/apps/cli/src/commands/runs/rerun.ts
@@ -141,7 +141,7 @@ function resolveWholeEnvReference(value: unknown): string | undefined {
 
 function referencedTargetNames(definition: Record<string, unknown>): readonly string[] {
   const names: string[] = [];
-  for (const key of ['use_target', 'grader_target', 'judge_target'] as const) {
+  for (const key of ['use_target', 'grader_target'] as const) {
     const resolved = resolveWholeEnvReference(definition[key]);
     if (resolved && !resolved.includes('${{')) {
       names.push(resolved);

diff --git a/apps/cli/src/templates/.agentv/targets.yaml b/apps/cli/src/templates/.agentv/targets.yaml
@@ -1,6 +1,7 @@
 # A list of all supported evaluation targets for the project.
 # Each target defines a provider and its specific configuration.
 # Actual values for paths/keys are stored in the local .env file.
+# Agent and CLI targets use grader_target to reference an LLM target for scoring.
 
 targets:
   - name: default
@@ -12,7 +13,7 @@ targets:
 
   - name: codex
     provider: codex
-    judge_target: azure-llm
+    grader_target: azure-llm
     # Uses the Codex CLI (defaults to `codex` on PATH)
     # executable: ${{ CODEX_CLI_PATH }}        # Optional: override executable path
     # args:                             # Optional additional CLI arguments
@@ -29,7 +30,7 @@ targets:
   # Claude - Anthropic's Claude Agent SDK
   - name: claude
     provider: claude
-    judge_target: azure-llm
+    grader_target: azure-llm
     # Uses the @anthropic-ai/claude-agent-sdk
     # model: claude-sonnet-4-20250514          # Optional: override model
     # cwd: ${{ CLAUDE_WORKSPACE_DIR }}         # Optional: working directory (defaults to process.cwd())
@@ -53,7 +54,7 @@ targets:
 
   - name: local_cli
     provider: cli
-    judge_target: azure-llm
+    grader_target: azure-llm
     # Passes the fully rendered prompt and any attached files to a local Python script
     # NOTE: Do not add quotes around {PROMPT} or {FILES} - they are already shell-escaped
     command: uv run ./mock_cli.py --prompt {PROMPT} {FILES} --output {OUTPUT_FILE}

diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts
@@ -260,6 +260,48 @@ describe('buildGradingArtifact', () => {
     expect(grading.graders?.[1].score).toBe(0.7);
   });
 
+  it('preserves multi-aspect grader assertions at top level and under the grader', () => {
+    const rubricAssertions = [
+      {
+        text: '[accuracy] Answer matches the reference - Score: 8/10 (strong)',
+        passed: true,
+        evidence: 'The answer includes the expected facts.',
+      },
+      {
+        text: '[citations] Answer cites the source - Score: 4/10 (weak)',
+        passed: false,
+        evidence: 'The answer does not cite a source.',
+      },
+    ];
+    const result = makeResult({
+      assertions: rubricAssertions,
+      scores: [
+        makeEvaluatorResult({
+          name: 'rubric-review',
+          type: 'llm-grader',
+          score: 0.6,
+          assertions: rubricAssertions,
+        }),
+      ],
+    });
+
+    const grading = buildGradingArtifact(result);
+
+    expect(grading.assertions).toEqual(rubricAssertions);
+    expect(grading.summary).toEqual({
+      passed: 1,
+      failed: 1,
+      total: 2,
+      pass_rate: 0.5,
+    });
+    expect(grading.graders?.[0]).toMatchObject({
+      name: 'rubric-review',
+      type: 'llm-grader',
+      score: 0.6,
+      assertions: rubricAssertions,
+    });
+  });
+
   it('keeps grading.json focused on grading evidence', () => {
     const result = makeResult({ error: 'Timeout exceeded' });
     const grading = buildGradingArtifact(result);

diff --git a/apps/web/src/content/docs/docs/evaluation/eval-cases.mdx b/apps/web/src/content/docs/docs/evaluation/eval-cases.mdx
@@ -345,15 +345,16 @@ Any grader in `assertions` can be marked as `required`. When a required grader f
 | Value | Behavior |
 |-------|----------|
 | `required: true` | Must score >= 0.8 (default threshold) to pass |
-| `required: 0.6` | Must score >= 0.6 to pass (custom threshold between 0 and 1) |
+| `required: true` + `min_score: 0.6` | Must score >= 0.6 to pass (custom threshold between 0 and 1) |
 
 ```yaml
 assertions:
   - type: contains
     value: "DENIED"
     required: true          # must pass (>= 0.8)
   - type: rubrics
-    required: 0.6           # must score at least 0.6
+    required: true
+    min_score: 0.6          # must score at least 0.6
     criteria:
       - id: quality
         outcome: Response is well-structured

diff --git a/apps/web/src/content/docs/docs/evaluation/rubrics.mdx b/apps/web/src/content/docs/docs/evaluation/rubrics.mdx
@@ -74,7 +74,7 @@ assertions:
 | `score_ranges` | — | Score range definitions (analytic mode) |
 
 :::note
-`required_min_score` (0–10 integer scale) is deprecated. Use `min_score` (0–1 scale) instead. For example, `required_min_score: 8` becomes `min_score: 0.8`.
+Use `min_score` for analytic rubric gating. The only 0–10 values in authored rubrics are `score_ranges` bands and grader outputs.
 :::
 
 ### Criterion Operators

diff --git a/apps/web/src/content/docs/docs/graders/custom-graders.mdx b/apps/web/src/content/docs/docs/graders/custom-graders.mdx
@@ -77,7 +77,7 @@ final_score = sum(score_i * weight_i) / sum(weight_i)
 ```
 
 If `weight` is omitted, it defaults to `1.0` (equal weighting).
-If any grader has `required: true` (or `required: <threshold>`) and scores below its required threshold, the overall test score is forced to `0`.
+If any grader has `required: true` and scores below its required threshold, the overall test score is forced to `0`. Use `min_score` for a custom threshold.
 
 ## Best Practices
 

diff --git a/apps/web/src/content/docs/docs/guides/workspace-architecture.mdx b/apps/web/src/content/docs/docs/guides/workspace-architecture.mdx
@@ -119,13 +119,12 @@ For each materialized repo, AgentV resolves acquisition in this order:
 
 | Order | Source | How it is used |
 |-------|--------|----------------|
-| 1 | Explicit resolver | `workspace.repos[].resolver` names a configured command resolver. If it returns `handled:false`, AgentV fails clearly instead of guessing. |
-| 2 | Pattern resolver | The first non-`default` `repo_resolvers[]` entry whose `repos` pattern matches the repo URL or identity. If it returns `handled:false`, AgentV continues to the default resolver. |
-| 3 | Default resolver | The resolver named `default`, if configured. It must not declare `repos`; it is the unconditional project default. If it returns `handled:false`, AgentV continues to the built-in git resolver. |
-| 4 | Registered project | A project in `$AGENTV_HOME/projects.yaml` whose `origin` matches the repo identity. AgentV seeds its mirror cache from that local checkout, then clones the cache into the workspace and resets `origin` to the declared repo URL. |
-| 5 | Configured mirror | A path listed under `git_cache.mirrors`. AgentV seeds its mirror cache from that checkout or bare mirror, then clones the cache into the workspace. |
-| 6 | Mirror cache | An AgentV-owned bare cache under `$AGENTV_DATA_DIR/git-cache/<hash>`. Cache population is locked, cloned into a temporary path, verified, and atomically renamed before use. |
-| 7 | Remote clone | The normalized clone URL from the eval's `repo` field. |
+| 1 | Pattern resolver | The first non-`default` `repo_resolvers[]` entry whose `repos` pattern matches the repo URL or identity. If it returns `handled:false`, AgentV continues to the default resolver. |
+| 2 | Default resolver | The resolver named `default`, if configured. It must not declare `repos`; it is the unconditional project default. If it returns `handled:false`, AgentV continues to the built-in git resolver. |
+| 3 | Registered project | A project in `$AGENTV_HOME/projects.yaml` whose `origin` matches the repo identity. AgentV seeds its mirror cache from that local checkout, then clones the cache into the workspace and resets `origin` to the declared repo URL. |
+| 4 | Configured mirror | A path listed under `git_cache.mirrors`. AgentV seeds its mirror cache from that checkout or bare mirror, then clones the cache into the workspace. |
+| 5 | Mirror cache | An AgentV-owned bare cache under `$AGENTV_DATA_DIR/git-cache/<hash>`. Cache population is locked, cloned into a temporary path, verified, and atomically renamed before use. |
+| 6 | Remote clone | The normalized clone URL from the eval's `repo` field. |
 
 Workspace clones are independent from user-owned checkouts, configured mirrors,
 and resolver source directories. AgentV does not leave Git alternates pointing

diff --git a/apps/web/src/content/docs/docs/targets/coding-agents.mdx b/apps/web/src/content/docs/docs/targets/coding-agents.mdx
@@ -5,7 +5,7 @@ sidebar:
   order: 3
 ---
 
-Coding agent targets evaluate AI coding assistants and CLI-based agents. These targets require a `grader_target` (also accepts `judge_target` for backward compatibility) to run LLM-based graders.
+Coding agent targets evaluate AI coding assistants and CLI-based agents. These targets require a `grader_target` to run LLM-based graders.
 
 ## Prompt format
 
@@ -73,39 +73,6 @@ targets:
 | `cwd` | No | Working directory |
 | `grader_target` | Yes | LLM target for evaluation |
 
-## cc-mirror
-
-[cc-mirror](https://github.com/numman-ali/cc-mirror) creates isolated Claude Code variants that route through alternative providers (Z.ai, Kimi, MiniMax, OpenRouter, etc.). The `cc-mirror` provider alias resolves to `claude-cli` and auto-discovers the binary path from `~/.cc-mirror/<variant>/variant.json`.
-
-```yaml
-targets:
-  # Explicit variant with known executable
-  - name: claude-zai
-    provider: cc-mirror
-    executable: claude-zai
-    grader_target: azure-base
-
-  # Auto-discover binary from variant.json
-  - name: my-kimi
-    provider: cc-mirror
-    grader_target: azure-base
-```
-
-| Field | Required | Description |
-|-------|----------|-------------|
-| `executable` | No | CLI binary name or path. When set, used directly (skips variant.json lookup). |
-| `variant` | No | Variant name (directory under `~/.cc-mirror/`). Defaults to target `name`. Used to locate `variant.json` when `executable` is not set. |
-| `cwd` | No | Working directory |
-| `grader_target` | Yes | LLM target for evaluation |
-
-Setup a variant first, then reference it by name:
-
-```bash
-npx cc-mirror quick --provider zai --name claude-zai --api-key "$Z_AI_API_KEY"
-```
-
-Since `cc-mirror` resolves to `claude-cli`, all Claude target fields (model, system_prompt, timeout_seconds, etc.) are also supported.
-
 ## Codex CLI
 
 ```yaml
@@ -131,7 +98,7 @@ targets:
 ```yaml
 targets:
   - name: copilot
-    provider: copilot
+    provider: copilot-cli
     model: gpt-5-mini
     grader_target: azure-base
 ```
@@ -140,7 +107,7 @@ targets:
 |-------|----------|-------------|
 | `model` | No | Model to use (defaults to copilot's default) |
 | `cwd` | No | Working directory |
-| `subprovider` | No | OpenAI-compatible provider type for `copilot`, `copilot-cli`, or `copilot-sdk`, such as `openai` or `azure` |
+| `subprovider` | No | OpenAI-compatible provider type for `copilot-cli` or `copilot-sdk`, such as `openai` or `azure` |
 | `base_url` | No | Provider base URL or Azure resource URL/name |
 | `api_key` | No | Provider API key. Prefer `${{ ENV_VAR }}` references. |
 | `bearer_token` | No | Provider bearer token. Prefer `${{ ENV_VAR }}` references. Takes precedence over `api_key` when set. |
@@ -308,7 +275,7 @@ The VS Code provider uses a **subagent file-messaging architecture**. AgentV pro
 ```yaml
 targets:
   - name: copilot
-    provider: copilot
+    provider: copilot-cli
     executable: ${{ COPILOT_EXE }}
     grader_target: azure-base
 ```

diff --git a/apps/web/src/content/docs/docs/targets/configuration.mdx b/apps/web/src/content/docs/docs/targets/configuration.mdx
@@ -76,7 +76,7 @@ tests:
 
 ## Grader Target
 
-Agent targets that need LLM-based evaluation specify a `grader_target` (also accepts `judge_target` for backward compatibility) — the LLM used to run LLM grader graders:
+Agent targets that need LLM-based evaluation specify a `grader_target` — the LLM used to run LLM grader graders:
 
 ```yaml
 targets:
@@ -171,7 +171,6 @@ workspace:
 |-------|-------------|
 | `repos[].path` | Directory within the workspace to clone into |
 | `repos[].repo` | Repository identity: full clone URL or GitHub `org/name` shorthand |
-| `repos[].resolver` | Optional configured `repo_resolvers[].name` override |
 | `repos[].commit` | Branch, tag, or SHA to check out (default: `HEAD`) |
 | `repos[].base_commit` | Alias for `commit`, useful for SWE-bench-style datasets |
 | `repos[].ancestor` | Walk N commits back from the checked-out ref (e.g., `1` for parent) |

diff --git a/apps/web/src/content/docs/docs/tools/dashboard.mdx b/apps/web/src/content/docs/docs/tools/dashboard.mdx
@@ -314,7 +314,7 @@ results:
   auto_push: false
 ```
 
-Project-local `.agentv/config.yaml` is for portable eval defaults such as `execution`, `eval_patterns`, and `dashboard`. Do not put `projects` in project-local config; AgentV warns and ignores it there. `results_by_project` is deprecated; use `projects[].results` in `$AGENTV_HOME/config.yaml`.
+Project-local `.agentv/config.yaml` is for portable eval defaults such as `execution`, `eval_patterns`, and `dashboard`. Do not put `projects` in project-local config; AgentV warns and ignores it there. Put per-project results settings in `projects[].results` in `$AGENTV_HOME/config.yaml`.
 
 The project `repo` and the `results` block sync different repositories:
 

diff --git a/docs/adr/0013-experiment-is-metadata-expressed-as-tags-experiment.md b/docs/adr/0013-experiment-is-metadata-expressed-as-tags-experiment.md
@@ -4,7 +4,7 @@ Date: 2026-07-01
 
 ## Status
 
-Accepted
+Accepted, then **superseded** (eval-authoring portions) by [ADR 0016](0016-promptfoo-superset-eval-authoring-contract.md) as part of the promptfoo-superset restructure (2026-07-02).
 
 Extends [ADR 0009](0009-eval-path-result-identity-and-default-experiment.md) and
 builds on [ADR 0012](0012-finalize-run-artifact-layout.md), which established

diff --git a/docs/adr/0013-stabilize-eval-authoring-contract.md b/docs/adr/0013-stabilize-eval-authoring-contract.md
@@ -4,7 +4,7 @@ Date: 2026-07-01
 
 ## Status
 
-Accepted
+Accepted, then **superseded** (eval-authoring portions) by [ADR 0016](0016-promptfoo-superset-eval-authoring-contract.md) as part of the promptfoo-superset restructure (2026-07-02).
 
 Supersedes the eval-authoring placement portions of
 [ADR 0002](0002-keep-harbor-benchmark-execution-behind-runner-boundary.md),