diff --git a/apps/cli/src/commands/eval/task-bundle.ts b/apps/cli/src/commands/eval/task-bundle.ts
index 9aec62e11..7e86e6872 100644
--- a/apps/cli/src/commands/eval/task-bundle.ts
+++ b/apps/cli/src/commands/eval/task-bundle.ts
@@ -298,7 +298,7 @@ async function copyDirectory(sourcePath: string, destinationPath: string): Promi
 }
 
 function shouldCopyDirectory(reference: BundleSourceReference): boolean {
-  if (reference.kind !== 'code_grader_cwd') {
+  if (reference.kind !== 'script_grader_cwd' && reference.kind !== 'code_grader_cwd') {
     return true;
   }
   return !path.isAbsolute(reference.displayPath);
diff --git a/apps/cli/src/commands/pipeline/grade.ts b/apps/cli/src/commands/pipeline/grade.ts
index 738924275..ef4eb3773 100644
--- a/apps/cli/src/commands/pipeline/grade.ts
+++ b/apps/cli/src/commands/pipeline/grade.ts
@@ -107,6 +107,7 @@ export async function runCodeGraders(
   const executeCodeGrader = async (graderConfig: Record<string, unknown>, task: GraderTask) => {
     const { testId, resultsDir, responseText, inputData } = task;
     const graderName = graderConfig.name as string;
+    const graderType = typeof graderConfig.type === 'string' ? graderConfig.type : 'script';
     const messages = [{ role: 'assistant' as const, content: responseText }];
     const trace = buildTraceFromMessages({
       input: inputData.input,
@@ -157,7 +158,7 @@ export async function runCodeGraders(
 
       await writeFile(
         join(resultsDir, `${graderName}.json`),
-        `${JSON.stringify({ name: graderName, type: 'code-grader', score, weight: graderConfig.weight ?? 1.0, assertions, details: parsed.details ?? {} }, null, 2)}\n`,
+        `${JSON.stringify({ name: graderName, type: graderType, score, weight: graderConfig.weight ?? 1.0, assertions, details: parsed.details ?? {} }, null, 2)}\n`,
         'utf8',
       );
     } catch (error) {
@@ -167,7 +168,7 @@ export async function runCodeGraders(
 
       await writeFile(
         join(resultsDir, `${graderName}.json`),
-        `${JSON.stringify({ name: graderName, type: 'code-grader', score: 0, weight: graderConfig.weight ?? 1.0, assertions: [{ text: `Error: ${message}`, passed: false }], details: { error: message } }, null, 2)}\n`,
+        `${JSON.stringify({ name: graderName, type: graderType, score: 0, weight: graderConfig.weight ?? 1.0, assertions: [{ text: `Error: ${message}`, passed: false }], details: { error: message } }, null, 2)}\n`,
         'utf8',
       );
     }
diff --git a/apps/cli/src/commands/pipeline/input.ts b/apps/cli/src/commands/pipeline/input.ts
index 72ee29955..efd07824c 100644
--- a/apps/cli/src/commands/pipeline/input.ts
+++ b/apps/cli/src/commands/pipeline/input.ts
@@ -22,7 +22,7 @@ import { readFile } from 'node:fs/promises';
 import { mkdir, writeFile } from 'node:fs/promises';
 import { dirname, join, relative, resolve } from 'node:path';
 
-import type { CodeGraderConfig, GraderConfig, LlmGraderConfig } from '@agentv/core';
+import type { GraderConfig, LlmGraderConfig, ScriptGraderConfig } from '@agentv/core';
 
 /** Assertion types that can be graded deterministically without external scripts or LLMs. */
 const BUILTIN_ASSERTION_TYPES = new Set([
@@ -252,15 +252,15 @@ async function writeGraderConfigs(
   let hasLlmGraders = false;
 
   for (const assertion of assertions) {
-    if (assertion.type === 'code-grader') {
+    if (assertion.type === 'script' || assertion.type === 'code-grader') {
       if (!hasCodeGraders) {
         await mkdir(codeGradersDir, { recursive: true });
         hasCodeGraders = true;
       }
-      const config = assertion as CodeGraderConfig;
+      const config = assertion as ScriptGraderConfig;
       await writeJson(join(codeGradersDir, `${config.name}.json`), {
         name: config.name,
-        type: 'code-grader',
+        type: 'script',
         command: config.command,
         cwd: config.resolvedCwd ?? config.cwd ?? evalDir,
         weight: config.weight ?? 1.0,
diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts
index 99672c5bf..8437f732d 100644
--- a/apps/cli/src/commands/pipeline/run.ts
+++ b/apps/cli/src/commands/pipeline/run.ts
@@ -18,7 +18,7 @@ import { tmpdir } from 'node:os';
 import { dirname, join, relative, resolve } from 'node:path';
 
 import { deriveCategory, loadTestSuite } from '@agentv/core';
-import type { CodeGraderConfig, GraderConfig, LlmGraderConfig } from '@agentv/core';
+import type { GraderConfig, LlmGraderConfig, ScriptGraderConfig } from '@agentv/core';
 import { command, number, oneOf, option, optional, positional, string } from 'cmd-ts';
 
 import { buildDefaultRunDir } from '../eval/result-layout.js';
@@ -439,14 +439,15 @@ async function writeGraderConfigs(
   let hasLlmGraders = false;
 
   for (const assertion of assertions) {
-    if (assertion.type === 'code-grader') {
+    if (assertion.type === 'script' || assertion.type === 'code-grader') {
       if (!hasCodeGraders) {
         await mkdir(codeGradersDir, { recursive: true });
         hasCodeGraders = true;
       }
-      const config = assertion as CodeGraderConfig;
+      const config = assertion as ScriptGraderConfig;
       await writeJson(join(codeGradersDir, `${config.name}.json`), {
         name: config.name,
+        type: 'script',
         command: config.command,
         cwd: config.resolvedCwd ?? config.cwd ?? evalDir,
         weight: config.weight ?? 1.0,
diff --git a/apps/cli/test/commands/eval/task-bundle.test.ts b/apps/cli/test/commands/eval/task-bundle.test.ts
index c44b34988..9d7aeb4be 100644
--- a/apps/cli/test/commands/eval/task-bundle.test.ts
+++ b/apps/cli/test/commands/eval/task-bundle.test.ts
@@ -72,7 +72,7 @@ describe('materializeTaskBundle', () => {
             graderName: 'quality',
           },
           {
-            kind: 'code_grader_command',
+            kind: 'script_grader_command',
             displayPath: scriptPath,
             resolvedPath: scriptPath,
             graderName: 'quality',
diff --git a/apps/web/src/content/docs/docs/evaluation/batch-cli.mdx b/apps/web/src/content/docs/docs/evaluation/batch-cli.mdx
index 80c20f7a0..1e2d66724 100644
--- a/apps/web/src/content/docs/docs/evaluation/batch-cli.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/batch-cli.mdx
@@ -54,7 +54,7 @@ tests:
 
     assertions:
       - name: decision-check
-        type: code-grader
+        type: script
         command: [bun, run, ./scripts/check-output.ts]
         cwd: .
 
@@ -82,7 +82,7 @@ tests:
 
     assertions:
       - name: decision-check
-        type: code-grader
+        type: script
         command: [bun, run, ./scripts/check-output.ts]
         cwd: .
 ```
@@ -141,7 +141,7 @@ AgentV extracts tool calls directly from `output[].tool_calls[]` for `tool_traje
 
 ## Grader Implementation
 
-Each test has its own grader that validates the batch runner output. The grader receives the standard `code_grader` input via stdin.
+Each test has its own grader that validates the batch runner output. The grader receives the standard `script` input via stdin.
 
 **Input (stdin):**
 ```json
diff --git a/apps/web/src/content/docs/docs/evaluation/eval-cases.mdx b/apps/web/src/content/docs/docs/evaluation/eval-cases.mdx
index 4431a1a57..baf8040b9 100644
--- a/apps/web/src/content/docs/docs/evaluation/eval-cases.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/eval-cases.mdx
@@ -24,14 +24,12 @@ tests:
 | Field | Required | Description |
 |-------|----------|-------------|
 | `id` | Yes | Unique identifier for the test |
-| `criteria` | Conditional | Description of what a correct response should contain. Required only when the case has no `expected_output` or `assertions` |
 | `input` | Yes | Input sent to the target (string, object, or message array) |
-| `expected_output` | No | Expected response for comparison (string, object, or message array) |
+| `expected_output` | No | Passive reference response available to graders (string, object, or message array) |
+| `assertions` / `assert` | Yes | Per-test graders; plain strings become `g-eval` rubric checks |
 | `execution` | No | Per-case grader/default overrides such as `skip_defaults`; target selection belongs in top-level `target` or CLI `--target` |
 | `workspace` | No | Per-case workspace config (overrides suite-level) |
 | `metadata` | No | Arbitrary key-value pairs passed to graders and workspace scripts |
-| `rubrics` | No | Structured evaluation criteria |
-| `assertions` | No | Per-test graders |
 
 ## Input
 
@@ -41,7 +39,7 @@ The simplest form is a string, which expands to a single user message:
 input: What is 15 + 27?
 ```
 
-Structured object input also expands to a single user message while preserving the object for code graders and batch runners:
+Structured object input also expands to a single user message while preserving the object for script graders and batch runners:
 
 ```yaml
 input:
@@ -71,8 +69,8 @@ Optional reference response for comparison by graders. Write `expected_output` a
 a golden answer or reference response the target could have produced, not as a
 rubric or "the agent should..." criteria list. `expected_output` is passive
 reference data: it is stored on the case and passed to graders, but it does not
-choose a grader by itself when `assertions` is present. Add explicit assertion
-strings, `llm-grader`, `code-grader`, `field-accuracy`, or another
+choose a grader by itself. Add explicit assertion
+strings, `llm-grader`, `script`, `field-accuracy`, or another
 reference-aware grader when you want the reference answer evaluated.
 
 A string expands to a single assistant message:
@@ -98,10 +96,10 @@ eval suites, or tags/filters for target-specific cases.
 ```yaml
 tests:
   - id: complex-case
-    criteria: Provides detailed explanation
     input: Explain quicksort algorithm
 
     assertions:
+      - Provides a detailed explanation
       - name: depth_check
         type: llm-grader
         prompt: ./graders/depth.md
@@ -117,16 +115,17 @@ assertions:
 
 tests:
   - id: normal-case
-    criteria: Returns correct answer
     input: What is 2+2?
+    assertions:
+      - Returns the correct answer
     # Gets latency_check from root-level assertions
 
   - id: special-case
-    criteria: Handles edge case
     input: Handle this edge case
     execution:
       skip_defaults: true
     assertions:
+      - Handles the edge case
       - name: custom_eval
         type: llm-grader
     # Does NOT get latency_check
@@ -144,16 +143,18 @@ workspace:
 
 tests:
   - id: case-1
-    criteria: Should work
     input: Do something
+    assertions:
+      - Completes the requested task
     workspace:
       hooks:
         before_all:
           command: ["bun", "run", "custom-setup.ts"]
 
   - id: case-2
-    criteria: Should also work
     input: Do something else
+    assertions:
+      - Completes the requested task
     # Inherits suite-level hooks.before_all
 ```
 
@@ -287,17 +288,17 @@ All deterministic assertions support these optional fields:
 ```yaml
 tests:
   - id: no-competitors
-    criteria: Response must not mention any competitor
     input: "Describe our product advantages."
     assertions:
+      - Response must not mention any competitor
       - type: contains-any
         value: ["CompetitorA", "CompetitorB", "CompetitorC"]
         negate: true
 
   - id: required-inputs
-    criteria: Agent asks for missing rule codes
     input: "Process customs entry for country BE."
     assertions:
+      - Agent asks for missing rule codes
       - name: asks-for-rule-codes
         type: icontains-any
         value: ["rule code", "rule codes"]
@@ -311,13 +312,12 @@ Assertion graders auto-generate a `name` when one is not provided (e.g., `contai
 
 ### Advanced Rubric Assertions
 
-Use `type: rubrics` with a `criteria` array only when you need weights,
+Use `type: g-eval` with a `criteria` array only when you need weights,
 required flags, or score ranges:
 
 ```yaml
 tests:
   - id: denied-party
-    criteria: Must identify denied party
     input:
       - role: user
         content: Screen "Acme Corp" against denied parties list
@@ -328,7 +328,7 @@ tests:
       - type: contains
         value: "DENIED"
         required: true
-      - type: rubrics
+      - type: g-eval
         criteria:
           - id: accuracy
             outcome: Correctly identifies the denied party
@@ -352,7 +352,7 @@ assertions:
   - type: contains
     value: "DENIED"
     required: true          # must pass (>= 0.8)
-  - type: rubrics
+  - type: g-eval
     required: true
     min_score: 0.6          # must score at least 0.6
     criteria:
@@ -373,24 +373,22 @@ Required gates are evaluated after all graders run. If any required grader falls
 
 ## How Reference Fields and `assertions` Interact
 
-The `criteria` and `expected_output` fields are **data fields** that describe what the
-response should accomplish. They are not graders themselves — how they get used depends
-on whether `assertions` is present.
+`expected_output` is reference data, not a grader. It is stored on the case and
+provided to graders that know how to use it, but it does not create an LLM
+grading call by itself. Put the grading contract in `assertions` or `assert`.
 
-### No `assertions` — implicit LLM grader
-
-When a test has no `assertions` field, a default `llm-grader` grader runs automatically
-and uses the case context, including `criteria` and `expected_output` when present:
+Plain assertion strings are the default shape for semantic checks:
 
 ```yaml
 tests:
   - id: simple-eval
-    criteria: Assistant correctly explains the bug and proposes a fix
     input: "Debug this function..."
-    # No assertions → default llm-grader evaluates against criteria
+    assertions:
+      - Assistant correctly explains the bug and proposes a fix
 ```
 
-Suite-level `preprocessors` also apply to this implicit grader. That matters when the agent output is a `ContentFile` block rather than plain text:
+Suite-level `preprocessors` apply to explicit LLM graders. That matters when the
+agent output is a `ContentFile` block rather than plain text:
 
 ```yaml
 preprocessors:
@@ -399,16 +397,15 @@ preprocessors:
 
 tests:
   - id: spreadsheet-eval
-    criteria: Output includes the revenue rows
     input: Generate the spreadsheet report
+    assertions:
+      - Output includes the revenue rows
 ```
 
-### `assertions` present — explicit graders only
-
-When `assertions` is defined, only the declared graders run. No implicit grader is added
-because `criteria` or `expected_output` exists. Graders that are declared (such as
-plain rubric strings, `llm-grader`, `code-grader`, or `rubrics`) receive the case
-context, including `criteria` and `expected_output`, as input automatically.
+When `assertions` is defined, only the declared graders run. No implicit grader is
+added because `expected_output` exists. Declared graders such as plain rubric
+strings, `llm-grader`, `script`, or `g-eval` receive the case context, including
+`expected_output`, as input automatically.
 
 This means a case with `expected_output` and only deterministic assertions evaluates only
 those deterministic assertions:
@@ -424,7 +421,7 @@ tests:
 ```
 
 For contract-style evals where assertion strings express every semantic check,
-omit `criteria`:
+keep those checks in `assertions`:
 
 ```yaml
 tests:
@@ -440,21 +437,11 @@ tests:
       - The answer avoids preserving one-off observations as durable guidance.
 ```
 
-If `assertions` contains only deterministic graders (like `contains` or `regex`), the `criteria` field is not evaluated and a warning is emitted:
-
-```
-Warning: Test 'my-test': criteria is defined but no grader in assertions
-will evaluate it. Add a rubric assertion string or another grader to assertions,
-or remove criteria if it is documentation-only.
-```
-
-To use `criteria` alongside deterministic checks, add a rubric assertion string
-or another grader explicitly:
+To combine deterministic checks with semantic checks, add both explicitly:
 
 ```yaml
 tests:
   - id: mixed-eval
-    criteria: Response is helpful and mentions the fix
     input: "Debug this function..."
     assertions:
       - Explains why the bug happens
@@ -471,9 +458,9 @@ preprocessors:
 
 tests:
   - id: mixed-eval
-    criteria: Response is helpful and mentions the fix
     input: "Debug this function..."
     assertions:
+      - Response is helpful and mentions the fix
       - type: llm-grader        # use explicit form for custom preprocessors
         preprocessors:
           - type: xlsx
@@ -489,11 +476,12 @@ Pass additional context through the `metadata` field:
 ```yaml
 tests:
   - id: code-gen
-    criteria: Generates valid Python
     metadata:
       language: python
       difficulty: medium
     input: Write a function to sort a list
+    assertions:
+      - Generates valid Python
 ```
 
 `metadata` is passed to workspace lifecycle hooks as `case_metadata`, preserved
diff --git a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx
index c53d61e4f..88a3bfc45 100644
--- a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx
@@ -239,7 +239,7 @@ tests:
 ```
 
 `assertions` supports rubric shorthand strings, deterministic assertion types
-(`contains`, `regex`, `is_json`, `equals`), `rubrics`, LLM graders, and code
+(`contains`, `regex`, `is_json`, `equals`), `g-eval`, LLM graders, and code
 graders. See [Tests](/docs/evaluation/eval-cases/#per-test-assertions) for
 per-test assertions usage.
 
diff --git a/apps/web/src/content/docs/docs/evaluation/examples.mdx b/apps/web/src/content/docs/docs/evaluation/examples.mdx
index 9d69a737c..3d19bfea7 100644
--- a/apps/web/src/content/docs/docs/evaluation/examples.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/examples.mdx
@@ -69,7 +69,7 @@ tests:
 
 ## Multi-Grader
 
-Combine a code grader and an LLM grader on the same test:
+Combine a script grader and an LLM grader on the same test:
 
 ```yaml
 description: JSON generation with validation
@@ -81,7 +81,7 @@ tests:
 
     assertions:
       - name: json_format_validator
-        type: code-grader
+        type: script
         command: [uv, run, validate_json.py]
         cwd: ./graders
       - name: content_evaluator
@@ -310,7 +310,7 @@ tests:
 
     assertions:
       - name: decision-check
-        type: code-grader
+        type: script
         command: [bun, run, ./scripts/check-batch-cli-output.ts]
         cwd: .
 
@@ -343,7 +343,7 @@ tests:
 
     assertions:
       - name: decision-check
-        type: code-grader
+        type: script
         command: [bun, run, ./scripts/check-batch-cli-output.ts]
         cwd: .
 ```
diff --git a/apps/web/src/content/docs/docs/evaluation/rubrics.mdx b/apps/web/src/content/docs/docs/evaluation/rubrics.mdx
index 2fb186e6f..acebb043f 100644
--- a/apps/web/src/content/docs/docs/evaluation/rubrics.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/rubrics.mdx
@@ -22,11 +22,11 @@ tests:
       - States time complexity
 ```
 
-All strings are collected into a single rubrics grader automatically.
+All strings are collected into a single g-eval grader automatically.
 
 ### Full form for advanced options
 
-Use `type: rubrics` explicitly when you need weights, required flags, or score ranges:
+Use `type: g-eval` explicitly when you need weights, required flags, or score ranges:
 
 ```yaml
 tests:
@@ -34,7 +34,7 @@ tests:
     criteria: Explain how quicksort works
     input: Explain quicksort algorithm
     assertions:
-      - type: rubrics
+      - type: g-eval
         criteria:
           - Mentions divide-and-conquer approach
           - Explains partition step
@@ -47,7 +47,7 @@ For fine-grained control, use rubric objects with weights and requirements:
 
 ```yaml
 assertions:
-  - type: rubrics
+  - type: g-eval
     criteria:
       - id: core-concept
         outcome: Explains divide-and-conquer
@@ -74,7 +74,7 @@ assertions:
 | `score_ranges` | — | Score range definitions (analytic mode) |
 
 :::note
-Use `min_score` for analytic rubric gating. The only 0–10 values in authored rubrics are `score_ranges` bands and grader outputs.
+Use `min_score` for analytic rubric gating. The only 0–10 values in authored g-eval are `score_ranges` bands and grader outputs.
 :::
 
 ### Criterion Operators
@@ -83,7 +83,7 @@ Use `operator` when the criterion outcome should be interpreted with a specific
 
 ```yaml
 assertions:
-  - type: rubrics
+  - type: g-eval
     criteria:
       - id: supported-revenue
         operator: correctness
@@ -103,7 +103,7 @@ For quality gradients instead of binary pass/fail, use score ranges:
 
 ```yaml
 assertions:
-  - type: rubrics
+  - type: g-eval
     criteria:
       - id: accuracy
         outcome: Provides correct answer
@@ -170,12 +170,12 @@ tests:
     criteria: Generates correct, clean Python code
     input: Write a fibonacci function
     assertions:
-      - type: rubrics
+      - type: g-eval
         criteria:
           - Returns correct values for n=0,1,2,10
           - Uses meaningful variable names
           - Includes docstring
       - name: syntax_check
-        type: code-grader
+        type: script
         command: [./validators/check_python.py]
 ```
diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
index b13595cfd..1b5b9caad 100644
--- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
@@ -42,7 +42,7 @@ Each `scores[]` entry includes per-grader timing:
 }
 ```
 
-The `duration_ms`, `started_at`, and `ended_at` fields are present on every grader result (including `code-grader`), enabling per-grader bottleneck analysis.
+The `duration_ms`, `started_at`, and `ended_at` fields are present on every grader result (including `script`), enabling per-grader bottleneck analysis.
 
 ## Common Options
 
@@ -419,7 +419,7 @@ fixture so AgentV still runs graders against real or frozen candidate output.
 
 ## Run a Single Assertion
 
-Run a code-grader assertion in isolation without executing a full eval suite:
+Run a script assertion in isolation without executing a full eval suite:
 
 ```bash
 agentv eval assert <name> --agent-output <text> --agent-input <text>
@@ -441,7 +441,7 @@ The `--file` option reads a JSON file with `{ "output": "...", "input": "..." }`
 
 **Exit codes:** 0 if score >= 0.5 (pass), 1 if score < 0.5 (fail).
 
-This is the same interface that agent-orchestrated evals use — the EVAL.yaml transpiler emits `assertions` instructions for code graders so external grading agents can execute them directly.
+This is the same interface that agent-orchestrated evals use — the EVAL.yaml transpiler emits `assertions` instructions for script graders so external grading agents can execute them directly.
 
 ## Offline Grading
 
diff --git a/apps/web/src/content/docs/docs/evaluation/sdk.mdx b/apps/web/src/content/docs/docs/evaluation/sdk.mdx
index 10698004b..4b2f75222 100644
--- a/apps/web/src/content/docs/docs/evaluation/sdk.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/sdk.mdx
@@ -9,7 +9,7 @@ YAML remains AgentV's canonical, portable eval format. The SDK surfaces below ar
 
 AgentV currently provides two npm packages for programmatic use:
 
-- **`@agentv/sdk`** — user-facing SDK for `evaluate()`, YAML-aligned eval authoring, custom assertions, and code graders
+- **`@agentv/sdk`** — user-facing SDK for `evaluate()`, YAML-aligned eval authoring, custom assertions, and script graders
 - **`@agentv/core`** — core implementation package and typed configuration
 
 ## Installation
@@ -140,7 +140,7 @@ export default defineEval({
         graders.exact('{"message":"Hello"}', { name: 'exact-json', minScore: 1 }),
         graders.regex(/"message"\s*:/, { name: 'message-key' }),
         graders.json({ name: 'valid-json', required: true }),
-        graders.rubrics(['Greets the user'], { name: 'rubric-review' }),
+        graders.g-eval(['Greets the user'], { name: 'rubric-review' }),
         graders.llmGrader({
           name: 'llm-review',
           prompt: 'Grade whether the answer is useful.',
@@ -153,7 +153,7 @@ export default defineEval({
 });
 ```
 
-The catalog covers `contains`, `equals`/`exact`, `regex`, `is-json`/`json`, `rubrics`, `llm-grader`, and `code-grader`. CamelCase SDK options such as `minScore`, `maxSteps`, and rubric `scoreRanges` lower to `min_score`, `max_steps`, and `score_ranges` when AgentV loads or serializes the suite.
+The catalog covers `contains`, `equals`/`exact`, `regex`, `is-json`/`json`, `g-eval`, `llm-grader`, and `script`. CamelCase SDK options such as `minScore`, `maxSteps`, and rubric `scoreRanges` lower to `min_score`, `max_steps`, and `score_ranges` when AgentV loads or serializes the suite.
 
 ## AgentV-Native Helper Factories
 
@@ -262,7 +262,7 @@ assertions:
     value: "Hello"
 ```
 
-## Code Graders
+## Script Graders
 
 Use `defineCodeGrader` from `@agentv/sdk` for full control over scoring with an explicit assertions array:
 
@@ -294,7 +294,7 @@ it('links to the dashboard', () => {
 ```yaml
 assertions:
   - name: vitest-welcome-banner
-    type: code-grader
+    type: script
     command: [agentv, eval, graders/welcome-banner.test.ts]
 ```
 
@@ -311,9 +311,9 @@ export default defineWorkspaceGrader(async ({ workspace }) => [
 ]);
 ```
 
-`defineCodeGrader`, `defineVitestWorkspaceGrader`, and `defineWorkspaceGrader` custom scripts are referenced in YAML with `type: code-grader` and `command: [bun, run, grader.ts]`. Plain Vitest verifier files can use `command: [agentv, eval, graders/check.test.ts]` without a custom wrapper; use `agentv eval vitest` when you need adapter flags. `defineAssertion` uses convention-based discovery instead — just place in `.agentv/assertions/` and reference by name.
+`defineCodeGrader`, `defineVitestWorkspaceGrader`, and `defineWorkspaceGrader` custom scripts are referenced in YAML with `type: script` and `command: [bun, run, grader.ts]`. Plain Vitest verifier files can use `command: [agentv, eval, graders/check.test.ts]` without a custom wrapper; use `agentv eval vitest` when you need adapter flags. `defineAssertion` uses convention-based discovery instead — just place in `.agentv/assertions/` and reference by name.
 
-For detailed patterns, input/output contracts, and language-agnostic examples, see [Code Graders](/docs/graders/code-graders/).
+For detailed patterns, input/output contracts, and language-agnostic examples, see [Script Graders](/docs/graders/code-graders/).
 
 ## Wire Format vs SDK Format
 
diff --git a/apps/web/src/content/docs/docs/getting-started/quickstart.mdx b/apps/web/src/content/docs/docs/getting-started/quickstart.mdx
index b844759d4..59fefede3 100644
--- a/apps/web/src/content/docs/docs/getting-started/quickstart.mdx
+++ b/apps/web/src/content/docs/docs/getting-started/quickstart.mdx
@@ -55,7 +55,7 @@ tests:
 
     assertions:
       - name: math_check
-        type: code-grader
+        type: script
         command: [./validators/check_math.py]
 ```
 
diff --git a/apps/web/src/content/docs/docs/graders/code-graders.mdx b/apps/web/src/content/docs/docs/graders/code-graders.mdx
index 4c775491a..ea328cc84 100644
--- a/apps/web/src/content/docs/docs/graders/code-graders.mdx
+++ b/apps/web/src/content/docs/docs/graders/code-graders.mdx
@@ -1,22 +1,21 @@
 ---
-title: Code Graders
-description: Deterministic code graders in Python or TypeScript
+title: Script Graders
+description: Deterministic script graders in Python or TypeScript
 sidebar:
   order: 1
 ---
 
-Code graders are scripts that evaluate agent responses deterministically. Write them in any language — Python, TypeScript, Node, or any executable.
+Script graders are scripts that evaluate agent responses deterministically. Write them in any language — Python, TypeScript, Node, or any executable.
 
 ## Contract
 
-Code graders receive eval context via stdin JSON and return a result via stdout.
+Script graders receive eval context via stdin JSON and return a result via stdout.
 
 **Input (stdin, raw wire format):**
 ```json
 {
   "input": [{ "role": "user", "content": "What is 15 + 27?" }],
   "input_files": [],
-  "criteria": "Correctly calculates 15 + 27 = 42",
   "output": "The answer is 42.",
   "expected_output": [{ "role": "assistant", "content": "42" }],
   "messages": [{ "role": "assistant", "content": "The answer is 42." }],
@@ -86,7 +85,7 @@ fi
 
 ```yaml
 assertions:
-  - type: code-grader
+  - type: script
     command: [bash, scripts/check-pages.sh]
 ```
 
@@ -94,7 +93,7 @@ Silent one-liners work too — stdout is optional:
 
 ```yaml
 assertions:
-  - type: code-grader
+  - type: script
     command: ["bash", "-c", "[ $(wc -l < output.txt) -ge 10 ]"]
 ```
 
@@ -129,7 +128,7 @@ print(json.dumps({
 The repo-local helper in `examples/features/sdk-python/` wraps the same contract for that example checkout:
 
 ```python
-from agentv_py.grader import Assertion, CodeGraderResult, define_code_grader
+from agentv_py.grader import Assertion, CodeGraderResult, define_script
 
 
 def evaluate(context):
@@ -146,7 +145,7 @@ def evaluate(context):
     )
 
 if __name__ == "__main__":
-    define_code_grader(evaluate)
+    define_script(evaluate)
 ```
 
 Deprecated wire aliases like `output_text`, `input_text`, `reference_answer`, and `expected_output_text` are not accepted by the Python helper.
@@ -181,7 +180,7 @@ console.log(JSON.stringify({
 ```yaml
 assertions:
   - name: my_validator
-    type: code-grader
+    type: script
     command: [./validators/check_answer.py]
 ```
 
@@ -238,12 +237,12 @@ describe('welcome banner', () => {
 });
 ```
 
-Then use AgentV's built-in Vitest adapter as the `code-grader` command. The adapter copies verifier files into a temporary workspace-local path when needed, runs Vitest in `workspace_path`, reads the JSON reporter output, and maps each test outcome to an AgentV assertion:
+Then use AgentV's built-in Vitest adapter as the `script` command. The adapter copies verifier files into a temporary workspace-local path when needed, runs Vitest in `workspace_path`, reads the JSON reporter output, and maps each test outcome to an AgentV assertion:
 
 ```yaml
 assertions:
   - name: vitest-welcome-banner
-    type: code-grader
+    type: script
     command: [agentv, eval, graders/welcome-banner.test.ts]
 ```
 
@@ -271,7 +270,7 @@ Prefer Vitest verifiers when the checks naturally fit `expect(...)`. Use `define
 
 ## Target Access
 
-Code graders can call an LLM through a target proxy for metrics that require multiple LLM calls (contextual precision, semantic similarity, etc.).
+Script graders can call an LLM through a target proxy for metrics that require multiple LLM calls (contextual precision, semantic similarity, etc.).
 
 ### Configuration
 
@@ -280,7 +279,7 @@ Add a `target` block to the grader config:
 ```yaml
 assertions:
   - name: contextual-precision
-    type: code-grader
+    type: script
     command: [bun, scripts/contextual-precision.ts]
     target:
       max_calls: 10  # Default: 50
@@ -324,7 +323,7 @@ Use `target.invokeBatch(requests)` for multiple calls in parallel.
 
 ## Advanced Input Fields
 
-Beyond the basic fields (`input`, `output`, `expected_output`, `criteria`), code graders receive additional structured context:
+Beyond the basic fields (`input`, `output`, `expected_output`), script graders receive additional structured context:
 
 | Field | Type | Description |
 |-------|------|-------------|
@@ -366,7 +365,7 @@ Use `expected_output` for reference answers and `output` for the actual final an
 
 ## Workspace Access
 
-When `workspace` is configured in the eval YAML (via `workspace.template`, `workspace.repos`, or lifecycle hooks), code graders receive the prepared workspace path in two ways:
+When `workspace` is configured in the eval YAML (via `workspace.template`, `workspace.repos`, or lifecycle hooks), script graders receive the prepared workspace path in two ways:
 
 1. **JSON payload**: `workspace_path` field in the stdin input
 2. **Environment variable**: `AGENTV_WORKSPACE_PATH`
@@ -426,11 +425,11 @@ target: my_agent
 
 tests:
   - id: implement-feature
-    criteria: Agent implements the feature correctly
     input: "Implement the TODO functions in src/index.ts"
     assertions:
+      - Agent implements the feature correctly
       - name: functional-check
-        type: code-grader
+        type: script
         command: [bun, scripts/functional-check.ts]
 ```
 
@@ -465,7 +464,7 @@ The command:
 3. Prints the grader's JSON result to stdout
 4. Exits 0 if score >= 0.5, exit 1 otherwise
 
-This is the same interface that agent-orchestrated evals use — the EVAL.yaml transpiler emits `agentv eval assert` instructions for code graders so external grading agents can run them directly.
+This is the same interface that agent-orchestrated evals use — the EVAL.yaml transpiler emits `agentv eval assert` instructions for script graders so external grading agents can run them directly.
 
 ### With stdin pipe
 
diff --git a/apps/web/src/content/docs/docs/graders/composite.mdx b/apps/web/src/content/docs/docs/graders/composite.mdx
index a755c7367..ac92bce6d 100644
--- a/apps/web/src/content/docs/docs/graders/composite.mdx
+++ b/apps/web/src/content/docs/docs/graders/composite.mdx
@@ -20,7 +20,7 @@ assertions:
         type: llm-grader
         prompt: ./prompts/check1.md
       - name: evaluator_2
-        type: code-grader
+        type: script
         command: [uv, run, check2.py]
     aggregator:
       type: weighted_average
@@ -32,7 +32,7 @@ assertions:
 Each sub-grader runs independently, then the aggregator combines their results.
 Use `assertions` for composite members. `graders` is still accepted for backward compatibility.
 
-If you only need weighted-average aggregation, a plain test-level `assertions` list already computes a weighted mean across graders. Use `composite` when you need a custom aggregation strategy (`threshold`, `code-grader`, `llm-grader`) or nested grader groups.
+If you only need weighted-average aggregation, a plain test-level `assertions` list already computes a weighted mean across graders. Use `composite` when you need a custom aggregation strategy (`threshold`, `script`, `llm-grader`) or nested grader groups.
 
 ## Aggregator Types
 
@@ -100,7 +100,7 @@ Because this is an average, the final score is the fraction of passing children
 
 ### OR Logic (Strict)
 
-For a strict OR, add a custom code-grader aggregator and return `1.0` when any child score passes.
+For a strict OR, add a custom script aggregator and return `1.0` when any child score passes.
 
 Composite aggregator execution accepts either a direct script path or a shell command.
 The `bun run` form is the recommended pattern:
@@ -110,7 +110,7 @@ assertions:
   - name: strict_or
     type: composite
     aggregator:
-      type: code-grader
+      type: script
       path: bun run ../scripts/or-aggregator.js
     assertions:
       - name: mentions-paris
@@ -138,13 +138,13 @@ console.log(
   );
 ```
 
-### Code Grader Aggregator
+### Script Grader Aggregator
 
 Run a custom command to decide the final score based on all grader results:
 
 ```yaml
 aggregator:
-  type: code-grader
+  type: script
   path: bun run ./scripts/safety-gate.js
   cwd: ./graders  # optional working directory
 ```
@@ -187,7 +187,7 @@ Inside the prompt file, use the `{{EVALUATOR_RESULTS_JSON}}` variable to inject
 
 ### Safety Gate
 
-Block outputs that fail safety even if quality is high. A code grader aggregator can enforce hard gates:
+Block outputs that fail safety even if quality is high. A script grader aggregator can enforce hard gates:
 
 ```yaml
 tests:
@@ -207,7 +207,7 @@ tests:
             type: llm-grader
             prompt: ./prompts/quality-check.md
         aggregator:
-          type: code-grader
+          type: script
           path: ./scripts/safety-gate.js
 ```
 
@@ -225,7 +225,7 @@ Assign different importance to each evaluation dimension:
       type: llm-grader
       prompt: ./prompts/correctness.md
     - name: style
-      type: code-grader
+      type: script
       command: [uv, run, style_checker.py]
     - name: security
       type: llm-grader
@@ -313,7 +313,7 @@ Assertions from sub-graders are prefixed with the grader name (e.g., `[safety]`)
 ## Best Practices
 
 1. **Name graders clearly** -- names appear in results and debugging output, so use descriptive labels like `safety` or `correctness` rather than `eval_1`.
-2. **Use safety gates for critical checks** -- do not let high quality scores override safety failures. A code grader aggregator can enforce hard gates.
+2. **Use safety gates for critical checks** -- do not let high quality scores override safety failures. A script grader aggregator can enforce hard gates.
 3. **Balance weights thoughtfully** -- consider which aspects matter most for your use case and assign weights accordingly.
 4. **Keep nesting shallow** -- deep nesting makes debugging harder. Two levels of composites is usually sufficient.
 5. **Test aggregators independently** -- verify custom aggregation logic with unit tests before wiring it into a composite grader.
diff --git a/apps/web/src/content/docs/docs/graders/custom-assertions.mdx b/apps/web/src/content/docs/docs/graders/custom-assertions.mdx
index bd453db76..11061a152 100644
--- a/apps/web/src/content/docs/docs/graders/custom-assertions.mdx
+++ b/apps/web/src/content/docs/docs/graders/custom-assertions.mdx
@@ -14,11 +14,11 @@ AgentV provides two SDK functions for custom evaluation logic:
 | Function | Best For | Discovery |
 |----------|----------|-----------|
 | `defineAssertion()` | Pass/fail checks, reusable assertion types | Convention-based (`.agentv/assertions/`) |
-| `defineCodeGrader()` | Full scoring control with explicit assertions array | Referenced via `type: code-grader` + `command:` |
+| `defineCodeGrader()` | Full scoring control with explicit assertions array | Referenced via `type: script` + `command:` |
 
 **Use `defineAssertion()`** when you want a named assertion type that can be referenced across eval files without specifying a command path. It uses a simplified result contract focused on `pass` and optional `score`.
 
-**Use `defineCodeGrader()`** when you need full control over scoring with explicit `assertions` arrays, or when the grader is a one-off grader tied to a specific eval. See [Code Graders](/docs/graders/code-graders/) for details.
+**Use `defineCodeGrader()`** when you need full control over scoring with explicit `assertions` arrays, or when the grader is a one-off grader tied to a specific eval. See [Script Graders](/docs/graders/code-graders/) for details.
 
 Both functions handle stdin/stdout JSON parsing, snake_case-to-camelCase conversion, Zod validation, and error handling automatically.
 
@@ -111,7 +111,7 @@ The handler must return an `AssertionScore` object:
 
 ## Context Available to Assertions
 
-The handler receives an `AssertionContext` with the same fields as a code grader:
+The handler receives an `AssertionContext` with the same fields as a script grader:
 
 | Field | Type | Description |
 |-------|------|-------------|
@@ -224,19 +224,19 @@ target: default
 
 tests:
   - id: greeting-response
-    criteria: Agent gives a multi-word greeting
     input: "Say hello and introduce yourself"
     expected_output: "Hello! I'm an AI assistant here to help you."
     assertions:
+      - Agent gives a multi-word greeting
       - type: contains
         value: "Hello"
       - type: word-count
 
   - id: short-answer
-    criteria: Agent gives a short but valid response
     input: "What is 2+2?"
     expected_output: "The answer is 4."
     assertions:
+      - Agent gives a short but valid response
       - type: contains
         value: "4"
       - type: word-count
diff --git a/apps/web/src/content/docs/docs/graders/custom-graders.mdx b/apps/web/src/content/docs/docs/graders/custom-graders.mdx
index 568d5b989..ebac41f1a 100644
--- a/apps/web/src/content/docs/docs/graders/custom-graders.mdx
+++ b/apps/web/src/content/docs/docs/graders/custom-graders.mdx
@@ -11,9 +11,9 @@ AgentV supports multiple grader types that can be combined for comprehensive eva
 
 | Type | Description | Use Case |
 |------|-------------|----------|
-| `code_grader` | Deterministic command (Python/TS/any) | Exact matching, format validation, programmatic checks |
-| `llm_grader` | LLM-based evaluation with custom prompt | Semantic evaluation, nuance, subjective quality |
-| `rubrics` | Structured rubric grader via `assertions` | Multi-criterion grading with weights |
+| `script` | Deterministic command (Python/TS/any) | Exact matching, format validation, programmatic checks |
+| `llm-grader` | LLM-based evaluation with custom prompt | Semantic evaluation, nuance, subjective quality |
+| `g-eval` | Structured rubric grader via `assertions` | Multi-criterion grading with weights |
 
 ## Referencing Graders
 
@@ -39,11 +39,11 @@ tests:
 ```yaml
 tests:
   - id: test-1
-    criteria: Returns valid JSON
     input: Generate a JSON config
     assertions:
+      - Returns valid JSON
       - name: json_check
-        type: code-grader
+        type: script
         command: [./validators/check_json.py]
 ```
 
@@ -54,14 +54,13 @@ Use multiple graders on the same case for comprehensive scoring:
 ```yaml
 tests:
   - id: code-generation
-    criteria: Generates correct Python code
     input: Write a sorting function
     assertions:
       - Code is syntactically valid
       - Handles edge cases such as empty lists and single-element lists
       - Uses an appropriate algorithm
       - name: syntax_check
-        type: code-grader
+        type: script
         command: [./validators/check_syntax.py]
       - name: quality_review
         type: llm-grader
@@ -82,8 +81,8 @@ If any grader has `required: true` and scores below its required threshold, the
 ## Best Practices
 
 - **Use plain assertion strings first for semantic checks** — AgentV treats them as rubric criteria
-- **Use code graders for deterministic checks** — exact value matching, format validation, schema compliance
+- **Use script graders for deterministic checks** — exact value matching, format validation, schema compliance
 - **Use LLM graders for semantic evaluation** — meaning, quality, helpfulness
-- **Use rubrics for structured multi-criteria grading** — when you need weighted, itemized scoring
+- **Use `g-eval` for structured multi-criteria grading** — when you need weighted, itemized scoring
 - **Combine grader types** for comprehensive coverage
-- **Test code graders locally** before running full evaluations
+- **Test script graders locally** before running full evaluations
diff --git a/apps/web/src/content/docs/docs/graders/execution-metrics.mdx b/apps/web/src/content/docs/docs/graders/execution-metrics.mdx
index 7b12d3f69..e3abb1c3f 100644
--- a/apps/web/src/content/docs/docs/graders/execution-metrics.mdx
+++ b/apps/web/src/content/docs/docs/graders/execution-metrics.mdx
@@ -112,7 +112,7 @@ Fails if total token usage exceeds the threshold.
 |----------|----------------------|
 | Check multiple metrics at once | `execution_metrics` |
 | Simple single-threshold check | `latency`, `cost`, or `token_usage` |
-| Complex custom formulas | `code_grader` with custom command |
+| Complex custom formulas | `script` with custom command |
 
 ## Combining with Other Graders
 
diff --git a/apps/web/src/content/docs/docs/graders/llm-graders.mdx b/apps/web/src/content/docs/docs/graders/llm-graders.mdx
index b828d240f..7f7d72d17 100644
--- a/apps/web/src/content/docs/docs/graders/llm-graders.mdx
+++ b/apps/web/src/content/docs/docs/graders/llm-graders.mdx
@@ -7,19 +7,22 @@ sidebar:
 
 LLM graders use a language model to evaluate agent responses against custom criteria defined in a prompt file.
 
-## Default Grader
+## Explicit LLM Graders
 
-When a test defines `criteria` but has **no `assertions` field**, a default `llm-grader` runs automatically. The built-in prompt evaluates the response against your `criteria` and `expected_output`:
+Put semantic grading requirements in `assertions` or `assert`. Plain strings are
+handled by the built-in `g-eval` rubric grader. Use `type: llm-grader` when you
+need a custom prompt, target, or grader-specific preprocessing:
 
 ```yaml
 tests:
   - id: simple-eval
-    criteria: Correctly explains the bug and proposes a fix
     input: "Debug this function..."
-    # No assertions needed — default llm-grader evaluates against criteria
+    assertions:
+      - Correctly explains the bug and proposes a fix
 ```
 
-When `assertions` **is** present, no default grader is added. To use an LLM grader alongside other graders, declare it explicitly. See [How criteria and assertions interact](/docs/evaluation/eval-cases/#how-criteria-and-assertions-interact).
+`expected_output` is passive reference data; it is available to graders but does
+not create an LLM grading call by itself. See [How reference fields and assertions interact](/docs/evaluation/eval-cases/#how-reference-fields-and-assertions-interact).
 
 ## Configuration
 
@@ -71,7 +74,7 @@ Score the response from 0.0 to 1.0 based on:
 | `output` | Candidate answer text |
 | `metadata` | Test metadata as formatted JSON |
 | `metadata_json` | Test metadata as compact JSON |
-| `rubrics` | LLM-grader rubric items as formatted JSON |
+| `g-eval` | LLM-grader rubric items as formatted JSON |
 | `rubrics_json` | LLM-grader rubric items as compact JSON |
 | `file_changes` | Unified diff of workspace file changes (populated when `workspace` is configured) |
 | `tool_calls` | Formatted summary of tool calls from agent execution (tool name + key inputs per call) |
@@ -99,7 +102,7 @@ tests:
       - name: dexter_semantic
         type: llm-grader
         prompt: file://prompts/dexter-grader.md
-        rubrics:
+        g-eval:
           - operator: correctness
             criteria: Uses the provided ticker and company.
 ```
@@ -196,9 +199,9 @@ preprocessors:
 
 tests:
   - id: spreadsheet-output
-    criteria: Output includes the revenue rows
     input: Generate the spreadsheet report
     assertions:
+      - Output includes the revenue rows
       - name: spreadsheet-check
         type: llm-grader
         prompt: |
@@ -215,8 +218,6 @@ Resolution order:
 - if no preprocessor matches, AgentV falls back to a UTF-8 text read
 - if the fallback read looks binary or invalid, the grader receives a warning note instead of failing the test run
 
-The implicit default `llm-grader` also inherits suite-level `preprocessors`, so you can omit `assertions` and still preprocess file outputs before grading.
-
 See [`examples/features/preprocessors/`](../../../../examples/features/preprocessors/) for a runnable example with a file-producing target and a custom preprocessor script.
 
 ## Available Context Fields
diff --git a/apps/web/src/content/docs/docs/graders/python-helpers.mdx b/apps/web/src/content/docs/docs/graders/python-helpers.mdx
index 6394785db..b997c7817 100644
--- a/apps/web/src/content/docs/docs/graders/python-helpers.mdx
+++ b/apps/web/src/content/docs/docs/graders/python-helpers.mdx
@@ -1,6 +1,6 @@
 ---
 title: Repo-Local Python Helpers
-description: Example-local Python helpers for canonical AgentV code-graders and eval authoring
+description: Example-local Python helpers for canonical AgentV script graders and eval authoring
 sidebar:
   order: 7
 ---
@@ -15,7 +15,7 @@ The helper lives in `examples/features/sdk-python/`.
 
 ## Scope
 
-- `agentv_py.grader` wraps Python `code-grader` scripts over canonical `snake_case` fields.
+- `agentv_py.grader` wraps Python `script` graders over canonical `snake_case` fields.
 - `agentv_py.evals` builds AgentV-shaped eval definitions and JSONL datasets.
 - `run_agentv_eval()` shells out to `agentv eval` or the repo source CLI.
 
@@ -35,7 +35,7 @@ Use canonical fields instead:
 ## Example
 
 ```python
-from agentv_py.grader import Assertion, CodeGraderResult, define_code_grader
+from agentv_py.grader import Assertion, CodeGraderResult, define_script
 
 
 def evaluate(context):
@@ -54,7 +54,7 @@ def evaluate(context):
 
 
 if __name__ == "__main__":
-    define_code_grader(evaluate)
+    define_script(evaluate)
 ```
 
 ## Authoring evals
diff --git a/apps/web/src/content/docs/docs/graders/structured-data.mdx b/apps/web/src/content/docs/docs/graders/structured-data.mdx
index 41af50f5a..b9338ba0e 100644
--- a/apps/web/src/content/docs/docs/graders/structured-data.mdx
+++ b/apps/web/src/content/docs/docs/graders/structured-data.mdx
@@ -54,7 +54,7 @@ assertions:
 | `date` | Compares dates after parsing | `formats` -- list of accepted date formats |
 | `numeric_tolerance` | Numeric compare within tolerance | `tolerance` -- absolute threshold; `relative: true` for relative tolerance |
 
-For fuzzy string matching, use a `code_grader` grader (e.g. Levenshtein distance) instead of adding a fuzzy mode to `field_accuracy`.
+For fuzzy string matching, use a `script` grader (e.g. Levenshtein distance) instead of adding a fuzzy mode to `field_accuracy`.
 
 ### Aggregation
 
diff --git a/apps/web/src/content/docs/docs/graders/tool-trajectory.mdx b/apps/web/src/content/docs/docs/graders/tool-trajectory.mdx
index b9e4bddfa..87ebfd354 100644
--- a/apps/web/src/content/docs/docs/graders/tool-trajectory.mdx
+++ b/apps/web/src/content/docs/docs/graders/tool-trajectory.mdx
@@ -257,4 +257,4 @@ tests:
 2. **Combine with other graders** — use tool trajectory for execution validation and LLM graders for output quality.
 3. **Inspect traces first** with `--dump-traces` to understand agent behavior before writing graders.
 4. **Use generous latency thresholds** to avoid flaky tests from timing variance.
-5. **Use code graders for custom validation** — write custom tool validation scripts when built-in modes are insufficient.
+5. **Use script graders for custom validation** — write custom tool validation scripts when built-in modes are insufficient.
diff --git a/apps/web/src/content/docs/docs/guides/agent-eval-layers.mdx b/apps/web/src/content/docs/docs/guides/agent-eval-layers.mdx
index e9a9fe2d1..03dd91ba7 100644
--- a/apps/web/src/content/docs/docs/guides/agent-eval-layers.mdx
+++ b/apps/web/src/content/docs/docs/guides/agent-eval-layers.mdx
@@ -15,8 +15,8 @@ Covers plan quality, plan adherence, and tool selection rationale. Use LLM-based
 
 | Concern | AgentV grader |
 |---------|-----------------|
-| Plan quality & coherence | `rubrics` |
-| Workspace-aware auditing | `rubrics` with `required: true` criteria |
+| Plan quality & coherence | `g-eval` |
+| Workspace-aware auditing | `g-eval` with `required: true` criteria |
 
 ```yaml
 # Layer 1: Reasoning — verify the agent's plan makes sense
@@ -24,7 +24,7 @@ assertions:
   - Agent formed a coherent plan before acting
   - Agent selected appropriate tools for the task
   - name: workspace-audit
-    type: rubrics
+    type: g-eval
     criteria:
       - id: plan-before-act
         outcome: Agent formed a plan before making changes
@@ -43,7 +43,7 @@ Covers tool call correctness, argument validity, execution path, and redundancy.
 | Tool sequence | `tool_trajectory` (`in_order`, `exact`) |
 | Minimum tool usage | `tool_trajectory` (`any_order`) |
 | Argument correctness | `tool_trajectory` with `args` matching |
-| Custom validation logic | `code_grader` |
+| Custom validation logic | `script` |
 
 ```yaml
 # Layer 2: Action — verify the agent called the right tools
@@ -72,7 +72,7 @@ Covers task completion, output correctness, step efficiency, latency, and cost.
 
 | Concern | AgentV grader |
 |---------|-----------------|
-| Output correctness | `rubrics`, `equals`, `contains`, `regex` |
+| Output correctness | `g-eval`, `equals`, `contains`, `regex` |
 | Structured data accuracy | `field_accuracy` |
 | Efficiency budgets | `execution_metrics` |
 | Multi-signal rollup | `composite` |
@@ -102,8 +102,8 @@ Covers prompt injection resilience, policy adherence, bias, and content safety.
 
 | Concern | AgentV grader |
 |---------|-----------------|
-| Content safety | `rubrics` |
-| Policy enforcement | `code_grader` with policy command |
+| Content safety | `g-eval` |
+| Policy enforcement | `script` with policy command |
 | "Must NOT" assertions | Any grader with `negate: true` |
 
 ```yaml
diff --git a/apps/web/src/content/docs/docs/guides/benchmark-provenance.mdx b/apps/web/src/content/docs/docs/guides/benchmark-provenance.mdx
index 87241b4dd..fcb3a4a7d 100644
--- a/apps/web/src/content/docs/docs/guides/benchmark-provenance.mdx
+++ b/apps/web/src/content/docs/docs/guides/benchmark-provenance.mdx
@@ -32,7 +32,7 @@ Use this split when deciding where a benchmark key belongs:
 | `workspace.isolation` | Yes | Controls shared vs per-case folder isolation. Runtime workspace paths are machine-local config/CLI bindings, not benchmark provenance. |
 | `experiment` | Yes | Selects targets, thresholds, repeat policy, budgets, and default grader behavior. Concurrency is an operator/run setting from `--workers` or project config. |
 | `input`, `input_files`, `expected_output` | Yes | Builds the target prompt and passive reference answer. |
-| `assertions` | Yes | Runs deterministic, LLM, composite, or code graders. |
+| `assertions` | Yes | Runs deterministic, LLM, composite, or script graders. |
 | Top-level `name`, `version`, `tags`, `license`, `requires` | Informational | Identifies and categorizes the suite. |
 | `tests[].metadata` | Informational to AgentV | Passes arbitrary case data through to results and extension context; in-process custom assertions can also read it. |
 
@@ -74,7 +74,7 @@ Benchmark task packs map cleanly onto AgentV fields at authoring time:
 | Source checkout | `workspace.repos[].repo` and `workspace.repos[].commit` |
 | Per-case setup | `extensions: ["file://scripts/setup.mjs:beforeEach"]` reading `case_metadata` |
 | Gold answer | `expected_output` when the answer is passive reference data |
-| Active verification | `assertions`, especially `code-grader` for commands or artifact checks |
+| Active verification | `assertions`, especially `script` for commands or artifact checks |
 | Provenance | `tests[].metadata` with source pins, generator rows, and curation labels |
 | Bulky task files | Optional `tests: ./cases/` with per-case directories and supporting files |
 
@@ -113,7 +113,7 @@ extensions:
 
 assertions:
   - name: focused-tests
-    type: code-grader
+    type: script
     command: ["python", "./graders/run-focused-tests.py"]
     required: true
 
@@ -136,7 +136,7 @@ In this example, `workspace.repos[].commit` is the actual checkout. The
 matching `metadata.source_commit` is audit data that gets recorded with the case
 and is available to extensions. `apply-test-patch.mjs` can read
 `case_metadata.test_patch` and `case_metadata.fail_to_pass_tests`, then apply
-the patch and write the selected test list into the workspace. The code grader
+the patch and write the selected test list into the workspace. The script grader
 can read that workspace file through its `workspace_path` payload. Repo
 acquisition remains outside the eval; use registered projects or
 `git_cache.mirrors` when a local machine needs faster large-repo setup. See
@@ -167,7 +167,7 @@ target: codex
 
 assertions:
   - name: tests-pass
-    type: code-grader
+    type: script
     command: ["python", "./graders/run-tests.py"]
     required: true
 ```
@@ -220,7 +220,7 @@ letting a parent eval compare targets, repeat policy, and gates consistently.
 
 Generated datasets often need stable row provenance more than workspace setup.
 Keep the generated row identity in metadata, use `expected_output` for the gold
-answer, and score with rubrics or an LLM/code grader.
+answer, and score with rubrics or an LLM/script grader.
 
 ```yaml
 name: finance-research-generated
@@ -334,7 +334,7 @@ script.
 - Do not duplicate operational checkout state only in metadata. Put the real
   checkout under `workspace.repos`.
 - Keep `metadata` snake_case because it crosses process and result boundaries.
-- Prefer `expected_output` for passive gold answers and `code-grader` for active
+- Prefer `expected_output` for passive gold answers and `script` for active
   commands, file checks, or generated artifact validation.
 - Prefer case directories over long inline YAML only for bulky source inputs;
   the generated run folder remains the portable artifact contract.
diff --git a/apps/web/src/content/docs/docs/guides/evaluation-types.mdx b/apps/web/src/content/docs/docs/guides/evaluation-types.mdx
index 368ffd26d..fe9953cdc 100644
--- a/apps/web/src/content/docs/docs/guides/evaluation-types.mdx
+++ b/apps/web/src/content/docs/docs/guides/evaluation-types.mdx
@@ -47,7 +47,7 @@ Trigger quality evaluates whether the right skill is activated for the right pro
 |-----------|------------------|-----------------|
 | **Question** | "Does it help?" | "Does it activate?" |
 | **Signal type** | Deterministic-ish | Noisy / statistical |
-| **Test method** | Fixed assertions, rubrics, graders | Repeated trials, train/test splits |
+| **Test method** | Fixed assertions, g-eval, graders | Repeated trials, train/test splits |
 | **What you tune** | Agent logic, prompts, tool use | Skill descriptions, trigger metadata |
 | **Failure mode** | Wrong output | Wrong routing |
 | **Optimization** | Pass/fail per test case | Accuracy rate over a sample |
@@ -64,7 +64,7 @@ AgentV's eval tooling is designed for **execution quality**:
 - **`EVAL.yaml`** — define test cases with inputs, expected outputs, and assertions
 - **`evals.json`** — lightweight skill evaluation format (prompt/expected-output pairs)
 - **`agentv eval`** — execute evaluations and collect results
-- **Graders** — `llm-grader`, `code-grader`, `tool-trajectory`, `rubrics`, `contains`, `regex`, and others all measure execution behavior
+- **Graders** — `llm-grader`, `script`, `tool-trajectory`, `g-eval`, `contains`, `regex`, and others all measure execution behavior
 
 These tools assume the skill is already loaded and invoked. They measure what happens *after* routing, not the routing decision itself.
 
diff --git a/apps/web/src/content/docs/docs/guides/human-review.mdx b/apps/web/src/content/docs/docs/guides/human-review.mdx
index 26a9ad601..50dcf8b6e 100644
--- a/apps/web/src/content/docs/docs/guides/human-review.mdx
+++ b/apps/web/src/content/docs/docs/guides/human-review.mdx
@@ -27,7 +27,7 @@ Skip the review step for routine CI gate runs where you only need pass/fail.
 | **False positive** | A `contains` check passes on a coincidental substring match |
 | **False negative** | An LLM grader penalizes a correct answer that uses different phrasing |
 | **Qualitative regression** | Scores stay the same but tone, formatting, or helpfulness degrades |
-| **Grader miscalibration** | A code grader is too strict on whitespace; a rubric is too lenient on accuracy |
+| **Grader miscalibration** | A script grader is too strict on whitespace; a rubric is too lenient on accuracy |
 | **Flaky results** | The same test produces wildly different scores across runs |
 
 ## How to review
@@ -92,7 +92,7 @@ The `feedback.json` file is a structured annotation of a single eval run. It rec
       "verdict": "needs_improvement",
       "notes": "Missing coverage of multi-document queries.",
       "evaluator_overrides": {
-        "code-grader:format-check": "Too strict — penalized valid output with trailing newline",
+        "script:format-check": "Too strict — penalized valid output with trailing newline",
         "llm-grader:quality": "Score 0.6 seems fair, answer was incomplete"
       },
       "workspace_notes": "Workspace had stale cached files from previous run — may have affected retrieval results."
@@ -137,14 +137,14 @@ The `feedback.json` file is a structured annotation of a single eval run. It rec
 
 ### Grader overrides (workspace evaluations)
 
-For workspace evaluations with multiple graders (code graders, LLM graders, tool trajectory checks), the `evaluator_overrides` field lets the reviewer annotate specific grader results:
+For workspace evaluations with multiple graders (script graders, LLM graders, tool trajectory checks), the `evaluator_overrides` field lets the reviewer annotate specific grader results:
 
 ```json
 {
   "test_id": "test-refactor-api",
   "verdict": "needs_improvement",
   "evaluator_overrides": {
-    "code-grader:test-pass": "Tests pass but the refactored code has a subtle race condition the tests don't cover",
+    "script:test-pass": "Tests pass but the refactored code has a subtle race condition the tests don't cover",
     "llm-grader:quality": "Score 0.9 is too high — the agent left dead code behind",
     "tool-trajectory:efficiency": "Used 12 tool calls where 5 would suffice, but the result is correct"
   },
diff --git a/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx b/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx
index 0bea029db..41253c202 100644
--- a/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx
+++ b/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx
@@ -212,7 +212,7 @@ Keep your baseline stable across iterations. Only re-run the baseline when the t
 
 ## Graduating to EVAL.yaml
 
-When `evals.json` becomes limiting — you need workspace isolation, code graders, tool trajectory checks, or multi-turn conversations — graduate to EVAL.yaml:
+When `evals.json` becomes limiting — you need workspace isolation, script graders, tool trajectory checks, or multi-turn conversations — graduate to EVAL.yaml:
 
 ```bash
 agentv convert evals.json -o eval.yaml
@@ -241,7 +241,7 @@ tests:
 After converting, you can:
 - Replace `llm-grader` assertions with faster deterministic graders (`contains`, `regex`, `equals`)
 - Add `workspace` configuration for file-system isolation
-- Use `code-grader` for custom scoring logic
+- Use `script` for custom scoring logic
 - Define `tool-trajectory` assertions to check tool usage patterns
 
 See [Skill Evals (evals.json)](/docs/integrations/agent-skills-evals/) for the full field mapping and side-by-side comparison.
@@ -258,7 +258,7 @@ If you've been using the Agent Skills skill-creator workflow, AgentV reads your
 | `summary.json` (read) | `<output>/summary.json` (write) | AgentV writes the canonical run summary; convert it in a wrapper if another tool needs a narrower compatibility shape |
 | n/a | `index.jsonl` (write) | AgentV-specific per-test manifest for filtering, retry, and replay workflows |
 | with-skill vs without-skill | `--target baseline --target candidate` | Structured comparison |
-| Graduate to richer evals | `agentv convert evals.json` → EVAL.yaml | Adds workspace, code graders, etc. |
+| Graduate to richer evals | `agentv convert evals.json` → EVAL.yaml | Adds workspace, script graders, etc. |
 
 **Key takeaway:** You do not need to rewrite your `evals.json`. AgentV reads it directly and adds a richer evaluation engine on top.
 
@@ -310,7 +310,7 @@ Start simple and add complexity only when the evaluation results demand it:
 
 1. **Start with `evals.json`** — 5-10 test cases, natural-language assertions
 2. **Add deterministic checks** — when you find assertions that can be exact (`contains`, `regex`)
-3. **Graduate to EVAL.yaml** — when you need workspace isolation or code graders
+3. **Graduate to EVAL.yaml** — when you need workspace isolation or script graders
 4. **Add tool trajectory checks** — when tool usage patterns matter
 5. **Use rubrics** — when you need weighted, structured scoring criteria
 
diff --git a/apps/web/src/content/docs/docs/index.mdx b/apps/web/src/content/docs/docs/index.mdx
index 5855bd2b6..10844cd03 100644
--- a/apps/web/src/content/docs/docs/index.mdx
+++ b/apps/web/src/content/docs/docs/index.mdx
@@ -5,7 +5,7 @@ sidebar:
   order: 1
 ---
 
-AgentV is a CLI-first AI agent evaluation framework. It evaluates your agents locally with multi-objective scoring (correctness, latency, cost, safety) from YAML specifications. Deterministic code graders + customizable LLM graders, all version-controlled in Git.
+AgentV is a CLI-first AI agent evaluation framework. It evaluates your agents locally with multi-objective scoring (correctness, latency, cost, safety) from YAML specifications. Deterministic script graders + customizable LLM graders, all version-controlled in Git.
 
 ## Why AgentV?
 
@@ -38,7 +38,7 @@ AgentV is a CLI-first AI agent evaluation framework. It evaluates your agents lo
 - **Eval files** — YAML or JSONL definitions of test cases
 - **Tests** — Individual test entries with input messages and expected outcomes
 - **Targets** — The agent or LLM provider being evaluated
-- **Graders** — Code graders (Python/TypeScript) or LLM graders that score responses
+- **Graders** — Script graders (Python/TypeScript) or LLM graders that score responses
 - **Rubrics** — Structured criteria with weights for grading
 - **Results** — JSONL output with scores, reasoning, and execution traces
 
@@ -50,7 +50,7 @@ Use this topic map when you are an AI agent trying to decide which primitive or
 | --- | --- | --- |
 | Create a first eval | [Quickstart](/docs/getting-started/quickstart/) → [Eval files](/docs/evaluation/eval-files/) | Defines the smallest runnable YAML shape before adding advanced fields. |
 | Run or resume evals | [Running evals](/docs/evaluation/running-evals/) → [WIP checkpoints](/docs/tools/wip-checkpoints/) | Covers `agentv eval`, concurrency, `--resume`, `--rerun-failed`, and remote partial-run recovery. |
-| Choose graders | [Rubrics](/docs/evaluation/rubrics/) → [Code graders](/docs/graders/code-graders/) → [LLM graders](/docs/graders/llm-graders/) | Keeps deterministic checks, rubric scoring, and LLM judgment separate. |
+| Choose graders | [Rubrics](/docs/evaluation/rubrics/) → [Script graders](/docs/graders/code-graders/) → [LLM graders](/docs/graders/llm-graders/) | Keeps deterministic checks, rubric scoring, and LLM judgment separate. |
 | Evaluate tool use or agents | [Tool trajectory](/docs/graders/tool-trajectory/) → [Coding agents](/docs/targets/coding-agents/) → [CLI provider](/docs/targets/cli-provider/) | Shows how targets, transcripts, and tool-call assertions compose. |
 | Share and inspect results | [Result artifact contract](/docs/reference/result-artifacts/) → [Results](/docs/tools/results/) → [Dashboard](/docs/tools/dashboard/) | Explains canonical run bundles, local artifacts, reports, remote result repositories, and Dashboard review flows. |
 | Compare runs | [Compare](/docs/tools/compare/) → [Dashboard Analytics](/docs/tools/dashboard/#analytics) | Use CLI metrics for automation and Dashboard analytics for interactive inspection. |
diff --git a/apps/web/src/content/docs/docs/integrations/agent-skills-evals.mdx b/apps/web/src/content/docs/docs/integrations/agent-skills-evals.mdx
index fe8f0d175..3d583fdad 100644
--- a/apps/web/src/content/docs/docs/integrations/agent-skills-evals.mdx
+++ b/apps/web/src/content/docs/docs/integrations/agent-skills-evals.mdx
@@ -152,7 +152,7 @@ The generated YAML includes comments about available AgentV features you can use
 # Converted from Agent Skills evals.json
 # AgentV features you can add:
 #   - type: is_json, contains, regex for deterministic graders
-#   - type: code-grader for custom scoring scripts
+#   - type: script for custom scoring scripts
 #   - Multi-turn conversations via input message arrays
 #   - Composite graders with weighted scoring
 #   - Workspace isolation with repos and hooks
diff --git a/apps/web/src/content/docs/docs/integrations/autoevals-integration.mdx b/apps/web/src/content/docs/docs/integrations/autoevals-integration.mdx
index 48d69c0b7..1b2ae8159 100644
--- a/apps/web/src/content/docs/docs/integrations/autoevals-integration.mdx
+++ b/apps/web/src/content/docs/docs/integrations/autoevals-integration.mdx
@@ -1,6 +1,6 @@
 ---
 title: Autoevals Integration
-description: Use Braintrust's open-source autoevals scorers (Factuality, Faithfulness, etc.) as code-grader graders in AgentV.
+description: Use Braintrust's open-source autoevals scorers (Factuality, Faithfulness, etc.) as script graders in AgentV.
 sidebar:
   order: 3
 ---
@@ -13,7 +13,7 @@ sidebar:
 
 - Works standalone — no Braintrust platform account required
 - Uses any OpenAI-compatible endpoint for LLM-based scorers
-- Integrates with AgentV via the `code-grader` type: wrap any autoevals scorer in a command that reads stdin and writes the AgentV grader result to stdout
+- Integrates with AgentV via the `script` grader type: wrap any autoevals scorer in a command that reads stdin and writes the AgentV grader result to stdout
 
 ## Installation
 
@@ -50,7 +50,7 @@ All LLM-based scorers return a `score` (0–1) and `metadata.rationale` explaini
 
 ## TypeScript Example
 
-Use the `Factuality` scorer as an AgentV `code-grader` to verify answer correctness.
+Use the `Factuality` scorer as an AgentV `script` grader to verify answer correctness.
 
 **EVAL.yaml:**
 
@@ -63,7 +63,7 @@ tests:
     expected_output: "Paris is the capital of France."
     assertions:
       - name: factuality
-        type: code-grader
+        type: script
         command: ["bun", "run", "graders/factuality.ts"]
 ```
 
@@ -101,7 +101,7 @@ console.log(
 );
 ```
 
-The code grader reads the canonical AgentV stdin payload (`input`, `expected_output`, `output`), maps those fields to autoevals parameters (`input`, `output`, `expected`), runs the scorer, and writes the AgentV result format (with `assertions` array) to stdout.
+The script grader reads the canonical AgentV stdin payload (`input`, `expected_output`, `output`), maps those fields to autoevals parameters (`input`, `output`, `expected`), runs the scorer, and writes the AgentV result format (with `assertions` array) to stdout.
 
 ## Python Example
 
@@ -118,7 +118,7 @@ tests:
     expected_output: "The paper found that transformer models outperform RNNs on long-range tasks."
     assertions:
       - name: faithfulness
-        type: code-grader
+        type: script
         command: ["python", "graders/faithfulness.py"]
 ```
 
@@ -202,7 +202,7 @@ const result = await Factuality({
 
 ## RAG Evaluation Suite
 
-Combine multiple autoevals scorers in a single code grader for comprehensive RAG evaluation.
+Combine multiple autoevals scorers in a single script grader for comprehensive RAG evaluation.
 
 **EVAL.yaml:**
 
@@ -215,7 +215,7 @@ tests:
     expected_output: "Exercise improves cardiovascular health, mental well-being, and longevity."
     assertions:
       - name: rag-quality
-        type: code-grader
+        type: script
         command: ["bun", "run", "graders/rag-suite.ts"]
         weight: 1.0
 ```
diff --git a/apps/web/src/content/docs/docs/reference/comparison.mdx b/apps/web/src/content/docs/docs/reference/comparison.mdx
index a354160c1..a91911931 100644
--- a/apps/web/src/content/docs/docs/reference/comparison.mdx
+++ b/apps/web/src/content/docs/docs/reference/comparison.mdx
@@ -15,7 +15,7 @@ AgentV is the **evaluation layer** in the AI agent lifecycle. It works alongside
 
 ### AgentV — Evaluate
 
-Offline evaluation and testing. Run eval cases against agents, score with deterministic code graders + LLM judges, detect regressions, gate CI/CD pipelines. Everything lives in Git.
+Offline evaluation and testing. Run eval cases against agents, score with deterministic script graders + LLM judges, detect regressions, gate CI/CD pipelines. Everything lives in Git.
 
 ```
 agentv eval evals/my-agent.yaml
diff --git a/apps/web/src/content/docs/docs/reference/result-artifacts.mdx b/apps/web/src/content/docs/docs/reference/result-artifacts.mdx
index 77acc974f..76e04c3f7 100644
--- a/apps/web/src/content/docs/docs/reference/result-artifacts.mdx
+++ b/apps/web/src/content/docs/docs/reference/result-artifacts.mdx
@@ -89,7 +89,7 @@ reserved for rebuildable local state and are skipped by run discovery.
 | `result.json` | Compact per-attempt manifest for one attempt directory, including AgentV `execution_status` and `verdict`. | Loading one attempt without scanning the whole run index. |
 | `grading.json` | Grader outputs, assertions, rubric evidence, execution-metric grader facts, and scoring provenance. | Explaining why a row passed or failed. |
 | `metrics.json` | Derived executor behavior summary, such as tool calls, files touched, shell commands, errors, turns, and output sizes. | Dashboard behavior views, metric-style graders, adapter projections, and lightweight analysis. |
-| `outputs/file_changes.diff` | Full unified diff of workspace file changes when file changes are captured. | Human review and external artifact inspection; LLM and code graders still receive the same full diff through `file_changes`. |
+| `outputs/file_changes.diff` | Full unified diff of workspace file changes when file changes are captured. | Human review and external artifact inspection; LLM and script graders still receive the same full diff through `file_changes`. |
 | `timing.json` | Duration, token usage, cost usage, and source labels such as `provider_reported`, `token_estimated`, `aggregate`, or `unavailable`. | Cost/latency reporting and provider-accounting audits. |
 | `transcript.json` | AgentV-normalized transcript/timeline document with canonical `tool_name` values and `transcript_summary`. | Portable human review, transcript-aware graders, and tool-trajectory analysis. |
 | `transcript-raw.jsonl` | Native provider or harness evidence when available. | Parser debugging, forensic review, and preserving source bytes without making provider schemas public AgentV fields. |
diff --git a/apps/web/src/content/docs/docs/tools/convert.mdx b/apps/web/src/content/docs/docs/tools/convert.mdx
index 1a754967a..ba91b090a 100644
--- a/apps/web/src/content/docs/docs/tools/convert.mdx
+++ b/apps/web/src/content/docs/docs/tools/convert.mdx
@@ -37,7 +37,7 @@ Converts an [Agent Skills `evals.json`](/docs/integrations/agent-skills-evals) f
 - Maps `expected_output` → `expected_output`
 - Maps `assertions` → `assertions` graders (llm-grader)
 - Resolves `files[]` paths relative to the evals.json directory
-- Adds TODO comments for AgentV-specific features (workspace setup, code graders, rubrics)
+- Adds TODO comments for AgentV-specific features (workspace setup, script graders, rubrics)
 
 This is a one-way conversion — use it as a starting point, then enhance the generated YAML with AgentV features.
 
diff --git a/apps/web/src/content/docs/docs/tools/import.mdx b/apps/web/src/content/docs/docs/tools/import.mdx
index 834b0680c..276a3775f 100644
--- a/apps/web/src/content/docs/docs/tools/import.mdx
+++ b/apps/web/src/content/docs/docs/tools/import.mdx
@@ -204,7 +204,7 @@ Each instance becomes an EVAL.yaml with:
 - `input` — the problem statement
 - `workspace.docker.image` — the pre-built SWE-bench Docker image (`ghcr.io/epoch-research/swe-bench.eval.x86_64.<instance_id>:latest`)
 - `workspace.repos[].base_commit` — the commit to reset to before the agent runs
-- `assertions` — `code-grader` tasks that run `FAIL_TO_PASS` and `PASS_TO_PASS` pytest suites inside the container
+- `assertions` — `script` tasks that run `FAIL_TO_PASS` and `PASS_TO_PASS` pytest suites inside the container
 
 Run an imported SWE-bench eval against any coding agent target:
 
diff --git a/apps/web/src/content/docs/docs/tools/prepare.mdx b/apps/web/src/content/docs/docs/tools/prepare.mdx
index ac07cddcc..b9f2a8dbb 100644
--- a/apps/web/src/content/docs/docs/tools/prepare.mdx
+++ b/apps/web/src/content/docs/docs/tools/prepare.mdx
@@ -64,7 +64,7 @@ Supported `--trace` inputs:
 | `agentv.trace.v1` JSON or JSONL | Explicit trace replay/export files |
 | AgentV transcript JSONL | `agentv import claude`, `agentv import codex`, or `agentv import copilot` output |
 
-Single-record trace files are accepted directly. Multi-record files are matched by `test_id` and target. The selected trace is projected into AgentV's normal `trace` and `messages` grader context, so `tool-trajectory`, execution-metrics, and code graders receive the same shape they see during eval runs.
+Single-record trace files are accepted directly. Multi-record files are matched by `test_id` and target. The selected trace is projected into AgentV's normal `trace` and `messages` grader context, so `tool-trajectory`, execution-metrics, and script graders receive the same shape they see during eval runs.
 
 Use `--response` when the final answer text should be graded independently of the trace. If `--response` is omitted and the trace contains an assistant message with content, AgentV uses the last assistant message as the candidate answer.
 
diff --git a/packages/core/src/evaluation/graders/composite.ts b/packages/core/src/evaluation/graders/composite.ts
index 66c88fe2b..df89613b1 100644
--- a/packages/core/src/evaluation/graders/composite.ts
+++ b/packages/core/src/evaluation/graders/composite.ts
@@ -71,6 +71,7 @@ export class CompositeGrader implements Grader {
     const aggregator = this.config.aggregator;
 
     switch (aggregator.type) {
+      case 'script':
       case 'code-grader':
         return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
       case 'llm-grader':
diff --git a/packages/core/src/evaluation/graders/llm-grader.ts b/packages/core/src/evaluation/graders/llm-grader.ts
index 27633b8ef..6194e4d50 100644
--- a/packages/core/src/evaluation/graders/llm-grader.ts
+++ b/packages/core/src/evaluation/graders/llm-grader.ts
@@ -13,7 +13,7 @@ import type { Message, Provider, ProviderResponse, ProviderTool } from '../provi
 import { extractLastAssistantContent, isAgentProvider } from '../providers/types.js';
 import { TEMPLATE_VARIABLES } from '../template-variables.js';
 import type { TokenUsage } from '../trace.js';
-import type { AssertionEntry, JsonObject, RubricItem } from '../types.js';
+import type { AssertionEntry, GraderConfig, JsonObject, RubricItem } from '../types.js';
 import { formatRubricOperatorGuidance, formatRubricOperatorLabel } from './rubric-operators.js';
 import { clampScore, isNonEmptyString, parseJsonFromText, scoreToVerdict } from './scoring.js';
 import type { EvaluationContext, EvaluationScore, Grader } from './types.js';
@@ -164,7 +164,7 @@ function buildTemplateVariables(context: EvaluationContext): Record<string, stri
     context.promptInputs.question && context.promptInputs.question.trim().length > 0
       ? context.promptInputs.question
       : context.evalCase.question;
-  const rubrics = context.evaluator?.type === 'llm-grader' ? context.evaluator.rubrics : undefined;
+  const rubrics = getRubrics(context.evaluator);
 
   return {
     [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
@@ -180,6 +180,18 @@ function buildTemplateVariables(context: EvaluationContext): Record<string, stri
   };
 }
 
+function getRubrics(config: GraderConfig | undefined): readonly RubricItem[] | undefined {
+  return config?.type === 'llm-grader' || config?.type === 'g-eval' ? config.rubrics : undefined;
+}
+
+function isLlmBackedWithPreprocessors(
+  config: GraderConfig | undefined,
+): config is Extract<GraderConfig, { readonly type: 'llm-grader' | 'g-eval' | 'llm-rubric' }> {
+  return (
+    config?.type === 'llm-grader' || config?.type === 'g-eval' || config?.type === 'llm-rubric'
+  );
+}
+
 function resolveContentBasePath(context: EvaluationContext): string | undefined {
   if (context.workspacePath) {
     return context.workspacePath;
@@ -242,7 +254,11 @@ export class LlmGrader implements Grader {
 
     // LLM mode: structured JSON evaluation
     const config = preparedContext.evaluator;
-    if (config?.type === 'llm-grader' && config.rubrics && config.rubrics.length > 0) {
+    if (
+      (config?.type === 'llm-grader' || config?.type === 'g-eval') &&
+      config.rubrics &&
+      config.rubrics.length > 0
+    ) {
       return this.evaluateWithRubrics(preparedContext, graderProvider, config.rubrics);
     }
 
@@ -251,7 +267,7 @@ export class LlmGrader implements Grader {
 
   private async prepareContext(context: EvaluationContext): Promise<EvaluationContext> {
     const config = context.evaluator;
-    if (config?.type !== 'llm-grader' || !context.output) {
+    if (!isLlmBackedWithPreprocessors(config) || !context.output) {
       return context;
     }
 
@@ -505,7 +521,7 @@ export class LlmGrader implements Grader {
     const userPrompt = this.buildAgentUserPrompt(context);
 
     const config = context.evaluator;
-    const rubrics = config?.type === 'llm-grader' ? config.rubrics : undefined;
+    const rubrics = getRubrics(config);
 
     const fsTools = createFilesystemTools(workspacePath);
 
@@ -625,7 +641,7 @@ export class LlmGrader implements Grader {
       }
 
       const config = context.evaluator;
-      const rubrics = config?.type === 'llm-grader' ? config.rubrics : undefined;
+      const rubrics = getRubrics(config);
 
       const details: JsonObject = {
         mode: modeLabel,
@@ -669,7 +685,7 @@ export class LlmGrader implements Grader {
    */
   private buildAgentSystemPrompt(context: EvaluationContext): string {
     const config = context.evaluator;
-    const rubrics = config?.type === 'llm-grader' ? config.rubrics : undefined;
+    const rubrics = getRubrics(config);
 
     const parts: string[] = [
       'You are an expert grader with access to the workspace filesystem.',
@@ -705,7 +721,7 @@ export class LlmGrader implements Grader {
     }
 
     const config = context.evaluator;
-    const rubrics = config?.type === 'llm-grader' ? config.rubrics : undefined;
+    const rubrics = getRubrics(config);
 
     const parts: string[] = [
       'Evaluate the candidate answer by investigating the workspace.',
@@ -763,7 +779,7 @@ export class LlmGrader implements Grader {
         : context.evalCase.question;
 
     const config = context.evaluator;
-    const rubrics = config?.type === 'llm-grader' ? config.rubrics : undefined;
+    const rubrics = getRubrics(config);
 
     const template = context.graderTemplateOverride ?? this.graderTemplate;
     if (template) {
diff --git a/packages/core/src/evaluation/graders/promptfoo-assertions.ts b/packages/core/src/evaluation/graders/promptfoo-assertions.ts
new file mode 100644
index 000000000..6847984c7
--- /dev/null
+++ b/packages/core/src/evaluation/graders/promptfoo-assertions.ts
@@ -0,0 +1,372 @@
+import { execFileWithStdin } from '../../runtime/exec.js';
+import { serializeSnakeCaseBoundaryPayload } from '../case-conversion.js';
+import type {
+  AssertSetGraderConfig,
+  AssertionEntry,
+  JsonObject,
+  JsonValue,
+  ScriptAssertionGraderConfig,
+  SimilarGraderConfig,
+} from '../types.js';
+import { clampScore } from './scoring.js';
+import type { EvaluationContext, EvaluationScore, Grader } from './types.js';
+
+type ScriptResult =
+  | boolean
+  | number
+  | {
+      readonly pass?: boolean;
+      readonly score?: number;
+      readonly reason?: string;
+      readonly assertions?: readonly AssertionEntry[];
+      readonly details?: JsonObject;
+    };
+
+function buildAssertionContext(context: EvaluationContext): Record<string, unknown> {
+  return {
+    criteria: context.evalCase.criteria,
+    expectedOutput: context.evalCase.expected_output,
+    input: context.evalCase.input,
+    metadata: context.evalCase.metadata ?? null,
+    trace: context.trace ?? null,
+    tokenUsage: context.tokenUsage ?? null,
+    costUsd: context.costUsd ?? null,
+    durationMs: context.durationMs ?? null,
+    fileChanges: context.fileChanges ?? null,
+    workspacePath: context.workspacePath ?? null,
+    dependencyResults: context.dependencyResults ?? null,
+  };
+}
+
+function normalizeScriptResult(
+  result: ScriptResult,
+  fallbackText: string,
+  threshold?: number,
+): EvaluationScore {
+  const passThreshold = threshold ?? Number.EPSILON;
+  if (typeof result === 'boolean') {
+    return {
+      score: result ? 1 : 0,
+      verdict: result ? 'pass' : 'fail',
+      assertions: [{ text: result ? 'Assertion passed' : fallbackText, passed: result }],
+      expectedAspectCount: 1,
+    };
+  }
+
+  if (typeof result === 'number') {
+    const score = clampScore(result);
+    const passed = score >= passThreshold;
+    return {
+      score,
+      verdict: passed ? 'pass' : 'fail',
+      assertions: [{ text: passed ? 'Assertion passed' : fallbackText, passed }],
+      expectedAspectCount: 1,
+    };
+  }
+
+  const score =
+    typeof result.score === 'number'
+      ? clampScore(result.score)
+      : result.pass === true
+        ? 1
+        : result.pass === false
+          ? 0
+          : 0;
+  const passed = result.pass ?? score >= passThreshold;
+  const assertions =
+    result.assertions && result.assertions.length > 0
+      ? result.assertions
+      : [
+          {
+            text: result.reason ?? (passed ? 'Assertion passed' : fallbackText),
+            passed,
+          },
+        ];
+  return {
+    score,
+    verdict: passed ? 'pass' : 'fail',
+    assertions,
+    expectedAspectCount: assertions.length || 1,
+    ...(result.details ? { details: result.details } : {}),
+  };
+}
+
+function buildFunctionBody(code: string): string {
+  const trimmed = code.trim().replace(/;+\s*$/, '');
+  if (trimmed.includes('\n') || /\breturn\b/.test(trimmed)) {
+    return trimmed;
+  }
+  const lastSemi = trimmed.lastIndexOf(';');
+  if (/^(const|let|var)\s/.test(trimmed) && lastSemi >= 0) {
+    return `${trimmed.slice(0, lastSemi + 1)} return ${trimmed.slice(lastSemi + 1).trim()}`;
+  }
+  return `return ${trimmed}`;
+}
+
+export class JavascriptAssertionGrader implements Grader {
+  readonly kind = 'javascript';
+
+  constructor(private readonly config: ScriptAssertionGraderConfig) {}
+
+  async evaluate(context: EvaluationContext): Promise<EvaluationScore> {
+    try {
+      const fn = new Function('output', 'context', buildFunctionBody(this.config.value));
+      const result = (await fn(context.candidate, buildAssertionContext(context))) as ScriptResult;
+      return normalizeScriptResult(
+        result,
+        'Javascript assertion returned a failing result',
+        this.config.threshold,
+      );
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      return {
+        score: 0,
+        verdict: 'fail',
+        assertions: [{ text: `Javascript assertion failed: ${message}`, passed: false }],
+        expectedAspectCount: 1,
+      };
+    }
+  }
+}
+
+function buildPythonProgram(code: string): string {
+  const isMultiline = code.includes('\n');
+  const body = isMultiline
+    ? code
+        .split('\n')
+        .map((line) => `    ${line}`)
+        .join('\n')
+    : `    return ${code}`;
+
+  return `import json
+import sys
+
+payload = json.load(sys.stdin)
+
+def main(output, context):
+${body}
+
+result = main(payload.get("output", ""), payload.get("context", {}))
+print(json.dumps(result))
+`;
+}
+
+export class PythonAssertionGrader implements Grader {
+  readonly kind = 'python';
+
+  constructor(
+    private readonly config: ScriptAssertionGraderConfig,
+    private readonly timeoutMs?: number,
+  ) {}
+
+  async evaluate(context: EvaluationContext): Promise<EvaluationScore> {
+    const payload = JSON.stringify({
+      output: context.candidate,
+      context: serializeSnakeCaseBoundaryPayload(buildAssertionContext(context)),
+    });
+    try {
+      const result = await execFileWithStdin(
+        ['python3', '-c', buildPythonProgram(this.config.value)],
+        payload,
+        {
+          timeoutMs: this.timeoutMs,
+        },
+      );
+      if (result.exitCode !== 0) {
+        throw new Error(result.stderr.trim() || `python exited with code ${result.exitCode}`);
+      }
+      const parsed = JSON.parse(result.stdout.trim()) as ScriptResult;
+      return normalizeScriptResult(
+        parsed,
+        'Python assertion returned a failing result',
+        this.config.threshold,
+      );
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      return {
+        score: 0,
+        verdict: 'fail',
+        assertions: [{ text: `Python assertion failed: ${message}`, passed: false }],
+        expectedAspectCount: 1,
+      };
+    }
+  }
+}
+
+export class WebhookAssertionGrader implements Grader {
+  readonly kind = 'webhook';
+
+  constructor(private readonly config: ScriptAssertionGraderConfig) {}
+
+  async evaluate(context: EvaluationContext): Promise<EvaluationScore> {
+    try {
+      const response = await fetch(this.config.value, {
+        method: 'POST',
+        headers: { 'content-type': 'application/json' },
+        body: JSON.stringify({
+          output: context.candidate,
+          context: serializeSnakeCaseBoundaryPayload(buildAssertionContext(context)),
+        }),
+      });
+      if (!response.ok) {
+        throw new Error(`HTTP ${response.status}`);
+      }
+      const result = (await response.json()) as ScriptResult;
+      return normalizeScriptResult(
+        result,
+        'Webhook assertion returned a failing result',
+        this.config.threshold,
+      );
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      return {
+        score: 0,
+        verdict: 'fail',
+        assertions: [{ text: `Webhook assertion failed: ${message}`, passed: false }],
+        expectedAspectCount: 1,
+      };
+    }
+  }
+}
+
+export class AssertSetGrader implements Grader {
+  readonly kind = 'assert-set';
+
+  constructor(
+    private readonly config: AssertSetGraderConfig,
+    private readonly createChild: (
+      config: AssertSetGraderConfig['assertions'][number],
+    ) => Promise<Grader>,
+  ) {}
+
+  async evaluate(context: EvaluationContext): Promise<EvaluationScore> {
+    const scores = [];
+    for (const childConfig of this.config.assertions) {
+      const child = await this.createChild(childConfig);
+      const result = await child.evaluate(context);
+      scores.push({
+        name: childConfig.name,
+        type: childConfig.type,
+        score: result.score,
+        weight: childConfig.weight ?? 1,
+        verdict: result.verdict,
+        assertions: result.assertions,
+        graderRawRequest: result.graderRawRequest,
+        scores: result.scores,
+        details: result.details,
+        tokenUsage: result.tokenUsage,
+      });
+    }
+
+    const totalWeight = scores.reduce((sum, score) => sum + (score.weight ?? 1), 0) || 1;
+    const score =
+      scores.reduce((sum, item) => sum + item.score * (item.weight ?? 1), 0) / totalWeight;
+    const threshold = this.config.threshold ?? 1;
+    const passed = score >= threshold;
+    return {
+      score,
+      verdict: passed ? 'pass' : 'fail',
+      assertions: scores.flatMap((item) => item.assertions),
+      expectedAspectCount: scores.reduce((sum, item) => sum + item.assertions.length, 0) || 1,
+      scores,
+      details: { threshold },
+    };
+  }
+}
+
+function getEmbeddingConfig(config: SimilarGraderConfig): JsonObject | undefined {
+  const provider = typeof config.provider === 'object' ? config.provider : undefined;
+  const nested =
+    config.config?.embedding_provider && typeof config.config.embedding_provider === 'object'
+      ? (config.config.embedding_provider as JsonObject)
+      : undefined;
+  return nested ?? provider ?? config.config;
+}
+
+function asString(value: JsonValue | undefined): string | undefined {
+  return typeof value === 'string' && value.trim().length > 0 ? value : undefined;
+}
+
+async function embedTexts(
+  config: SimilarGraderConfig,
+  texts: readonly string[],
+): Promise<number[][]> {
+  const embedding = getEmbeddingConfig(config);
+  const model = asString(embedding?.model);
+  const rawBaseUrl = asString(embedding?.base_url) ?? asString(embedding?.endpoint);
+  if (!embedding || !model || !rawBaseUrl) {
+    throw new Error(
+      'similar requires config.embedding_provider with OpenAI-compatible base_url and model',
+    );
+  }
+  const apiKey = asString(embedding.api_key);
+  const baseUrl = rawBaseUrl.replace(/\/+$/, '');
+  const response = await fetch(`${baseUrl}/embeddings`, {
+    method: 'POST',
+    headers: {
+      'content-type': 'application/json',
+      ...(apiKey ? { authorization: `Bearer ${apiKey}` } : {}),
+    },
+    body: JSON.stringify({ model, input: texts }),
+  });
+  if (!response.ok) {
+    throw new Error(`embedding provider returned HTTP ${response.status}`);
+  }
+  const json = (await response.json()) as { data?: readonly { embedding?: readonly number[] }[] };
+  const embeddings = json.data?.map((item) => [...(item.embedding ?? [])]) ?? [];
+  if (embeddings.length !== texts.length || embeddings.some((item) => item.length === 0)) {
+    throw new Error('embedding provider returned an invalid embeddings payload');
+  }
+  return embeddings;
+}
+
+function cosine(a: readonly number[], b: readonly number[]): number {
+  let dot = 0;
+  let aMag = 0;
+  let bMag = 0;
+  for (let i = 0; i < Math.min(a.length, b.length); i += 1) {
+    dot += a[i] * b[i];
+    aMag += a[i] * a[i];
+    bMag += b[i] * b[i];
+  }
+  if (aMag === 0 || bMag === 0) return 0;
+  return dot / (Math.sqrt(aMag) * Math.sqrt(bMag));
+}
+
+export class SimilarAssertionGrader implements Grader {
+  readonly kind = 'similar';
+
+  constructor(private readonly config: SimilarGraderConfig) {}
+
+  async evaluate(context: EvaluationContext): Promise<EvaluationScore> {
+    try {
+      const [expected, actual] = await embedTexts(this.config, [
+        this.config.value,
+        context.candidate,
+      ]);
+      const similarity = clampScore((cosine(expected, actual) + 1) / 2);
+      const threshold = this.config.threshold ?? 0.75;
+      const passed = similarity >= threshold;
+      return {
+        score: similarity,
+        verdict: passed ? 'pass' : 'fail',
+        assertions: [
+          {
+            text: `Embedding similarity ${similarity.toFixed(3)} ${passed ? '>=' : '<'} ${threshold}`,
+            passed,
+          },
+        ],
+        expectedAspectCount: 1,
+        details: { threshold, metric: 'cosine' },
+      };
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      return {
+        score: 0,
+        verdict: 'fail',
+        assertions: [{ text: `Similar assertion failed: ${message}`, passed: false }],
+        expectedAspectCount: 1,
+      };
+    }
+  }
+}
diff --git a/packages/core/src/evaluation/loaders/case-file-loader.ts b/packages/core/src/evaluation/loaders/case-file-loader.ts
index ada3be8cd..c20103477 100644
--- a/packages/core/src/evaluation/loaders/case-file-loader.ts
+++ b/packages/core/src/evaluation/loaders/case-file-loader.ts
@@ -227,7 +227,7 @@ function parseAssertionFromString(expected: string, sourceFilePath: string): Jso
   }
   if (expected.startsWith('python:')) {
     return {
-      type: 'code-grader',
+      type: 'script',
       command: ['uv', 'run', 'python', expected.slice('python:'.length).trim()],
     };
   }
@@ -235,14 +235,14 @@ function parseAssertionFromString(expected: string, sourceFilePath: string): Jso
     const filePath = stripFileProtocol(expected).trim();
     if (!filePath.endsWith('.py')) {
       throw new Error(
-        `Unsupported promptfoo __expected file assertion "${expected}". Only file://*.py code graders are supported.`,
+        `Unsupported promptfoo __expected file assertion "${expected}". Only file://*.py script graders are supported.`,
       );
     }
     const commandPath = path.isAbsolute(filePath)
       ? filePath
       : path.resolve(path.dirname(sourceFilePath), filePath);
     return {
-      type: 'code-grader',
+      type: 'script',
       command: ['uv', 'run', 'python', commandPath],
     };
   }
diff --git a/packages/core/src/evaluation/loaders/grader-parser.ts b/packages/core/src/evaluation/loaders/grader-parser.ts
index 49283f09f..20181baea 100644
--- a/packages/core/src/evaluation/loaders/grader-parser.ts
+++ b/packages/core/src/evaluation/loaders/grader-parser.ts
@@ -48,12 +48,78 @@ function isDeprecatedJudgeType(type: string): boolean {
   return type === 'code-judge' || type === 'llm-judge';
 }
 
+const UNSUPPORTED_PROMPTFOO_ASSERTION_TYPES = new Set([
+  'agent-rubric',
+  'answer-relevance',
+  'bleu',
+  'classifier',
+  'contains-html',
+  'contains-json',
+  'contains-sql',
+  'contains-xml',
+  'context-faithfulness',
+  'context-recall',
+  'context-relevance',
+  'conversation-relevance',
+  'factuality',
+  'finish-reason',
+  'gleu',
+  'guardrails',
+  'is-html',
+  'is-refusal',
+  'is-sql',
+  'is-valid-function-call',
+  'is-valid-openai-function-call',
+  'is-valid-openai-tools-call',
+  'is-xml',
+  'levenshtein',
+  'meteor',
+  'model-graded-closedqa',
+  'model-graded-factuality',
+  'moderation',
+  'perplexity',
+  'perplexity-score',
+  'pi',
+  'rouge-n',
+  'ruby',
+  'similar:cosine',
+  'similar:dot',
+  'similar:euclidean',
+  'select-best',
+  'human',
+  'max-score',
+  'tool-call-f1',
+  'skill-used',
+  'trajectory:goal-success',
+  'trajectory:tool-args-match',
+  'trajectory:step-count',
+  'trajectory:tool-sequence',
+  'trajectory:tool-used',
+  'trace-error-spans',
+  'trace-span-count',
+  'trace-span-duration',
+  'search-rubric',
+  'word-count',
+]);
+
+function assertSupportedPromptfooType(type: string, evalId: string, name?: string): void {
+  const baseType = type.startsWith('not-') ? type.slice(4) : type;
+  if (!UNSUPPORTED_PROMPTFOO_ASSERTION_TYPES.has(baseType)) {
+    return;
+  }
+  throw new Error(
+    `Unsupported promptfoo assertion type '${type}' in '${evalId}'` +
+      `${name ? ` for evaluator '${name}'` : ''}. This type is future scope in AgentV and is not accepted as a custom assertion.`,
+  );
+}
+
 /**
  * Parse evaluators from eval case configuration.
  */
 export async function parseGraders(
   rawEvalCase: JsonObject & {
     readonly execution?: JsonValue;
+    readonly assert?: JsonValue;
     readonly assertions?: JsonValue;
     readonly evaluators?: JsonValue;
   },
@@ -65,17 +131,19 @@ export async function parseGraders(
   const execution = rawEvalCase.execution;
   const executionObject = isJsonObject(execution) ? execution : undefined;
 
-  // Case-level graders priority: assertions > legacy execution/top-level assertion lists
+  // Case-level graders priority: assert > assertions > legacy execution/top-level assertion lists
   const caseEvaluators =
+    rawEvalCase.assert ??
     rawEvalCase.assertions ??
+    (executionObject ? executionObject.assert : undefined) ??
     (executionObject ? executionObject.evaluators : undefined) ?? // deprecated: use assertions
     rawEvalCase.evaluators; // deprecated: use assertions
 
-  // Root-level default graders: assertions > legacy execution assertion list
+  // Root-level default graders: assert > assertions > legacy execution assertion list
   const skipDefaults = executionObject?.skip_defaults === true;
   const rootEvaluators = skipDefaults
     ? undefined
-    : (globalExecution?.assertions ?? globalExecution?.evaluators); // deprecated: use assertions
+    : (globalExecution?.assert ?? globalExecution?.assertions ?? globalExecution?.evaluators); // deprecated: use assertions
 
   // Parse case-level evaluators
   const parsedCase = await parseGraderList(
@@ -247,6 +315,7 @@ async function expandGraderEntries(
 export async function collectAssertionTemplateSourceReferences(
   rawEvalCase: JsonObject & {
     readonly execution?: JsonValue;
+    readonly assert?: JsonValue;
     readonly assertions?: JsonValue;
     readonly evaluators?: JsonValue;
   },
@@ -257,13 +326,15 @@ export async function collectAssertionTemplateSourceReferences(
   const execution = rawEvalCase.execution;
   const executionObject = isJsonObject(execution) ? execution : undefined;
   const caseEvaluators =
+    rawEvalCase.assert ??
     rawEvalCase.assertions ??
+    (executionObject ? executionObject.assert : undefined) ??
     (executionObject ? executionObject.evaluators : undefined) ??
     rawEvalCase.evaluators;
   const skipDefaults = executionObject?.skip_defaults === true;
   const rootEvaluators = skipDefaults
     ? undefined
-    : (globalExecution?.assertions ?? globalExecution?.evaluators);
+    : (globalExecution?.assert ?? globalExecution?.assertions ?? globalExecution?.evaluators);
 
   return [
     ...(await collectAssertionTemplateReferencesFromValue(caseEvaluators, searchRoots, evalId)),
@@ -364,7 +435,7 @@ async function collectAssertionTemplateReferencesFromObject(
   includeContext: IncludeContext,
 ): Promise<readonly EvalSourceReference[]> {
   const references: EvalSourceReference[] = [];
-  for (const key of ['assertions', 'evaluators'] as const) {
+  for (const key of ['assert', 'assertions', 'evaluators'] as const) {
     references.push(
       ...(await collectAssertionTemplateReferencesFromValue(
         value[key],
@@ -426,7 +497,7 @@ async function parseGraderList(
             // e.g. [contains, "crit1", "crit2", "crit3"] → contains(w=1) + rubrics(w=3)
             // → each of the 4 visible assertions counts equally.
             result[placeholderIndex] = {
-              type: 'rubrics',
+              type: 'g-eval',
               criteria: strings,
               weight: strings.length,
             };
@@ -457,6 +528,10 @@ async function parseGraderList(
       continue;
     }
 
+    if (typeof typeValue === 'string') {
+      assertSupportedPromptfooType(typeValue, evalId, rawName);
+    }
+
     // Unknown types are treated as custom assertion types (resolved via registry discovery)
     const isCustomType = typeof typeValue === 'string' && !isGraderKind(typeValue);
     if (typeof typeValue !== 'string') {
@@ -515,11 +590,64 @@ async function parseGraderList(
       continue;
     }
 
-    if (typeValue === 'code-grader') {
+    if (typeValue === 'assert-set') {
+      const rawMembers = rawEvaluator.assert ?? rawEvaluator.assertions;
+      if (!Array.isArray(rawMembers)) {
+        logWarning(`Skipping assert-set evaluator '${name}' in '${evalId}': missing assert array`);
+        continue;
+      }
+
+      const parsedMembers = await parseGraderList(
+        rawMembers as JsonValue,
+        searchRoots,
+        `${evalId}:${name}`,
+        defaultPreprocessors,
+      );
+      if (!parsedMembers || parsedMembers.length === 0) {
+        logWarning(
+          `Skipping assert-set evaluator '${name}' in '${evalId}': no valid child assertions`,
+        );
+        continue;
+      }
+
+      const threshold =
+        typeof rawEvaluator.threshold === 'number' &&
+        rawEvaluator.threshold >= 0 &&
+        rawEvaluator.threshold <= 1
+          ? rawEvaluator.threshold
+          : undefined;
+      const weight = validateWeight(rawEvaluator.weight, name, evalId);
+      const { required, min_score } = parseRequiredAndMinScore(
+        rawEvaluator.required,
+        (rawEvaluator as Record<string, unknown>).min_score as JsonValue | undefined,
+        name,
+        evalId,
+      );
+      evaluators.push({
+        name,
+        type: 'assert-set',
+        assertions: parsedMembers,
+        ...(threshold !== undefined ? { threshold } : {}),
+        ...(weight !== undefined ? { weight } : {}),
+        ...(required !== undefined ? { required } : {}),
+        ...(min_score !== undefined ? { min_score } : {}),
+        ...(negate !== undefined ? { negate } : {}),
+      });
+      continue;
+    }
+
+    if (typeValue === 'code-grader' || typeValue === 'script') {
+      const isLegacyCodeGrader = typeValue === 'code-grader';
+      if (isLegacyCodeGrader) {
+        logWarning(
+          `Evaluator '${name}' in '${evalId}': 'code-grader' is deprecated. Use 'script' instead.`,
+        );
+      }
+      const displayType = 'script';
       let command: string[] | undefined;
       if (rawEvaluator.script !== undefined) {
         throw new Error(
-          `Grader '${name}' in '${evalId}': 'script' has been removed. Use 'command' instead.`,
+          `Grader '${name}' in '${evalId}': 'script' field has been removed. Use 'command' instead.`,
         );
       }
       const rawCommand = rawEvaluator.command;
@@ -528,19 +656,19 @@ async function parseGraderList(
         const trimmed = rawCommand.trim();
         if (trimmed.length === 0) {
           throw new Error(
-            `Invalid code-grader command for evaluator '${name}' in '${evalId}': command cannot be empty`,
+            `Invalid ${displayType} command for evaluator '${name}' in '${evalId}': command cannot be empty`,
           );
         }
         command = parseCommandToArgv(trimmed);
       } else {
         command = asStringArray(
           rawCommand,
-          `code-grader command for evaluator '${name}' in '${evalId}'`,
+          `${displayType} command for evaluator '${name}' in '${evalId}'`,
         );
       }
 
       if (!command) {
-        logWarning(`Skipping code-grader evaluator '${name}' in '${evalId}': missing command`);
+        logWarning(`Skipping ${displayType} evaluator '${name}' in '${evalId}': missing command`);
         continue;
       }
 
@@ -556,7 +684,7 @@ async function parseGraderList(
           resolvedCwd = path.resolve(resolved.resolvedPath);
         } else {
           logWarning(
-            `Code-grader evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
+            `${displayType} evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
             resolved.attempted.length > 0
               ? resolved.attempted.map((attempt) => `  Tried: ${attempt}`)
               : undefined,
@@ -625,7 +753,7 @@ async function parseGraderList(
 
       evaluators.push({
         name,
-        type: 'code-grader',
+        type: 'script',
         command,
         ...(resolvedScriptPath ? { resolvedScriptPath } : {}),
         cwd,
@@ -642,8 +770,8 @@ async function parseGraderList(
     }
 
     if (typeValue === 'composite') {
-      // Accept assertions > evaluators (deprecated)
-      const rawMembers = rawEvaluator.assertions ?? rawEvaluator.evaluators; // evaluators deprecated
+      // Accept assert > assertions > evaluators (deprecated)
+      const rawMembers = rawEvaluator.assert ?? rawEvaluator.assertions ?? rawEvaluator.evaluators; // evaluators deprecated
       if (!Array.isArray(rawMembers)) {
         logWarning(
           `Skipping composite evaluator '${name}' in '${evalId}': missing assertions (or evaluators) array`,
@@ -675,6 +803,7 @@ async function parseGraderList(
       }
       if (
         normalizedAggregatorType !== 'weighted_average' &&
+        normalizedAggregatorType !== 'script' &&
         normalizedAggregatorType !== 'code-grader' &&
         normalizedAggregatorType !== 'llm-grader' &&
         normalizedAggregatorType !== 'threshold'
@@ -710,7 +839,7 @@ async function parseGraderList(
           continue;
         }
 
-        // Parse member evaluator (reuse existing logic for code, llm-grader, code-grader)
+        // Parse member evaluator (reuse existing logic for code, llm-grader, script)
         const memberConfigs = await parseGraders(
           { evaluators: [rawMember] },
           undefined,
@@ -749,11 +878,19 @@ async function parseGraderList(
           type: 'weighted_average',
           ...(Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}),
         };
-      } else if (normalizedAggregatorType === 'code-grader') {
+      } else if (
+        normalizedAggregatorType === 'script' ||
+        normalizedAggregatorType === 'code-grader'
+      ) {
+        if (normalizedAggregatorType === 'code-grader') {
+          logWarning(
+            `Composite evaluator '${name}' in '${evalId}': aggregator type 'code-grader' is deprecated. Use 'script' instead.`,
+          );
+        }
         const aggregatorPath = asString(rawAggregator.path);
         if (!aggregatorPath) {
           logWarning(
-            `Skipping composite evaluator '${name}' in '${evalId}': code-grader aggregator missing path`,
+            `Skipping composite evaluator '${name}' in '${evalId}': script aggregator missing path`,
           );
           continue;
         }
@@ -761,7 +898,7 @@ async function parseGraderList(
         // Set cwd to eval file directory (first search root)
         // Paths are resolved relative to this directory
         aggregator = {
-          type: 'code-grader',
+          type: 'script',
           path: aggregatorPath,
           cwd: searchRoots[0],
         };
@@ -1311,6 +1448,79 @@ async function parseGraderList(
       continue;
     }
 
+    if (typeValue === 'javascript' || typeValue === 'python' || typeValue === 'webhook') {
+      const value = asString(rawEvaluator.value);
+      if (!value || value.trim().length === 0) {
+        logWarning(`Skipping ${typeValue} evaluator '${name}' in '${evalId}': missing value`);
+        continue;
+      }
+      const threshold =
+        typeof rawEvaluator.threshold === 'number' &&
+        rawEvaluator.threshold >= 0 &&
+        rawEvaluator.threshold <= 1
+          ? rawEvaluator.threshold
+          : undefined;
+      const weight = validateWeight(rawEvaluator.weight, name, evalId);
+      const { required, min_score } = parseRequiredAndMinScore(
+        rawEvaluator.required,
+        (rawEvaluator as Record<string, unknown>).min_score as JsonValue | undefined,
+        name,
+        evalId,
+      );
+      const config = isJsonObject(rawEvaluator.config) ? rawEvaluator.config : undefined;
+      evaluators.push({
+        name,
+        type: typeValue,
+        value,
+        ...(threshold !== undefined ? { threshold } : {}),
+        ...(weight !== undefined ? { weight } : {}),
+        ...(required !== undefined ? { required } : {}),
+        ...(min_score !== undefined ? { min_score } : {}),
+        ...(negate !== undefined ? { negate } : {}),
+        ...(config !== undefined ? { config } : {}),
+      });
+      continue;
+    }
+
+    if (typeValue === 'similar') {
+      const value = asString(rawEvaluator.value);
+      if (!value || value.trim().length === 0) {
+        logWarning(`Skipping similar evaluator '${name}' in '${evalId}': missing value`);
+        continue;
+      }
+      const threshold =
+        typeof rawEvaluator.threshold === 'number' &&
+        rawEvaluator.threshold >= 0 &&
+        rawEvaluator.threshold <= 1
+          ? rawEvaluator.threshold
+          : undefined;
+      const weight = validateWeight(rawEvaluator.weight, name, evalId);
+      const { required, min_score } = parseRequiredAndMinScore(
+        rawEvaluator.required,
+        (rawEvaluator as Record<string, unknown>).min_score as JsonValue | undefined,
+        name,
+        evalId,
+      );
+      const provider =
+        typeof rawEvaluator.provider === 'string' || isJsonObject(rawEvaluator.provider)
+          ? rawEvaluator.provider
+          : undefined;
+      const config = isJsonObject(rawEvaluator.config) ? rawEvaluator.config : undefined;
+      evaluators.push({
+        name,
+        type: 'similar',
+        value,
+        ...(threshold !== undefined ? { threshold } : {}),
+        ...(provider !== undefined ? { provider } : {}),
+        ...(weight !== undefined ? { weight } : {}),
+        ...(required !== undefined ? { required } : {}),
+        ...(min_score !== undefined ? { min_score } : {}),
+        ...(negate !== undefined ? { negate } : {}),
+        ...(config !== undefined ? { config } : {}),
+      });
+      continue;
+    }
+
     if (typeValue === 'contains') {
       const value = asString(rawEvaluator.value);
       if (!value) {
@@ -1526,21 +1736,14 @@ async function parseGraderList(
 
     if (typeValue === 'rubrics') {
       const rawCriteria = rawEvaluator.criteria;
-      if (!Array.isArray(rawCriteria) || rawCriteria.length === 0) {
+      const normalizedCriteria = normalizeRubricCriteria(rawCriteria);
+      if (!normalizedCriteria || normalizedCriteria.length === 0) {
         logWarning(
           `Skipping rubrics evaluator '${name}' in '${evalId}': criteria must be a non-empty array`,
         );
         continue;
       }
 
-      // Normalize string shorthands to objects before passing to parseRubricItems
-      const normalizedCriteria = rawCriteria.map((item, index) => {
-        if (typeof item === 'string') {
-          return { id: `rubric-${index + 1}`, outcome: item, weight: 1.0, required: true };
-        }
-        return item;
-      });
-
       const parsedCriteria = parseRubricItems(normalizedCriteria, name, evalId);
       if (!parsedCriteria || parsedCriteria.length === 0) {
         logWarning(`Skipping rubrics evaluator '${name}' in '${evalId}': no valid criteria found`);
@@ -1557,7 +1760,7 @@ async function parseGraderList(
 
       evaluators.push({
         name,
-        type: 'llm-grader',
+        type: 'g-eval',
         rubrics: parsedCriteria,
         ...(graderTargetName ? { target: graderTargetName } : {}),
         ...(weight !== undefined ? { weight } : {}),
@@ -1643,7 +1846,7 @@ async function parseGraderList(
 
     const rawRubrics = rawEvaluator.rubrics;
     const parsedRubrics = Array.isArray(rawRubrics)
-      ? parseRubricItems(rawRubrics, name, evalId)
+      ? parseRubricItems(normalizeRubricCriteria(rawRubrics) ?? [], name, evalId)
       : undefined;
 
     if (typeValue === 'rubric') {
@@ -1667,7 +1870,7 @@ async function parseGraderList(
       // deprecated: `type: rubric` maps to `type: llm-grader` with `rubrics`. Use `type: rubrics` with `criteria` instead.
       evaluators.push({
         name,
-        type: 'llm-grader',
+        type: 'g-eval',
         rubrics: parsedRubrics,
         ...(graderTargetName ? { target: graderTargetName } : {}),
         ...(weight !== undefined ? { weight } : {}),
@@ -1736,6 +1939,73 @@ async function parseGraderList(
     const llmTemperature =
       typeof rawTempLlm === 'number' && rawTempLlm >= 0 && rawTempLlm <= 2 ? rawTempLlm : undefined;
 
+    if (typeValue === 'g-eval') {
+      const rubricSource =
+        rawEvaluator.rubric_item ??
+        rawEvaluator.rubricItem ??
+        rawEvaluator.rubrics ??
+        rawEvaluator.criteria ??
+        rawEvaluator.value;
+      const normalizedCriteria = normalizeRubricCriteria(rubricSource, rawEvaluator);
+      const gEvalRubrics = normalizedCriteria
+        ? parseRubricItems(normalizedCriteria, name, evalId)
+        : undefined;
+      if (!gEvalRubrics || gEvalRubrics.length === 0) {
+        logWarning(
+          `Skipping g-eval evaluator '${name}' in '${evalId}': expected value, criteria, rubric_item, or rubrics`,
+        );
+        continue;
+      }
+
+      evaluators.push({
+        name,
+        type: 'g-eval',
+        prompt,
+        promptPath,
+        ...(promptPath ? { resolvedPromptPath: promptPath } : {}),
+        ...(resolvedPromptScript ? { resolvedPromptScript } : {}),
+        rubrics: gEvalRubrics,
+        ...(graderTargetName ? { target: graderTargetName } : {}),
+        ...(weight !== undefined ? { weight } : {}),
+        ...(required !== undefined ? { required } : {}),
+        ...(min_score !== undefined ? { min_score } : {}),
+        ...(negate !== undefined ? { negate } : {}),
+        ...(finalConfig ? { config: finalConfig } : {}),
+        ...(llmMaxSteps !== undefined ? { max_steps: llmMaxSteps } : {}),
+        ...(llmTemperature !== undefined ? { temperature: llmTemperature } : {}),
+        ...(mergedPreprocessors ? { preprocessors: mergedPreprocessors } : {}),
+      });
+      continue;
+    }
+
+    if (typeValue === 'llm-rubric') {
+      const value =
+        typeof rawEvaluator.value === 'string'
+          ? rawEvaluator.value
+          : typeof rawEvaluator.criteria === 'string'
+            ? rawEvaluator.criteria
+            : undefined;
+      evaluators.push({
+        name,
+        type: 'llm-rubric',
+        prompt,
+        promptPath,
+        ...(promptPath ? { resolvedPromptPath: promptPath } : {}),
+        ...(resolvedPromptScript ? { resolvedPromptScript } : {}),
+        ...(value !== undefined ? { value } : {}),
+        ...(graderTargetName ? { target: graderTargetName } : {}),
+        ...(weight !== undefined ? { weight } : {}),
+        ...(required !== undefined ? { required } : {}),
+        ...(min_score !== undefined ? { min_score } : {}),
+        ...(negate !== undefined ? { negate } : {}),
+        ...(finalConfig ? { config: finalConfig } : {}),
+        ...(llmMaxSteps !== undefined ? { max_steps: llmMaxSteps } : {}),
+        ...(llmTemperature !== undefined ? { temperature: llmTemperature } : {}),
+        ...(mergedPreprocessors ? { preprocessors: mergedPreprocessors } : {}),
+      });
+      continue;
+    }
+
     evaluators.push({
       name,
       type: 'llm-grader',
@@ -2106,6 +2376,55 @@ function parseRubricOperator(
   return undefined;
 }
 
+function normalizeRubricCriteria(
+  raw: unknown,
+  fallback?: JsonObject,
+): readonly unknown[] | undefined {
+  if (typeof raw === 'string') {
+    return [
+      {
+        id: 'rubric-1',
+        outcome: raw,
+        weight: typeof fallback?.weight === 'number' ? fallback.weight : 1,
+        required: typeof fallback?.required === 'boolean' ? fallback.required : true,
+        ...(typeof fallback?.min_score === 'number' ? { min_score: fallback.min_score } : {}),
+        ...(fallback?.score_ranges !== undefined ? { score_ranges: fallback.score_ranges } : {}),
+        ...(typeof fallback?.operator === 'string' ? { operator: fallback.operator } : {}),
+      },
+    ];
+  }
+
+  if (Array.isArray(raw)) {
+    return raw.map((item, index) => {
+      if (typeof item === 'string') {
+        return { id: `rubric-${index + 1}`, outcome: item, weight: 1.0, required: true };
+      }
+      return item;
+    });
+  }
+
+  if (isJsonObject(raw)) {
+    return [raw];
+  }
+
+  if (fallback?.score_ranges !== undefined) {
+    const outcome =
+      asString(fallback.criteria) ?? asString(fallback.value) ?? asString(fallback.outcome);
+    return [
+      {
+        id: asString(fallback.id) ?? 'rubric-1',
+        ...(outcome ? { outcome } : {}),
+        score_ranges: fallback.score_ranges,
+        weight: typeof fallback.weight === 'number' ? fallback.weight : 1,
+        ...(typeof fallback.min_score === 'number' ? { min_score: fallback.min_score } : {}),
+        ...(typeof fallback.operator === 'string' ? { operator: fallback.operator } : {}),
+      },
+    ];
+  }
+
+  return undefined;
+}
+
 /**
  * Parse rubric items from raw YAML/JSON data.
  * Supports both checklist rubrics and score-range rubrics.
@@ -2373,11 +2692,11 @@ function parseScoreRanges(
  * - String shorthand: "Must be polite" -> { id: "rubric-1", outcome: "Must be polite", weight: 1.0, required: true }
  * - Object form with outcome, weight, required, score_ranges, min_score
  *
- * Returns an LlmGraderConfig to prepend to evaluators, or undefined if no valid rubrics.
+ * Returns a g-eval config to prepend to evaluators, or undefined if no valid rubrics.
  */
 export function parseInlineRubrics(
   rawRubrics: readonly unknown[],
-): import('../types.js').LlmGraderConfig | undefined {
+): import('../types.js').GEvalGraderConfig | undefined {
   const rubricItems = rawRubrics
     .filter((r): r is JsonObject | string => isJsonObject(r) || typeof r === 'string')
     .map((rubric, index) => {
@@ -2456,7 +2775,7 @@ export function parseInlineRubrics(
 
   return {
     name: 'rubrics',
-    type: 'llm-grader',
+    type: 'g-eval',
     rubrics: rubricItems,
   };
 }
diff --git a/packages/core/src/evaluation/loaders/jsonl-parser.ts b/packages/core/src/evaluation/loaders/jsonl-parser.ts
index 4572a6191..b02560eb9 100644
--- a/packages/core/src/evaluation/loaders/jsonl-parser.ts
+++ b/packages/core/src/evaluation/loaders/jsonl-parser.ts
@@ -50,12 +50,14 @@ type RawJsonlEvalCase = JsonObject & {
   readonly id?: JsonValue;
   readonly conversation_id?: JsonValue;
   readonly criteria?: JsonValue;
-  /** @deprecated Use `criteria` instead */
+  /** @deprecated Use `assert` instead */
   readonly expected_outcome?: JsonValue;
   readonly input?: JsonValue;
   readonly expected_output?: JsonValue;
   readonly execution?: JsonValue;
   readonly evaluators?: JsonValue;
+  readonly assert?: JsonValue;
+  readonly assertions?: JsonValue;
   readonly rubrics?: JsonValue;
 };
 
@@ -167,7 +169,7 @@ export async function loadTestsFromJsonl(
     sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackSuiteName;
 
   // Global defaults from sidecar
-  const globalEvaluator = coerceEvaluator(sidecar.evaluator, 'sidecar') ?? 'llm-grader';
+  const globalEvaluator = coerceEvaluator(sidecar.evaluator, 'sidecar');
   const globalExecution = sidecar.execution;
 
   if (verbose) {
@@ -197,7 +199,7 @@ export async function loadTestsFromJsonl(
       outcome = asString(testCaseConfig.expected_outcome);
       if (outcome) {
         logWarning(
-          `Test '${asString(testCaseConfig.id) ?? 'unknown'}': 'expected_outcome' is deprecated. Use 'criteria' instead.`,
+          `Test '${asString(testCaseConfig.id) ?? 'unknown'}': 'expected_outcome' has been removed. Use 'assert' instead.`,
         );
       }
     }
@@ -207,12 +209,37 @@ export async function loadTestsFromJsonl(
     // Resolve expected_output with shorthand support
     const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
 
-    // A test is complete when it has id, input, and at least one of: criteria, expected_output, or assertions
+    const hasExplicitCaseGraders =
+      testCaseConfig.assert !== undefined ||
+      testCaseConfig.assertions !== undefined ||
+      testCaseConfig.evaluators !== undefined ||
+      testCaseConfig.rubrics !== undefined;
+    const executionObject = isJsonObject(testCaseConfig.execution)
+      ? testCaseConfig.execution
+      : undefined;
+    const hasExplicitRootGraders =
+      executionObject?.skip_defaults === true
+        ? false
+        : globalExecution?.assert !== undefined ||
+          globalExecution?.assertions !== undefined ||
+          globalExecution?.evaluators !== undefined;
+    const graderCase =
+      outcome && !hasExplicitCaseGraders && !hasExplicitRootGraders
+        ? ({ ...testCaseConfig, assert: [outcome] } satisfies RawJsonlEvalCase)
+        : testCaseConfig;
+
+    // A test is complete when it has id, input, and at least one of: criteria,
+    // expected_output, or assertions. Legacy test-level criteria is desugared to a
+    // bare-string assert above so it uses the canonical g-eval path instead of the
+    // implicit default LLM grader.
     const hasEvaluationSpec =
-      !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== undefined;
+      !!outcome ||
+      expectedMessages.length > 0 ||
+      graderCase.assert !== undefined ||
+      graderCase.assertions !== undefined;
     if (!id || !hasEvaluationSpec || !rawInputMessages || rawInputMessages.length === 0) {
       logError(
-        `Skipping incomplete test at line ${lineNumber}: ${id ?? 'unknown'}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`,
+        `Skipping incomplete test at line ${lineNumber}: ${id ?? 'unknown'}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`,
       );
       continue;
     }
@@ -276,12 +303,7 @@ export async function loadTestsFromJsonl(
     const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
     let evaluators: Awaited<ReturnType<typeof parseGraders>>;
     try {
-      evaluators = await parseGraders(
-        testCaseConfig,
-        mergedExecution,
-        searchRoots,
-        id ?? 'unknown',
-      );
+      evaluators = await parseGraders(graderCase, mergedExecution, searchRoots, id ?? 'unknown');
     } catch (error) {
       // Skip entire test if evaluator validation fails
       const message = error instanceof Error ? error.message : String(error);
diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts
index 29518ee3a..947c41927 100644
--- a/packages/core/src/evaluation/orchestrator.ts
+++ b/packages/core/src/evaluation/orchestrator.ts
@@ -2783,6 +2783,22 @@ async function runEvaluatorsForCase(options: {
     });
   }
 
+  if (!evalCase.evaluator && (!evalCase.preprocessors || evalCase.preprocessors.length === 0)) {
+    return {
+      score: {
+        score: 1,
+        verdict: 'pass',
+        assertions: [
+          {
+            text: 'No assertions declared; grading skipped',
+            passed: true,
+          },
+        ],
+        expectedAspectCount: 1,
+      },
+    };
+  }
+
   const evaluatorKind = evalCase.evaluator ?? 'llm-grader';
   const activeEvaluator = evaluators[evaluatorKind] ?? evaluators['llm-grader'];
   if (!activeEvaluator) {
diff --git a/packages/core/src/evaluation/registry/builtin-graders.ts b/packages/core/src/evaluation/registry/builtin-graders.ts
index d3dbcf873..6bc4d691b 100644
--- a/packages/core/src/evaluation/registry/builtin-graders.ts
+++ b/packages/core/src/evaluation/registry/builtin-graders.ts
@@ -32,6 +32,13 @@ import {
 } from '../graders.js';
 import { InlineAssertGrader } from '../graders/inline-assert.js';
 import { containsTemplateVariables, resolveCustomPrompt } from '../graders/prompt-resolution.js';
+import {
+  AssertSetGrader,
+  JavascriptAssertionGrader,
+  PythonAssertionGrader,
+  SimilarAssertionGrader,
+  WebhookAssertionGrader,
+} from '../graders/promptfoo-assertions.js';
 import { isAgentProvider } from '../providers/types.js';
 import type { Provider } from '../providers/types.js';
 import type { ToolTrajectoryGraderConfig } from '../trace.js';
@@ -46,14 +53,20 @@ import type {
   EqualsGraderConfig,
   ExecutionMetricsGraderConfig,
   FieldAccuracyGraderConfig,
+  GEvalGraderConfig,
   GraderConfig,
   IcontainsAllGraderConfig,
   IcontainsAnyGraderConfig,
   IcontainsGraderConfig,
   IsJsonGraderConfig,
   LatencyGraderConfig,
+  LlmBackedGraderConfig,
   LlmGraderConfig,
+  LlmRubricGraderConfig,
   RegexGraderConfig,
+  ScriptAssertionGraderConfig,
+  ScriptGraderConfig,
+  SimilarGraderConfig,
   SkillTriggerGraderConfig,
   StartsWithGraderConfig,
   TokenUsageGraderConfig,
@@ -79,7 +92,7 @@ export const INLINE_ASSERT_FN = Symbol.for('agentv.inline-assert-fn');
  * - agentv provider: built-in AI SDK agent mode with filesystem tools
  */
 export const llmGraderFactory: GraderFactoryFn = (config, context) => {
-  const c = config as LlmGraderConfig;
+  const c = config as LlmBackedGraderConfig;
   const { llmGrader, graderProvider, targetResolver, agentTimeoutMs } = context;
 
   let evaluator = llmGrader;
@@ -111,7 +124,7 @@ export const llmGraderFactory: GraderFactoryFn = (config, context) => {
   }
 
   return {
-    kind: 'llm-grader',
+    kind: c.type,
     async evaluate(evalContext) {
       const customPrompt = await resolveCustomPrompt(
         c,
@@ -146,6 +159,9 @@ export const llmGraderFactory: GraderFactoryFn = (config, context) => {
 
       let graderTemplateOverride: string | undefined;
       let evalCase = evalContext.evalCase;
+      if (c.type === 'llm-rubric' && c.value && !customPrompt) {
+        evalCase = { ...evalCase, criteria: c.value };
+      }
       if (customPrompt) {
         if (!isFromInlinePrompt || containsTemplateVariables(customPrompt)) {
           graderTemplateOverride = customPrompt;
@@ -165,9 +181,15 @@ export const llmGraderFactory: GraderFactoryFn = (config, context) => {
   };
 };
 
-/** Factory for `code-grader` evaluators. */
+export const gEvalFactory: GraderFactoryFn = (config, context) =>
+  llmGraderFactory(config as GEvalGraderConfig, context);
+
+export const llmRubricFactory: GraderFactoryFn = (config, context) =>
+  llmGraderFactory(config as LlmRubricGraderConfig, context);
+
+/** Factory for subprocess-backed script evaluators. */
 export const codeFactory: GraderFactoryFn = (config, context) => {
-  const c = config as CodeGraderConfig;
+  const c = config as ScriptGraderConfig | CodeGraderConfig;
   return new CodeGrader({
     command: c.command,
     cwd: c.resolvedCwd ?? c.cwd,
@@ -177,6 +199,24 @@ export const codeFactory: GraderFactoryFn = (config, context) => {
   });
 };
 
+export const javascriptFactory: GraderFactoryFn = (config) =>
+  new JavascriptAssertionGrader(config as ScriptAssertionGraderConfig);
+
+export const pythonFactory: GraderFactoryFn = (config, context) =>
+  new PythonAssertionGrader(config as ScriptAssertionGraderConfig, context.agentTimeoutMs);
+
+export const webhookFactory: GraderFactoryFn = (config) =>
+  new WebhookAssertionGrader(config as ScriptAssertionGraderConfig);
+
+export const similarFactory: GraderFactoryFn = (config) =>
+  new SimilarAssertionGrader(config as SimilarGraderConfig);
+
+export const assertSetFactory: GraderFactoryFn = (config, context) => {
+  return new AssertSetGrader(config as import('../types.js').AssertSetGraderConfig, (child) =>
+    context.registry.create(child, context),
+  );
+};
+
 /** Factory for `composite` evaluators. */
 export const compositeFactory: GraderFactoryFn = (config, context) => {
   const c = config as CompositeGraderConfig;
@@ -407,7 +447,10 @@ export function createBuiltinRegistry(): GraderRegistry {
 
   registry
     .register('llm-grader', llmGraderFactory)
+    .register('g-eval', gEvalFactory)
+    .register('llm-rubric', llmRubricFactory)
     .register('code-grader', codeFactory)
+    .register('script', codeFactory)
     .register('composite', compositeFactory)
     .register('tool-trajectory', toolTrajectoryFactory)
     .register('field-accuracy', fieldAccuracyFactory)
@@ -416,6 +459,7 @@ export function createBuiltinRegistry(): GraderRegistry {
     .register('token-usage', tokenUsageFactory)
     .register('execution-metrics', executionMetricsFactory)
     .register('skill-trigger', skillTriggerFactory)
+    .register('assert-set', assertSetFactory)
     .register('contains', containsFactory)
     .register('contains-any', containsAnyFactory)
     .register('contains-all', containsAllFactory)
@@ -427,6 +471,10 @@ export function createBuiltinRegistry(): GraderRegistry {
     .register('regex', regexFactory)
     .register('is-json', isJsonFactory)
     .register('equals', equalsFactory)
+    .register('javascript', javascriptFactory)
+    .register('python', pythonFactory)
+    .register('webhook', webhookFactory)
+    .register('similar', similarFactory)
     .register('inline-assert', (config) => {
       // biome-ignore lint/suspicious/noExplicitAny: symbol key access requires any
       const fn = (config as any)[INLINE_ASSERT_FN] as
diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts
index 806d86146..f379a0c2d 100644
--- a/packages/core/src/evaluation/types.ts
+++ b/packages/core/src/evaluation/types.ts
@@ -165,6 +165,7 @@ export function isTestMessage(value: unknown): value is TestMessage {
 
 const GRADER_KIND_VALUES = [
   'code-grader',
+  'script',
   'llm-grader',
   'rubric',
   'composite',
@@ -175,6 +176,9 @@ const GRADER_KIND_VALUES = [
   'token-usage',
   'execution-metrics',
   'skill-trigger',
+  'assert-set',
+  'g-eval',
+  'llm-rubric',
   'contains',
   'contains-any',
   'contains-all',
@@ -186,6 +190,10 @@ const GRADER_KIND_VALUES = [
   'regex',
   'is-json',
   'equals',
+  'javascript',
+  'python',
+  'webhook',
+  'similar',
   'rubrics',
   'inline-assert',
 ] as const;
@@ -387,9 +395,9 @@ export type WorkspaceConfig = {
   readonly env?: WorkspaceEnvConfig;
 };
 
-export type CodeGraderConfig = {
+export type ScriptGraderConfig = {
   readonly name: string;
-  readonly type: 'code-grader';
+  readonly type: 'script';
   readonly command: readonly string[];
   readonly resolvedScriptPath?: string;
   readonly cwd?: string;
@@ -400,7 +408,7 @@ export type CodeGraderConfig = {
   readonly min_score?: number;
   /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
   readonly negate?: boolean;
-  /** Pass-through configuration for the code-grader (any unrecognized YAML properties) */
+  /** Pass-through configuration for the script (any unrecognized YAML properties) */
   readonly config?: JsonObject;
   /** When present, enables target access via local proxy */
   readonly target?: TargetAccessConfig;
@@ -408,6 +416,11 @@ export type CodeGraderConfig = {
   readonly preprocessors?: readonly ContentPreprocessorConfig[];
 };
 
+/** @deprecated Use ScriptGraderConfig with type: 'script'. */
+export type CodeGraderConfig = Omit<ScriptGraderConfig, 'type'> & {
+  readonly type: 'code-grader';
+};
+
 /**
  * Executable prompt template configuration.
  * Matches code-grader pattern for consistency.
@@ -457,6 +470,18 @@ export type LlmGraderConfig = {
   readonly preprocessors?: readonly ContentPreprocessorConfig[];
 };
 
+export type GEvalGraderConfig = Omit<LlmGraderConfig, 'type'> & {
+  readonly type: 'g-eval';
+};
+
+export type LlmRubricGraderConfig = Omit<LlmGraderConfig, 'type' | 'rubrics'> & {
+  readonly type: 'llm-rubric';
+  /** Promptfoo-compatible free-form rubric text. */
+  readonly value?: string;
+};
+
+export type LlmBackedGraderConfig = LlmGraderConfig | GEvalGraderConfig | LlmRubricGraderConfig;
+
 /**
  * Score range definition for analytic rubric scoring.
  * Each range maps an integer score band (0-10) to an outcome description.
@@ -510,6 +535,8 @@ export type RubricItem = {
 
 export type CompositeAggregatorConfig =
   | { readonly type: 'weighted_average'; readonly weights?: Record<string, number> }
+  | { readonly type: 'script'; readonly path: string; readonly cwd?: string }
+  /** @deprecated Use the script aggregator type. */
   | { readonly type: 'code-grader'; readonly path: string; readonly cwd?: string }
   | {
       readonly type: 'llm-grader';
@@ -860,6 +887,42 @@ export type RubricsEvaluatorConfig = {
   readonly negate?: boolean;
 };
 
+export type ScriptAssertionGraderConfig = {
+  readonly name: string;
+  readonly type: 'javascript' | 'python' | 'webhook';
+  readonly value: string;
+  readonly threshold?: number;
+  readonly weight?: number;
+  readonly required?: boolean;
+  readonly min_score?: number;
+  readonly negate?: boolean;
+  readonly config?: JsonObject;
+};
+
+export type SimilarGraderConfig = {
+  readonly name: string;
+  readonly type: 'similar';
+  readonly value: string;
+  readonly threshold?: number;
+  readonly provider?: string | JsonObject;
+  readonly weight?: number;
+  readonly required?: boolean;
+  readonly min_score?: number;
+  readonly negate?: boolean;
+  readonly config?: JsonObject;
+};
+
+export type AssertSetGraderConfig = {
+  readonly name: string;
+  readonly type: 'assert-set';
+  readonly assertions: readonly GraderConfig[];
+  readonly threshold?: number;
+  readonly weight?: number;
+  readonly required?: boolean;
+  readonly min_score?: number;
+  readonly negate?: boolean;
+};
+
 /**
  * Configuration for the skill-trigger evaluator.
  * Detects whether the agent invoked a named skill as its first tool call.
@@ -895,8 +958,11 @@ export type InlineAssertEvaluatorConfig = {
 };
 
 export type GraderConfig = (
+  | ScriptGraderConfig
   | CodeGraderConfig
   | LlmGraderConfig
+  | GEvalGraderConfig
+  | LlmRubricGraderConfig
   | CompositeGraderConfig
   | ToolTrajectoryGraderConfig
   | FieldAccuracyGraderConfig
@@ -917,6 +983,9 @@ export type GraderConfig = (
   | IsJsonGraderConfig
   | EqualsGraderConfig
   | RubricsEvaluatorConfig
+  | ScriptAssertionGraderConfig
+  | SimilarGraderConfig
+  | AssertSetGraderConfig
   | InlineAssertEvaluatorConfig
 ) & {
   /** Optional promptfoo-style named score key. Scoring aggregation support is layered separately. */
@@ -935,7 +1004,11 @@ export interface EvalSourceReference {
     | 'input_file'
     | 'llm_grader_prompt'
     | 'prompt_script'
+    | 'script_grader_command'
+    | 'script_grader_cwd'
+    /** @deprecated New eval loads emit script_grader_command. */
     | 'code_grader_command'
+    /** @deprecated New eval loads emit script_grader_cwd. */
     | 'code_grader_cwd'
     | 'assertion_template'
     | 'preprocessor_command';
diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts
index 021cece7d..b120a5745 100644
--- a/packages/core/src/evaluation/validation/eval-file.schema.ts
+++ b/packages/core/src/evaluation/validation/eval-file.schema.ts
@@ -106,7 +106,7 @@ const RubricCriterionSchema = z.union([z.string().min(1), RubricItemSchema]);
 // --- Type-specific evaluator schemas ---
 
 const CodeGraderSchema = EvaluatorCommonSchema.extend({
-  type: z.enum(['code-grader', 'code_grader']),
+  type: z.literal('script'),
   command: z.union([z.string(), z.array(z.string())]),
   cwd: z.string().optional(),
   target: z.union([z.boolean(), z.object({ max_calls: z.number().optional() })]).optional(),
@@ -143,7 +143,7 @@ const AggregatorSchema = z.discriminatedUnion('type', [
     threshold: z.number().min(0).max(1),
   }),
   z.object({
-    type: z.literal('code-grader'),
+    type: z.literal('script'),
     path: z.string(),
     cwd: z.string().optional(),
   }),
@@ -158,6 +158,7 @@ const AggregatorSchema = z.discriminatedUnion('type', [
 const CompositeSchema: z.ZodType = z.lazy(() =>
   EvaluatorCommonSchema.extend({
     type: z.literal('composite'),
+    assert: z.array(EvaluatorSchema).optional(),
     assertions: z.array(EvaluatorSchema).optional(),
     evaluators: z.array(EvaluatorSchema).optional(),
     aggregator: AggregatorSchema,
@@ -264,8 +265,6 @@ const PromptfooAssertionSchema = EvaluatorCommonSchema.extend({
     'python',
     'webhook',
     'similar',
-    'select-best',
-    'human',
     'contains',
     'contains-any',
     'contains-all',
@@ -504,6 +503,7 @@ const ExecutionSchema = z.object({
   target: z.string().optional(),
   targets: z.array(z.union([z.string(), EvalTargetRefSchema])).optional(),
   workers: z.never().optional(),
+  assert: z.array(AssertionItemSchema).optional(),
   assertions: z.array(AssertionItemSchema).optional(),
   evaluators: z.array(EvaluatorSchema).optional(),
   skip_defaults: z.boolean().optional(),
@@ -585,7 +585,6 @@ const EvalTestSchema = z.object({
   id: z.string().min(1).optional(),
   description: z.string().optional(),
   vars: JsonObjectSchema.optional(),
-  criteria: z.string().optional(),
   provider: EvalTargetSchema.optional(),
   providers: EvalTargetsSchema.optional(),
   prompts: PromptsSchema.optional(),
diff --git a/packages/core/src/evaluation/validation/eval-validator.ts b/packages/core/src/evaluation/validation/eval-validator.ts
index 1c0cc3ef9..8de2978f0 100644
--- a/packages/core/src/evaluation/validation/eval-validator.ts
+++ b/packages/core/src/evaluation/validation/eval-validator.ts
@@ -232,7 +232,7 @@ const REMOVED_TEST_FIELDS = new Map<string, string>([]);
 /** Deprecated test-level fields with migration hints. */
 const DEPRECATED_TEST_FIELDS = new Map<string, string>([
   ['evaluator', "'evaluator' is deprecated. Use 'assertions' instead."],
-  ['expected_outcome', "'expected_outcome' is deprecated. Use 'criteria' instead."],
+  ['expected_outcome', "'expected_outcome' is deprecated. Use 'assert' instead."],
 ]);
 
 /** Name field pattern: lowercase alphanumeric with hyphens. */
diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts
index 12f25ed16..cf36040cf 100644
--- a/packages/core/src/evaluation/yaml-parser.ts
+++ b/packages/core/src/evaluation/yaml-parser.ts
@@ -204,6 +204,7 @@ type RawTestSuite = JsonObject & {
   readonly default_test?: JsonValue;
   readonly workspace?: JsonValue;
   readonly assertions?: JsonValue;
+  readonly assert?: JsonValue;
   readonly preprocessors?: JsonValue;
   readonly extensions?: JsonValue;
   readonly on_run_complete?: JsonValue;
@@ -240,6 +241,7 @@ type RawEvalCase = JsonObject & {
   readonly run?: JsonValue;
   readonly evaluators?: JsonValue;
   readonly assertions?: JsonValue;
+  readonly assert?: JsonValue;
   readonly rubrics?: JsonValue;
   readonly workspace?: JsonValue;
   readonly metadata?: JsonValue;
@@ -324,6 +326,9 @@ function interpolateRawEvalCase(
     ...(raw.expected_output !== undefined
       ? { expected_output: interpolateCaseField(raw.expected_output, vars, filters) }
       : {}),
+    ...(raw.assert !== undefined
+      ? { assert: interpolateCaseField(raw.assert, vars, filters) }
+      : {}),
     ...(raw.assertions !== undefined
       ? { assertions: interpolateCaseField(raw.assertions, vars, filters) }
       : {}),
@@ -624,7 +629,7 @@ async function loadTestsFromParsedYamlValue(
   const suiteMetadataPayload = extractSuiteMetadataPayload(suite);
   const evalFileDir = path.dirname(absoluteTestPath);
 
-  const globalEvaluator = coerceEvaluator(suite.evaluator, 'global') ?? 'llm-grader';
+  const globalEvaluator = coerceEvaluator(suite.evaluator, 'global');
   const suitePreprocessors = await parsePreprocessors(
     suite.preprocessors,
     searchRoots,
@@ -679,9 +684,9 @@ async function loadTestsFromParsedYamlValue(
   readSuiteRuntimeBlock(suite, evalFilePath);
 
   // Build global execution context, including suite-level assertions (which is a sibling of execution)
-  const suiteAssertions = suite.assertions;
+  const suiteAssertions = suite.assert ?? suite.assertions;
   const globalExecution: JsonObject | undefined =
-    suiteAssertions !== undefined ? { assertions: suiteAssertions } : undefined;
+    suiteAssertions !== undefined ? { assert: suiteAssertions } : undefined;
 
   const results: EvalTest[] = [];
 
@@ -712,7 +717,7 @@ async function loadTestsFromParsedYamlValue(
         outcome = asString(renderedCase.expected_outcome);
         if (outcome) {
           logWarning(
-            `Test '${asString(renderedCase.id) ?? 'unknown'}': 'expected_outcome' is deprecated. Use 'criteria' instead.`,
+            `Test '${asString(renderedCase.id) ?? 'unknown'}': 'expected_outcome' has been removed. Use 'assert' instead.`,
           );
         }
       }
@@ -786,23 +791,42 @@ async function loadTestsFromParsedYamlValue(
           : undefined;
       const effectiveSuiteInputMessages = expandInputShorthand(effectiveSuiteInputValue);
 
-      // A test is complete when it has id, input, and at least one of: criteria, expected_output, assertions, or turns (conversation mode)
+      const hasExplicitCaseGraders =
+        renderedCase.assert !== undefined ||
+        renderedCase.assertions !== undefined ||
+        renderedCase.evaluators !== undefined ||
+        renderedCase.rubrics !== undefined;
+      const hasExplicitRootGraders =
+        skipDefaults === true
+          ? false
+          : globalExecution?.assert !== undefined ||
+            globalExecution?.assertions !== undefined ||
+            globalExecution?.evaluators !== undefined;
+      const graderCase =
+        outcome && !hasExplicitCaseGraders && !hasExplicitRootGraders
+          ? ({ ...renderedCase, assert: [outcome] } satisfies RawEvalCase)
+          : renderedCase;
+
+      // A test is complete when it has id, input, and at least one of: criteria,
+      // expected_output, assertions, or turns (conversation mode). Legacy test-level
+      // criteria is desugared to a bare-string assert above so it uses the canonical
+      // g-eval path instead of the implicit default LLM grader.
       const hasEvaluationSpec =
         !!outcome ||
         expectedMessages.length > 0 ||
-        renderedCase.assertions !== undefined ||
+        graderCase.assertions !== undefined ||
+        graderCase.assert !== undefined ||
         (Array.isArray(renderedCase.turns) && renderedCase.turns.length > 0);
       const hasInputMessages =
         testInputMessages.length > 0 ||
         (effectiveSuiteInputMessages !== undefined && effectiveSuiteInputMessages.length > 0);
       if (!id || !hasEvaluationSpec || !hasInputMessages) {
         logError(
-          `Skipping incomplete test: ${id ?? 'unknown'}. Missing required fields: id, input or PROMPT.md, and at least one of criteria/expected_output/assertions/turns`,
+          `Skipping incomplete test: ${id ?? 'unknown'}. Missing required fields: id, input or PROMPT.md, and at least one of criteria/expected_output/assert/turns`,
         );
         continue;
       }
 
-      // Prepend suite-level input to test input (respecting skip_defaults)
       // expected_output is optional - for outcome-only evaluation
       const hasExpectedMessages = expectedMessages.length > 0;
 
@@ -870,7 +894,7 @@ async function loadTestsFromParsedYamlValue(
       let evaluators: Awaited<ReturnType<typeof parseGraders>>;
       try {
         evaluators = await parseGraders(
-          renderedCase,
+          graderCase,
           globalExecution,
           searchRoots,
           id ?? 'unknown',
@@ -884,7 +908,7 @@ async function loadTestsFromParsedYamlValue(
       }
 
       const assertionTemplateReferences = await collectAssertionTemplateSourceReferences(
-        renderedCase,
+        graderCase,
         globalExecution,
         searchRoots,
         id ?? 'unknown',
@@ -1891,10 +1915,10 @@ function collectSingleGraderSourceReferences(
 ): readonly EvalSourceReference[] {
   const references: EvalSourceReference[] = [];
 
-  if (evaluator.type === 'code-grader') {
+  if (evaluator.type === 'script' || evaluator.type === 'code-grader') {
     const command = evaluator.command ?? [];
     references.push({
-      kind: 'code_grader_command',
+      kind: 'script_grader_command',
       displayPath: evaluator.resolvedScriptPath ?? command.join(' '),
       ...(evaluator.resolvedScriptPath ? { resolvedPath: evaluator.resolvedScriptPath } : {}),
       graderName: evaluator.name,
@@ -1902,7 +1926,7 @@ function collectSingleGraderSourceReferences(
     });
     if (evaluator.resolvedCwd) {
       references.push({
-        kind: 'code_grader_cwd',
+        kind: 'script_grader_cwd',
         displayPath: evaluator.cwd ?? evaluator.resolvedCwd,
         resolvedPath: evaluator.resolvedCwd,
         graderName: evaluator.name,
@@ -1948,9 +1972,9 @@ function collectSingleGraderSourceReferences(
     for (const member of evaluator.assertions) {
       references.push(...collectSingleGraderSourceReferences(member));
     }
-    if (evaluator.aggregator.type === 'code-grader') {
+    if (evaluator.aggregator.type === 'script' || evaluator.aggregator.type === 'code-grader') {
       references.push({
-        kind: 'code_grader_command',
+        kind: 'script_grader_command',
         displayPath: evaluator.aggregator.path,
         resolvedPath: path.resolve(evaluator.aggregator.cwd ?? '', evaluator.aggregator.path),
         graderName: evaluator.name,
diff --git a/packages/core/test/evaluation/code-grader-file-backed.test.ts b/packages/core/test/evaluation/code-grader-file-backed.test.ts
index 459f60118..2106e9f05 100644
--- a/packages/core/test/evaluation/code-grader-file-backed.test.ts
+++ b/packages/core/test/evaluation/code-grader-file-backed.test.ts
@@ -49,6 +49,30 @@ async function createScoringGrader(dir: string): Promise<readonly string[]> {
   return [process.execPath, script];
 }
 
+async function createPayloadShapeGrader(dir: string): Promise<readonly string[]> {
+  const script = join(dir, 'payload-shape-grader.js');
+  await writeFile(
+    script,
+    `const input = require('fs').readFileSync(0, 'utf8');
+const payload = JSON.parse(input);
+console.log(JSON.stringify({
+  score: payload.expected_output?.[0]?.content?.answer === 'Paris' &&
+    payload.config?.mode === 'strict' &&
+    payload.input?.[0]?.content === 'Test input' ? 1 : 0,
+  assertions: [{
+    text: 'structured stdin preserved',
+    passed: payload.expected_output?.[0]?.content?.answer === 'Paris' &&
+      payload.config?.mode === 'strict' &&
+      payload.input?.[0]?.content === 'Test input'
+  }],
+  details: { expected_output: payload.expected_output, config: payload.config, input: payload.input }
+}));
+`,
+    'utf8',
+  );
+  return [process.execPath, script];
+}
+
 describe('CodeGrader file-backed output', () => {
   let tmpDir: string;
 
@@ -113,4 +137,24 @@ describe('CodeGrader file-backed output', () => {
     // We can't inspect the payload directly, but the grader script should run without error
     expect(result.score).toBeGreaterThanOrEqual(0);
   });
+
+  it('preserves structured expected_output, input, and config in stdin', async () => {
+    const command = await createPayloadShapeGrader(tmpDir);
+
+    const evaluator = new CodeGrader({ command, config: { mode: 'strict' } });
+    const result = await evaluator.evaluate({
+      evalCase: {
+        ...baseTestCase,
+        expected_output: [{ role: 'assistant', content: { answer: 'Paris' } }],
+      },
+      candidate: 'answer',
+      output: [{ role: 'assistant' as const, content: 'answer' }],
+    });
+
+    expect(result.score).toBe(1);
+    expect(result.assertions).toEqual([{ text: 'structured stdin preserved', passed: true }]);
+    expect(result.details?.expected_output).toEqual([
+      { role: 'assistant', content: { answer: 'Paris' } },
+    ]);
+  });
 });
diff --git a/packages/core/test/evaluation/graders/promptfoo-assertions.test.ts b/packages/core/test/evaluation/graders/promptfoo-assertions.test.ts
new file mode 100644
index 000000000..e745d250c
--- /dev/null
+++ b/packages/core/test/evaluation/graders/promptfoo-assertions.test.ts
@@ -0,0 +1,253 @@
+import { afterEach, describe, expect, it } from 'bun:test';
+import { type Server, createServer } from 'node:http';
+
+import type { EvaluationContext } from '../../../src/evaluation/graders/types.js';
+import { createBuiltinRegistry } from '../../../src/evaluation/registry/builtin-graders.js';
+import type { GraderConfig } from '../../../src/evaluation/types.js';
+
+const baseContext: EvaluationContext = {
+  evalCase: {
+    id: 'case-1',
+    question: 'Question',
+    input: [{ role: 'user', content: 'Question' }],
+    expected_output: [{ role: 'assistant', content: { answer: 'Paris' } }],
+    reference_answer: 'Paris',
+    file_paths: [],
+    criteria: 'Answer correctly',
+  },
+  candidate: 'Paris is the capital of France.',
+  target: { name: 'mock', kind: 'mock', config: {} },
+  provider: {
+    id: 'mock',
+    kind: 'mock',
+    targetName: 'mock',
+    async invoke() {
+      return { output: [{ role: 'assistant', content: 'ok' }] };
+    },
+  },
+  attempt: 1,
+  promptInputs: { question: 'Question' },
+  now: new Date('2026-07-02T00:00:00Z'),
+};
+
+async function run(config: GraderConfig) {
+  const registry = createBuiltinRegistry();
+  const grader = await registry.create(config, {
+    llmGrader: {
+      kind: 'llm-grader',
+      evaluate() {
+        throw new Error('not used');
+      },
+    },
+    registry,
+  });
+  return grader.evaluate(baseContext);
+}
+
+describe('promptfoo-compatible built-in assertions', () => {
+  let server: Server | undefined;
+
+  afterEach(async () => {
+    if (server) {
+      await new Promise<void>((resolve) => server?.close(() => resolve()));
+      server = undefined;
+    }
+  });
+
+  it('runs javascript assertions in-process', async () => {
+    const result = await run({
+      name: 'js',
+      type: 'javascript',
+      value: "output.includes('Paris') && context.expectedOutput[0].content.answer === 'Paris'",
+    });
+
+    expect(result.score).toBe(1);
+    expect(result.verdict).toBe('pass');
+  });
+
+  it('uses assertion thresholds for numeric script results', async () => {
+    const result = await run({
+      name: 'js-threshold',
+      type: 'javascript',
+      value: '0.5',
+      threshold: 0.75,
+    });
+
+    expect(result.score).toBe(0.5);
+    expect(result.verdict).toBe('fail');
+    expect(result.assertions[0].passed).toBe(false);
+  });
+
+  it('fails numeric javascript score zero when no threshold is set', async () => {
+    const result = await run({
+      name: 'js-zero',
+      type: 'javascript',
+      value: '0',
+    });
+
+    expect(result.score).toBe(0);
+    expect(result.verdict).toBe('fail');
+    expect(result.assertions[0].passed).toBe(false);
+  });
+
+  it('honors explicit threshold zero for numeric javascript results', async () => {
+    const result = await run({
+      name: 'js-zero-threshold',
+      type: 'javascript',
+      value: '0',
+      threshold: 0,
+    });
+
+    expect(result.score).toBe(0);
+    expect(result.verdict).toBe('pass');
+    expect(result.assertions[0].passed).toBe(true);
+  });
+
+  it('runs python assertions in a subprocess', async () => {
+    const result = await run({
+      name: 'py',
+      type: 'python',
+      value: "'Paris' in output and context['expected_output'][0]['content']['answer'] == 'Paris'",
+    });
+
+    expect(result.score).toBe(1);
+    expect(result.verdict).toBe('pass');
+  });
+
+  it('fails object python score zero when no pass flag or threshold is set', async () => {
+    const result = await run({
+      name: 'py-zero',
+      type: 'python',
+      value: "{'score': 0, 'reason': 'zero score'}",
+    });
+
+    expect(result.score).toBe(0);
+    expect(result.verdict).toBe('fail');
+    expect(result.assertions[0]).toEqual({ text: 'zero score', passed: false });
+  });
+
+  it('runs webhook assertions against an HTTP endpoint', async () => {
+    const url = await new Promise<string>((resolve) => {
+      server = createServer((req, res) => {
+        let body = '';
+        req.on('data', (chunk) => {
+          body += chunk;
+        });
+        req.on('end', () => {
+          const payload = JSON.parse(body) as { output: string };
+          res.setHeader('content-type', 'application/json');
+          res.end(
+            JSON.stringify({
+              score: payload.output.includes('Paris') ? 1 : 0,
+              assertions: [{ text: 'saw output', passed: payload.output.includes('Paris') }],
+            }),
+          );
+        });
+      }).listen(0, () => {
+        const address = server?.address();
+        if (address && typeof address === 'object') {
+          resolve(`http://127.0.0.1:${address.port}`);
+        }
+      });
+    });
+
+    const result = await run({ name: 'webhook', type: 'webhook', value: url });
+    expect(result.score).toBe(1);
+    expect(result.assertions[0].text).toBe('saw output');
+  });
+
+  it('fails webhook score zero when no pass flag or threshold is set', async () => {
+    const url = await new Promise<string>((resolve) => {
+      server = createServer((req, res) => {
+        req.resume();
+        req.on('end', () => {
+          res.setHeader('content-type', 'application/json');
+          res.end(JSON.stringify({ score: 0, reason: 'zero score' }));
+        });
+      }).listen(0, () => {
+        const address = server?.address();
+        if (address && typeof address === 'object') {
+          resolve(`http://127.0.0.1:${address.port}`);
+        }
+      });
+    });
+
+    const result = await run({ name: 'webhook-zero', type: 'webhook', value: url });
+
+    expect(result.score).toBe(0);
+    expect(result.verdict).toBe('fail');
+    expect(result.assertions[0]).toEqual({ text: 'zero score', passed: false });
+  });
+
+  it('aggregates nested assert-set children', async () => {
+    const result = await run({
+      name: 'set',
+      type: 'assert-set',
+      threshold: 0.5,
+      assertions: [
+        { name: 'contains', type: 'contains', value: 'Paris' },
+        { name: 'starts', type: 'starts-with', value: 'Paris' },
+      ],
+    });
+
+    expect(result.score).toBe(1);
+    expect(result.scores?.map((score) => score.type)).toEqual(['contains', 'starts-with']);
+  });
+
+  it('does not count zero-score script children as passing in composite thresholds', async () => {
+    const result = await run({
+      name: 'gate',
+      type: 'composite',
+      assertions: [
+        { name: 'js-zero', type: 'javascript', value: '0' },
+        { name: 'contains', type: 'contains', value: 'Paris' },
+      ],
+      aggregator: { type: 'threshold', threshold: 1 },
+    });
+
+    expect(result.score).toBe(0.5);
+    expect(result.verdict).toBe('fail');
+    expect(result.assertions[0]).toEqual({
+      text: '1/2 evaluators passed (threshold: 1)',
+      passed: false,
+    });
+    expect(result.scores?.[0]).toMatchObject({
+      name: 'js-zero',
+      type: 'javascript',
+      score: 0,
+      verdict: 'fail',
+    });
+  });
+
+  it('runs similar with an OpenAI-compatible embeddings provider', async () => {
+    const url = await new Promise<string>((resolve) => {
+      server = createServer((req, res) => {
+        req.resume();
+        req.on('end', () => {
+          res.setHeader('content-type', 'application/json');
+          res.end(
+            JSON.stringify({
+              data: [{ embedding: [1, 0, 0] }, { embedding: [1, 0, 0] }],
+            }),
+          );
+        });
+      }).listen(0, () => {
+        const address = server?.address();
+        if (address && typeof address === 'object') {
+          resolve(`http://127.0.0.1:${address.port}`);
+        }
+      });
+    });
+
+    const result = await run({
+      name: 'similar',
+      type: 'similar',
+      value: 'Paris is the capital of France.',
+      threshold: 0.9,
+      config: { embedding_provider: { base_url: url, model: 'test-embedding' } },
+    });
+
+    expect(result.score).toBe(1);
+    expect(result.verdict).toBe('pass');
+  });
+});
diff --git a/packages/core/test/evaluation/loaders/case-file-loader.test.ts b/packages/core/test/evaluation/loaders/case-file-loader.test.ts
index ddca6666e..828f558f1 100644
--- a/packages/core/test/evaluation/loaders/case-file-loader.test.ts
+++ b/packages/core/test/evaluation/loaders/case-file-loader.test.ts
@@ -136,15 +136,11 @@ describe('resolveFileReference', () => {
     expect(cases[0].assertions).toEqual([
       { type: 'latency', threshold: 1000 },
       { type: 'cost', budget: 0.01 },
-      { type: 'code-grader', command: ['uv', 'run', 'python', graderPath] },
+      { type: 'script', command: ['uv', 'run', 'python', graderPath] },
     ]);
 
     const evaluators = await parseGraders(cases[0], undefined, [tempDir], 'csv-assertions');
-    expect(evaluators.map((evaluator) => evaluator.type)).toEqual([
-      'latency',
-      'cost',
-      'code-grader',
-    ]);
+    expect(evaluators.map((evaluator) => evaluator.type)).toEqual(['latency', 'cost', 'script']);
   });
 
   it('rejects unsupported promptfoo expected DSL forms clearly', async () => {
diff --git a/packages/core/test/evaluation/loaders/grader-parser.test.ts b/packages/core/test/evaluation/loaders/grader-parser.test.ts
index 6b8e2e9e2..a05f4bb25 100644
--- a/packages/core/test/evaluation/loaders/grader-parser.test.ts
+++ b/packages/core/test/evaluation/loaders/grader-parser.test.ts
@@ -10,6 +10,7 @@ import type {
   CompositeGraderConfig,
   ContainsGraderConfig,
   EqualsGraderConfig,
+  GEvalGraderConfig,
   IsJsonGraderConfig,
   LatencyGraderConfig,
   LlmGraderConfig,
@@ -211,7 +212,7 @@ describe('parseGraders - deterministic assertion types', () => {
     expect(evaluators).toBeUndefined();
   });
 
-  it('parses type: rubrics with criteria as llm-grader', async () => {
+  it('parses type: rubrics with criteria as g-eval', async () => {
     const evaluators = await parseGraders(
       {
         evaluators: [
@@ -227,8 +228,8 @@ describe('parseGraders - deterministic assertion types', () => {
       'test-1',
     );
     expect(evaluators).toHaveLength(1);
-    expect(evaluators?.[0].type).toBe('llm-grader');
-    expect((evaluators?.[0] as LlmGraderConfig).rubrics).toHaveLength(1);
+    expect(evaluators?.[0].type).toBe('g-eval');
+    expect((evaluators?.[0] as GEvalGraderConfig).rubrics).toHaveLength(1);
   });
 
   it('parses multiple assertion types in one evaluators array', async () => {
@@ -251,6 +252,75 @@ describe('parseGraders - deterministic assertion types', () => {
     expect(evaluators?.[2].type).toBe('is-json');
     expect(evaluators?.[3].type).toBe('equals');
   });
+
+  it('parses explicit g-eval criteria with score ranges', async () => {
+    const evaluators = await parseGraders(
+      {
+        assert: [
+          {
+            name: 'quality',
+            type: 'g-eval',
+            rubric_item: {
+              id: 'quality',
+              outcome: 'Answer quality',
+              min_score: 0.8,
+              score_ranges: [
+                { score_range: [0, 4], outcome: 'Weak' },
+                { score_range: [5, 7], outcome: 'Adequate' },
+                { score_range: [8, 10], outcome: 'Strong' },
+              ],
+            },
+          },
+        ],
+      },
+      undefined,
+      [tempDir],
+      'test-1',
+    );
+
+    const config = evaluators?.[0] as GEvalGraderConfig;
+    expect(config.type).toBe('g-eval');
+    expect(config.rubrics?.[0]).toMatchObject({
+      id: 'quality',
+      outcome: 'Answer quality',
+      min_score: 0.8,
+      score_ranges: [
+        { score_range: [0, 4], outcome: 'Weak' },
+        { score_range: [5, 7], outcome: 'Adequate' },
+        { score_range: [8, 10], outcome: 'Strong' },
+      ],
+    });
+  });
+
+  it('parses llm-rubric as free-form rubric text', async () => {
+    const evaluators = await parseGraders(
+      {
+        assert: [{ name: 'freeform', type: 'llm-rubric', value: 'Judge whether it is helpful' }],
+      },
+      undefined,
+      [tempDir],
+      'test-1',
+    );
+
+    expect(evaluators?.[0]).toMatchObject({
+      name: 'freeform',
+      type: 'llm-rubric',
+      value: 'Judge whether it is helpful',
+    });
+  });
+
+  it('rejects known unimplemented promptfoo assertion types', async () => {
+    await expect(
+      parseGraders(
+        {
+          assert: [{ name: 'bleu', type: 'bleu', value: 'reference' }],
+        },
+        undefined,
+        [tempDir],
+        'test-1',
+      ),
+    ).rejects.toThrow("Unsupported promptfoo assertion type 'bleu'");
+  });
 });
 
 describe('parseGraders - tool-trajectory', () => {
@@ -447,7 +517,7 @@ describe('parseGraders - tool-trajectory', () => {
   });
 });
 
-describe('parseGraders - code-grader config pass-through', () => {
+describe('parseGraders - script config pass-through', () => {
   let tempDir: string;
 
   beforeAll(async () => {
@@ -466,7 +536,7 @@ describe('parseGraders - code-grader config pass-through', () => {
       evaluators: [
         {
           name: 'fuzzy-matcher',
-          type: 'code-grader',
+          type: 'script',
           command: ['bun', 'run', './test_script.ts'],
           fields: [
             { path: 'supplier.name', threshold: 0.85 },
@@ -482,7 +552,7 @@ describe('parseGraders - code-grader config pass-through', () => {
 
     expect(evaluators).toHaveLength(1);
     const config = evaluators?.[0] as CodeGraderConfig;
-    expect(config.type).toBe('code-grader');
+    expect(config.type).toBe('script');
     expect(config.name).toBe('fuzzy-matcher');
     expect(config.config).toEqual({
       fields: [
@@ -499,7 +569,7 @@ describe('parseGraders - code-grader config pass-through', () => {
       evaluators: [
         {
           name: 'simple-grader',
-          type: 'code-grader',
+          type: 'script',
           command: ['bun', 'run', './test_script.ts'],
         },
       ],
@@ -509,7 +579,7 @@ describe('parseGraders - code-grader config pass-through', () => {
 
     expect(evaluators).toHaveLength(1);
     const config = evaluators?.[0] as CodeGraderConfig;
-    expect(config.type).toBe('code-grader');
+    expect(config.type).toBe('script');
     expect(config.config).toBeUndefined();
   });
 
@@ -518,7 +588,7 @@ describe('parseGraders - code-grader config pass-through', () => {
       evaluators: [
         {
           name: 'with-weight',
-          type: 'code-grader',
+          type: 'script',
           command: ['bun', 'run', './test_script.ts'],
           cwd: tempDir,
           weight: 2.0,
@@ -550,7 +620,7 @@ describe('parseGraders - code-grader config pass-through', () => {
       evaluators: [
         {
           name: 'shell-command',
-          type: 'code-grader',
+          type: 'script',
           command: './test_script.ts',
         },
       ],
@@ -574,7 +644,7 @@ describe('parseGraders - code-grader config pass-through', () => {
           evaluators: [
             {
               name: 'legacy-script',
-              type: 'code-grader',
+              type: 'script',
               script: './test_script.ts',
             },
           ],
@@ -583,7 +653,7 @@ describe('parseGraders - code-grader config pass-through', () => {
         [tempDir],
         'test-case',
       ),
-    ).rejects.toThrow(/'script' has been removed.*command/);
+    ).rejects.toThrow(/'script' field has been removed.*command/);
   });
 });
 
@@ -609,7 +679,7 @@ describe('parseGraders - kebab-case type normalization', () => {
     expect((evaluators?.[0] as LlmGraderConfig).target).toBe('grader-low-cost-a');
   });
 
-  it('accepts code-grader kebab-case as canonical form', async () => {
+  it('normalizes legacy code-grader to script', async () => {
     const rawEvalCase = {
       evaluators: [
         {
@@ -623,7 +693,29 @@ describe('parseGraders - kebab-case type normalization', () => {
     const evaluators = await parseGraders(rawEvalCase, undefined, [tempDir], 'test-case');
 
     expect(evaluators).toHaveLength(1);
-    expect(evaluators?.[0].type).toBe('code-grader');
+    expect(evaluators?.[0].type).toBe('script');
+  });
+
+  it('accepts script as the subprocess grader type', async () => {
+    const rawEvalCase = {
+      evaluators: [
+        {
+          name: 'subprocess-check',
+          type: 'script',
+          command: ['bun', 'run', './test_script.ts'],
+        },
+      ],
+    };
+
+    const evaluators = await parseGraders(rawEvalCase, undefined, [tempDir], 'test-case');
+
+    expect(evaluators).toHaveLength(1);
+    expect(evaluators?.[0].type).toBe('script');
+    expect((evaluators?.[0] as CodeGraderConfig).command).toEqual([
+      'bun',
+      'run',
+      './test_script.ts',
+    ]);
   });
 
   it('accepts is-json kebab-case as canonical form', async () => {
@@ -1320,17 +1412,17 @@ describe('parseGraders - assertions field', () => {
     expect(evaluators?.[0].type).toBe('contains');
   });
 
-  it('ignores the removed assertion field as evaluator input', async () => {
-    const removedKey = ['ass', 'ert'].join('');
+  it('parses canonical assert field as evaluators', async () => {
     const evaluators = await parseGraders(
       {
-        [removedKey]: [{ type: 'contains', value: 'DENIED' }],
+        assert: [{ type: 'contains', value: 'DENIED' }],
       },
       undefined,
       [tempDir],
       'test-1',
     );
-    expect(evaluators).toBeUndefined();
+    expect(evaluators).toHaveLength(1);
+    expect(evaluators?.[0].type).toBe('contains');
   });
 
   it('assertions takes precedence over execution.evaluators', async () => {
@@ -1349,6 +1441,23 @@ describe('parseGraders - assertions field', () => {
     expect(evaluators?.[0].type).toBe('contains');
   });
 
+  it('assert takes precedence over assertions and execution.evaluators', async () => {
+    const evaluators = await parseGraders(
+      {
+        assert: [{ type: 'contains', value: 'CANONICAL' }],
+        assertions: [{ type: 'contains', value: 'LEGACY' }],
+        execution: {
+          evaluators: [{ name: 'latency-check', type: 'latency', threshold: 5000 }],
+        },
+      },
+      undefined,
+      [tempDir],
+      'test-1',
+    );
+    expect(evaluators).toHaveLength(1);
+    expect(evaluators?.[0]).toMatchObject({ type: 'contains', value: 'CANONICAL' });
+  });
+
   it('assertions takes precedence over top-level evaluators', async () => {
     const evaluators = await parseGraders(
       {
@@ -1406,6 +1515,22 @@ describe('parseGraders - assertions field', () => {
     expect(evaluators?.[0].type).toBe('latency');
   });
 
+  it('falls back to execution.assert when case-level assertions are not present', async () => {
+    const evaluators = await parseGraders(
+      {
+        execution: {
+          assert: [{ type: 'contains', value: 'EXEC' }],
+          evaluators: [{ name: 'latency-check', type: 'latency', threshold: 5000 }],
+        },
+      },
+      undefined,
+      [tempDir],
+      'test-1',
+    );
+    expect(evaluators).toHaveLength(1);
+    expect(evaluators?.[0]).toMatchObject({ type: 'contains', value: 'EXEC' });
+  });
+
   it('suite-level assertions takes precedence over suite-level execution.evaluators', async () => {
     const evaluators = await parseGraders(
       {},
@@ -1432,6 +1557,20 @@ describe('parseGraders - assertions field', () => {
     expect(evaluators).toHaveLength(1);
     expect(evaluators?.[0].type).toBe('latency');
   });
+
+  it('suite-level assert takes precedence over suite-level assertions', async () => {
+    const evaluators = await parseGraders(
+      {},
+      {
+        assert: [{ type: 'contains', value: 'CANONICAL' }],
+        assertions: [{ type: 'contains', value: 'LEGACY' }],
+      },
+      [tempDir],
+      'test-1',
+    );
+    expect(evaluators).toHaveLength(1);
+    expect(evaluators?.[0]).toMatchObject({ type: 'contains', value: 'CANONICAL' });
+  });
 });
 
 describe('parseGraders - assertion templates', () => {
@@ -1660,9 +1799,9 @@ describe('parseGraders - type: rubrics with criteria', () => {
       'test-1',
     );
     expect(evaluators).toHaveLength(1);
-    expect(evaluators?.[0].type).toBe('llm-grader');
-    expect((evaluators?.[0] as LlmGraderConfig).rubrics).toHaveLength(2);
-    expect((evaluators?.[0] as LlmGraderConfig).weight).toBe(4.0);
+    expect(evaluators?.[0].type).toBe('g-eval');
+    expect((evaluators?.[0] as GEvalGraderConfig).rubrics).toHaveLength(2);
+    expect((evaluators?.[0] as GEvalGraderConfig).weight).toBe(4.0);
   });
 
   it('preserves optional rubric criterion operators', async () => {
@@ -1831,9 +1970,9 @@ describe('parseGraders - type: rubrics with criteria', () => {
     );
 
     expect(evaluators).toHaveLength(1);
-    const config = evaluators?.[0] as LlmGraderConfig;
+    const config = evaluators?.[0] as GEvalGraderConfig;
     expect(config.name).toBe('rubrics');
-    expect(config.type).toBe('llm-grader');
+    expect(config.type).toBe('g-eval');
     expect(config.rubrics?.[0]?.min_score).toBe(0.8);
     expect(config.rubrics?.[0]?.score_ranges).toEqual([
       { score_range: [0, 4], outcome: 'Weak' },
@@ -1934,13 +2073,13 @@ describe('parseGraders - required field', () => {
     expect(config.required).toBe(true);
   });
 
-  it('parses required on code-grader evaluator', async () => {
+  it('parses required on script evaluator', async () => {
     const evaluators = await parseGraders(
       {
         evaluators: [
           {
             name: 'code-check',
-            type: 'code-grader',
+            type: 'script',
             command: ['bun', 'run', './test_script.ts'],
             required: true,
           },
@@ -2059,6 +2198,31 @@ describe('parseGraders - composite assertions field', () => {
     expect(evaluators?.[0].type).toBe('composite');
   });
 
+  it('parses composite with canonical assert field', async () => {
+    const evaluators = await parseGraders(
+      {
+        assert: [
+          {
+            name: 'combined',
+            type: 'composite',
+            assert: [
+              { name: 'safety', type: 'llm-grader', prompt: './safety.md' },
+              { name: 'quality', type: 'llm-grader', prompt: './quality.md' },
+            ],
+            aggregator: { type: 'weighted_average' },
+          },
+        ],
+      },
+      undefined,
+      [tempDir],
+      'test-1',
+    );
+    expect(evaluators).toHaveLength(1);
+    const composite = evaluators?.[0] as CompositeGraderConfig;
+    expect(composite.type).toBe('composite');
+    expect(composite.assertions).toHaveLength(2);
+  });
+
   it('composite still works with evaluators field (backward compat)', async () => {
     const evaluators = await parseGraders(
       {
@@ -2105,6 +2269,30 @@ describe('parseGraders - composite assertions field', () => {
     expect(composite.assertions).toHaveLength(1);
     expect(composite.assertions[0].name).toBe('safety');
   });
+
+  it('composite assert takes precedence over assertions and evaluators', async () => {
+    const evaluators = await parseGraders(
+      {
+        assert: [
+          {
+            name: 'combined',
+            type: 'composite',
+            assert: [{ name: 'safety', type: 'llm-grader', prompt: './safety.md' }],
+            assertions: [{ name: 'legacy', type: 'llm-grader', prompt: './quality.md' }],
+            evaluators: [{ name: 'quality', type: 'llm-grader', prompt: './quality.md' }],
+            aggregator: { type: 'weighted_average' },
+          },
+        ],
+      },
+      undefined,
+      [tempDir],
+      'test-1',
+    );
+    expect(evaluators).toHaveLength(1);
+    const composite = evaluators?.[0] as CompositeGraderConfig;
+    expect(composite.assertions).toHaveLength(1);
+    expect(composite.assertions[0].name).toBe('safety');
+  });
 });
 
 describe('parseGraders - string shorthand in assertions', () => {
@@ -2124,13 +2312,13 @@ describe('parseGraders - string shorthand in assertions', () => {
 
     expect(evaluators).toHaveLength(1);
     const rubrics = evaluators?.[0];
-    expect(rubrics?.type).toBe('llm-grader');
-    expect((rubrics as LlmGraderConfig).rubrics).toHaveLength(3);
-    expect((rubrics as LlmGraderConfig).rubrics?.[0].outcome).toBe(
+    expect(rubrics?.type).toBe('g-eval');
+    expect((rubrics as GEvalGraderConfig).rubrics).toHaveLength(3);
+    expect((rubrics as GEvalGraderConfig).rubrics?.[0].outcome).toBe(
       'Mentions divide-and-conquer approach',
     );
-    expect((rubrics as LlmGraderConfig).rubrics?.[1].outcome).toBe('Explains partition step');
-    expect((rubrics as LlmGraderConfig).rubrics?.[2].outcome).toBe('States time complexity');
+    expect((rubrics as GEvalGraderConfig).rubrics?.[1].outcome).toBe('Explains partition step');
+    expect((rubrics as GEvalGraderConfig).rubrics?.[2].outcome).toBe('States time complexity');
   });
 
   it('groups strings into rubrics and preserves object evaluators', async () => {
@@ -2149,9 +2337,9 @@ describe('parseGraders - string shorthand in assertions', () => {
 
     expect(evaluators).toHaveLength(2);
     // First: rubrics (at position of first string)
-    expect(evaluators?.[0].type).toBe('llm-grader');
-    expect((evaluators?.[0] as LlmGraderConfig).rubrics).toHaveLength(2);
-    expect((evaluators?.[0] as LlmGraderConfig).rubrics?.[0].outcome).toBe(
+    expect(evaluators?.[0].type).toBe('g-eval');
+    expect((evaluators?.[0] as GEvalGraderConfig).rubrics).toHaveLength(2);
+    expect((evaluators?.[0] as GEvalGraderConfig).rubrics?.[0].outcome).toBe(
       'Mentions divide-and-conquer approach',
     );
     // Second: the contains evaluator
@@ -2170,9 +2358,9 @@ describe('parseGraders - string shorthand in assertions', () => {
     );
 
     expect(evaluators).toHaveLength(1);
-    expect(evaluators?.[0].type).toBe('llm-grader');
-    expect((evaluators?.[0] as LlmGraderConfig).rubrics).toHaveLength(1);
-    expect((evaluators?.[0] as LlmGraderConfig).rubrics?.[0].outcome).toBe(
+    expect(evaluators?.[0].type).toBe('g-eval');
+    expect((evaluators?.[0] as GEvalGraderConfig).rubrics).toHaveLength(1);
+    expect((evaluators?.[0] as GEvalGraderConfig).rubrics?.[0].outcome).toBe(
       'Response must be polite',
     );
   });
@@ -2208,8 +2396,8 @@ describe('parseGraders - string shorthand in assertions', () => {
     );
 
     expect(evaluators).toHaveLength(2);
-    const rubrics = evaluators?.[0] as LlmGraderConfig;
-    expect(rubrics.type).toBe('llm-grader');
+    const rubrics = evaluators?.[0] as GEvalGraderConfig;
+    expect(rubrics.type).toBe('g-eval');
     expect(rubrics.rubrics).toHaveLength(3);
     expect(rubrics.weight).toBe(3);
     expect(evaluators?.[1].type).toBe('contains');
diff --git a/packages/core/test/evaluation/loaders/jsonl-parser.test.ts b/packages/core/test/evaluation/loaders/jsonl-parser.test.ts
index e3f377943..2f0f3b5ae 100644
--- a/packages/core/test/evaluation/loaders/jsonl-parser.test.ts
+++ b/packages/core/test/evaluation/loaders/jsonl-parser.test.ts
@@ -69,6 +69,23 @@ describe('loadTestsFromJsonl', () => {
     expect(cases[0].input).toHaveLength(1);
     expect(cases[0].input[0].role).toBe('user');
     expect(cases[0].input[0].content).toBe('Query');
+    expect(cases[0].assertions?.[0]?.type).toBe('g-eval');
+    expect(cases[0].assertions?.[0]?.rubrics?.[0]?.outcome).toBe('Goal');
+  });
+
+  it('keeps expected_output-only JSONL cases passive without implicit assertions', async () => {
+    const jsonlPath = path.join(tempDir, 'expected-output-only.jsonl');
+    await writeFile(
+      jsonlPath,
+      '{"id": "test-1", "input": "Query", "expected_output": "Reference answer"}\n',
+    );
+
+    const cases = await loadTestsFromJsonl(jsonlPath, tempDir);
+
+    expect(cases).toHaveLength(1);
+    expect(cases[0].criteria).toBe('');
+    expect(cases[0].expected_output[0].content).toBe('Reference answer');
+    expect(cases[0].assertions).toBeUndefined();
   });
 
   it('parses multi-line JSONL', async () => {
@@ -225,7 +242,7 @@ describe('loadTestsFromJsonl', () => {
 
     expect(cases).toHaveLength(1);
     expect(cases[0].assertions).toHaveLength(1);
-    expect(cases[0].assertions?.[0].type).toBe('llm-grader');
+    expect(cases[0].assertions?.[0].type).toBe('g-eval');
     const rubricEvaluator = cases[0].assertions?.[0] as { type: string; rubrics?: unknown[] };
     expect(rubricEvaluator.rubrics).toHaveLength(2);
   });
@@ -259,11 +276,13 @@ describe('loadTestsFromJsonl', () => {
     expect(cases[0].assertions).toHaveLength(1);
     expect(cases[0].assertions?.[0]).toMatchObject({
       name: 'rubrics',
-      type: 'llm-grader',
+      type: 'g-eval',
       rubrics: [
         {
           id: 'quality',
+          outcome: 'Answer quality',
           min_score: 0.8,
+          weight: 1,
           score_ranges: [
             { score_range: [0, 4], outcome: 'Weak' },
             { score_range: [5, 7], outcome: 'Adequate' },
@@ -406,6 +425,27 @@ describe('loadTests with format detection', () => {
 
     expect(cases).toHaveLength(1);
     expect(cases[0].id).toBe('yaml-test');
+    expect(cases[0].assertions?.[0]?.type).toBe('g-eval');
+    expect(cases[0].assertions?.[0]?.rubrics?.[0]?.outcome).toBe('Goal');
+  });
+
+  it('keeps expected_output-only YAML cases passive without implicit assertions', async () => {
+    const yamlPath = path.join(tempDir, 'expected-output-only.yaml');
+    await writeFile(
+      yamlPath,
+      `tests:
+  - id: expected-only
+    input: Query
+    expected_output: Reference answer
+`,
+    );
+
+    const cases = await loadTests(yamlPath, tempDir);
+
+    expect(cases).toHaveLength(1);
+    expect(cases[0].criteria).toBe('');
+    expect(cases[0].expected_output[0].content).toBe('Reference answer');
+    expect(cases[0].assertions).toBeUndefined();
   });
 
   it('routes .yml to YAML parser', async () => {
@@ -843,8 +883,8 @@ eval_cases:
     });
   });
 
-  describe('expected_outcome → criteria alias (YAML)', () => {
-    it('supports expected_outcome as deprecated alias for criteria', async () => {
+  describe('expected_outcome → assert compatibility (YAML)', () => {
+    it('supports expected_outcome as deprecated assertion shorthand', async () => {
       const yamlPath = path.join(tempDir, 'expected-outcome-alias.yaml');
       await writeFile(
         yamlPath,
@@ -885,8 +925,8 @@ eval_cases:
     });
   });
 
-  describe('expected_outcome → criteria alias (JSONL)', () => {
-    it('supports expected_outcome as deprecated alias for criteria', async () => {
+  describe('expected_outcome → assert compatibility (JSONL)', () => {
+    it('supports expected_outcome as deprecated assertion shorthand', async () => {
       const jsonlPath = path.join(tempDir, 'expected-outcome-alias.jsonl');
       await writeFile(
         jsonlPath,
diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts
index 4acf92dd1..385a965b4 100644
--- a/packages/core/test/evaluation/orchestrator.test.ts
+++ b/packages/core/test/evaluation/orchestrator.test.ts
@@ -2568,6 +2568,41 @@ describe('criteria with assertions runs only declared evaluators (#452)', () =>
     expect(result.score).toBe(1);
   });
 
+  it('does not run the default llm-grader for passive expected_output-only cases', async () => {
+    const provider = new SequenceProvider('mock', {
+      responses: [{ output: [{ role: 'assistant', content: 'hello world' }] }],
+    });
+    const llmEvaluate = mock(() => {
+      throw new Error('default llm-grader should not run');
+    });
+
+    const { evaluator: _evaluator, ...referenceOnlyCase } = criteriaTestCase;
+    const result = await runEvalCase({
+      evalCase: {
+        ...referenceOnlyCase,
+        criteria: '',
+        expected_output: [{ role: 'assistant', content: 'hello world' }],
+      },
+      provider,
+      target: {
+        ...baseTarget,
+        graderTarget: 'grader-target',
+      },
+      evaluators: {
+        'llm-grader': {
+          kind: 'llm-grader',
+          evaluate: llmEvaluate,
+        },
+      },
+    });
+
+    expect(llmEvaluate).not.toHaveBeenCalled();
+    expect(result.score).toBe(1);
+    expect(result.assertions).toEqual([
+      { text: 'No assertions declared; grading skipped', passed: true },
+    ]);
+  });
+
   it('criteria is available as evalCase data for evaluators that consume it', async () => {
     const provider = new SequenceProvider('mock', {
       responses: [{ output: [{ role: 'assistant', content: 'hello world' }] }],
diff --git a/packages/core/test/evaluation/rubric-operators-yaml.test.ts b/packages/core/test/evaluation/rubric-operators-yaml.test.ts
index 44f36bbdf..1e03b2132 100644
--- a/packages/core/test/evaluation/rubric-operators-yaml.test.ts
+++ b/packages/core/test/evaluation/rubric-operators-yaml.test.ts
@@ -39,9 +39,9 @@ describe('rubric criterion operators', () => {
     const tests = await loadTests(path.join(dir, 'suite.eval.yaml'), dir);
     const evaluator = tests[0]?.assertions?.[0];
 
-    expect(evaluator?.type).toBe('llm-grader');
-    if (!evaluator || evaluator.type !== 'llm-grader') {
-      throw new Error('expected rubrics to normalize to llm-grader');
+    expect(evaluator?.type).toBe('g-eval');
+    if (!evaluator || evaluator.type !== 'g-eval') {
+      throw new Error('expected rubrics to normalize to g-eval');
     }
 
     expect(evaluator.rubrics?.map((rubric) => rubric.operator)).toEqual([
diff --git a/packages/core/test/evaluation/source-traceability.test.ts b/packages/core/test/evaluation/source-traceability.test.ts
index d47a4bd26..3e3a96722 100644
--- a/packages/core/test/evaluation/source-traceability.test.ts
+++ b/packages/core/test/evaluation/source-traceability.test.ts
@@ -85,12 +85,12 @@ tests:
     const kinds = source?.references.map((reference) => reference.kind).sort();
     expect(kinds).toEqual([
       'assertion_template',
-      'code_grader_command',
-      'code_grader_cwd',
       'input_file',
       'llm_grader_prompt',
       'preprocessor_command',
       'prompt_script',
+      'script_grader_command',
+      'script_grader_cwd',
     ]);
 
     const promptFile = source?.references.find(
@@ -100,7 +100,7 @@ tests:
     expect(promptFile?.resolvedPath).toBe(path.join(tempDir, 'graders', 'prompt.md'));
 
     const codeCommand = source?.references.find(
-      (reference) => reference.kind === 'code_grader_command',
+      (reference) => reference.kind === 'script_grader_command',
     );
     expect(codeCommand?.command).toEqual(['bun', 'graders/code.ts']);
     expect(codeCommand?.resolvedPath).toBe(path.join(tempDir, 'graders', 'code.ts'));
diff --git a/packages/core/test/evaluation/validation/eval-file-schema.test.ts b/packages/core/test/evaluation/validation/eval-file-schema.test.ts
index 5f9a83b11..eee77a3d0 100644
--- a/packages/core/test/evaluation/validation/eval-file-schema.test.ts
+++ b/packages/core/test/evaluation/validation/eval-file-schema.test.ts
@@ -215,7 +215,15 @@ describe('EvalFileSchema input shorthand', () => {
               value: ['Identifies user impact', 'Avoids unsupported claims'],
               score_ranges: [{ score_range: [0, 10], outcome: 'overall quality' }],
             },
+            {
+              type: 'composite',
+              assert: [{ type: 'contains', value: 'safe' }],
+              aggregator: { type: 'weighted_average' },
+            },
           ],
+          execution: {
+            assert: [{ type: 'contains', value: 'Looks' }],
+          },
         },
       ],
       scenarios: [
diff --git a/packages/core/test/evaluation/validation/eval-validator.test.ts b/packages/core/test/evaluation/validation/eval-validator.test.ts
index 63e54f97f..dab22e157 100644
--- a/packages/core/test/evaluation/validation/eval-validator.test.ts
+++ b/packages/core/test/evaluation/validation/eval-validator.test.ts
@@ -2120,7 +2120,7 @@ tests:
         warnings.some(
           (e) =>
             e.message.includes("'expected_outcome' is deprecated") &&
-            e.message.includes("'criteria'"),
+            e.message.includes("'assert'"),
         ),
       ).toBe(true);
     });
diff --git a/packages/sdk/src/assertion.ts b/packages/sdk/src/assertion.ts
index 3e326a103..8eda2977b 100644
--- a/packages/sdk/src/assertion.ts
+++ b/packages/sdk/src/assertion.ts
@@ -38,7 +38,10 @@ export type AssertionContext = CodeGraderInput;
 export type AssertionType =
   // kebab-case (canonical internal form)
   | 'llm-grader'
-  | 'code-grader'
+  | 'g-eval'
+  | 'llm-rubric'
+  | 'script'
+  | 'assert-set'
   | 'rubrics'
   | 'composite'
   | 'tool-trajectory'
@@ -59,6 +62,10 @@ export type AssertionType =
   | 'equals'
   | 'regex'
   | 'is-json'
+  | 'javascript'
+  | 'python'
+  | 'webhook'
+  | 'similar'
   // legacy snake_case aliases (still accepted)
   | 'llm_grader'
   | 'code_grader'
diff --git a/packages/sdk/src/graders.ts b/packages/sdk/src/graders.ts
index c5fc27083..9b0169e75 100644
--- a/packages/sdk/src/graders.ts
+++ b/packages/sdk/src/graders.ts
@@ -67,6 +67,18 @@ export interface RubricsGraderConfig extends EvalAssertionConfig, GraderCommonCo
   readonly criteria: readonly GraderRubricCriterion[];
 }
 
+export interface GEvalGraderConfig extends EvalAssertionConfig, GraderCommonConfig {
+  readonly type: 'g-eval';
+  readonly criteria: readonly GraderRubricCriterion[];
+  readonly target?: string;
+}
+
+export interface LlmRubricGraderConfig extends EvalAssertionConfig, GraderCommonConfig {
+  readonly type: 'llm-rubric';
+  readonly value: string;
+  readonly target?: string;
+}
+
 export interface GraderPromptScriptConfig {
   readonly command: readonly string[];
   readonly config?: Readonly<Record<string, unknown>>;
@@ -104,8 +116,8 @@ export interface CodeGraderOptions extends GraderHelperOptions {
   readonly preprocessors?: readonly EvalPreprocessor[];
 }
 
-export interface CodeGraderConfig extends EvalAssertionConfig, GraderCommonConfig {
-  readonly type: 'code-grader';
+export interface ScriptGraderConfig extends EvalAssertionConfig, GraderCommonConfig {
+  readonly type: 'script';
   readonly command: GraderCommand;
   readonly cwd?: string;
   readonly target?: true | CodeGraderTargetOptions;
@@ -113,14 +125,19 @@ export interface CodeGraderConfig extends EvalAssertionConfig, GraderCommonConfi
   readonly preprocessors?: readonly EvalPreprocessor[];
 }
 
+/** @deprecated Use ScriptGraderConfig with type: 'script'. */
+export type CodeGraderConfig = ScriptGraderConfig;
+
 export type GraderHelperConfig =
   | ContainsGraderConfig
   | EqualsGraderConfig
   | RegexGraderConfig
   | IsJsonGraderConfig
   | RubricsGraderConfig
+  | GEvalGraderConfig
+  | LlmRubricGraderConfig
   | LlmGraderConfig
-  | CodeGraderConfig;
+  | ScriptGraderConfig;
 
 function withCommon<T extends { readonly type: string }>(
   config: T,
@@ -180,6 +197,34 @@ export function rubricsGrader(
   return withCommon({ type: 'rubrics', criteria }, options);
 }
 
+export function gEvalGrader(
+  criteria: readonly GraderRubricCriterion[],
+  options: GraderHelperOptions & { readonly target?: string } = {},
+): GEvalGraderConfig {
+  return withCommon(
+    {
+      type: 'g-eval',
+      criteria,
+      ...(options.target !== undefined ? { target: options.target } : {}),
+    },
+    options,
+  );
+}
+
+export function llmRubricGrader(
+  value: string,
+  options: GraderHelperOptions & { readonly target?: string } = {},
+): LlmRubricGraderConfig {
+  return withCommon(
+    {
+      type: 'llm-rubric',
+      value,
+      ...(options.target !== undefined ? { target: options.target } : {}),
+    },
+    options,
+  );
+}
+
 export function llmGrader(options: LlmGraderOptions = {}): LlmGraderConfig {
   return withCommon(
     {
@@ -199,10 +244,17 @@ export function llmGrader(options: LlmGraderOptions = {}): LlmGraderConfig {
 export function codeGrader(
   command: GraderCommand,
   options: CodeGraderOptions = {},
-): CodeGraderConfig {
+): ScriptGraderConfig {
+  return scriptGrader(command, options);
+}
+
+export function scriptGrader(
+  command: GraderCommand,
+  options: CodeGraderOptions = {},
+): ScriptGraderConfig {
   return withCommon(
     {
-      type: 'code-grader',
+      type: 'script',
       command,
       ...(options.cwd !== undefined ? { cwd: options.cwd } : {}),
       ...(options.target !== undefined ? { target: options.target } : {}),
@@ -221,8 +273,12 @@ export const graders = Object.freeze({
   isJson: isJsonGrader,
   json: jsonGrader,
   rubrics: rubricsGrader,
+  gEval: gEvalGrader,
+  llmRubric: llmRubricGrader,
   llmGrader,
   codeGrader,
+  script: scriptGrader,
+  scriptGrader,
 });
 
 export type GraderCatalog = typeof graders;
diff --git a/packages/sdk/src/index.ts b/packages/sdk/src/index.ts
index c752a5ed7..7d4d0d992 100644
--- a/packages/sdk/src/index.ts
+++ b/packages/sdk/src/index.ts
@@ -160,17 +160,21 @@ export {
   containsGrader,
   equalsGrader,
   exactGrader,
+  gEvalGrader,
   graders,
   isJsonGrader,
   jsonGrader,
   llmGrader,
+  llmRubricGrader,
   regexGrader,
   rubricsGrader,
+  scriptGrader,
   type CodeGraderConfig,
   type CodeGraderOptions,
   type CodeGraderTargetOptions,
   type ContainsGraderConfig,
   type EqualsGraderConfig,
+  type GEvalGraderConfig,
   type GraderCatalog,
   type GraderCommand,
   type GraderCommonConfig,
@@ -184,9 +188,11 @@ export {
   type IsJsonGraderConfig,
   type LlmGraderConfig,
   type LlmGraderOptions,
+  type LlmRubricGraderConfig,
   type RegexGraderConfig,
   type RegexGraderOptions,
   type RubricsGraderConfig,
+  type ScriptGraderConfig,
 } from './graders.js';
 
 // Re-export target client
diff --git a/packages/sdk/test/grader-helpers.test.ts b/packages/sdk/test/grader-helpers.test.ts
index 791a1351c..655255d2a 100644
--- a/packages/sdk/test/grader-helpers.test.ts
+++ b/packages/sdk/test/grader-helpers.test.ts
@@ -12,6 +12,7 @@ import {
   llmGrader,
   regexGrader,
   rubricsGrader,
+  scriptGrader,
   serializeEvalYaml,
   toEvalYamlObject,
 } from '../src/index.js';
@@ -64,12 +65,16 @@ describe('grader helper config builders', () => {
       }),
     ).toEqual({
       name: 'scripted-check',
-      type: 'code-grader',
+      type: 'script',
       command: ['bun', 'run', 'graders/check.ts'],
       cwd: 'graders',
       target: { maxCalls: 2 },
       config: { mode: 'strict' },
     });
+    expect(scriptGrader(['bun', 'run', 'graders/check.ts'])).toEqual({
+      type: 'script',
+      command: ['bun', 'run', 'graders/check.ts'],
+    });
   });
 
   it('composes inside defineEval and serializes to canonical AgentV YAML assertions', () => {
@@ -112,7 +117,7 @@ describe('grader helper config builders', () => {
                 },
               ],
             }),
-            graders.codeGrader(['bun', 'run', 'graders/check.ts'], {
+            graders.script(['bun', 'run', 'graders/check.ts'], {
               name: 'scripted-check',
               target: { maxCalls: 2 },
               minScore: 0.5,
@@ -163,7 +168,7 @@ describe('grader helper config builders', () => {
       },
       {
         name: 'scripted-check',
-        type: 'code-grader',
+        type: 'script',
         command: ['bun', 'run', 'graders/check.ts'],
         target: { max_calls: 2 },
         min_score: 0.5,
@@ -174,7 +179,7 @@ describe('grader helper config builders', () => {
 
     expect(yaml).toContain('assertions:');
     expect(yaml).toContain('type: llm-grader');
-    expect(yaml).toContain('type: code-grader');
+    expect(yaml).toContain('type: script');
     expect(yaml).toContain('max_steps: 2');
     expect(yaml).toContain('max_calls: 2');
     expect(yaml).toContain('min_score: 0.8');
diff --git a/skills-data/agentv-eval-writer/SKILL.md b/skills-data/agentv-eval-writer/SKILL.md
index 7449431d2..f725beda0 100644
--- a/skills-data/agentv-eval-writer/SKILL.md
+++ b/skills-data/agentv-eval-writer/SKILL.md
@@ -3,7 +3,7 @@ name: agentv-eval-writer
 description: >-
   Write, edit, review, and validate AgentV EVAL.yaml / .eval.yaml evaluation files.
   Use when asked to create new eval files, update or fix existing ones, add or remove test cases,
-  configure graders (`llm-grader`, `code-grader`, `rubrics`), review whether an eval is correct or complete,
+  configure graders (`g-eval`, `llm-rubric`, `llm-grader`, `script`), review whether an eval is correct or complete,
   convert between EVAL.yaml and evals.json using `agentv convert`, or generate eval test cases
   from chat transcripts (markdown conversation or JSON messages).
   Do NOT use for creating SKILL.md files, writing skill definitions, or running evals —
@@ -37,8 +37,8 @@ Use `@agentv/sdk` for TypeScript helper imports. Do not use `@agentv/eval` for n
 
 ## Authoring Checklist
 
-- If `assertions` already state the grading contract, omit `criteria` instead of duplicating the same rubric twice.
-- Prefer plain assertion strings for semantic checks when the default LLM rubric grader can judge them. Use multiple named `type: llm-grader` blocks only for custom prompts, custom grader targets, or intentionally separate grader panels.
+- Put grading criteria in `assertions`/`assert`, not in test-level `criteria`. Plain assertion strings become a `g-eval` rubric grader.
+- Prefer plain assertion strings for semantic checks when the default rubric grader can judge them. Use `type: llm-rubric` for structured criteria, `type: llm-grader` for custom prompts/targets, and `type: script` when grading must execute code.
 - Write `expected_output` as a golden/reference answer the target could have produced. Do not write criteria, scoring instructions, or "the agent should..." rubric prose there.
 - For historical or repo-state evals, materialize the repo under `workspace.repos[]` pinned to the commit under test. Mentioning a SHA only in prompt prose is not enough because the agent needs an actual checkout to inspect.
 
@@ -60,7 +60,7 @@ agentv convert evals.json
 agentv eval evals.json
 ```
 
-The converter maps `prompt` → `input`, `expected_output` → `expected_output`, `assertions` → `assertions` (`llm-grader`), and resolves `files[]` paths. The generated YAML includes TODO comments for AgentV features to add (workspace setup, code graders, rubrics, required gates).
+The converter maps `prompt` → `input`, `expected_output` → `expected_output`, `assertions` → `assertions` (`g-eval` rubric checks), and resolves `files[]` paths. The generated YAML includes TODO comments for AgentV features to add (workspace setup, script graders, rubrics, required gates).
 
 After converting, enhance the YAML with AgentV-specific capabilities shown below.
 
@@ -133,10 +133,9 @@ tests:
 | Field | Required | Description |
 |-------|----------|-------------|
 | `id` | yes | Unique identifier |
-| `criteria` | conditional | What the response should accomplish; required only when no `expected_output` or `assertions` are present |
 | `input` | yes | Input to the agent (string/object shorthand or full message array) |
 | `expected_output` | no | Gold-standard reference answer (string shorthand or full message array) |
-| `assertions` | no | Graders: deterministic checks, rubrics, and LLM/code graders |
+| `assertions` / `assert` | yes | Graders: deterministic checks, rubrics, LLM graders, script graders, or plain-string `g-eval` checks |
 | `execution` | no | Per-case grader/default overrides such as `skip_defaults`; target selection belongs in top-level `target` or CLI `--target` |
 | `workspace` | no | Per-case workspace config (overrides suite-level) |
 | `metadata` | no | Arbitrary key-value pairs passed to setup/teardown scripts |
@@ -186,7 +185,8 @@ tests: ./cases.yaml
 
 # cases.yaml — each test only needs its own query
 # - id: test-1
-#   criteria: ...
+#   assertions:
+#     - ...
 #   input: "User question here"
 ```
 
@@ -207,7 +207,7 @@ The external file can be YAML (array of test objects) or JSONL.
 
 ## Assertions Field
 
-`assertions` defines graders at the suite level or per-test level. It is the canonical field for all graders:
+`assertions` (or `assert`) defines graders at the suite level or per-test level. It is the canonical field for all graders:
 
 ```yaml
 # Mix exact checks with rubric shorthand when both matter.
@@ -230,9 +230,11 @@ tests:
 
 Plain strings in `assertions` are rubric criteria and are the preferred shape for
 qualitative agent behavior. Use deterministic assertions (`contains`, `regex`,
-`is-json`, `equals`) only for exact machine-verifiable outputs, and code graders
+`is-json`, `equals`) only for exact machine-verifiable outputs, and script graders
 when the check must inspect files, run commands, or validate structured state.
-Do not add a separate `criteria` field that just repeats these assertion strings.
+Do not add a separate test-level `criteria` field. Legacy evals that still use
+`criteria` without explicit assertions are loaded as a plain-string assertion for
+compatibility, but new evals should author the assertion directly.
 
 For repo-state evals, combine a pinned checkout, a golden answer, and assertion
 shorthand:
@@ -262,33 +264,11 @@ tests:
       - The answer preserves the historical commit SHA as context.
 ```
 
-## How `criteria` and `assertions` Interact
+## Assertions and Reference Data
 
-`criteria` is a **data field** — it describes what the response should accomplish. It is **not** a grader. How it gets evaluated depends on whether `assertions` is present:
-
-| Scenario | What happens | Warning? |
-|----------|-------------|----------|
-| `criteria` + **no `assertions`** | Implicit `llm-grader` runs automatically against `criteria` | No |
-| `criteria` + **`assertions` with only deterministic graders** (contains, regex, etc.) | Only declared graders run. `criteria` is **not evaluated**. | Yes — warns that no grader will consume criteria |
-| `criteria` + **`assertions` with rubric shorthand or a grader** (plain strings, `llm-grader`, `code-grader`, `rubrics`) | Declared graders run. Graders receive `criteria` as input. | No |
-
-### No assertions → implicit llm-grader
-
-The simplest path. `criteria` is automatically evaluated by the default `llm-grader`:
-
-```yaml
-tests:
-  - id: simple-eval
-    criteria: Assistant correctly explains the bug and proposes a fix
-    input: "Debug this function..."
-    # No assertions → default llm-grader evaluates against criteria
-```
-
-### assertions present → no implicit grader
-
-When `assertions` is defined, **only the declared graders run**. For semantic
-checks, add plain rubric strings. If you need a custom LLM prompt or grader
-target, declare `llm-grader` explicitly:
+When `assertions` or `assert` is defined, **only the declared graders run**. For
+semantic checks, add plain rubric strings. If you need a custom LLM prompt or
+grader target, declare `llm-grader` explicitly:
 
 ```yaml
 tests:
@@ -300,22 +280,30 @@ tests:
         value: "fix"
 ```
 
-**Common mistake:** defining `criteria` with only deterministic graders. The criteria will be ignored and a warning is emitted:
+`expected_output` is passive reference data. It is available to graders through
+`{{expected_output}}` and the script stdin payload, but it does not create an
+implicit LLM grading call by itself.
+
+**Common mistake:** putting rubric prose in `expected_output` instead of an
+assertion:
 
 ```yaml
 tests:
   - id: bad-example
-    criteria: Gives a thoughtful answer    # ⚠ NOT evaluated — no grader in assertions
     input: "What is 2+2?"
-    assertions:
-      - type: contains
-        value: "4"
-    # Warning: criteria is defined but no grader in assertions will evaluate it.
+    expected_output: The assistant should explain why the answer is 4. # reference answer field, not a grader
 ```
 
-If plain assertion strings fully express the semantic contract, leave `criteria`
-out. Keep `criteria` for the implicit-grader path or for non-duplicative context
-that a declared grader actually needs.
+Write this as:
+
+```yaml
+tests:
+  - id: good-example
+    input: "What is 2+2?"
+    expected_output: "4"
+    assertions:
+      - The answer is 4 and explains the arithmetic briefly
+```
 
 ## Required Gates
 
@@ -326,7 +314,7 @@ assertions:
   - type: contains
     value: "DENIED"
     required: true          # must score >= 0.8 (default)
-  - type: rubrics
+  - type: g-eval
     required: true
     min_score: 0.6          # must score >= 0.6 (custom threshold)
     criteria:
@@ -413,26 +401,26 @@ See https://agentv.dev/targets/configuration/#repository-lifecycle
 
 Configure via `assertions` array. Multiple graders produce a weighted average score.
 
-### code-grader
+### script
 ```yaml
 - name: format_check
-  type: code-grader
+  type: script
   command: [uv, run, validate.py]
   cwd: ./scripts          # optional working directory
   target: {}              # optional: enable LLM target proxy (max_calls: 50)
 ```
 Contract: stdin JSON -> stdout JSON `{score, assertions: [{text, passed, evidence?}], reasoning}`
-Raw stdin uses snake_case and includes: `criteria`, `input`, `expected_output`, `output` (final answer string), `messages`, `trace`, `trace_summary`, `token_usage`, `cost_usd`, `duration_ms`, `start_time`, `end_time`, `file_changes`, `workspace_path`, `config`
+Raw stdin uses snake_case and includes: `input`, `expected_output`, `output` (final answer string), `messages`, `trace`, `trace_summary`, `token_usage`, `cost_usd`, `duration_ms`, `start_time`, `end_time`, `file_changes`, `workspace_path`, `config`
 SDK handlers receive the same payload in camelCase: `expectedOutput`, `traceSummary`, `tokenUsage`, `costUsd`, `durationMs`, `startTime`, `endTime`, `fileChanges`, `workspacePath`.
 When a workspace is configured, `workspace_path` is the absolute path to the workspace dir (also available as `AGENTV_WORKSPACE_PATH` env var). Use this for functional grading (e.g., running `npm test` in the workspace).
 For deterministic workspace checks that fit normal Vitest `expect(...)` tests, prefer a plain verifier file and the built-in adapter:
 ```yaml
 - name: welcome_banner
-  type: code-grader
+  type: script
   command: [agentv, eval, graders/welcome-banner.test.ts]
 ```
 AgentV infers the Vitest adapter for `*.test.ts`, `*.spec.ts`, and Vercel-style `EVAL.ts` files. Use the explicit `agentv eval vitest` subcommand only when you need adapter flags such as `--cwd`, `--in-workspace`, or `--vitest-command`.
-See docs at https://agentv.dev/graders/code-graders/
+See the Script Graders docs for the full stdin/stdout contract.
 
 ### llm-grader
 ```yaml
@@ -557,15 +545,16 @@ Binary check: does output exactly equal the value (both trimmed)?
 ```
 Binary check: is the output valid JSON?
 
-### rubrics
+### g-eval / llm-rubric
 ```yaml
 - Correctly identifies the denied party
 - Provides clear reasoning
 ```
 LLM-judged structured evaluation. Plain strings are the preferred shorthand.
-Use `type: rubrics` only when you need weighted criteria, `required: false`,
-`min_score`, or score ranges. Criteria items support `id`, `outcome`, `weight`,
-and `required` fields.
+Use `type: g-eval` when you need weighted criteria, `required: false`,
+`min_score`, or score ranges. Use `type: llm-rubric` for a single structured
+rubric item with the same LLM rubric semantics. Criteria items support `id`,
+`outcome`, `weight`, and `required` fields.
 Use optional `operator: correctness` for positive support checks or `operator: contradiction` for guard criteria where omission is acceptable but incompatible claims fail.
 
 See `references/rubric-grader.md` for score-range mode and scoring formula.
@@ -663,7 +652,7 @@ export default defineEval({
 });
 ```
 
-The `graders` catalog returns ordinary `assertions` entries such as `type: is-json`, `type: regex`, `type: llm-grader`, and `type: code-grader`. `defineEval()` lowers camelCase TypeScript fields such as `expectedOutput`, `inputFiles`, and `maxSteps` to canonical snake_case YAML/runtime keys.
+The `graders` catalog returns ordinary `assertions` entries such as `type: is-json`, `type: regex`, `type: llm-grader`, and `type: script`. `defineEval()` lowers camelCase TypeScript fields such as `expectedOutput`, `inputFiles`, and `maxSteps` to canonical snake_case YAML/runtime keys.
 
 If adapting Braintrust `scores` or DeepEval metrics, write small AgentV helper factories that return `graders.*` configs:
 
@@ -714,7 +703,7 @@ export default defineCodeGrader(({ output, trace }) => {
 });
 ```
 
-`defineAssertion()` files go in `.agentv/assertions/` and are referenced by filename as `type: <name>`. `defineCodeGrader()` scripts are referenced in YAML with `type: code-grader` and `command: [bun, run, grader.ts]`. Plain Vitest workspace verifier files can use `command: [agentv, eval, graders/check.test.ts]`.
+`defineAssertion()` files go in `.agentv/assertions/` and are referenced by filename as `type: <name>`. `defineCodeGrader()` scripts are referenced in YAML with `type: script` and `command: [bun, run, grader.ts]`. Plain Vitest workspace verifier files can use `command: [agentv, eval, graders/check.test.ts]`.
 
 ### Convention-Based Discovery
 
@@ -798,14 +787,14 @@ After running evals, perform a human review before iterating. Create `feedback.j
       "test_id": "test-id",
       "verdict": "acceptable | needs_improvement | incorrect | flaky",
       "notes": "Why this verdict",
-      "evaluator_overrides": { "code-grader:name": "Override note" },
+      "evaluator_overrides": { "script:name": "Override note" },
       "workspace_notes": "Workspace state observations"
     }
   ]
 }
 ```
 
-Use `evaluator_overrides` for workspace evaluations to annotate specific grader results (e.g., "code-grader was too strict"). Use `workspace_notes` for observations about workspace state.
+Use `evaluator_overrides` for workspace evaluations to annotate specific grader results (e.g., "script grader was too strict"). Use `workspace_notes` for observations about workspace state.
 
 Review workflow: run evals → inspect results (`agentv inspect show`) → write feedback → tune prompts/graders → re-run.
 
diff --git a/skills-data/agentv-eval-writer/references/custom-evaluators.md b/skills-data/agentv-eval-writer/references/custom-evaluators.md
index 1ed851720..d058bb733 100644
--- a/skills-data/agentv-eval-writer/references/custom-evaluators.md
+++ b/skills-data/agentv-eval-writer/references/custom-evaluators.md
@@ -6,7 +6,6 @@
 
 ```json
 {
-  "criteria": "string",
   "input_files": ["path"],
   "input": [{"role": "user", "content": "..."}],
   "expected_output": [{"role": "assistant", "content": "..."}],
@@ -69,7 +68,7 @@ import {
   - `.invokeBatch(requests)` - Batch LLM calls
 - `definePromptTemplate(fn)` - Wraps prompt generation function
   - Raw stdin uses `snake_case`; SDK handlers receive `camelCase`
-  - Context fields: `input`, `expectedOutput`, `output`, `messages`, `criteria`, `config`, `trace`, `traceSummary`, `tokenUsage`, `costUsd`, `durationMs`, `startTime`, `endTime`
+  - Context fields: `input`, `expectedOutput`, `output`, `messages`, `config`, `trace`, `traceSummary`, `tokenUsage`, `costUsd`, `durationMs`, `startTime`, `endTime`
 
 For Python, the repo-local helper example in `examples/features/sdk-python/` keeps canonical `snake_case` fields and rejects deprecated wire aliases like `output_text`, `input_text`, and `reference_answer`. It is not a separate Python runner or a promised published package; generated evals still run through the AgentV CLI.
 
@@ -145,10 +144,13 @@ if __name__ == "__main__":
 #!/usr/bin/env bun
 import { defineCodeGrader } from '@agentv/sdk';
 
-export default defineCodeGrader(({ output, criteria }) => {
+export default defineCodeGrader(({ output, expectedOutput }) => {
   const candidate = output ?? '';
+  const expected = expectedOutput
+    ?.map((message) => (typeof message.content === 'string' ? message.content : ''))
+    .join('\n') ?? '';
   const assertions: Array<{ text: string; passed: boolean }> = [];
-  if (candidate.includes(criteria)) {
+  if (expected.length > 0 && candidate.includes(expected)) {
     assertions.push({ text: 'Matches expected outcome', passed: true });
   } else {
     assertions.push({ text: 'Does not match expected outcome', passed: false });
@@ -167,7 +169,6 @@ Derived from test fields (users never author these directly):
 
 | Variable | Source |
 |----------|--------|
-| `criteria` | Test `criteria` field |
 | `input` | Full resolved input array (JSON) |
 | `expected_output` | Full resolved expected array (JSON) |
 | `output` | Final answer / scored result string |
diff --git a/skills-data/agentv-eval-writer/references/eval.schema.json b/skills-data/agentv-eval-writer/references/eval.schema.json
index 7f0511f1c..9b0400054 100644
--- a/skills-data/agentv-eval-writer/references/eval.schema.json
+++ b/skills-data/agentv-eval-writer/references/eval.schema.json
@@ -890,9 +890,6 @@
                         "properties": {},
                         "additionalProperties": {}
                       },
-                      "criteria": {
-                        "type": "string"
-                      },
                       "provider": {
                         "anyOf": [
                           {
@@ -2298,7 +2295,7 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": ["code-grader", "code_grader"]
+                                  "const": "script"
                                 },
                                 "command": {
                                   "anyOf": [
@@ -2603,8 +2600,6 @@
                                     "python",
                                     "webhook",
                                     "similar",
-                                    "select-best",
-                                    "human",
                                     "contains",
                                     "contains-any",
                                     "contains-all",
@@ -2896,6 +2891,10 @@
                                   "type": "string",
                                   "const": "composite"
                                 },
+                                "assert": {
+                                  "type": "array",
+                                  "items": {}
+                                },
                                 "assertions": {
                                   "type": "array",
                                   "items": {}
@@ -2944,7 +2943,7 @@
                                       "properties": {
                                         "type": {
                                           "type": "string",
-                                          "const": "code-grader"
+                                          "const": "script"
                                         },
                                         "path": {
                                           "type": "string"
@@ -3621,6 +3620,21 @@
                           "workers": {
                             "not": {}
                           },
+                          "assert": {
+                            "type": "array",
+                            "items": {
+                              "anyOf": [
+                                {
+                                  "type": "string"
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {},
+                                  "additionalProperties": {}
+                                }
+                              ]
+                            }
+                          },
                           "assertions": {
                             "type": "array",
                             "items": {
@@ -3667,7 +3681,7 @@
                                     },
                                     "type": {
                                       "type": "string",
-                                      "enum": ["code-grader", "code_grader"]
+                                      "const": "script"
                                     },
                                     "command": {
                                       "anyOf": [
@@ -3972,8 +3986,6 @@
                                         "python",
                                         "webhook",
                                         "similar",
-                                        "select-best",
-                                        "human",
                                         "contains",
                                         "contains-any",
                                         "contains-all",
@@ -4265,6 +4277,10 @@
                                       "type": "string",
                                       "const": "composite"
                                     },
+                                    "assert": {
+                                      "type": "array",
+                                      "items": {}
+                                    },
                                     "assertions": {
                                       "type": "array",
                                       "items": {}
@@ -4313,7 +4329,7 @@
                                           "properties": {
                                             "type": {
                                               "type": "string",
-                                              "const": "code-grader"
+                                              "const": "script"
                                             },
                                             "path": {
                                               "type": "string"
@@ -5617,9 +5633,6 @@
                         "properties": {},
                         "additionalProperties": {}
                       },
-                      "criteria": {
-                        "type": "string"
-                      },
                       "provider": {
                         "anyOf": [
                           {
@@ -7025,7 +7038,7 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": ["code-grader", "code_grader"]
+                                  "const": "script"
                                 },
                                 "command": {
                                   "anyOf": [
@@ -7330,8 +7343,6 @@
                                     "python",
                                     "webhook",
                                     "similar",
-                                    "select-best",
-                                    "human",
                                     "contains",
                                     "contains-any",
                                     "contains-all",
@@ -7623,6 +7634,10 @@
                                   "type": "string",
                                   "const": "composite"
                                 },
+                                "assert": {
+                                  "type": "array",
+                                  "items": {}
+                                },
                                 "assertions": {
                                   "type": "array",
                                   "items": {}
@@ -7671,7 +7686,7 @@
                                       "properties": {
                                         "type": {
                                           "type": "string",
-                                          "const": "code-grader"
+                                          "const": "script"
                                         },
                                         "path": {
                                           "type": "string"
@@ -8348,6 +8363,21 @@
                           "workers": {
                             "not": {}
                           },
+                          "assert": {
+                            "type": "array",
+                            "items": {
+                              "anyOf": [
+                                {
+                                  "type": "string"
+                                },
+                                {
+                                  "type": "object",
+                                  "properties": {},
+                                  "additionalProperties": {}
+                                }
+                              ]
+                            }
+                          },
                           "assertions": {
                             "type": "array",
                             "items": {
@@ -8394,7 +8424,7 @@
                                     },
                                     "type": {
                                       "type": "string",
-                                      "enum": ["code-grader", "code_grader"]
+                                      "const": "script"
                                     },
                                     "command": {
                                       "anyOf": [
@@ -8699,8 +8729,6 @@
                                         "python",
                                         "webhook",
                                         "similar",
-                                        "select-best",
-                                        "human",
                                         "contains",
                                         "contains-any",
                                         "contains-all",
@@ -8992,6 +9020,10 @@
                                       "type": "string",
                                       "const": "composite"
                                     },
+                                    "assert": {
+                                      "type": "array",
+                                      "items": {}
+                                    },
                                     "assertions": {
                                       "type": "array",
                                       "items": {}
@@ -9040,7 +9072,7 @@
                                           "properties": {
                                             "type": {
                                               "type": "string",
-                                              "const": "code-grader"
+                                              "const": "script"
                                             },
                                             "path": {
                                               "type": "string"
@@ -13979,9 +14011,6 @@
                       "properties": {},
                       "additionalProperties": {}
                     },
-                    "criteria": {
-                      "type": "string"
-                    },
                     "provider": {
                       "anyOf": [
                         {
@@ -15387,7 +15416,7 @@
                               },
                               "type": {
                                 "type": "string",
-                                "enum": ["code-grader", "code_grader"]
+                                "const": "script"
                               },
                               "command": {
                                 "anyOf": [
@@ -15692,8 +15721,6 @@
                                   "python",
                                   "webhook",
                                   "similar",
-                                  "select-best",
-                                  "human",
                                   "contains",
                                   "contains-any",
                                   "contains-all",
@@ -15985,6 +16012,10 @@
                                 "type": "string",
                                 "const": "composite"
                               },
+                              "assert": {
+                                "type": "array",
+                                "items": {}
+                              },
                               "assertions": {
                                 "type": "array",
                                 "items": {}
@@ -16033,7 +16064,7 @@
                                     "properties": {
                                       "type": {
                                         "type": "string",
-                                        "const": "code-grader"
+                                        "const": "script"
                                       },
                                       "path": {
                                         "type": "string"
@@ -16710,6 +16741,21 @@
                         "workers": {
                           "not": {}
                         },
+                        "assert": {
+                          "type": "array",
+                          "items": {
+                            "anyOf": [
+                              {
+                                "type": "string"
+                              },
+                              {
+                                "type": "object",
+                                "properties": {},
+                                "additionalProperties": {}
+                              }
+                            ]
+                          }
+                        },
                         "assertions": {
                           "type": "array",
                           "items": {
@@ -16756,7 +16802,7 @@
                                   },
                                   "type": {
                                     "type": "string",
-                                    "enum": ["code-grader", "code_grader"]
+                                    "const": "script"
                                   },
                                   "command": {
                                     "anyOf": [
@@ -17061,8 +17107,6 @@
                                       "python",
                                       "webhook",
                                       "similar",
-                                      "select-best",
-                                      "human",
                                       "contains",
                                       "contains-any",
                                       "contains-all",
@@ -17354,6 +17398,10 @@
                                     "type": "string",
                                     "const": "composite"
                                   },
+                                  "assert": {
+                                    "type": "array",
+                                    "items": {}
+                                  },
                                   "assertions": {
                                     "type": "array",
                                     "items": {}
@@ -17402,7 +17450,7 @@
                                         "properties": {
                                           "type": {
                                             "type": "string",
-                                            "const": "code-grader"
+                                            "const": "script"
                                           },
                                           "path": {
                                             "type": "string"
diff --git a/skills-data/agentv-eval-writer/references/rubric-evaluator.md b/skills-data/agentv-eval-writer/references/rubric-evaluator.md
index d0afd6225..821d6ae59 100644
--- a/skills-data/agentv-eval-writer/references/rubric-evaluator.md
+++ b/skills-data/agentv-eval-writer/references/rubric-evaluator.md
@@ -1,12 +1,12 @@
-# Rubric Grader
+# Rubric Graders
 
-Rubrics are defined as `assertions` entries with `type: rubrics`. They support binary checklist grading and score-range analytic grading.
+Rubrics are defined as `assertions` entries with plain strings, `type: g-eval`, or `type: llm-rubric`. They support binary checklist grading and score-range analytic grading.
 
 ## Field Reference
 
 | Field | Type | Default | Description |
 |-------|------|---------|-------------|
-| `type` | string | required | Must be `rubrics` |
+| `type` | string | required | Use `g-eval` for grouped criteria or `llm-rubric` for a single structured rubric |
 | `criteria` | array | required | List of criterion strings or objects |
 | `required` | boolean or number | - | Gate: `true` requires score >= 0.8; a number (0–1) sets a custom threshold |
 
@@ -33,14 +33,14 @@ assertions:
   - States time complexity
 ```
 
-Equivalent to the full form with `type: rubrics`. Use the full form only when you need weights, `required: false`, or `score_ranges`.
+Equivalent to the full form with `type: g-eval`. Use the full form only when you need weights, `required: false`, or `score_ranges`.
 
 Mixed strings and objects are supported in `assertions` — strings are grouped into a single rubrics grader at the position of the first string:
 
 ```yaml
 assertions:
   - Mentions divide-and-conquer approach  # grouped into rubrics
-  - type: code-grader
+  - type: script
     command: [check_syntax.py]
   - States time complexity                # grouped into rubrics
 ```
@@ -49,7 +49,7 @@ assertions:
 
 ```yaml
 assertions:
-  - type: rubrics
+  - type: g-eval
     criteria:
       - Mentions divide-and-conquer approach
       - id: complexity
@@ -68,7 +68,7 @@ Use `operator` when outcome text should carry grading intent without embedding w
 
 ```yaml
 assertions:
-  - type: rubrics
+  - type: g-eval
     criteria:
       - id: supported-fact
         operator: correctness
@@ -87,7 +87,7 @@ Shorthand map format (recommended):
 
 ```yaml
 assertions:
-  - type: rubrics
+  - type: g-eval
     criteria:
       - id: correctness
         weight: 2.0