From 61bcec47aa4335aab577e9e21b2969b8c1b3ce64 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 2 Jul 2026 11:51:22 +0200 Subject: [PATCH 1/3] feat(core): adopt nunjucks eval templating --- .../docs/docs/evaluation/eval-files.mdx | 22 +- bun.lock | 20 +- examples/features/README.md | 4 +- examples/features/env-interpolation/README.md | 6 +- .../env-interpolation/evals/dataset.eval.yaml | 8 +- .../features/test-vars-templating/README.md | 6 +- .../evals/dataset.eval.yaml | 18 +- packages/core/package.json | 2 + packages/core/src/evaluation/interpolation.ts | 110 ++- .../src/evaluation/loaders/grader-parser.ts | 2 + .../evaluation/loaders/shorthand-expansion.ts | 22 + packages/core/src/evaluation/types.ts | 8 +- packages/core/src/evaluation/yaml-parser.ts | 661 ++++++++++-------- .../interpolation-integration.test.ts | 62 +- .../test/evaluation/interpolation.test.ts | 100 ++- .../test/evaluation/suite-level-input.test.ts | 157 ++++- .../evaluation/workspace/deps-scanner.test.ts | 2 +- packages/core/test/projects.test.ts | 4 +- 18 files changed, 787 insertions(+), 427 deletions(-) diff --git a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx index 9dc940b97..63d6655ab 100644 --- a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx +++ b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx @@ -438,8 +438,8 @@ tests: # workspace.yaml — works on any machine repos: - path: ./my-repo - repo: "${{ MY_REPO_URL }}" - commit: "${{ MY_REPO_COMMIT }}" + repo: "{{ env.MY_REPO_URL }}" + commit: "{{ env.MY_REPO_COMMIT }}" ``` ```bash @@ -450,31 +450,31 @@ MY_REPO_COMMIT=main ## Per-Test Template Variables -Eval YAML also supports per-test `vars` for data-driven prompt templates. Use `{{name}}` placeholders in test-facing text fields, and AgentV resolves them when the suite loads. +Eval YAML also supports per-test `vars` for data-driven prompt templates. Use `{{ vars.name }}` placeholders in test-facing text fields, and AgentV resolves them when the suite loads. ```yaml -input: "Answer clearly: {{question}}" +input: "Answer clearly: {{ vars.question }}" tests: - id: capital vars: question: What is the capital of France? expected_answer: Paris - criteria: "Answers {{question}} correctly" + criteria: "Answers {{ vars.question }} correctly" input: - role: user - content: "Question: {{question}}" - expected_output: "{{expected_answer}}" + content: "Question: {{ vars.question }}" + expected_output: "{{ vars.expected_answer }}" ``` ### Behavior - `vars` is defined per test as an object -- `{{name}}` and dotted paths like `{{ user.name }}` are supported -- Substitution applies to suite-level `input`, test `input`, `input_files`, `criteria`, `expected_output`, and conversation turn `input` / `expected_output` +- `{{ vars.name }}` and dotted paths like `{{ vars.user.name }}` are supported +- Substitution applies to suite-level `input`, test `input`, `input_files`, `criteria`, `expected_output`, assertion values/metrics, and conversation turn `input` / `expected_output` / assertions - When the whole string is a single placeholder, the original JSON value is preserved -- Missing variables are left unchanged, so unrelated template syntax is not silently blanked out -- `vars` interpolation is separate from environment interpolation: `{{question}}` uses test data, `${{ PROJECT_NAME }}` uses environment variables +- Missing variables render as empty strings following Nunjucks semantics +- `vars` interpolation is separate from environment interpolation: `{{ vars.question }}` uses test data, `{{ env.PROJECT_NAME }}` uses environment variables ## JSONL Format diff --git a/bun.lock b/bun.lock index bd6bd8963..d6fbdd77e 100644 --- a/bun.lock +++ b/bun.lock @@ -19,7 +19,7 @@ }, "apps/cli": { "name": "agentv", - "version": "4.42.4", + "version": "5.0.0-next.1", "bin": { "agentv": "./dist/cli.js", }, @@ -85,7 +85,7 @@ }, "packages/core": { "name": "@agentv/core", - "version": "4.42.4", + "version": "5.0.0-next.1", "dependencies": { "@agentclientprotocol/sdk": "^0.14.1", "@earendil-works/pi-ai": "^0.74.0", @@ -94,11 +94,13 @@ "fast-glob": "^3.3.3", "json5": "^2.2.3", "micromatch": "^4.0.8", + "nunjucks": "^3.2.4", "yaml": "^2.8.3", "zod": "^3.23.8", }, "devDependencies": { "@types/micromatch": "^4.0.10", + "@types/nunjucks": "^3.2.6", "zod-to-json-schema": "^3.25.1", }, "optionalDependencies": { @@ -120,7 +122,7 @@ }, "packages/sdk": { "name": "@agentv/sdk", - "version": "4.42.4", + "version": "5.0.0-next.1", "dependencies": { "@agentv/core": "workspace:*", "yaml": "^2.8.3", @@ -837,6 +839,8 @@ "@types/node": ["@types/node@24.1.0", "", { "dependencies": { "undici-types": "~7.8.0" } }, "sha512-ut5FthK5moxFKH2T1CUOC6ctR67rQRvvHdFLCD2Ql6KXmMuCrjsSsRI9UsLCm9M18BMwClv4pn327UvB7eeO1w=="], + "@types/nunjucks": ["@types/nunjucks@3.2.6", "", {}, "sha512-pHiGtf83na1nCzliuAdq8GowYiXvH5l931xZ0YEHaLMNFgynpEqx+IPStlu7UaDkehfvl01e4x/9Tpwhy7Ue3w=="], + "@types/react": ["@types/react@19.2.14", "", { "dependencies": { "csstype": "^3.2.2" } }, "sha512-ilcTH/UniCkMdtexkoCN0bI7pMcJDvmQFPvuPvmEaYA/NSfFTAgdUSLAoVjaRJm7+6PvcM+q1zYOwS4wTYMF9w=="], "@types/react-dom": ["@types/react-dom@19.2.3", "", { "peerDependencies": { "@types/react": "^19.2.0" } }, "sha512-jp2L/eY6fn+KgVVQAOqYItbF0VY/YApe5Mz2F0aykSO8gx31bYCZyvSeYxCHKvzHG5eZjc+zyaS5BrBWya2+kQ=="], @@ -857,6 +861,8 @@ "@vitejs/plugin-react": ["@vitejs/plugin-react@4.7.0", "", { "dependencies": { "@babel/core": "^7.28.0", "@babel/plugin-transform-react-jsx-self": "^7.27.1", "@babel/plugin-transform-react-jsx-source": "^7.27.1", "@rolldown/pluginutils": "1.0.0-beta.27", "@types/babel__core": "^7.20.5", "react-refresh": "^0.17.0" }, "peerDependencies": { "vite": "^4.2.0 || ^5.0.0 || ^6.0.0 || ^7.0.0" } }, "sha512-gUu9hwfWvvEDBBmgtAowQCojwZmJ5mcLn3aufeCsitijs3+f2NsrPtlAWIR6OPiqljl96GVCUbLe0HyqIpVaoA=="], + "a-sync-waterfall": ["a-sync-waterfall@1.0.1", "", {}, "sha512-RYTOHHdWipFUliRFMCS4X2Yn2X8M87V/OpSqWzKKOGhzqyUxzyVmhHDH9sAvG+ZuQf/TAOFsLCpMw09I1ufUnA=="], + "acorn": ["acorn@8.15.0", "", { "bin": { "acorn": "bin/acorn" } }, "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg=="], "acorn-jsx": ["acorn-jsx@5.3.2", "", { "peerDependencies": { "acorn": "^6.0.0 || ^7.0.0 || ^8.0.0" } }, "sha512-rq9s+JNhf0IChjtDXxllJ7g41oZk5SlXtp0LHwyA5cejwn7vKmKp4pPri6YEePv2PU65sAsegbXtIinmDFDXgQ=="], @@ -885,6 +891,8 @@ "array-iterate": ["array-iterate@2.0.1", "", {}, "sha512-I1jXZMjAgCMmxT4qxXfPXa6SthSoE8h6gkSI9BGGNv8mP8G/v0blc+qFnZu6K42vTOiuME596QaLO0TP3Lk0xg=="], + "asap": ["asap@2.0.6", "", {}, "sha512-BSHWgDSAiKs50o2Re8ppvp3seVHXSRM44cdSsT9FfNEUUZLOGWVCsiWaRPWM1Znn+mqZ1OfVZ3z3DWEzSp7hRA=="], + "ast-types": ["ast-types@0.16.1", "", { "dependencies": { "tslib": "^2.0.1" } }, "sha512-6t10qk83GOG8p0vKmaCr8eiilZwO171AvbROMtvvNiwrTly62t+7XkA8RdIIVbpMhCASAsxgAzdRSwh6nw/5Dg=="], "astring": ["astring@1.9.0", "", { "bin": { "astring": "bin/astring" } }, "sha512-LElXdjswlqjWrPpJFg1Fx4wpkOCxj1TDHlSV4PlaRxHGWko024xICaa97ZkMfs6DRKlCguiAI+rbXv5GWwXIkg=="], @@ -969,7 +977,7 @@ "comma-separated-tokens": ["comma-separated-tokens@2.0.3", "", {}, "sha512-Fu4hJdvzeylCfQPp9SGWidpzrMs7tTrlu6Vb8XGaRGck8QSNZJJp538Wrb60Lax4fPwR64ViY468OIUTbRlGZg=="], - "commander": ["commander@4.1.1", "", {}, "sha512-NOKm8xhkzAjzFx8B2v5OAHT+u5pRQc2UCa2Vq9jYL/31o2wi9mxBA7LIFs3sV5VSC49z6pEhfbMULvShKj26WA=="], + "commander": ["commander@5.1.0", "", {}, "sha512-P0CysNDQ7rtVw4QIQtm+MRxV66vKFSvlsQvGYXZWR3qFU0jlMKHZZZgw8e+8DSah4UDKMqnknRDQz+xuQXQ/Zg=="], "common-ancestor-path": ["common-ancestor-path@1.0.1", "", {}, "sha512-L3sHRo1pXXEqX8VU28kfgUY+YGsk09hPqZiZmLacNib6XNTCM8ubYeT7ryXQw8asB1sKgcU5lkB7ONug08aB8w=="], @@ -1505,6 +1513,8 @@ "nth-check": ["nth-check@2.1.1", "", { "dependencies": { "boolbase": "^1.0.0" } }, "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w=="], + "nunjucks": ["nunjucks@3.2.4", "", { "dependencies": { "a-sync-waterfall": "^1.0.0", "asap": "^2.0.3", "commander": "^5.1.0" }, "peerDependencies": { "chokidar": "^3.3.0" }, "optionalPeers": ["chokidar"], "bin": { "nunjucks-precompile": "bin/precompile" } }, "sha512-26XRV6BhkgK0VOxfbU5cQI+ICFUtMLixv1noZn1tGU38kQH5A5nmmbk/O45xdyBhD1esk47nKrY0mvQpZIhRjQ=="], + "object-assign": ["object-assign@4.1.1", "", {}, "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg=="], "ofetch": ["ofetch@1.5.1", "", { "dependencies": { "destr": "^2.0.5", "node-fetch-native": "^1.6.7", "ufo": "^1.6.1" } }, "sha512-2W4oUZlVaqAPAil6FUg/difl6YhqhUR7x2eZY4bQCko22UXg3hptq9KLQdqFClV+Wu85UX7hNtdGTngi/1BxcA=="], @@ -1987,6 +1997,8 @@ "sitemap/@types/node": ["@types/node@17.0.45", "", {}, "sha512-w+tIMs3rq2afQdsPJlODhoUEKzFP1ayaoyl1CcnwtIlsVe7K7bA1NGm4s3PraqTLlXnbIN84zuBlxBWo1u9BLw=="], + "sucrase/commander": ["commander@4.1.1", "", {}, "sha512-NOKm8xhkzAjzFx8B2v5OAHT+u5pRQc2UCa2Vq9jYL/31o2wi9mxBA7LIFs3sV5VSC49z6pEhfbMULvShKj26WA=="], + "svgo/commander": ["commander@11.1.0", "", {}, "sha512-yPVavfyCcRhmorC7rWlkHn15b4wDVgVmBA7kV4QVBsF7kv/9TKJAbAXVTxvTnwP8HHKjRCJDClKbciiYS7p0DQ=="], "tinyglobby/picomatch": ["picomatch@4.0.3", "", {}, "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q=="], diff --git a/examples/features/README.md b/examples/features/README.md index d8ad0867b..40153f696 100644 --- a/examples/features/README.md +++ b/examples/features/README.md @@ -72,8 +72,8 @@ Focused examples for specific AgentV capabilities. Find your use case below, the | [input-files-shorthand](input-files-shorthand/) | Attach files to every test using a compact shorthand | | [suite-level-input](suite-level-input/) | Prepend a shared system prompt to every test in the suite | | [suite-level-input-files](suite-level-input-files/) | Share file attachments across every test in the suite | -| [env-interpolation](env-interpolation/) | Inject environment variables into eval config with `${{ VAR }}` | -| [test-vars-templating](test-vars-templating/) | Inject per-test `vars` into `{{name}}` templates in eval fields | +| [env-interpolation](env-interpolation/) | Inject environment variables into eval config with `{{ env.VAR }}` | +| [test-vars-templating](test-vars-templating/) | Inject per-test `vars` into `{{ vars.name }}` templates in eval fields | --- diff --git a/examples/features/env-interpolation/README.md b/examples/features/env-interpolation/README.md index 920af7c8f..eb4c7725f 100644 --- a/examples/features/env-interpolation/README.md +++ b/examples/features/env-interpolation/README.md @@ -1,6 +1,6 @@ # Environment Variable Interpolation -Demonstrates `${{ VAR }}` syntax for portable eval configs. +Demonstrates `{{ env.VAR }}` syntax for portable eval configs. ## Usage @@ -14,7 +14,7 @@ Or create a `.env` file — AgentV loads `.env` files automatically from the dir ## Features -- **Full-value**: `criteria: "${{ EVAL_CRITERIA }}"` — entire field from env var -- **Partial/inline**: `"must be ${{ EXPECTED }} and clear"` — env var within a string +- **Full-value**: `criteria: "{{ env.EVAL_CRITERIA }}"` — entire field from env var +- **Partial/inline**: `"must be {{ env.EXPECTED }} and clear"` — env var within a string - **Missing vars**: resolve to empty string (downstream validation catches required blanks) - **All fields**: works in any string field — criteria, input, workspace paths, etc. diff --git a/examples/features/env-interpolation/evals/dataset.eval.yaml b/examples/features/env-interpolation/evals/dataset.eval.yaml index e40ff22fe..613f7bfd9 100644 --- a/examples/features/env-interpolation/evals/dataset.eval.yaml +++ b/examples/features/env-interpolation/evals/dataset.eval.yaml @@ -1,6 +1,6 @@ # Environment Variable Interpolation Example # -# All string fields support ${{ VAR }} syntax for env variable interpolation. +# Config-load fields support {{ env.VAR }} syntax for env variable interpolation. # Missing variables resolve to empty string. # # Usage: @@ -10,7 +10,7 @@ # Or use a .env file in the project root: # CUSTOM_SYSTEM_PROMPT=You are a helpful assistant who always greets warmly. -description: Demonstrates ${{ VAR }} interpolation in eval fields +description: Demonstrates {{ env.VAR }} interpolation in eval fields target: llm @@ -19,13 +19,13 @@ tests: - id: full-value criteria: Responds with a friendly greeting input: "Hello!" - expected_output: "${{ EXPECTED_GREETING }}" + expected_output: "{{ env.EXPECTED_GREETING }}" # Partial/inline interpolation: env var embedded in a larger string - id: partial-value criteria: Response uses the system prompt persona input: - role: system - content: "${{ CUSTOM_SYSTEM_PROMPT }}" + content: "{{ env.CUSTOM_SYSTEM_PROMPT }}" - role: user content: "Hi there!" diff --git a/examples/features/test-vars-templating/README.md b/examples/features/test-vars-templating/README.md index 44a29636f..c41ddb618 100644 --- a/examples/features/test-vars-templating/README.md +++ b/examples/features/test-vars-templating/README.md @@ -1,6 +1,6 @@ # Per-Test Vars Templating -Demonstrates `tests[].vars` with `{{name}}` placeholders in eval files. +Demonstrates `tests[].vars` with `{{ vars.name }}` placeholders in eval files. ## Usage @@ -11,6 +11,6 @@ agentv eval examples/features/test-vars-templating/evals/dataset.eval.yaml ## Features - **Per-test data**: each test defines its own `vars` object -- **Template substitution**: `{{question}}` and dotted paths like `{{expected.answer}}` +- **Template substitution**: `{{ vars.question }}` and dotted paths like `{{ vars.expected.answer }}` - **Suite-level templates**: shared `input` can reference per-test vars too -- **Separate from env interpolation**: `{{question}}` uses test data, `${{ VAR }}` uses environment variables +- **Separate from env interpolation**: `{{ vars.question }}` uses test data, `{{ env.VAR }}` uses environment variables diff --git a/examples/features/test-vars-templating/evals/dataset.eval.yaml b/examples/features/test-vars-templating/evals/dataset.eval.yaml index da567890f..ae21ad1f5 100644 --- a/examples/features/test-vars-templating/evals/dataset.eval.yaml +++ b/examples/features/test-vars-templating/evals/dataset.eval.yaml @@ -1,7 +1,7 @@ # Per-test vars templating example # -# tests[].vars provides per-test data for {{name}} placeholders in eval fields. -# Placeholders support dotted paths like {{expected.answer}}. +# tests[].vars provides per-test data for {{ vars.name }} placeholders in eval fields. +# Placeholders support dotted paths like {{ vars.expected.answer }}. # # Usage: # agentv eval examples/features/test-vars-templating/evals/dataset.eval.yaml @@ -12,7 +12,7 @@ target: llm input: - role: system - content: "You are a concise assistant answering {{category}} questions." + content: "You are a concise assistant answering {{ vars.category }} questions." tests: - id: capital-france @@ -21,9 +21,9 @@ tests: question: What is the capital of France? expected: answer: Paris - criteria: "Answers {{question}} correctly" - input: "Question: {{question}}" - expected_output: "{{expected.answer}}" + criteria: "Answers {{ vars.question }} correctly" + input: "Question: {{ vars.question }}" + expected_output: "{{ vars.expected.answer }}" - id: greet-ada vars: @@ -32,8 +32,8 @@ tests: name: Ada expected: answer: Hello, Ada! - criteria: "Greets {{person.name}} warmly" + criteria: "Greets {{ vars.person.name }} warmly" input: - role: user - content: "Say hello to {{person.name}}." - expected_output: "{{expected.answer}}" + content: "Say hello to {{ vars.person.name }}." + expected_output: "{{ vars.expected.answer }}" diff --git a/packages/core/package.json b/packages/core/package.json index b7804a364..6e733ff2b 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -47,6 +47,7 @@ "fast-glob": "^3.3.3", "json5": "^2.2.3", "micromatch": "^4.0.8", + "nunjucks": "^3.2.4", "yaml": "^2.8.3", "zod": "^3.23.8" }, @@ -72,6 +73,7 @@ }, "devDependencies": { "@types/micromatch": "^4.0.10", + "@types/nunjucks": "^3.2.6", "zod-to-json-schema": "^3.25.1" } } diff --git a/packages/core/src/evaluation/interpolation.ts b/packages/core/src/evaluation/interpolation.ts index e8b396224..1ab64a496 100644 --- a/packages/core/src/evaluation/interpolation.ts +++ b/packages/core/src/evaluation/interpolation.ts @@ -1,14 +1,12 @@ +import nunjucks from 'nunjucks'; import type { EnvLookup } from './providers/types.js'; -const ENV_VAR_PATTERN = /\$\{\{\s*([A-Za-z_][A-Za-z0-9_]*)\s*\}\}/g; -const TEMPLATE_VAR_PATTERN = /\{\{\s*([A-Za-z_][A-Za-z0-9_.]*)\s*\}\}/g; -const WHOLE_TEMPLATE_VAR_PATTERN = /^\{\{\s*([A-Za-z_][A-Za-z0-9_.]*)\s*\}\}$/; +export type NunjucksFilterMap = Readonly unknown>>; -/** - * Regex that matches a string consisting of exactly one `${{ VAR }}` reference - * and nothing else. Used to detect whole-value substitutions eligible for type coercion. - */ -const WHOLE_VAR_PATTERN = /^\$\{\{\s*([A-Za-z_][A-Za-z0-9_]*)\s*\}\}$/; +const WHOLE_SIMPLE_TEMPLATE_VAR_PATTERN = + /^\s*\{\{\s*([A-Za-z_][A-Za-z0-9_]*(?:\.[A-Za-z_][A-Za-z0-9_]*)*)\s*\}\}\s*$/; +const ENV_OUTPUT_PATTERN = /\{\{\s*env\.[\s\S]*?\}\}/g; +const WHOLE_ENV_OUTPUT_PATTERN = /^\s*\{\{\s*env\.[\s\S]*?\}\}\s*$/; /** * Pattern matching plain integers (e.g. "42", "-7") and decimal fractions @@ -20,9 +18,6 @@ const PLAIN_NUMBER_PATTERN = /^-?(?:0|[1-9]\d*)(?:\.\d+)?$/; /** * Coerce a resolved string to its native primitive type when appropriate. * "true"/"false" become booleans; plain integer/decimal strings become numbers. - * Strings that happen to be valid JS numbers but are not plain decimal notation - * (hex, scientific notation, "Infinity") are left as strings. - * All other strings (including empty string) are returned as-is. */ function coercePrimitive(value: string): unknown { if (value === 'true') return true; @@ -49,53 +44,52 @@ function cloneTemplateValue(value: unknown): unknown { return value; } -function stringifyTemplateValue(value: unknown): string { - if (typeof value === 'string') return value; - return JSON.stringify(value); +function createNunjucksEnvironment(filters?: NunjucksFilterMap): nunjucks.Environment { + const environment = new nunjucks.Environment(undefined, { + autoescape: false, + throwOnUndefined: false, + }); + environment.addFilter('load', (value: string) => JSON.parse(value) as unknown); + for (const [name, filter] of Object.entries(filters ?? {})) { + environment.addFilter(name, filter); + } + return environment; } -function lookupTemplateVar( - vars: Readonly>, - expression: string, -): unknown | undefined { - if (!expression) return undefined; +function lookupPath(context: Readonly>, expression: string): unknown { return expression.split('.').reduce((current, segment) => { if (!isPlainObject(current)) { return undefined; } return current[segment]; - }, vars); + }, context); +} + +function renderString( + template: string, + context: Readonly>, + filters?: NunjucksFilterMap, +): string { + return createNunjucksEnvironment(filters).renderString(template, context); +} + +function renderEnvString(template: string, env: EnvLookup): string { + if (template.includes('${{')) { + return template; + } + return template.replace(ENV_OUTPUT_PATTERN, (match) => renderString(match, { env })); } /** - * Recursively interpolate `${{ VAR }}` references in all string values. - * Missing variables resolve to empty string. - * Non-string values pass through unchanged. Returns a new object (no mutation). - * - * Type coercion: when the **entire** string value is a single `${{ VAR }}` reference - * (no surrounding text), the resolved value is coerced to its native type — - * `"true"`/`"false"` become booleans, numeric strings become numbers. This allows - * boolean and numeric config fields to be driven by environment variables: + * Recursively render config-load `{{ env.VAR }}` templates in string values. * - * ```yaml - * # .agentv/config.yaml - * results: - * export: - * auto_push: ${{ AGENTV_AUTO_PUSH }} # AGENTV_AUTO_PUSH=true → boolean true - * ``` - * - * Inline/partial substitutions (e.g. `"prefix-${{ VAR }}"`) are always strings. + * Runtime shell variables such as `$VAR` and `${VAR}` are intentionally outside + * this syntax and pass through unchanged for CLI target subprocesses. */ export function interpolateEnv(value: unknown, env: EnvLookup): unknown { if (typeof value === 'string') { - // Whole-value substitution: coerce the resolved value to its native type. - const wholeMatch = WHOLE_VAR_PATTERN.exec(value); - if (wholeMatch) { - const resolved = env[wholeMatch[1] as string] ?? ''; - return coercePrimitive(resolved); - } - // Partial/inline substitution: always produces a string. - return value.replace(ENV_VAR_PATTERN, (_, varName: string) => env[varName] ?? ''); + const rendered = renderEnvString(value, env); + return WHOLE_ENV_OUTPUT_PATTERN.test(value) ? coercePrimitive(rendered) : rendered; } if (Array.isArray(value)) { return value.map((item) => interpolateEnv(item, env)); @@ -111,35 +105,37 @@ export function interpolateEnv(value: unknown, env: EnvLookup): unknown { } /** - * Recursively interpolate `{{ var }}` references in string values using per-test vars. - * Missing variables are left unchanged so unrelated template syntaxes remain intact. - * When the whole string is a single variable reference, the original JSON value is preserved. + * Recursively render eval-time Nunjucks templates using per-test vars. + * + * The context exposes both promptfoo-style top-level vars (`{{ name }}`) and the + * explicit namespace (`{{ vars.name }}`). When the whole field is exactly a + * simple variable reference, the original JSON value is preserved. */ export function interpolateTemplateVars( value: unknown, vars: Readonly>, + filters?: NunjucksFilterMap, ): unknown { if (typeof value === 'string') { - const wholeMatch = WHOLE_TEMPLATE_VAR_PATTERN.exec(value); + const context = { ...vars, vars }; + const wholeMatch = WHOLE_SIMPLE_TEMPLATE_VAR_PATTERN.exec(value); if (wholeMatch) { - const resolved = lookupTemplateVar(vars, wholeMatch[1] as string); - return resolved === undefined ? value : cloneTemplateValue(resolved); + const resolved = lookupPath(context, wholeMatch[1] as string); + if (resolved !== undefined) { + return cloneTemplateValue(resolved); + } } - - return value.replace(TEMPLATE_VAR_PATTERN, (match, expression: string) => { - const resolved = lookupTemplateVar(vars, expression); - return resolved === undefined ? match : stringifyTemplateValue(resolved); - }); + return renderString(value, context, filters); } if (Array.isArray(value)) { - return value.map((item) => interpolateTemplateVars(item, vars)); + return value.map((item) => interpolateTemplateVars(item, vars, filters)); } if (isPlainObject(value)) { const result: Record = {}; for (const [key, nested] of Object.entries(value)) { - result[key] = interpolateTemplateVars(nested, vars); + result[key] = interpolateTemplateVars(nested, vars, filters); } return result; } diff --git a/packages/core/src/evaluation/loaders/grader-parser.ts b/packages/core/src/evaluation/loaders/grader-parser.ts index a2d74a4ea..49283f09f 100644 --- a/packages/core/src/evaluation/loaders/grader-parser.ts +++ b/packages/core/src/evaluation/loaders/grader-parser.ts @@ -470,6 +470,7 @@ async function parseGraderList( const name = rawName ?? (isCustomType ? typeValue : generateAssertionName(typeValue as GraderKind, rawEvaluator)); + const metric = asString(rawEvaluator.metric); if (!name) { logWarning(`Skipping evaluator with missing name in '${evalId}'`); @@ -1327,6 +1328,7 @@ async function parseGraderList( name, type: 'contains', value, + ...(metric !== undefined ? { metric } : {}), ...(weight !== undefined ? { weight } : {}), ...(required !== undefined ? { required } : {}), ...(min_score !== undefined ? { min_score } : {}), diff --git a/packages/core/src/evaluation/loaders/shorthand-expansion.ts b/packages/core/src/evaluation/loaders/shorthand-expansion.ts index b6a784189..9ed2aee24 100644 --- a/packages/core/src/evaluation/loaders/shorthand-expansion.ts +++ b/packages/core/src/evaluation/loaders/shorthand-expansion.ts @@ -29,6 +29,10 @@ export function expandInputShorthand(value: JsonValue | undefined): TestMessage[ // String shorthand: single user message if (typeof value === 'string') { + const parsedMessages = parseChatArrayPrompt(value); + if (parsedMessages) { + return parsedMessages; + } return [{ role: 'user', content: value }]; } @@ -50,6 +54,24 @@ export function expandInputShorthand(value: JsonValue | undefined): TestMessage[ return undefined; } +function parseChatArrayPrompt(value: string): TestMessage[] | undefined { + const trimmed = value.trim(); + if (!trimmed.startsWith('[')) { + return undefined; + } + + try { + const parsed = JSON.parse(trimmed) as unknown; + if (!Array.isArray(parsed)) { + return undefined; + } + const messages = parsed.filter((message): message is TestMessage => isTestMessage(message)); + return messages.length === parsed.length && messages.length > 0 ? messages : undefined; + } catch { + return undefined; + } +} + /** * Expand the `expected_output` shorthand into a message array. * diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 1964d1336..bdb4b8aac 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -869,7 +869,7 @@ export type InlineAssertEvaluatorConfig = { readonly negate?: boolean; }; -export type GraderConfig = +export type GraderConfig = ( | CodeGraderConfig | LlmGraderConfig | CompositeGraderConfig @@ -892,7 +892,11 @@ export type GraderConfig = | IsJsonGraderConfig | EqualsGraderConfig | RubricsEvaluatorConfig - | InlineAssertEvaluatorConfig; + | InlineAssertEvaluatorConfig +) & { + /** Optional promptfoo-style named score key. Scoring aggregation support is layered separately. */ + readonly metric?: string; +}; /** * Source reference resolved while loading an eval definition. diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index a5a99ef18..890b28c69 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -1,5 +1,6 @@ import { readFile, realpath, stat } from 'node:fs/promises'; import path from 'node:path'; +import { pathToFileURL } from 'node:url'; import fg from 'fast-glob'; import micromatch from 'micromatch'; import { stringify as stringifyYaml } from 'yaml'; @@ -11,7 +12,11 @@ import { normalizeExperimentRunOverride, } from './experiment.js'; import { collectResolvedInputFilePaths } from './input-message-utils.js'; -import { interpolateEnv, interpolateTemplateVars } from './interpolation.js'; +import { + type NunjucksFilterMap, + interpolateEnv, + interpolateTemplateVars, +} from './interpolation.js'; import { loadTestsFromAgentSkills } from './loaders/agent-skills-parser.js'; import { expandFileReferences, @@ -194,6 +199,7 @@ type RawTestSuite = JsonObject & { readonly workspace?: JsonValue; readonly assertions?: JsonValue; readonly preprocessors?: JsonValue; + readonly nunjucks_filters?: JsonValue; readonly input?: JsonValue; readonly metadata?: JsonValue; readonly governance?: JsonValue; @@ -254,16 +260,18 @@ function resolveTests(suite: RawTestSuite): JsonValue | undefined { function interpolateCaseField( value: T, vars: JsonObject | undefined, + filters?: NunjucksFilterMap, ): T { if (!vars || value === undefined) { return value; } - return interpolateTemplateVars(value, vars as Record) as T; + return interpolateTemplateVars(value, vars as Record, filters) as T; } function interpolateCaseTurns( turns: JsonValue | undefined, vars: JsonObject | undefined, + filters?: NunjucksFilterMap, ): JsonValue | undefined { if (!vars || !Array.isArray(turns)) { return turns; @@ -276,34 +284,123 @@ function interpolateCaseTurns( return { ...rawTurn, - input: interpolateCaseField(rawTurn.input, vars), - expected_output: interpolateCaseField(rawTurn.expected_output, vars), + input: interpolateCaseField(rawTurn.input, vars, filters), + expected_output: interpolateCaseField(rawTurn.expected_output, vars, filters), + assertions: interpolateCaseField(rawTurn.assertions, vars, filters), } satisfies JsonObject; }); } -function interpolateRawEvalCase(raw: RawEvalCase, vars: JsonObject | undefined): RawEvalCase { +function interpolateRawEvalCase( + raw: RawEvalCase, + vars: JsonObject | undefined, + filters?: NunjucksFilterMap, +): RawEvalCase { if (!vars) { return raw; } return { ...raw, - ...(raw.criteria !== undefined ? { criteria: interpolateCaseField(raw.criteria, vars) } : {}), + ...(raw.id !== undefined ? { id: interpolateCaseField(raw.id, vars, filters) } : {}), + ...(raw.criteria !== undefined + ? { criteria: interpolateCaseField(raw.criteria, vars, filters) } + : {}), ...(raw.expected_outcome !== undefined - ? { expected_outcome: interpolateCaseField(raw.expected_outcome, vars) } + ? { expected_outcome: interpolateCaseField(raw.expected_outcome, vars, filters) } : {}), - ...(raw.input !== undefined ? { input: interpolateCaseField(raw.input, vars) } : {}), + ...(raw.input !== undefined ? { input: interpolateCaseField(raw.input, vars, filters) } : {}), ...(raw.input_files !== undefined - ? { input_files: interpolateCaseField(raw.input_files, vars) } + ? { input_files: interpolateCaseField(raw.input_files, vars, filters) } : {}), ...(raw.expected_output !== undefined - ? { expected_output: interpolateCaseField(raw.expected_output, vars) } + ? { expected_output: interpolateCaseField(raw.expected_output, vars, filters) } + : {}), + ...(raw.assertions !== undefined + ? { assertions: interpolateCaseField(raw.assertions, vars, filters) } : {}), - ...(raw.turns !== undefined ? { turns: interpolateCaseTurns(raw.turns, vars) } : {}), + ...(raw.evaluators !== undefined + ? { evaluators: interpolateCaseField(raw.evaluators, vars, filters) } + : {}), + ...(raw.rubrics !== undefined + ? { rubrics: interpolateCaseField(raw.rubrics, vars, filters) } + : {}), + ...(raw.turns !== undefined ? { turns: interpolateCaseTurns(raw.turns, vars, filters) } : {}), }; } +function shouldExpandVarValue(value: JsonValue): value is readonly JsonValue[] { + return Array.isArray(value) && (value.length === 0 || typeof value[0] === 'string'); +} + +function expandArrayVarCases(raw: RawEvalCase): readonly RawEvalCase[] { + if (!isJsonObject(raw.vars)) { + return [raw]; + } + + const entries = Object.entries(raw.vars); + let combinations: Record[] = [{}]; + let expanded = false; + + for (const [key, value] of entries) { + const values = shouldExpandVarValue(value) ? value : [value]; + expanded ||= values.length !== 1 || values[0] !== value; + const next: Record[] = []; + for (const combination of combinations) { + for (const candidate of values) { + next.push({ ...combination, [key]: candidate }); + } + } + combinations = next; + } + + if (!expanded) { + return [raw]; + } + + return combinations.map((vars) => ({ ...raw, vars })); +} + +async function loadNunjucksFilters( + rawFilters: JsonValue | undefined, + evalFileDir: string, +): Promise { + if (rawFilters === undefined) { + return undefined; + } + if (!isJsonObject(rawFilters)) { + logWarning('Invalid nunjucks_filters: expected object mapping filter names to file paths'); + return undefined; + } + + const filters: Record unknown> = {}; + for (const [name, rawFilterPath] of Object.entries(rawFilters)) { + if (typeof rawFilterPath !== 'string' || rawFilterPath.trim().length === 0) { + logWarning(`Skipping nunjucks filter '${name}': expected file path string`); + continue; + } + + const filterPath = rawFilterPath.startsWith('file://') + ? rawFilterPath.slice('file://'.length) + : rawFilterPath; + const matches = await fg(path.resolve(evalFileDir, filterPath).replaceAll('\\', '/'), { + onlyFiles: true, + absolute: true, + }); + const resolvedPath = matches.sort().at(-1) ?? path.resolve(evalFileDir, filterPath); + const imported = (await import(pathToFileURL(resolvedPath).href)) as Record; + const filter = imported.default ?? imported[name]; + if (typeof filter !== 'function') { + throw new Error( + `Invalid nunjucks filter '${name}' at ${resolvedPath}: expected default export or named export '${name}' to be a function`, + ); + } + filters[name] = filter as (...args: unknown[]) => unknown; + } + + return Object.keys(filters).length > 0 ? filters : undefined; +} + /** * Read metadata from a test suite file (like target name). * This is a convenience function for CLI tools that need metadata without loading all tests. @@ -528,6 +625,7 @@ async function loadTestsFromParsedYamlValue( const importedSuiteTests: EvalTest[] = []; const evalFileDir = path.dirname(absoluteTestPath); + const nunjucksFilters = await loadNunjucksFilters(suite.nunjucks_filters, evalFileDir); const parentWorkspace = parentWorkspaceLocation(suite); const importEntries = readImports(suite.imports); const expandedImports = await expandImportEntries({ @@ -578,301 +676,312 @@ async function loadTestsFromParsedYamlValue( const results: EvalTest[] = []; - for (const rawTestCase of expandedTestCases) { - if (!isJsonObject(rawTestCase)) { - logWarning('Skipping invalid test entry (expected object)'); - continue; - } + for (const rawExpandedTestCase of expandedTestCases) { + const expandedVarCases = isJsonObject(rawExpandedTestCase) + ? expandArrayVarCases(rawExpandedTestCase as RawEvalCase) + : [rawExpandedTestCase]; - const testCaseConfig = rawTestCase as RawEvalCase; - const id = asString(testCaseConfig.id); + for (const rawTestCase of expandedVarCases) { + if (!isJsonObject(rawTestCase)) { + logWarning('Skipping invalid test entry (expected object)'); + continue; + } - // Skip tests that don't match the filter pattern (glob supported) - if (filterPattern && (!id || !matchesFilter(id, filterPattern))) { - continue; - } + const testCaseConfig = rawTestCase as RawEvalCase; + const caseVars = isJsonObject(testCaseConfig.vars) ? testCaseConfig.vars : undefined; + const renderedCase = interpolateRawEvalCase(testCaseConfig, caseVars, nunjucksFilters); + const id = asString(renderedCase.id); - const caseVars = isJsonObject(testCaseConfig.vars) ? testCaseConfig.vars : undefined; - const renderedCase = interpolateRawEvalCase(testCaseConfig, caseVars); + // Skip tests that don't match the filter pattern (glob supported) + if (filterPattern && (!id || !matchesFilter(id, filterPattern))) { + continue; + } - const conversationId = asString(renderedCase.conversation_id); - let outcome = asString(renderedCase.criteria); - if (!outcome && renderedCase.expected_outcome !== undefined) { - outcome = asString(renderedCase.expected_outcome); - if (outcome) { - logWarning( - `Test '${asString(renderedCase.id) ?? 'unknown'}': 'expected_outcome' is deprecated. Use 'criteria' instead.`, - ); + const conversationId = asString(renderedCase.conversation_id); + let outcome = asString(renderedCase.criteria); + if (!outcome && renderedCase.expected_outcome !== undefined) { + outcome = asString(renderedCase.expected_outcome); + if (outcome) { + logWarning( + `Test '${asString(renderedCase.id) ?? 'unknown'}': 'expected_outcome' is deprecated. Use 'criteria' instead.`, + ); + } } - } - // Extract per-case execution config early (reused below for skip_defaults) - const caseExecution = isJsonObject(renderedCase.execution) ? renderedCase.execution : undefined; - rejectUnsupportedTestExecutionFields(caseExecution, id); - if (caseExecution?.workspace !== undefined) { - throw new Error( - `test '${id ?? 'unknown'}'.execution.workspace has been removed from eval YAML. Put machine-local workspace_path/workspace_mode in .agentv/config.local.yaml under execution, or pass --workspace-path/--workspace-mode. Keep portable task setup in test workspace or suite workspace.`, - ); - } - const skipDefaults = caseExecution?.skip_defaults === true; - const caseThreshold = - typeof caseExecution?.threshold === 'number' && - (caseExecution.threshold as number) >= 0 && - (caseExecution.threshold as number) <= 1 - ? (caseExecution.threshold as number) + // Extract per-case execution config early (reused below for skip_defaults) + const caseExecution = isJsonObject(renderedCase.execution) + ? renderedCase.execution : undefined; - const caseRun = mergeRunOverrides( - caseThreshold !== undefined ? { threshold: caseThreshold } : undefined, - normalizeRunOverride(renderedCase.run, `test '${id ?? 'unknown'}'.run`), - ); + rejectUnsupportedTestExecutionFields(caseExecution, id); + if (caseExecution?.workspace !== undefined) { + throw new Error( + `test '${id ?? 'unknown'}'.execution.workspace has been removed from eval YAML. Put machine-local workspace_path/workspace_mode in .agentv/config.local.yaml under execution, or pass --workspace-path/--workspace-mode. Keep portable task setup in test workspace or suite workspace.`, + ); + } + const skipDefaults = caseExecution?.skip_defaults === true; + const caseThreshold = + typeof caseExecution?.threshold === 'number' && + (caseExecution.threshold as number) >= 0 && + (caseExecution.threshold as number) <= 1 + ? (caseExecution.threshold as number) + : undefined; + const caseRun = mergeRunOverrides( + caseThreshold !== undefined ? { threshold: caseThreshold } : undefined, + normalizeRunOverride(renderedCase.run, `test '${id ?? 'unknown'}'.run`), + ); - // Resolve input with shorthand support (pass suite-level input_files for merge) - const effectiveSuiteInputFiles = - rawSuiteInputFiles && !skipDefaults - ? interpolateCaseField(rawSuiteInputFiles, caseVars) - : undefined; - let inputCase = renderedCase; - let inputSuiteFiles = effectiveSuiteInputFiles; - if (renderedCase.input === undefined) { - const promptFallback = await loadPromptMdFallback({ - evalFilePath: absoluteTestPath, - searchRoots, - testInputFiles: renderedCase.input_files, - suiteInputFiles: effectiveSuiteInputFiles, - }); - if (promptFallback) { - if (promptFallback.inputFilesSource === 'test') { - const { input_files: _inputFiles, ...caseWithoutInputFiles } = renderedCase; - inputCase = { - ...caseWithoutInputFiles, - input: promptFallback.promptText, - ...(promptFallback.remainingInputFiles - ? { input_files: [...promptFallback.remainingInputFiles] } - : {}), - }; - inputSuiteFiles = undefined; - } else { - inputCase = { - ...renderedCase, - input: promptFallback.promptText, - }; - if (promptFallback.inputFilesSource === 'suite') { - inputSuiteFiles = promptFallback.remainingInputFiles - ? [...promptFallback.remainingInputFiles] - : undefined; + // Resolve input with shorthand support (pass suite-level input_files for merge) + const effectiveSuiteInputFiles = + rawSuiteInputFiles && !skipDefaults + ? interpolateCaseField(rawSuiteInputFiles, caseVars, nunjucksFilters) + : undefined; + let inputCase = renderedCase; + let inputSuiteFiles = effectiveSuiteInputFiles; + if (renderedCase.input === undefined) { + const promptFallback = await loadPromptMdFallback({ + evalFilePath: absoluteTestPath, + searchRoots, + testInputFiles: renderedCase.input_files, + suiteInputFiles: effectiveSuiteInputFiles, + }); + if (promptFallback) { + if (promptFallback.inputFilesSource === 'test') { + const { input_files: _inputFiles, ...caseWithoutInputFiles } = renderedCase; + inputCase = { + ...caseWithoutInputFiles, + input: promptFallback.promptText, + ...(promptFallback.remainingInputFiles + ? { input_files: [...promptFallback.remainingInputFiles] } + : {}), + }; + inputSuiteFiles = undefined; + } else { + inputCase = { + ...renderedCase, + input: promptFallback.promptText, + }; + if (promptFallback.inputFilesSource === 'suite') { + inputSuiteFiles = promptFallback.remainingInputFiles + ? [...promptFallback.remainingInputFiles] + : undefined; + } } } } - } - const testInputMessages = resolveInputMessages(inputCase, inputSuiteFiles); - // Resolve expected_output with shorthand support - const expectedMessages = resolveExpectedMessages(renderedCase) ?? []; - - // A test is complete when it has id, input, and at least one of: criteria, expected_output, assertions, or turns (conversation mode) - const hasEvaluationSpec = - !!outcome || - expectedMessages.length > 0 || - renderedCase.assertions !== undefined || - (Array.isArray(renderedCase.turns) && renderedCase.turns.length > 0); - if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) { - logError( - `Skipping incomplete test: ${id ?? 'unknown'}. Missing required fields: id, input or PROMPT.md, and at least one of criteria/expected_output/assertions/turns`, - ); - continue; - } - - // Prepend suite-level input to test input (respecting skip_defaults) - const effectiveSuiteInputValue = - rawSuiteInput && !skipDefaults ? interpolateCaseField(rawSuiteInput, caseVars) : undefined; - const effectiveSuiteInputMessages = expandInputShorthand(effectiveSuiteInputValue); - - // expected_output is optional - for outcome-only evaluation - const hasExpectedMessages = expectedMessages.length > 0; + const testInputMessages = resolveInputMessages(inputCase, inputSuiteFiles); + // Resolve expected_output with shorthand support + const expectedMessages = resolveExpectedMessages(renderedCase) ?? []; + + // A test is complete when it has id, input, and at least one of: criteria, expected_output, assertions, or turns (conversation mode) + const hasEvaluationSpec = + !!outcome || + expectedMessages.length > 0 || + renderedCase.assertions !== undefined || + (Array.isArray(renderedCase.turns) && renderedCase.turns.length > 0); + if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) { + logError( + `Skipping incomplete test: ${id ?? 'unknown'}. Missing required fields: id, input or PROMPT.md, and at least one of criteria/expected_output/assertions/turns`, + ); + continue; + } - const inputTextParts: string[] = []; + // Prepend suite-level input to test input (respecting skip_defaults) + const effectiveSuiteInputValue = + rawSuiteInput && !skipDefaults + ? interpolateCaseField(rawSuiteInput, caseVars, nunjucksFilters) + : undefined; + const effectiveSuiteInputMessages = expandInputShorthand(effectiveSuiteInputValue); + + // expected_output is optional - for outcome-only evaluation + const hasExpectedMessages = expectedMessages.length > 0; + + const inputTextParts: string[] = []; + + // Process suite-level input first + const suiteResolvedInputMessages = effectiveSuiteInputMessages + ? await processMessages({ + messages: effectiveSuiteInputMessages, + searchRoots, + repoRootPath, + textParts: inputTextParts, + messageType: 'input', + verbose, + }) + : []; - // Process suite-level input first - const suiteResolvedInputMessages = effectiveSuiteInputMessages - ? await processMessages({ - messages: effectiveSuiteInputMessages, - searchRoots, - repoRootPath, - textParts: inputTextParts, - messageType: 'input', - verbose, - }) - : []; - - // Process test-level input - const testResolvedInputMessages = await processMessages({ - messages: testInputMessages, - searchRoots, - repoRootPath, - textParts: inputTextParts, - messageType: 'input', - verbose, - }); - const inputMessages = [...suiteResolvedInputMessages, ...testResolvedInputMessages]; + // Process test-level input + const testResolvedInputMessages = await processMessages({ + messages: testInputMessages, + searchRoots, + repoRootPath, + textParts: inputTextParts, + messageType: 'input', + verbose, + }); + const inputMessages = [...suiteResolvedInputMessages, ...testResolvedInputMessages]; + + // Process expected_output into segments (only if provided) + // Preserve full message structure including role and tool_calls for evaluator + const outputSegments = hasExpectedMessages + ? await processExpectedMessages({ + messages: expectedMessages, + searchRoots, + repoRootPath, + verbose, + }) + : []; - // Process expected_output into segments (only if provided) - // Preserve full message structure including role and tool_calls for evaluator - const outputSegments = hasExpectedMessages - ? await processExpectedMessages({ - messages: expectedMessages, + // Build reference_answer: + // Extract the content from the last message in expected_output (similar to answer) + let referenceAnswer = ''; + if (outputSegments.length > 0) { + // Get the last message + const lastMessage = outputSegments[outputSegments.length - 1]; + const content = lastMessage.content; + const toolCalls = lastMessage.tool_calls; + + if (typeof content === 'string') { + referenceAnswer = content; + } else if (content !== undefined && content !== null) { + // Serialize just the content, not the entire message + referenceAnswer = JSON.stringify(content, null, 2); + } else if (toolCalls !== undefined && toolCalls !== null) { + // Message with only tool_calls - serialize just the tool_calls + referenceAnswer = JSON.stringify(toolCalls, null, 2); + } + } + const question = inputTextParts + .map((part) => part.trim()) + .filter((part) => part.length > 0) + .join(' '); + + const testCaseEvaluatorKind = coerceEvaluator(renderedCase.evaluator, id) ?? globalEvaluator; + let evaluators: Awaited>; + try { + evaluators = await parseGraders( + renderedCase, + globalExecution, searchRoots, - repoRootPath, - verbose, - }) - : []; - - // Build reference_answer: - // Extract the content from the last message in expected_output (similar to answer) - let referenceAnswer = ''; - if (outputSegments.length > 0) { - // Get the last message - const lastMessage = outputSegments[outputSegments.length - 1]; - const content = lastMessage.content; - const toolCalls = lastMessage.tool_calls; - - if (typeof content === 'string') { - referenceAnswer = content; - } else if (content !== undefined && content !== null) { - // Serialize just the content, not the entire message - referenceAnswer = JSON.stringify(content, null, 2); - } else if (toolCalls !== undefined && toolCalls !== null) { - // Message with only tool_calls - serialize just the tool_calls - referenceAnswer = JSON.stringify(toolCalls, null, 2); + id ?? 'unknown', + suitePreprocessors, + ); + } catch (error) { + // Skip entire test if evaluator validation fails + const message = error instanceof Error ? error.message : String(error); + logError(`Skipping test '${id}': ${message}`); + continue; } - } - const question = inputTextParts - .map((part) => part.trim()) - .filter((part) => part.length > 0) - .join(' '); - const testCaseEvaluatorKind = coerceEvaluator(renderedCase.evaluator, id) ?? globalEvaluator; - let evaluators: Awaited>; - try { - evaluators = await parseGraders( + const assertionTemplateReferences = await collectAssertionTemplateSourceReferences( renderedCase, globalExecution, searchRoots, id ?? 'unknown', - suitePreprocessors, ); - } catch (error) { - // Skip entire test if evaluator validation fails - const message = error instanceof Error ? error.message : String(error); - logError(`Skipping test '${id}': ${message}`); - continue; - } - const assertionTemplateReferences = await collectAssertionTemplateSourceReferences( - renderedCase, - globalExecution, - searchRoots, - id ?? 'unknown', - ); - - // Handle inline rubrics field (deprecated: use assertions: [{type: rubrics, criteria: [...]}] instead) - const inlineRubrics = renderedCase.rubrics; - if (inlineRubrics !== undefined && Array.isArray(inlineRubrics)) { - const rubricEvaluator = parseInlineRubrics(inlineRubrics); - if (rubricEvaluator) { - // Prepend rubric evaluator to existing evaluators - evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator]; + // Handle inline rubrics field (deprecated: use assertions: [{type: rubrics, criteria: [...]}] instead) + const inlineRubrics = renderedCase.rubrics; + if (inlineRubrics !== undefined && Array.isArray(inlineRubrics)) { + const rubricEvaluator = parseInlineRubrics(inlineRubrics); + if (rubricEvaluator) { + // Prepend rubric evaluator to existing evaluators + evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator]; + } } - } - warnUnconsumedCriteria(outcome, evaluators, id ?? 'unknown'); - - const userFilePaths = collectResolvedInputFilePaths(inputMessages); - - // Parse per-case workspace config and merge with suite-level - const caseWorkspace = await resolveWorkspaceConfig(renderedCase.workspace, evalFileDir); - const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace); - - // Parse per-case metadata, then merge suite-level metadata payload. - // Arrays concatenate (suite-first, deduplicated), scalars on the case win. - const rawCaseMetadata = isJsonObject(renderedCase.metadata) - ? (renderedCase.metadata as Record) - : undefined; - const metadata = mergeSuiteMetadataPayload(rawCaseMetadata, suiteMetadataPayload); - - // Extract dependency fields - const dependsOn = Array.isArray(renderedCase.depends_on) - ? (renderedCase.depends_on as readonly string[]).filter( - (v): v is string => typeof v === 'string', - ) - : undefined; - const onDependencyFailureRaw = asString(renderedCase.on_dependency_failure); - const onDependencyFailure = - onDependencyFailureRaw === 'skip' || - onDependencyFailureRaw === 'fail' || - onDependencyFailureRaw === 'run' - ? (onDependencyFailureRaw as import('./types.js').DependencyFailurePolicy) + warnUnconsumedCriteria(outcome, evaluators, id ?? 'unknown'); + + const userFilePaths = collectResolvedInputFilePaths(inputMessages); + + // Parse per-case workspace config and merge with suite-level + const caseWorkspace = await resolveWorkspaceConfig(renderedCase.workspace, evalFileDir); + const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace); + + // Parse per-case metadata, then merge suite-level metadata payload. + // Arrays concatenate (suite-first, deduplicated), scalars on the case win. + const rawCaseMetadata = isJsonObject(renderedCase.metadata) + ? (renderedCase.metadata as Record) : undefined; + const metadata = mergeSuiteMetadataPayload(rawCaseMetadata, suiteMetadataPayload); - // Extract conversation mode fields - const modeRaw = asString(renderedCase.mode); - const mode: ConversationMode | undefined = - modeRaw === 'conversation' ? 'conversation' : undefined; - const turns = Array.isArray(renderedCase.turns) - ? parseTurns(renderedCase.turns as readonly unknown[]) - : undefined; - const aggregationRaw = asString(renderedCase.aggregation); - const aggregation: ConversationAggregation | undefined = - aggregationRaw === 'mean' || aggregationRaw === 'min' || aggregationRaw === 'max' - ? aggregationRaw + // Extract dependency fields + const dependsOn = Array.isArray(renderedCase.depends_on) + ? (renderedCase.depends_on as readonly string[]).filter( + (v): v is string => typeof v === 'string', + ) : undefined; - const onTurnFailureRaw = asString(renderedCase.on_turn_failure); - const onTurnFailure: TurnFailurePolicy | undefined = - onTurnFailureRaw === 'continue' || onTurnFailureRaw === 'stop' ? onTurnFailureRaw : undefined; - const windowSize = - typeof renderedCase.window_size === 'number' && renderedCase.window_size >= 1 - ? (renderedCase.window_size as number) + const onDependencyFailureRaw = asString(renderedCase.on_dependency_failure); + const onDependencyFailure = + onDependencyFailureRaw === 'skip' || + onDependencyFailureRaw === 'fail' || + onDependencyFailureRaw === 'run' + ? (onDependencyFailureRaw as import('./types.js').DependencyFailurePolicy) + : undefined; + + // Extract conversation mode fields + const modeRaw = asString(renderedCase.mode); + const mode: ConversationMode | undefined = + modeRaw === 'conversation' ? 'conversation' : undefined; + const turns = Array.isArray(renderedCase.turns) + ? parseTurns(renderedCase.turns as readonly unknown[]) : undefined; - - const category = normalizeCategoryPath(suite.category ?? options?.category); - - const testCase: EvalTest = { - id, - suite: suiteName, - category, - conversation_id: conversationId, - question: question, - input: inputMessages, - expected_output: outputSegments, - reference_answer: referenceAnswer, - file_paths: userFilePaths, - criteria: outcome ?? '', - evaluator: testCaseEvaluatorKind, - assertions: evaluators, - ...(suitePreprocessors ? { preprocessors: suitePreprocessors } : {}), - workspace: mergedWorkspace, - metadata, - ...(caseRun?.threshold !== undefined ? { threshold: caseRun.threshold } : {}), - ...(caseRun !== undefined ? { run: caseRun } : {}), - ...(mode ? { mode } : {}), - ...(turns && turns.length > 0 ? { turns } : {}), - ...(aggregation ? { aggregation } : {}), - ...(onTurnFailure ? { on_turn_failure: onTurnFailure } : {}), - ...(windowSize !== undefined ? { window_size: windowSize } : {}), - ...(dependsOn && dependsOn.length > 0 ? { depends_on: dependsOn } : {}), - ...(onDependencyFailure ? { on_dependency_failure: onDependencyFailure } : {}), - source: buildEvalTestSource({ - evalFilePath, - absoluteTestPath, - repoRootPath, + const aggregationRaw = asString(renderedCase.aggregation); + const aggregation: ConversationAggregation | undefined = + aggregationRaw === 'mean' || aggregationRaw === 'min' || aggregationRaw === 'max' + ? aggregationRaw + : undefined; + const onTurnFailureRaw = asString(renderedCase.on_turn_failure); + const onTurnFailure: TurnFailurePolicy | undefined = + onTurnFailureRaw === 'continue' || onTurnFailureRaw === 'stop' + ? onTurnFailureRaw + : undefined; + const windowSize = + typeof renderedCase.window_size === 'number' && renderedCase.window_size >= 1 + ? (renderedCase.window_size as number) + : undefined; + + const category = normalizeCategoryPath(suite.category ?? options?.category); + + const testCase: EvalTest = { id, - renderedCase, - rawCaseSnapshots, - inputMessages, - evaluators, - assertionTemplateReferences, - }), - }; - - results.push(testCase); + suite: suiteName, + category, + conversation_id: conversationId, + question: question, + input: inputMessages, + expected_output: outputSegments, + reference_answer: referenceAnswer, + file_paths: userFilePaths, + criteria: outcome ?? '', + evaluator: testCaseEvaluatorKind, + assertions: evaluators, + ...(suitePreprocessors ? { preprocessors: suitePreprocessors } : {}), + workspace: mergedWorkspace, + metadata, + ...(caseRun?.threshold !== undefined ? { threshold: caseRun.threshold } : {}), + ...(caseRun !== undefined ? { run: caseRun } : {}), + ...(mode ? { mode } : {}), + ...(turns && turns.length > 0 ? { turns } : {}), + ...(aggregation ? { aggregation } : {}), + ...(onTurnFailure ? { on_turn_failure: onTurnFailure } : {}), + ...(windowSize !== undefined ? { window_size: windowSize } : {}), + ...(dependsOn && dependsOn.length > 0 ? { depends_on: dependsOn } : {}), + ...(onDependencyFailure ? { on_dependency_failure: onDependencyFailure } : {}), + source: buildEvalTestSource({ + evalFilePath, + absoluteTestPath, + repoRootPath, + id, + renderedCase, + rawCaseSnapshots, + inputMessages, + evaluators, + assertionTemplateReferences, + }), + }; + + results.push(testCase); + } } return { diff --git a/packages/core/test/evaluation/interpolation-integration.test.ts b/packages/core/test/evaluation/interpolation-integration.test.ts index 664b9dbe0..fef7fefbc 100644 --- a/packages/core/test/evaluation/interpolation-integration.test.ts +++ b/packages/core/test/evaluation/interpolation-integration.test.ts @@ -26,11 +26,11 @@ describe('env interpolation in YAML loading', () => { } }); - it('interpolates ${{ VAR }} in test criteria field', async () => { + it('interpolates {{ env.VAR }} in test criteria field', async () => { const evalFile = path.join(testDir, 'interp-criteria.eval.yaml'); await writeFile( evalFile, - 'tests:\n - id: test-1\n input: "hello"\n criteria: "${{ AGENTV_TEST_CRITERIA }}"\n', + 'tests:\n - id: test-1\n input: "hello"\n criteria: "{{ env.AGENTV_TEST_CRITERIA }}"\n', ); const cases = await loadTests(evalFile, testDir); expect(cases[0].criteria).toBe('Must return correct answer'); @@ -44,11 +44,11 @@ describe('env interpolation in YAML loading', () => { 'workspace:', ' repos:', ' - path: ./RepoA', - ' repo: "${{ AGENTV_TEST_PATH }}"', + ' repo: "{{ env.AGENTV_TEST_PATH }}"', 'tests:', ' - id: test-1', ' input: "hello"', - ' criteria: "${{ AGENTV_TEST_CRITERIA }}"', + ' criteria: "{{ env.AGENTV_TEST_CRITERIA }}"', '', ].join('\n'), ); @@ -58,7 +58,7 @@ describe('env interpolation in YAML loading', () => { expect(cases[0].workspace?.repos?.[0]?.repo).toBe('https://github.com/org/from-env.git'); }); - it('interpolates ${{ VAR }} in workspace repo identity', async () => { + it('interpolates {{ env.VAR }} in workspace repo identity', async () => { const evalFile = path.join(testDir, 'interp-workspace.eval.yaml'); await writeFile( evalFile, @@ -66,7 +66,7 @@ describe('env interpolation in YAML loading', () => { 'workspace:', ' repos:', ' - path: ./RepoA', - ' repo: "${{ AGENTV_TEST_PATH }}"', + ' repo: "{{ env.AGENTV_TEST_PATH }}"', 'tests:', ' - id: test-1', ' input: "hello"', @@ -78,11 +78,11 @@ describe('env interpolation in YAML loading', () => { expect(cases[0].workspace?.repos?.[0]?.repo).toBe('https://github.com/org/from-env.git'); }); - it('interpolates ${{ VAR }} in external workspace YAML file', async () => { + it('interpolates {{ env.VAR }} in external workspace YAML file', async () => { const workspaceFile = path.join(testDir, 'workspace.yaml'); await writeFile( workspaceFile, - ['repos:', ' - path: ./RepoB', ' repo: "${{ AGENTV_TEST_PATH }}"', ''].join('\n'), + ['repos:', ' - path: ./RepoB', ' repo: "{{ env.AGENTV_TEST_PATH }}"', ''].join('\n'), ); const evalFile = path.join(testDir, 'interp-ext-workspace.eval.yaml'); await writeFile( @@ -100,11 +100,11 @@ describe('env interpolation in YAML loading', () => { expect(cases[0].workspace?.repos?.[0]?.repo).toBe('https://github.com/org/from-env.git'); }); - it('interpolates ${{ VAR }} in external YAML case files', async () => { + it('interpolates {{ env.VAR }} in external YAML case files', async () => { const casesFile = path.join(testDir, 'cases.yaml'); await writeFile( casesFile, - ['- id: ext-1', ' input: "hello"', ' criteria: "${{ AGENTV_TEST_CRITERIA }}"', ''].join( + ['- id: ext-1', ' input: "hello"', ' criteria: "{{ env.AGENTV_TEST_CRITERIA }}"', ''].join( '\n', ), ); @@ -114,11 +114,11 @@ describe('env interpolation in YAML loading', () => { expect(cases[0].criteria).toBe('Must return correct answer'); }); - it('interpolates ${{ VAR }} in external JSONL case files', async () => { + it('interpolates {{ env.VAR }} in external JSONL case files', async () => { const casesFile = path.join(testDir, 'cases.jsonl'); await writeFile( casesFile, - '{"id": "ext-jsonl-1", "input": "hello", "criteria": "${{ AGENTV_TEST_CRITERIA }}"}\n', + '{"id": "ext-jsonl-1", "input": "hello", "criteria": "{{ env.AGENTV_TEST_CRITERIA }}"}\n', ); const evalFile = path.join(testDir, 'interp-external-jsonl.eval.yaml'); await writeFile(evalFile, 'tests: cases.jsonl\n'); @@ -142,9 +142,45 @@ describe('env interpolation in YAML loading', () => { // (empty criteria alone causes the test loader to skip it as incomplete) await writeFile( evalFile, - 'tests:\n - id: test-1\n input: "hello"\n criteria: "prefix ${{ AGENTV_NONEXISTENT_VAR }} suffix"\n expected_output: "some output"\n', + 'tests:\n - id: test-1\n input: "hello"\n criteria: "prefix {{ env.AGENTV_NONEXISTENT_VAR }} suffix"\n expected_output: "some output"\n', ); const cases = await loadTests(evalFile, testDir); expect(cases[0].criteria).toBe('prefix suffix'); }); + + it('resolves default filter values through env rendering', async () => { + const evalFile = path.join(testDir, 'interp-default.eval.yaml'); + await writeFile( + evalFile, + 'tests:\n - id: test-1\n input: "hello"\n criteria: "{{ env.AGENTV_NONEXISTENT_VAR | default(\\"fallback criteria\\") }}"\n', + ); + const cases = await loadTests(evalFile, testDir); + expect(cases[0].criteria).toBe('fallback criteria'); + }); + + it('leaves runtime shell variables in target commands untouched', async () => { + const evalFile = path.join(testDir, 'interp-shell-vars.eval.yaml'); + await writeFile( + evalFile, + [ + 'target:', + ' name: local-shell', + ' provider: cli', + ' command: "echo $RUNTIME ${RUNTIME} {{ env.AGENTV_TEST_PATH }}"', + 'tests:', + ' - id: test-1', + ' input: "hello"', + ' criteria: "do something"', + '', + ].join('\n'), + ); + const { targetSpec } = await import('../../src/evaluation/yaml-parser.js').then((module) => + module.readTestSuiteMetadata(evalFile), + ); + expect( + targetSpec?.definition && 'command' in targetSpec.definition + ? targetSpec.definition.command + : '', + ).toBe('echo $RUNTIME ${RUNTIME} https://github.com/org/from-env.git'); + }); }); diff --git a/packages/core/test/evaluation/interpolation.test.ts b/packages/core/test/evaluation/interpolation.test.ts index ecaccf299..b1b1b5884 100644 --- a/packages/core/test/evaluation/interpolation.test.ts +++ b/packages/core/test/evaluation/interpolation.test.ts @@ -4,24 +4,38 @@ import { interpolateEnv, interpolateTemplateVars } from '../../src/evaluation/in describe('interpolateEnv', () => { const env = { HOME: '/home/user', PROJECT: 'agentv', EMPTY: '' }; - it('replaces ${{ VAR }} in a string', () => { - expect(interpolateEnv('${{ HOME }}', env)).toBe('/home/user'); + it('replaces {{ env.VAR }} in a string', () => { + expect(interpolateEnv('{{ env.HOME }}', env)).toBe('/home/user'); }); - it('replaces ${{VAR}} without spaces', () => { - expect(interpolateEnv('${{HOME}}', env)).toBe('/home/user'); + it('replaces {{env.VAR}} without spaces', () => { + expect(interpolateEnv('{{env.HOME}}', env)).toBe('/home/user'); }); it('handles partial/inline interpolation', () => { - expect(interpolateEnv('${{ HOME }}/repos/${{ PROJECT }}', env)).toBe('/home/user/repos/agentv'); + expect(interpolateEnv('{{ env.HOME }}/repos/{{ env.PROJECT }}', env)).toBe( + '/home/user/repos/agentv', + ); }); it('resolves missing variables to empty string', () => { - expect(interpolateEnv('${{ MISSING }}', env)).toBe(''); + expect(interpolateEnv('{{ env.MISSING }}', env)).toBe(''); + }); + + it('supports the Nunjucks default filter for missing env vars', () => { + expect(interpolateEnv('{{ env.MISSING | default("fallback") }}', env)).toBe('fallback'); }); it('resolves missing variable inline to empty string', () => { - expect(interpolateEnv('prefix-${{ MISSING }}-suffix', env)).toBe('prefix--suffix'); + expect(interpolateEnv('prefix-{{ env.MISSING }}-suffix', env)).toBe('prefix--suffix'); + }); + + it('preserves runtime shell variables', () => { + expect(interpolateEnv('echo $RUNTIME ${RUNTIME}', env)).toBe('echo $RUNTIME ${RUNTIME}'); + }); + + it('does not resolve legacy ${{ VAR }} syntax', () => { + expect(interpolateEnv('${{ HOME }}', env)).toBe('${{ HOME }}'); }); it('passes through strings without interpolation syntax', () => { @@ -37,8 +51,8 @@ describe('interpolateEnv', () => { it('recursively interpolates object values', () => { const input = { - path: '${{ HOME }}/repos', - nested: { url: '${{ PROJECT }}' }, + path: '{{ env.HOME }}/repos', + nested: { url: '{{ env.PROJECT }}' }, literal: 'no-vars', }; expect(interpolateEnv(input, env)).toEqual({ @@ -49,90 +63,90 @@ describe('interpolateEnv', () => { }); it('does not mutate the original object', () => { - const input = { path: '${{ HOME }}' }; + const input = { path: '{{ env.HOME }}' }; const result = interpolateEnv(input, env); expect(result).not.toBe(input); - expect(input.path).toBe('${{ HOME }}'); + expect(input.path).toBe('{{ env.HOME }}'); }); it('recursively interpolates arrays', () => { - const input = ['${{ HOME }}', { key: '${{ PROJECT }}' }, 42]; + const input = ['{{ env.HOME }}', { key: '{{ env.PROJECT }}' }, 42]; expect(interpolateEnv(input, env)).toEqual(['/home/user', { key: 'agentv' }, 42]); }); it('handles empty string env values', () => { - expect(interpolateEnv('${{ EMPTY }}', env)).toBe(''); + expect(interpolateEnv('{{ env.EMPTY }}', env)).toBe(''); }); describe('whole-value type coercion', () => { it('coerces "true" to boolean true', () => { - expect(interpolateEnv('${{ FLAG }}', { FLAG: 'true' })).toBe(true); + expect(interpolateEnv('{{ env.FLAG }}', { FLAG: 'true' })).toBe(true); }); it('coerces "false" to boolean false', () => { - expect(interpolateEnv('${{ FLAG }}', { FLAG: 'false' })).toBe(false); + expect(interpolateEnv('{{ env.FLAG }}', { FLAG: 'false' })).toBe(false); }); it('coerces integer string to number', () => { - expect(interpolateEnv('${{ COUNT }}', { COUNT: '10' })).toBe(10); + expect(interpolateEnv('{{ env.COUNT }}', { COUNT: '10' })).toBe(10); }); it('coerces float string to number', () => { - expect(interpolateEnv('${{ RATIO }}', { RATIO: '0.75' })).toBe(0.75); + expect(interpolateEnv('{{ env.RATIO }}', { RATIO: '0.75' })).toBe(0.75); }); it('leaves empty string as string (missing var)', () => { - expect(interpolateEnv('${{ MISSING }}', {})).toBe(''); + expect(interpolateEnv('{{ env.MISSING }}', {})).toBe(''); }); it('leaves plain string values as strings', () => { - expect(interpolateEnv('${{ HOME }}', env)).toBe('/home/user'); + expect(interpolateEnv('{{ env.HOME }}', env)).toBe('/home/user'); }); it('does not coerce partial/inline substitutions', () => { // "true" appears only after inline replacement — no coercion - expect(interpolateEnv('enabled=${{ FLAG }}', { FLAG: 'true' })).toBe('enabled=true'); + expect(interpolateEnv('enabled={{ env.FLAG }}', { FLAG: 'true' })).toBe('enabled=true'); }); it('coerces inside nested objects', () => { - const input = { auto_push: '${{ PUSH }}', label: 'runs' }; + const input = { auto_push: '{{ env.PUSH }}', label: 'runs' }; expect(interpolateEnv(input, { PUSH: 'true' })).toEqual({ auto_push: true, label: 'runs' }); }); // Numeric edge-case regression tests — these must stay as strings it('does not coerce scientific notation (1e3)', () => { - expect(interpolateEnv('${{ VAL }}', { VAL: '1e3' })).toBe('1e3'); + expect(interpolateEnv('{{ env.VAL }}', { VAL: '1e3' })).toBe('1e3'); }); it('does not coerce hex strings (0x10)', () => { - expect(interpolateEnv('${{ VAL }}', { VAL: '0x10' })).toBe('0x10'); + expect(interpolateEnv('{{ env.VAL }}', { VAL: '0x10' })).toBe('0x10'); }); it('does not coerce "Infinity"', () => { - expect(interpolateEnv('${{ VAL }}', { VAL: 'Infinity' })).toBe('Infinity'); + expect(interpolateEnv('{{ env.VAL }}', { VAL: 'Infinity' })).toBe('Infinity'); }); it('does not coerce whitespace-only string', () => { - expect(interpolateEnv('${{ VAL }}', { VAL: ' ' })).toBe(' '); + expect(interpolateEnv('{{ env.VAL }}', { VAL: ' ' })).toBe(' '); }); it('does not coerce leading-zero string (00123)', () => { - expect(interpolateEnv('${{ VAL }}', { VAL: '00123' })).toBe('00123'); + expect(interpolateEnv('{{ env.VAL }}', { VAL: '00123' })).toBe('00123'); }); it('coerces negative integer', () => { - expect(interpolateEnv('${{ VAL }}', { VAL: '-7' })).toBe(-7); + expect(interpolateEnv('{{ env.VAL }}', { VAL: '-7' })).toBe(-7); }); }); it('is case-sensitive for variable names', () => { - expect(interpolateEnv('${{ home }}', env)).toBe(''); - expect(interpolateEnv('${{ HOME }}', env)).toBe('/home/user'); + expect(interpolateEnv('{{ env.home }}', env)).toBe(''); + expect(interpolateEnv('{{ env.HOME }}', env)).toBe('/home/user'); }); it('handles variables with underscores and digits', () => { const envWithSpecial = { MY_VAR_2: 'value' }; - expect(interpolateEnv('${{ MY_VAR_2 }}', envWithSpecial)).toBe('value'); + expect(interpolateEnv('{{ env.MY_VAR_2 }}', envWithSpecial)).toBe('value'); }); }); @@ -149,17 +163,33 @@ describe('interpolateTemplateVars', () => { ); }); + it('replaces namespaced {{ vars.foo }} references', () => { + expect(interpolateTemplateVars('Answer clearly: {{ vars.question }}', vars)).toBe( + 'Answer clearly: What is 2 + 2?', + ); + }); + it('supports dotted paths', () => { - expect(interpolateTemplateVars('Topic: {{ nested.topic }}', vars)).toBe('Topic: math'); + expect(interpolateTemplateVars('Topic: {{ vars.nested.topic }}', vars)).toBe('Topic: math'); }); - it('preserves missing variables instead of blanking them out', () => { - expect(interpolateTemplateVars('Answer clearly: {{missing}}', vars)).toBe( - 'Answer clearly: {{missing}}', + it('supports loops and built-in filters', () => { + const rendered = interpolateTemplateVars( + '{% for item in vars.items %}{{ item | upper }}{% if not loop.last %}, {% endif %}{% endfor %}', + { items: ['alpha', 'beta'] }, ); + expect(rendered).toBe('ALPHA, BETA'); + }); + + it('renders missing variables as empty strings', () => { + expect(interpolateTemplateVars('Answer clearly: {{missing}}', vars)).toBe('Answer clearly: '); }); it('returns the original JSON value for whole-value substitutions', () => { - expect(interpolateTemplateVars('{{expected}}', vars)).toEqual({ answer: '4' }); + expect(interpolateTemplateVars('{{ vars.expected }}', vars)).toEqual({ answer: '4' }); + }); + + it('returns the full vars object for {{ vars }}', () => { + expect(interpolateTemplateVars('{{ vars }}', vars)).toEqual(vars); }); }); diff --git a/packages/core/test/evaluation/suite-level-input.test.ts b/packages/core/test/evaluation/suite-level-input.test.ts index f5c1dae9c..0909fda86 100644 --- a/packages/core/test/evaluation/suite-level-input.test.ts +++ b/packages/core/test/evaluation/suite-level-input.test.ts @@ -256,6 +256,7 @@ tests: - id: templated vars: question: "What is the capital of France?" + expected_answer: "Paris" criteria: "Answers {{question}} correctly" input: - role: user @@ -287,12 +288,160 @@ tests: role: 'assistant', content: 'Thinking about What is the capital of France?', }); - expect(tests[0].expected_output).toEqual([ - { role: 'assistant', content: '{{expected_answer}}' }, - ]); + expect(tests[0].expected_output).toEqual([{ role: 'assistant', content: 'Paris' }]); expect(tests[0].metadata).toEqual({ untouched: '{{question}}' }); }); + it('applies namespaced vars with loops in suite and test input templates', async () => { + await writeFile( + path.join(tempDir, 'templated-namespaced-input.eval.yaml'), + `input: | + Items: + {% for item in vars.group.items %}- {{ item | upper }} + {% endfor %} +tests: + - id: templated-namespaced + vars: + group: + items: + - alpha + - beta + criteria: "Mentions {{ vars.group.items | length }} items" + input: "Question: {{ vars.group.items[0] }}" +`, + ); + + const tests = await loadTests( + path.join(tempDir, 'templated-namespaced-input.eval.yaml'), + tempDir, + ); + + expect(tests).toHaveLength(1); + expect(tests[0].criteria).toBe('Mentions 2 items'); + expect(tests[0].input[0]).toEqual({ + role: 'user', + content: 'Items:\n- ALPHA\n- BETA\n\n', + }); + expect(tests[0].input[1]).toEqual({ + role: 'user', + content: 'Question: alpha', + }); + }); + + it('loads custom nunjucks_filters for eval-time rendering', async () => { + const filterPath = path.join(tempDir, 'slug-filter.ts'); + await writeFile( + filterPath, + 'export default function slug(value: unknown) { return String(value).toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-|-$/g, ""); }\n', + ); + await writeFile( + path.join(tempDir, 'templated-custom-filter.eval.yaml'), + `nunjucks_filters: + slug: ./slug-filter.ts +tests: + - id: filter-test + vars: + title: "Hello AgentV" + criteria: "Slug is {{ vars.title | slug }}" + input: "Write {{ vars.title | slug }}" +`, + ); + + const tests = await loadTests(path.join(tempDir, 'templated-custom-filter.eval.yaml'), tempDir); + + expect(tests).toHaveLength(1); + expect(tests[0].criteria).toBe('Slug is hello-agentv'); + expect(tests[0].input[0]).toEqual({ role: 'user', content: 'Write hello-agentv' }); + }); + + it('expands string array vars into multiple rendered rows', async () => { + await writeFile( + path.join(tempDir, 'templated-array-vars.eval.yaml'), + `tests: + - id: "fruit-{{ vars.fruit }}" + vars: + fruit: + - apple + - pear + color: + - red + - green + tags: + - stable + criteria: "{{ vars.color }} {{ vars.fruit }}" + input: "Describe {{ vars.color }} {{ vars.fruit }}" +`, + ); + + const tests = await loadTests(path.join(tempDir, 'templated-array-vars.eval.yaml'), tempDir); + + expect(tests.map((test) => test.id)).toEqual([ + 'fruit-apple', + 'fruit-apple', + 'fruit-pear', + 'fruit-pear', + ]); + expect(tests.map((test) => test.criteria)).toEqual([ + 'red apple', + 'green apple', + 'red pear', + 'green pear', + ]); + expect(tests.map((test) => test.input[0]?.content)).toEqual([ + 'Describe red apple', + 'Describe green apple', + 'Describe red pear', + 'Describe green pear', + ]); + }); + + it('renders then parses chat-array prompt strings', async () => { + await writeFile( + path.join(tempDir, 'templated-chat-array.eval.yaml'), + `tests: + - id: chat-array + vars: + topic: "templating" + criteria: "Uses chat array" + input: '[{"role":"system","content":"You review {{ vars.topic }}"},{"role":"user","content":"Explain {{ vars.topic }}"}]' +`, + ); + + const tests = await loadTests(path.join(tempDir, 'templated-chat-array.eval.yaml'), tempDir); + + expect(tests).toHaveLength(1); + expect(tests[0].input).toEqual([ + { role: 'system', content: 'You review templating' }, + { role: 'user', content: 'Explain templating' }, + ]); + }); + + it('renders assertion values and metrics with per-test vars', async () => { + await writeFile( + path.join(tempDir, 'templated-assertions.eval.yaml'), + `tests: + - id: assertions + vars: + expected: "DENIED" + metric_name: "policy" + input: "Check access" + assertions: + - type: contains + metric: "{{ vars.metric_name }}_decision" + value: "{{ vars.expected }}" +`, + ); + + const tests = await loadTests(path.join(tempDir, 'templated-assertions.eval.yaml'), tempDir); + + expect(tests).toHaveLength(1); + expect(tests[0].assertions?.[0]).toMatchObject({ + type: 'contains', + value: 'DENIED', + metric: 'policy_decision', + }); + }); + it('applies per-test vars inside conversation turns', async () => { await writeFile( path.join(tempDir, 'templated-turns.eval.yaml'), @@ -317,7 +466,7 @@ tests: { input: 'Fix parser null check', expected_output: 'Fixed parser null check', - assertions: ['Mentions {{bug}}'], + assertions: ['Mentions parser null check'], }, ]); }); diff --git a/packages/core/test/evaluation/workspace/deps-scanner.test.ts b/packages/core/test/evaluation/workspace/deps-scanner.test.ts index 0028130c6..163f8ffdf 100644 --- a/packages/core/test/evaluation/workspace/deps-scanner.test.ts +++ b/packages/core/test/evaluation/workspace/deps-scanner.test.ts @@ -273,7 +273,7 @@ tests: workspace: repos: - path: ./repo - repo: \${{ TEST_REPO_URL }} + repo: "{{ env.TEST_REPO_URL }}" tests: - id: test-1 input: hello diff --git a/packages/core/test/projects.test.ts b/packages/core/test/projects.test.ts index 42e308e25..7c8d42f03 100644 --- a/packages/core/test/projects.test.ts +++ b/packages/core/test/projects.test.ts @@ -324,11 +324,9 @@ dashboard: it('interpolates env vars in repo', () => { const registryPath = getProjectsRegistryPath(); mkdirSync(path.dirname(registryPath), { recursive: true }); - // Use concatenation to avoid JS template literal evaluating ${{ ... }} - const d = '$'; writeFileSync( registryPath, - `projects:\n - id: env-bench\n repo: "${d}{{ BENCH_REPO_URL }}"\n path: /srv/agentv/repo\n branch: main\n added_at: "2026-01-01T00:00:00Z"\n last_opened_at: "2026-01-01T00:00:00Z"\n`, + 'projects:\n - id: env-bench\n repo: "{{ env.BENCH_REPO_URL }}"\n path: /srv/agentv/repo\n branch: main\n added_at: "2026-01-01T00:00:00Z"\n last_opened_at: "2026-01-01T00:00:00Z"\n', 'utf-8', ); From c0b8f47caf6dd69b782e9375c4fc46e56b9020f5 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 2 Jul 2026 15:30:26 +0200 Subject: [PATCH 2/3] fix(eval): resolve env templates in target secrets --- packages/core/src/evaluation/interpolation.ts | 4 ++ .../core/src/evaluation/providers/targets.ts | 19 +++++++ .../test/evaluation/providers/targets.test.ts | 49 +++++++++++++++++++ 3 files changed, 72 insertions(+) diff --git a/packages/core/src/evaluation/interpolation.ts b/packages/core/src/evaluation/interpolation.ts index 1ab64a496..c14b89f08 100644 --- a/packages/core/src/evaluation/interpolation.ts +++ b/packages/core/src/evaluation/interpolation.ts @@ -80,6 +80,10 @@ function renderEnvString(template: string, env: EnvLookup): string { return template.replace(ENV_OUTPUT_PATTERN, (match) => renderString(match, { env })); } +export function renderEnvTemplateString(template: string, env: EnvLookup): string { + return renderEnvString(template, env); +} + /** * Recursively render config-load `{{ env.VAR }}` templates in string values. * diff --git a/packages/core/src/evaluation/providers/targets.ts b/packages/core/src/evaluation/providers/targets.ts index 5705ed8dd..437f4dbe1 100644 --- a/packages/core/src/evaluation/providers/targets.ts +++ b/packages/core/src/evaluation/providers/targets.ts @@ -1,6 +1,7 @@ import path from 'node:path'; import { z } from 'zod'; +import { renderEnvTemplateString } from '../interpolation.js'; import type { EnvLookup, TargetDefinition } from './types.js'; // --------------------------------------------------------------------------- @@ -819,6 +820,7 @@ export const COMMON_TARGET_SETTINGS = [ ] as const; const USE_TARGET_ENV_PATTERN = /^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i; +const WHOLE_ENV_TEMPLATE_PATTERN = /^\s*\{\{\s*env\.[\s\S]*?\}\}\s*$/; const BASE_TARGET_SCHEMA = z .object({ @@ -2338,6 +2340,23 @@ function resolveOptionalString( return envValue; } + if (trimmed.includes('{{') && trimmed.includes('env.')) { + const allowLiteral = options?.allowLiteral ?? false; + if (!allowLiteral && !WHOLE_ENV_TEMPLATE_PATTERN.test(trimmed)) { + throw new Error( + `${description} must use a whole \${{ VARIABLE_NAME }} or {{ env.VARIABLE_NAME }} reference`, + ); + } + const rendered = renderEnvTemplateString(trimmed, env).trim(); + if (rendered.length === 0) { + if (options?.optionalEnv ?? false) { + return undefined; + } + throw new Error(`${description} env template resolved to an empty value`); + } + return rendered; + } + // Return as literal value const allowLiteral = options?.allowLiteral ?? false; if (!allowLiteral) { diff --git a/packages/core/test/evaluation/providers/targets.test.ts b/packages/core/test/evaluation/providers/targets.test.ts index 1ec0c2f5c..da80ec4bb 100644 --- a/packages/core/test/evaluation/providers/targets.test.ts +++ b/packages/core/test/evaluation/providers/targets.test.ts @@ -579,6 +579,55 @@ describe('resolveTargetDefinition', () => { }); }); + it('resolves openai settings from {{ env.* }} templates', () => { + const env = { + OPENAI_ENDPOINT: 'https://llm-gateway.example.com/v1', + OPENAI_API_KEY: 'openai-secret', + OPENAI_MODEL: 'gpt-5.4', + } satisfies Record; + + const target = resolveTargetDefinition( + { + name: 'openai-target', + provider: 'openai', + endpoint: '{{ env.OPENAI_ENDPOINT }}', + api_key: '{{ env.OPENAI_API_KEY }}', + model: '{{ env.OPENAI_MODEL | default("gpt-5.4-mini") }}', + }, + env, + ); + + expect(target.kind).toBe('openai'); + if (target.kind !== 'openai') { + throw new Error('expected openai target'); + } + + expect(target.config).toMatchObject({ + baseURL: 'https://llm-gateway.example.com/v1', + apiKey: 'openai-secret', + model: 'gpt-5.4', + }); + }); + + it('rejects inline {{ env.* }} templates in secret fields', () => { + expect(() => + resolveTargetDefinition( + { + name: 'openai-target', + provider: 'openai', + endpoint: '{{ env.OPENAI_ENDPOINT }}', + api_key: 'Bearer {{ env.OPENAI_API_KEY }}', + model: '{{ env.OPENAI_MODEL }}', + }, + { + OPENAI_ENDPOINT: 'https://llm-gateway.example.com/v1', + OPENAI_API_KEY: 'openai-secret', + OPENAI_MODEL: 'gpt-5.4', + }, + ), + ).toThrow(/whole .+ env\.VARIABLE_NAME/i); + }); + it('resolves openrouter settings from environment', () => { const env = { OPENROUTER_API_KEY: 'openrouter-secret', From c654e9713002a4e27a20b204d3939bc2929c143b Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 2 Jul 2026 15:48:22 +0200 Subject: [PATCH 3/3] fix(eval): tighten target secret env templates --- .../docs/docs/evaluation/eval-files.mdx | 14 +++---- .../core/src/evaluation/providers/targets.ts | 5 ++- .../test/evaluation/providers/targets.test.ts | 38 +++++++++++++++++++ 3 files changed, 48 insertions(+), 9 deletions(-) diff --git a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx index 63d6655ab..d5abf93ab 100644 --- a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx +++ b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx @@ -407,26 +407,26 @@ dataset rows out of oversized inline YAML, see [Benchmark Provenance](/docs/guid ## Environment Variable Interpolation -All string fields in eval files support `${{ VAR }}` syntax for environment variable interpolation. This enables portable eval configs that work across machines and CI environments without hardcoded paths. +All string fields in eval files support `{{ env.VAR }}` syntax for environment variable interpolation. This enables portable eval configs that work across machines and CI environments without hardcoded paths. ```yaml workspace: repos: - path: ./RepoA - repo: "${{ REPO_A_URL }}" - commit: "${{ REPO_A_COMMIT }}" + repo: "{{ env.REPO_A_URL }}" + commit: "{{ env.REPO_A_COMMIT }}" tests: - id: test-1 - input: "Evaluate the code in ${{ PROJECT_NAME }}" - criteria: "${{ EVAL_CRITERIA }}" + input: "Evaluate the code in {{ env.PROJECT_NAME }}" + criteria: "{{ env.EVAL_CRITERIA }}" ``` ### Behavior -- **Syntax:** `${{ VARIABLE_NAME }}` with optional whitespace around the name +- **Syntax:** `{{ env.VARIABLE_NAME }}` with optional whitespace around the name - **Missing variables** resolve to an empty string -- **Partial interpolation** is supported: `${{ HOME }}/repos/${{ PROJECT }}` becomes `/home/user/repos/myproject` +- **Partial interpolation** is supported: `{{ env.HOME }}/repos/{{ env.PROJECT }}` becomes `/home/user/repos/myproject` - **Non-string values** (numbers, booleans) are not affected - Interpolation is applied recursively to all nested objects and arrays - Works in YAML eval files, external YAML/JSONL case files, and external workspace config files diff --git a/packages/core/src/evaluation/providers/targets.ts b/packages/core/src/evaluation/providers/targets.ts index 437f4dbe1..6a6bfdd91 100644 --- a/packages/core/src/evaluation/providers/targets.ts +++ b/packages/core/src/evaluation/providers/targets.ts @@ -820,7 +820,7 @@ export const COMMON_TARGET_SETTINGS = [ ] as const; const USE_TARGET_ENV_PATTERN = /^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i; -const WHOLE_ENV_TEMPLATE_PATTERN = /^\s*\{\{\s*env\.[\s\S]*?\}\}\s*$/; +const SECRET_ENV_TEMPLATE_PATTERN = /^\s*\{\{\s*env\.([A-Za-z_][A-Za-z0-9_]*)\s*\}\}\s*$/; const BASE_TARGET_SCHEMA = z .object({ @@ -2342,7 +2342,8 @@ function resolveOptionalString( if (trimmed.includes('{{') && trimmed.includes('env.')) { const allowLiteral = options?.allowLiteral ?? false; - if (!allowLiteral && !WHOLE_ENV_TEMPLATE_PATTERN.test(trimmed)) { + const isSecretField = /\b(api key|bearer token|github token|token|secret)\b/i.test(description); + if (!allowLiteral && isSecretField && !SECRET_ENV_TEMPLATE_PATTERN.test(trimmed)) { throw new Error( `${description} must use a whole \${{ VARIABLE_NAME }} or {{ env.VARIABLE_NAME }} reference`, ); diff --git a/packages/core/test/evaluation/providers/targets.test.ts b/packages/core/test/evaluation/providers/targets.test.ts index da80ec4bb..cdd5c4c20 100644 --- a/packages/core/test/evaluation/providers/targets.test.ts +++ b/packages/core/test/evaluation/providers/targets.test.ts @@ -628,6 +628,44 @@ describe('resolveTargetDefinition', () => { ).toThrow(/whole .+ env\.VARIABLE_NAME/i); }); + it('rejects composed {{ env.* }} templates in secret fields', () => { + expect(() => + resolveTargetDefinition( + { + name: 'openai-target', + provider: 'openai', + endpoint: '{{ env.OPENAI_ENDPOINT }}', + api_key: '{{ env.OPENAI_API_KEY }}{{ env.OPENAI_API_KEY_2 }}', + model: '{{ env.OPENAI_MODEL }}', + }, + { + OPENAI_ENDPOINT: 'https://llm-gateway.example.com/v1', + OPENAI_API_KEY: 'openai-secret', + OPENAI_API_KEY_2: 'extra-secret', + OPENAI_MODEL: 'gpt-5.4', + }, + ), + ).toThrow(/whole .+ env\.VARIABLE_NAME/i); + }); + + it('rejects literal defaults in secret field env templates', () => { + expect(() => + resolveTargetDefinition( + { + name: 'openai-target', + provider: 'openai', + endpoint: '{{ env.OPENAI_ENDPOINT }}', + api_key: '{{ env.OPENAI_API_KEY | default("hardcoded-secret") }}', + model: '{{ env.OPENAI_MODEL }}', + }, + { + OPENAI_ENDPOINT: 'https://llm-gateway.example.com/v1', + OPENAI_MODEL: 'gpt-5.4', + }, + ), + ).toThrow(/whole .+ env\.VARIABLE_NAME/i); + }); + it('resolves openrouter settings from environment', () => { const env = { OPENROUTER_API_KEY: 'openrouter-secret',