diff --git a/apps/cli/test/commands/prepare/prepare.test.ts b/apps/cli/test/commands/prepare/prepare.test.ts index 1c7e5e6a9..ed850db91 100644 --- a/apps/cli/test/commands/prepare/prepare.test.ts +++ b/apps/cli/test/commands/prepare/prepare.test.ts @@ -255,7 +255,7 @@ describe('agentv prepare', () => { path.join(tempDir, '.agentv', 'targets.yaml'), ` targets: - - name: codex + - label: codex provider: cli command: bun ./scripts/target.ts `, diff --git a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx index 7391e2dc9..c53d61e4f 100644 --- a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx +++ b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx @@ -20,8 +20,9 @@ experiment format. - A **task suite** is eval YAML that owns task context: `workspace`, shared `input`, shared `assertions`, fixtures, graders, and test cases. It can run directly or be imported through `imports.suites`. -- A **raw case file** is a YAML/JSONL array, directory, or glob of cases. Import - it with `imports.tests`, `tests: ./cases.yaml`, or string shorthand; parent +- A **raw case file** is a YAML, JSON, JSONL, CSV, script-backed dataset, + directory, or glob of cases. Import it with `imports.tests`, + `tests: ./cases.yaml`, `tests: file://cases.csv`, or string shorthand; parent suite context applies because raw cases do not carry their own suite context. - A **wrapper eval** is eval YAML that imports one or more suites with `imports.suites` and binds run controls with top-level `target`, `repeat`, @@ -373,15 +374,33 @@ tests: ./cases.yaml ``` The path is resolved relative to the eval file's directory. The external raw -case file should contain a YAML array of test objects or a JSONL file with one -test per line. String entries inside a `tests:` list work the same way and may -use direct paths, directories, or globs: +case file can be a YAML or JSON array of test objects, a JSONL file with one +test per line, a promptfoo-compatible CSV file, or an explicit JavaScript or +Python dataset function such as `file://generate-tests.mjs:createTests` or +`file://generate_tests.py:create_tests`. String entries inside a `tests:` list +work the same way and may use direct paths, `file://` paths, directories, or +globs: ```yaml tests: - ./cases/*.cases.yaml ``` +CSV datasets support promptfoo-style magic columns. `__expected` and +`__expectedN` create AgentV assertions using the supported expected-column +mini-DSL (`contains:*`, `icontains:*`, `contains-any:*`, `contains-all:*`, +`icontains-any:*`, `icontains-all:*`, `starts-with:*`, `ends-with:*`, +`regex:*`, `equals:*`, `is-json`, `latency()`, `cost()`, +`grade:*`, `llm-rubric:*`, `javascript:*`, `fn:*`, `eval:*`, `python:*`, and +`file://*.py`; file paths inside CSV cells are resolved relative to the CSV +file). Unsupported promptfoo assertion forms such as `similar:*` are rejected +during validation instead of being skipped at runtime. +`__provider_output` becomes first-class `expected_output`, `__metric` names the +generated assertions, `__threshold` sets the test threshold, +`__metadata:` adds metadata, and `__config:__expectedN:threshold` sets an +assertion `min_score`. Ordinary columns become `vars`, so CSV rows can rely on +suite-level `input` that interpolates those variables. + String shorthand is raw-case-only. Import reusable task suites through `imports.suites`; use `imports.tests` when you want to drop suite context and import only raw cases into the parent context: diff --git a/examples/features/external-datasets/README.md b/examples/features/external-datasets/README.md index e2a2babc3..07121d7fd 100644 --- a/examples/features/external-datasets/README.md +++ b/examples/features/external-datasets/README.md @@ -6,6 +6,7 @@ Demonstrates loading raw test cases from external files using `imports.tests`. - Loading tests from external YAML files (`imports.tests[].path: cases/accuracy.yaml`) - Loading tests from external JSONL files (`imports.tests[].path: cases/regression.jsonl`) +- Loading tests from promptfoo-compatible CSV files (`imports.tests[].path: cases/magic.csv`) - Mixing inline `tests` with imported raw test rows - Glob patterns for loading multiple files (`imports.tests[].path: cases/**/*.yaml`) @@ -21,6 +22,7 @@ bun agentv eval examples/features/external-datasets/evals/dataset.eval.yaml - `evals/dataset.eval.yaml` — Main eval with inline tests and `imports.tests` references - `evals/cases/accuracy.yaml` — YAML array of test cases - `evals/cases/regression.jsonl` — JSONL test data (one test per line) +- `evals/cases/magic.csv` — CSV test data with promptfoo-style magic columns ## Supported Formats @@ -42,6 +44,22 @@ One JSON test object per line: {"id": "test-2", "criteria": "Another outcome", "input": "Another input"} ``` +### CSV (.csv) +CSV files use ordinary columns for `id`, `input`, and `vars`, plus promptfoo-style magic columns for assertions and metadata: + +```csv +id,input,__expected,__provider_output,__metric,__threshold,__metadata:source,locale +csv-test,Reply with a greeting,icontains:hello,Hello there,greeting,0.8,csv,en-US +``` + +`__expected` and `__expectedN` become AgentV assertions for the supported CSV +mini-DSL. `latency()`, `cost()`, and `file://*.py` map to runnable +AgentV graders, with CSV file paths resolved relative to the CSV file; +unsupported promptfoo forms such as `similar:*` are rejected during validation. +`__provider_output` becomes AgentV `expected_output`; ordinary non-magic +columns such as `locale` become `vars` and can be interpolated by suite-level +`input`. + ## Glob Patterns Use glob patterns to load from multiple files: diff --git a/examples/features/external-datasets/evals/cases/magic.csv b/examples/features/external-datasets/evals/cases/magic.csv new file mode 100644 index 000000000..0cb59809a --- /dev/null +++ b/examples/features/external-datasets/evals/cases/magic.csv @@ -0,0 +1,2 @@ +id,input,__expected,__provider_output,__metric,__threshold,__metadata:source,locale +csv-magic-greeting,Reply with a short greeting,icontains:hello,Hello there,greeting,0.8,csv,en-US diff --git a/examples/features/external-datasets/evals/dataset.eval.yaml b/examples/features/external-datasets/evals/dataset.eval.yaml index 22601c286..a07ce3ce0 100644 --- a/examples/features/external-datasets/evals/dataset.eval.yaml +++ b/examples/features/external-datasets/evals/dataset.eval.yaml @@ -7,6 +7,7 @@ imports: tests: - path: cases/accuracy.yaml - path: cases/regression.jsonl + - path: cases/magic.csv tests: - id: inline-test diff --git a/packages/core/src/evaluation/loaders/case-file-loader.ts b/packages/core/src/evaluation/loaders/case-file-loader.ts index a14265fa7..ada3be8cd 100644 --- a/packages/core/src/evaluation/loaders/case-file-loader.ts +++ b/packages/core/src/evaluation/loaders/case-file-loader.ts @@ -1,7 +1,9 @@ import { readFile, readdir, stat } from 'node:fs/promises'; import path from 'node:path'; +import { pathToFileURL } from 'node:url'; import fg from 'fast-glob'; +import { execFileWithStdin } from '../../runtime/exec.js'; import { interpolateEnv } from '../interpolation.js'; import type { JsonObject, JsonValue } from '../types.js'; import { isJsonObject } from '../types.js'; @@ -11,6 +13,24 @@ const ANSI_YELLOW = '\u001b[33m'; const ANSI_RESET = '\u001b[0m'; const FILE_PROTOCOL = 'file://'; +const DATASET_SCRIPT_TIMEOUT_MS = 30_000; +const DEFAULT_THRESHOLD = 0.75; +const THRESHOLD_ASSERTION_TYPES = new Set(['starts-with']); +const SUPPORTED_ASSERTION_TYPES = new Set([ + 'contains', + 'contains-any', + 'contains-all', + 'icontains', + 'icontains-any', + 'icontains-all', + 'starts-with', + 'ends-with', + 'regex', + 'is-json', + 'equals', + 'latency', + 'cost', +]); /** * Check if a value in the tests array is a file:// reference string. @@ -26,6 +46,10 @@ function extractFilePath(ref: string): string { return ref.slice(FILE_PROTOCOL.length); } +function stripFileProtocol(value: string): string { + return value.startsWith(FILE_PROTOCOL) ? extractFilePath(value) : value; +} + /** * Check if a path contains glob pattern characters. */ @@ -83,36 +107,466 @@ function parseJsonlCases(content: string, filePath: string): JsonObject[] { return results; } +function assertJsonCases(value: unknown, filePath: string): JsonObject[] { + const parsed = interpolateEnv(value, process.env); + const rawCases = Array.isArray(parsed) + ? parsed + : isJsonObject(parsed) && Array.isArray(parsed.tests) + ? parsed.tests + : undefined; + if (!rawCases) { + throw new Error(`External test file must contain an array of test objects: ${filePath}`); + } + return rawCases.map((item, index) => { + if (!isJsonObject(item)) { + throw new Error( + `External test file contains non-object entry at index ${index}: ${filePath}`, + ); + } + return item; + }); +} + +function parseJsonCases(content: string, filePath: string): JsonObject[] { + try { + return assertJsonCases(JSON.parse(content) as unknown, filePath); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + throw new Error(`Malformed JSON test file: ${message}\n File: ${filePath}`); + } +} + +function parseCsvRows(content: string, filePath: string): Record[] { + const rows: string[][] = []; + let row: string[] = []; + let cell = ''; + let inQuotes = false; + let rowStart = true; + const source = content.charCodeAt(0) === 0xfeff ? content.slice(1) : content; + + for (let index = 0; index < source.length; index++) { + const char = source[index]; + const next = source[index + 1]; + if (inQuotes) { + if (char === '"' && next === '"') { + cell += '"'; + index++; + } else if (char === '"') { + inQuotes = false; + } else { + cell += char; + } + continue; + } + if (char === '"' && cell === '') { + inQuotes = true; + rowStart = false; + continue; + } + if (char === ',') { + row.push(cell); + cell = ''; + rowStart = false; + continue; + } + if (char === '\n' || char === '\r') { + if (char === '\r' && next === '\n') { + index++; + } + row.push(cell); + if (!rowStart || row.some((value) => value.length > 0)) { + rows.push(row); + } + row = []; + cell = ''; + rowStart = true; + continue; + } + cell += char; + rowStart = false; + } + + if (inQuotes) { + throw new Error(`Malformed CSV test file: unterminated quoted cell\n File: ${filePath}`); + } + if (!rowStart || cell.length > 0 || row.length > 0) { + row.push(cell); + rows.push(row); + } + if (rows.length === 0) { + return []; + } + + const headers = rows[0].map((header) => header.trim()); + return rows.slice(1).map((values) => { + const record: Record = {}; + headers.forEach((header, index) => { + if (header.length > 0) { + record[header] = values[index] ?? ''; + } + }); + return record; + }); +} + +function parseAssertionFromString(expected: string, sourceFilePath: string): JsonObject { + if (expected.startsWith('grade:') || expected.startsWith('llm-rubric:')) { + const value = expected.slice(expected.startsWith('grade:') ? 6 : 11).trim(); + return { + type: 'llm-grader', + rubrics: [{ id: 'rubric', outcome: value, weight: 1 }], + }; + } + const functionPrefixes = ['javascript:', 'fn:', 'eval:']; + const functionPrefix = functionPrefixes.find((prefix) => expected.startsWith(prefix)); + if (functionPrefix) { + return { + type: 'inline-assert', + code: expected.slice(functionPrefix.length).trim(), + }; + } + if (expected.startsWith('python:')) { + return { + type: 'code-grader', + command: ['uv', 'run', 'python', expected.slice('python:'.length).trim()], + }; + } + if (expected.startsWith(FILE_PROTOCOL)) { + const filePath = stripFileProtocol(expected).trim(); + if (!filePath.endsWith('.py')) { + throw new Error( + `Unsupported promptfoo __expected file assertion "${expected}". Only file://*.py code graders are supported.`, + ); + } + const commandPath = path.isAbsolute(filePath) + ? filePath + : path.resolve(path.dirname(sourceFilePath), filePath); + return { + type: 'code-grader', + command: ['uv', 'run', 'python', commandPath], + }; + } + + const regexMatch = expected.match( + /^((?:not-)?[a-z][a-z0-9-]*)(?:\((\d+(?:\.\d+)?)\))?(?::([\s\S]*))?$/, + ); + if (regexMatch) { + const [, rawType, thresholdText, rawValue] = regexMatch; + const negate = rawType.startsWith('not-'); + const type = negate ? rawType.slice('not-'.length) : rawType; + const value = rawValue?.trim(); + const parsedThreshold = thresholdText ? Number.parseFloat(thresholdText) : undefined; + const threshold = + parsedThreshold !== undefined && Number.isFinite(parsedThreshold) + ? parsedThreshold + : THRESHOLD_ASSERTION_TYPES.has(type) + ? DEFAULT_THRESHOLD + : undefined; + if (!SUPPORTED_ASSERTION_TYPES.has(type)) { + if (rawValue !== undefined || thresholdText !== undefined) { + throw new Error( + `Unsupported promptfoo __expected assertion "${type}". Supported assertion types: ${[ + ...SUPPORTED_ASSERTION_TYPES, + ].join(', ')}`, + ); + } + return { type: 'equals', value: expected }; + } + if ((type === 'latency' || type === 'cost') && threshold === undefined) { + throw new Error( + `promptfoo __expected ${type} assertion requires a numeric limit, e.g. ${type}(1)`, + ); + } + const assertion: Record = { + type, + }; + if (negate) { + assertion.negate = true; + } + if ( + type === 'contains-any' || + type === 'contains-all' || + type === 'icontains-any' || + type === 'icontains-all' + ) { + assertion.value = value ? value.split(',').map((item) => item.trim()) : []; + } else if (value !== undefined) { + assertion.value = value; + } + if (type === 'latency' && threshold !== undefined) { + assertion.threshold = threshold; + } else if (type === 'cost' && threshold !== undefined) { + assertion.budget = threshold; + } else if (threshold !== undefined) { + assertion.min_score = threshold; + } + return assertion; + } + + return { type: 'equals', value: expected }; +} + +function parseMetadataValue(key: string, value: string): JsonValue | undefined { + if (value.trim() === '') { + return undefined; + } + if (key.endsWith('[]')) { + return value + .split(/(? item.trim().replaceAll('\\,', ',')) + .filter((item) => item.length > 0); + } + return value; +} + +function parseCsvCases(content: string, filePath: string): JsonObject[] { + return parseCsvRows(content, filePath).map((row, rowIndex) => { + const vars: Record = {}; + const metadata: Record = {}; + const assertions: JsonObject[] = []; + const assertionConfigs = new Map>(); + let id: string | undefined; + let input: string | undefined; + let prefix = ''; + let suffix = ''; + let criteria: string | undefined; + let expectedOutput: string | undefined; + let metric: string | undefined; + let threshold: number | undefined; + + for (const [rawKey, rawValue] of Object.entries(row)) { + const key = rawKey.trim(); + const value = rawValue; + if (key === 'id') { + id = value; + } else if (key === 'input') { + input = value; + } else if (key.startsWith('__expected')) { + if (value.trim() !== '') { + assertions.push(parseAssertionFromString(value.trim(), filePath)); + } + } else if (key === '__prefix') { + prefix = value; + } else if (key === '__suffix') { + suffix = value; + } else if (key === '__description') { + criteria = value; + } else if (key === '__provider_output' || key === '__providerOutput') { + expectedOutput = value; + } else if (key === '__metric') { + metric = value; + } else if (key === '__threshold') { + const parsedThreshold = Number.parseFloat(value); + if (Number.isFinite(parsedThreshold)) { + threshold = parsedThreshold; + } + } else if (key.startsWith('__metadata:')) { + const metadataKey = key.slice('__metadata:'.length); + const parsed = parseMetadataValue(metadataKey, value); + if (parsed !== undefined) { + metadata[metadataKey.endsWith('[]') ? metadataKey.slice(0, -2) : metadataKey] = parsed; + } + } else if (key.startsWith('__config:')) { + const [expectedKey, configKey] = key.slice('__config:'.length).split(':'); + if (configKey !== 'threshold') { + throw new Error(`Invalid config key "${configKey}" in __config column: ${filePath}`); + } + const targetIndex = + expectedKey === '__expected' + ? 0 + : /^__expected\d+$/.test(expectedKey) + ? Number.parseInt(expectedKey.slice('__expected'.length), 10) - 1 + : undefined; + if (targetIndex === undefined || targetIndex < 0) { + throw new Error(`Invalid expected key "${expectedKey}" in __config column: ${filePath}`); + } + const parsedThreshold = Number.parseFloat(value); + if (!Number.isFinite(parsedThreshold)) { + throw new Error(`Invalid numeric value for ${configKey} in __config column: ${filePath}`); + } + assertionConfigs.set(targetIndex, { [configKey]: parsedThreshold }); + } else if (key.length > 0) { + vars[key] = value; + } + } + + const caseInput = input !== undefined ? `${prefix}${input}${suffix}` : undefined; + assertions.forEach((assertion, index) => { + if (metric) { + (assertion as Record).metric = metric; + (assertion as Record).name = + assertions.length === 1 ? metric : `${metric}-${index + 1}`; + } + const config = assertionConfigs.get(index); + if (config?.threshold !== undefined) { + (assertion as Record).min_score = config.threshold; + metadata.threshold = config.threshold; + } + }); + + return { + id: id && id.trim() !== '' ? id : `row-${rowIndex + 1}`, + ...(caseInput !== undefined ? { input: caseInput } : {}), + ...(criteria ? { criteria } : {}), + ...(expectedOutput ? { expected_output: expectedOutput } : {}), + ...(assertions.length > 0 ? { assertions } : {}), + ...(threshold !== undefined ? { threshold } : {}), + ...(threshold !== undefined ? { execution: { threshold } } : {}), + ...(Object.keys(vars).length > 0 ? { vars } : {}), + ...(Object.keys(metadata).length > 0 ? { metadata } : {}), + }; + }); +} + +function parseDatasetFunctionReference(filePath: string): { + readonly scriptPath: string; + readonly functionName?: string; +} { + const extensionMatch = filePath.match(/\.(?:mjs|cjs|js|py)(?::([^/\\:]+))?$/i); + if (!extensionMatch) { + return { scriptPath: filePath }; + } + return { + scriptPath: filePath.slice( + 0, + filePath.length - (extensionMatch[1]?.length ?? 0) - (extensionMatch[1] ? 1 : 0), + ), + ...(extensionMatch[1] ? { functionName: extensionMatch[1] } : {}), + }; +} + +async function loadCasesFromJavaScriptFunction( + scriptPath: string, + functionName: string | undefined, +): Promise { + const module = (await import(pathToFileURL(scriptPath).href)) as Record; + const candidate = functionName ? module[functionName] : (module.default ?? module.createTests); + if (typeof candidate !== 'function') { + throw new Error( + `JavaScript dataset file must export function '${functionName ?? 'default or createTests'}': ${scriptPath}`, + ); + } + return assertJsonCases(await candidate(), scriptPath); +} + +async function loadCasesFromPythonFunction( + scriptPath: string, + functionName: string | undefined, +): Promise { + const harness = [ + 'import importlib.util, json, pathlib, sys', + 'script_path = pathlib.Path(sys.argv[1]).resolve()', + 'function_name = sys.argv[2]', + 'spec = importlib.util.spec_from_file_location("agentv_dataset_module", script_path)', + 'module = importlib.util.module_from_spec(spec)', + 'assert spec and spec.loader', + 'spec.loader.exec_module(module)', + 'fn = getattr(module, function_name)', + 'print(json.dumps(fn()))', + ].join('\n'); + const { stdout, stderr, exitCode } = await runPythonDatasetHarness( + harness, + scriptPath, + functionName ?? 'create_tests', + ); + if (exitCode !== 0) { + throw new Error(`Python dataset function failed: ${scriptPath}\n${stderr.trim()}`); + } + return parseJsonCases(stdout, scriptPath); +} + +async function runPythonDatasetHarness( + harness: string, + scriptPath: string, + functionName: string, +): Promise<{ + readonly stdout: string; + readonly stderr: string; + readonly exitCode: number; +}> { + const cwd = path.dirname(scriptPath); + const args = ['-c', harness, scriptPath, functionName]; + const commands = [ + ['uv', 'run', 'python', ...args], + ['python3', ...args], + ['python', ...args], + ]; + let lastMissingError: unknown; + + for (const command of commands) { + try { + return await execFileWithStdin(command, '', { + cwd, + timeoutMs: DATASET_SCRIPT_TIMEOUT_MS, + }); + } catch (error) { + if (!isMissingExecutableError(error)) { + throw error; + } + lastMissingError = error; + } + } + + const message = + lastMissingError instanceof Error ? lastMissingError.message : String(lastMissingError); + throw new Error(`Python dataset function failed: no Python runner available\n${message}`); +} + +function isMissingExecutableError(error: unknown): boolean { + if (!isJsonObjectLike(error)) { + return false; + } + return error.code === 'ENOENT'; +} + +function isJsonObjectLike(value: unknown): value is { readonly [key: string]: unknown } { + return typeof value === 'object' && value !== null; +} + /** * Load test objects from a single external file (YAML or JSONL). */ export async function loadCasesFromFile(filePath: string): Promise { - const ext = path.extname(filePath).toLowerCase(); + const { scriptPath, functionName } = parseDatasetFunctionReference(filePath); + const ext = path.extname(scriptPath).toLowerCase(); + if (ext === '.js' || ext === '.mjs' || ext === '.cjs') { + return loadCasesFromJavaScriptFunction(scriptPath, functionName); + } + if (ext === '.py') { + return loadCasesFromPythonFunction(scriptPath, functionName); + } + let content: string; try { - content = await readFile(filePath, 'utf8'); + content = await readFile(scriptPath, 'utf8'); } catch (error) { const message = error instanceof Error ? error.message : String(error); - throw new Error(`Cannot read external test file: ${filePath}\n ${message}`); + throw new Error(`Cannot read external test file: ${scriptPath}\n ${message}`); } if (content.trim() === '') { console.warn( - `${ANSI_YELLOW}Warning: External test file is empty, skipping: ${filePath}${ANSI_RESET}`, + `${ANSI_YELLOW}Warning: External test file is empty, skipping: ${scriptPath}${ANSI_RESET}`, ); return []; } if (ext === '.yaml' || ext === '.yml') { - return parseYamlCases(content, filePath); + return parseYamlCases(content, scriptPath); } if (ext === '.jsonl') { - return parseJsonlCases(content, filePath); + return parseJsonlCases(content, scriptPath); + } + if (ext === '.json') { + return parseJsonCases(content, scriptPath); + } + if (ext === '.csv') { + return parseCsvCases(content, scriptPath); } throw new Error( - `Unsupported external test file format '${ext}': ${filePath}. Supported: .yaml, .yml, .jsonl`, + `Unsupported external test file format '${ext}': ${scriptPath}. Supported: .csv, .json, .jsonl, .yaml, .yml, .js, .mjs, .cjs, .py`, ); } @@ -125,7 +579,7 @@ export async function resolveFileReference( ref: string, evalFileDir: string, ): Promise { - const rawPath = extractFilePath(ref); + const rawPath = stripFileProtocol(ref); const absolutePattern = path.resolve(evalFileDir, rawPath); if (isGlobPattern(rawPath)) { diff --git a/packages/core/src/evaluation/validation/eval-validator.ts b/packages/core/src/evaluation/validation/eval-validator.ts index 0edc477ef..1c0cc3ef9 100644 --- a/packages/core/src/evaluation/validation/eval-validator.ts +++ b/packages/core/src/evaluation/validation/eval-validator.ts @@ -57,7 +57,17 @@ const PROMPTFOO_ASSERTION_TYPES = new Set([ ]); /** Valid file extensions for external test files. */ -const VALID_TEST_FILE_EXTENSIONS = new Set(['.yaml', '.yml', '.jsonl']); +const VALID_TEST_FILE_EXTENSIONS = new Set([ + '.csv', + '.cjs', + '.js', + '.json', + '.jsonl', + '.mjs', + '.py', + '.yaml', + '.yml', +]); /** Known fields at the top level of an eval file. */ const KNOWN_TOP_LEVEL_FIELDS = new Set([ @@ -1826,9 +1836,13 @@ function validateTestsStringPath( errors: ValidationError[], location = 'tests', ): boolean { - const normalizedPath = testsPath.startsWith('file://') + let normalizedPath = testsPath.startsWith('file://') ? testsPath.slice('file://'.length) : testsPath; + const scriptFunctionMatch = normalizedPath.match(/\.(?:mjs|cjs|js|py):[^/\\:]+$/i); + if (scriptFunctionMatch) { + normalizedPath = normalizedPath.slice(0, normalizedPath.lastIndexOf(':')); + } if (/\.eval\.ya?ml$/i.test(normalizedPath)) { errors.push({ severity: 'error', @@ -1880,7 +1894,10 @@ async function validateRawCaseImportPath( let caseIndex = 0; for (const casePath of caseFiles) { - const pathStat = await stat(casePath).catch(() => undefined); + const statPath = casePath.match(/\.(?:mjs|cjs|js|py):[^/\\:]+$/i) + ? casePath.slice(0, casePath.lastIndexOf(':')) + : casePath; + const pathStat = await stat(statPath).catch(() => undefined); const externalCases = pathStat?.isDirectory() ? await loadCasesFromDirectory(casePath) : await loadCasesFromFile(casePath); diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index 2914ce8df..12f25ed16 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -777,9 +777,14 @@ async function loadTestsFromParsedYamlValue( } } } - const testInputMessages = resolveInputMessages(inputCase, inputSuiteFiles); + const testInputMessages = resolveInputMessages(inputCase, inputSuiteFiles) ?? []; // Resolve expected_output with shorthand support const expectedMessages = resolveExpectedMessages(renderedCase) ?? []; + const effectiveSuiteInputValue = + rawSuiteInput && !skipDefaults + ? interpolateCaseField(rawSuiteInput, caseVars, nunjucksFilters) + : undefined; + const effectiveSuiteInputMessages = expandInputShorthand(effectiveSuiteInputValue); // A test is complete when it has id, input, and at least one of: criteria, expected_output, assertions, or turns (conversation mode) const hasEvaluationSpec = @@ -787,7 +792,10 @@ async function loadTestsFromParsedYamlValue( expectedMessages.length > 0 || renderedCase.assertions !== undefined || (Array.isArray(renderedCase.turns) && renderedCase.turns.length > 0); - if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) { + const hasInputMessages = + testInputMessages.length > 0 || + (effectiveSuiteInputMessages !== undefined && effectiveSuiteInputMessages.length > 0); + if (!id || !hasEvaluationSpec || !hasInputMessages) { logError( `Skipping incomplete test: ${id ?? 'unknown'}. Missing required fields: id, input or PROMPT.md, and at least one of criteria/expected_output/assertions/turns`, ); @@ -795,12 +803,6 @@ async function loadTestsFromParsedYamlValue( } // Prepend suite-level input to test input (respecting skip_defaults) - const effectiveSuiteInputValue = - rawSuiteInput && !skipDefaults - ? interpolateCaseField(rawSuiteInput, caseVars, nunjucksFilters) - : undefined; - const effectiveSuiteInputMessages = expandInputShorthand(effectiveSuiteInputValue); - // expected_output is optional - for outcome-only evaluation const hasExpectedMessages = expectedMessages.length > 0; @@ -1502,8 +1504,11 @@ async function resolveIncludePaths( includePath: string, evalFileDir: string, ): Promise { - const absolutePattern = path.resolve(evalFileDir, includePath); - if (hasGlobMagic(includePath)) { + const normalizedPath = includePath.startsWith('file://') + ? includePath.slice('file://'.length) + : includePath; + const absolutePattern = path.resolve(evalFileDir, normalizedPath); + if (hasGlobMagic(normalizedPath)) { const matches = (await fg(absolutePattern.replaceAll('\\', '/'), { onlyFiles: true, absolute: true, diff --git a/packages/core/test/evaluation/loaders/case-file-loader.test.ts b/packages/core/test/evaluation/loaders/case-file-loader.test.ts index d580745f5..ddca6666e 100644 --- a/packages/core/test/evaluation/loaders/case-file-loader.test.ts +++ b/packages/core/test/evaluation/loaders/case-file-loader.test.ts @@ -9,6 +9,7 @@ import { loadCasesFromDirectory, resolveFileReference, } from '../../../src/evaluation/loaders/case-file-loader.js'; +import { parseGraders } from '../../../src/evaluation/loaders/grader-parser.js'; import { loadTestSuite, loadTests } from '../../../src/evaluation/yaml-parser.js'; describe('isFileReference', () => { @@ -76,6 +77,120 @@ describe('resolveFileReference', () => { expect(cases[1].id).toBe('jsonl-2'); }); + it('loads test objects from a JSON file', async () => { + await writeFile( + path.join(tempDir, 'cases', 'tests.json'), + JSON.stringify([ + { id: 'json-1', criteria: 'Goal 1', input: 'Query 1' }, + { id: 'json-2', criteria: 'Goal 2', input: 'Query 2' }, + ]), + ); + + const cases = await resolveFileReference('file://cases/tests.json', tempDir); + + expect(cases).toHaveLength(2); + expect(cases[0].id).toBe('json-1'); + expect(cases[1].id).toBe('json-2'); + }); + + it('maps promptfoo CSV magic columns into AgentV raw cases', async () => { + await writeFile( + path.join(tempDir, 'cases', 'tests.csv'), + [ + 'id,input,__expected,__expected2,__prefix,__suffix,__description,__provider_output,__metric,__threshold,__metadata:category,__metadata:tags[],__config:__expected2:threshold,locale', + 'csv-1,"What is 2+2?",equals:4,icontains:four,"Answer briefly:"," Thanks","Arithmetic case","4","accuracy",0.7,math,"smoke,regression",0.6,en-US', + ].join('\n'), + ); + + const cases = await resolveFileReference('file://cases/tests.csv', tempDir); + + expect(cases).toHaveLength(1); + expect(cases[0]).toMatchObject({ + id: 'csv-1', + input: 'Answer briefly:What is 2+2? Thanks', + expected_output: '4', + criteria: 'Arithmetic case', + threshold: 0.7, + vars: { locale: 'en-US' }, + metadata: { category: 'math', tags: ['smoke', 'regression'], threshold: 0.6 }, + assertions: [ + { type: 'equals', value: '4', metric: 'accuracy' }, + { type: 'icontains', value: 'four', metric: 'accuracy', min_score: 0.6 }, + ], + }); + }); + + it('maps supported promptfoo expected DSL forms to runnable AgentV assertions', async () => { + const graderPath = path.join(tempDir, 'cases', 'grader.py'); + await writeFile(graderPath, 'print("ok")\n'); + await writeFile( + path.join(tempDir, 'cases', 'expected-dsl.csv'), + [ + 'id,input,__expected,__expected2,__expected3', + 'csv-assertions,Hello,latency(1000),cost(0.01),file://grader.py', + ].join('\n'), + ); + + const cases = await resolveFileReference('file://cases/expected-dsl.csv', tempDir); + + expect(cases[0].assertions).toEqual([ + { type: 'latency', threshold: 1000 }, + { type: 'cost', budget: 0.01 }, + { type: 'code-grader', command: ['uv', 'run', 'python', graderPath] }, + ]); + + const evaluators = await parseGraders(cases[0], undefined, [tempDir], 'csv-assertions'); + expect(evaluators.map((evaluator) => evaluator.type)).toEqual([ + 'latency', + 'cost', + 'code-grader', + ]); + }); + + it('rejects unsupported promptfoo expected DSL forms clearly', async () => { + await writeFile( + path.join(tempDir, 'cases', 'unsupported-expected.csv'), + ['id,input,__expected', 'csv-similar,Hello,similar:hello'].join('\n'), + ); + + await expect( + resolveFileReference('file://cases/unsupported-expected.csv', tempDir), + ).rejects.toThrow(/Unsupported promptfoo __expected assertion "similar"/); + }); + + it('loads tests from explicit JavaScript function dataset files', async () => { + await writeFile( + path.join(tempDir, 'cases', 'dataset.mjs'), + `export function createTests() { + return [ + { id: 'js-1', criteria: 'JS goal', input: 'JS input' }, + ]; +} +`, + ); + + const cases = await resolveFileReference('file://cases/dataset.mjs:createTests', tempDir); + + expect(cases).toHaveLength(1); + expect(cases[0].id).toBe('js-1'); + }); + + it('loads tests from explicit Python function dataset files', async () => { + await writeFile( + path.join(tempDir, 'cases', 'dataset.py'), + `def create_tests(): + return [ + {"id": "py-1", "criteria": "Python goal", "input": "Python input"}, + ] +`, + ); + + const cases = await resolveFileReference('file://cases/dataset.py:create_tests', tempDir); + + expect(cases).toHaveLength(1); + expect(cases[0].id).toBe('py-1'); + }); + it('resolves glob patterns to multiple files', async () => { await mkdir(path.join(tempDir, 'glob-cases'), { recursive: true }); await writeFile( @@ -353,6 +468,112 @@ tests: ./cases.jsonl expect(tests[1].id).toBe('ext-jsonl-2'); }); + it('loads tests from file:// string paths', async () => { + await writeFile( + path.join(tempDir, 'file-url-cases.json'), + JSON.stringify([{ id: 'file-url-json', criteria: 'JSON goal', input: 'Input' }]), + ); + + await writeFile( + path.join(tempDir, 'file-url-suite.yaml'), + `name: file-url-suite +tests: file://file-url-cases.json +`, + ); + + const tests = await loadTests(path.join(tempDir, 'file-url-suite.yaml'), tempDir); + + expect(tests).toHaveLength(1); + expect(tests[0].id).toBe('file-url-json'); + }); + + it('keeps imports.tests select working beside file-backed tests', async () => { + await writeFile( + path.join(tempDir, 'import-cases.yaml'), + `- id: imported-keep + criteria: "Imported keep" + input: "Imported keep input" + metadata: + group: keep +- id: imported-drop + criteria: "Imported drop" + input: "Imported drop input" + metadata: + group: drop +`, + ); + await writeFile( + path.join(tempDir, 'direct-cases.jsonl'), + '{"id": "direct-case", "criteria": "Direct goal", "input": "Direct input"}\n', + ); + await writeFile( + path.join(tempDir, 'imports-and-file-tests.yaml'), + `imports: + tests: + - path: import-cases.yaml + select: + metadata: + group: keep +tests: file://direct-cases.jsonl +`, + ); + + const tests = await loadTests(path.join(tempDir, 'imports-and-file-tests.yaml'), tempDir); + + expect(tests.map((test) => test.id)).toEqual(['imported-keep', 'direct-case']); + }); + + it('loads promptfoo CSV magic columns through the full suite parser', async () => { + await writeFile( + path.join(tempDir, 'magic-cases.csv'), + [ + 'id,input,__expected,__metric,__threshold,__metadata:category,__provider_output', + 'magic-csv,Hello,contains:Hi,greeting,0.9,smoke,Hi there', + ].join('\n'), + ); + await writeFile( + path.join(tempDir, 'magic-suite.yaml'), + `name: magic-suite +tests: file://magic-cases.csv +`, + ); + + const tests = await loadTests(path.join(tempDir, 'magic-suite.yaml'), tempDir); + + expect(tests).toHaveLength(1); + expect(tests[0].id).toBe('magic-csv'); + expect(tests[0].threshold).toBe(0.9); + expect(tests[0].metadata).toMatchObject({ category: 'smoke' }); + expect(tests[0].reference_answer).toBe('Hi there'); + expect(tests[0].assertions?.[0]).toMatchObject({ + name: 'greeting', + type: 'contains', + value: 'Hi', + }); + }); + + it('applies suite-level input to promptfoo CSV rows with vars and expected assertions', async () => { + await writeFile( + path.join(tempDir, 'promptfoo-vars.csv'), + ['id,topic,__expected', 'case,refund,contains:refund'].join('\n'), + ); + await writeFile( + path.join(tempDir, 'promptfoo-vars-suite.yaml'), + `input: Answer about {{ topic }} +tests: file://promptfoo-vars.csv +`, + ); + + const tests = await loadTests(path.join(tempDir, 'promptfoo-vars-suite.yaml'), tempDir); + + expect(tests).toHaveLength(1); + expect(tests[0]).toMatchObject({ + id: 'case', + input: [{ role: 'user', content: 'Answer about refund' }], + }); + expect(tests[0].assertions?.[0]).toMatchObject({ type: 'contains', value: 'refund' }); + }); + it('resolves relative path against eval file directory', async () => { // Create nested directory structure const dirA = path.join(tempDir, 'a'); diff --git a/packages/core/test/evaluation/validation/eval-validator.test.ts b/packages/core/test/evaluation/validation/eval-validator.test.ts index 97236de02..63e54f97f 100644 --- a/packages/core/test/evaluation/validation/eval-validator.test.ts +++ b/packages/core/test/evaluation/validation/eval-validator.test.ts @@ -1579,6 +1579,87 @@ tests: "./cases-shorthand-workspace.yaml" ); expect(extWarnings).toHaveLength(0); }); + + it('passes dataset loader extensions without unsupported-extension warnings', async () => { + const files = { + 'cases.csv': 'id,input,__expected\ncsv-1,Hello,contains:Hi\n', + 'cases.json': '[{"id":"json-1","criteria":"Goal","input":"Query"}]\n', + 'cases.mjs': 'export function createTests() { return []; }\n', + 'cases.py': 'def create_tests():\n return []\n', + }; + for (const [filename, content] of Object.entries(files)) { + await writeFile(path.join(tempDir, filename), content); + } + + const filePath = path.join(tempDir, 'tests-dataset-extensions.yaml'); + await writeFile( + filePath, + `imports: + tests: + - path: file://cases.csv + - path: cases.json + - path: cases.mjs:createTests + - path: cases.py:create_tests +tests: + - id: inline + criteria: Goal + input: Query +`, + ); + + const result = await validateEvalFile(filePath); + + expect(result.valid).toBe(true); + const extWarnings = result.errors.filter( + (error) => error.severity === 'warning' && error.message.includes('extension'), + ); + expect(extWarnings).toHaveLength(0); + }); + + it('passes promptfoo CSV rows that rely on suite-level input', async () => { + await writeFile( + path.join(tempDir, 'suite-input-cases.csv'), + 'id,topic,__expected\ncase,refund,contains:refund\n', + ); + + const filePath = path.join(tempDir, 'suite-input-csv.yaml'); + await writeFile( + filePath, + `input: Answer about {{ topic }} +tests: file://suite-input-cases.csv +`, + ); + + const result = await validateEvalFile(filePath); + + expect(result.valid).toBe(true); + expect(result.errors).toHaveLength(0); + }); + + it('rejects unsupported promptfoo CSV expected DSL forms during validation', async () => { + await writeFile( + path.join(tempDir, 'unsupported-expected-cases.csv'), + 'id,input,__expected\ncase,Hello,similar:hello\n', + ); + + const filePath = path.join(tempDir, 'unsupported-expected-csv.yaml'); + await writeFile( + filePath, + `tests: file://unsupported-expected-cases.csv +`, + ); + + const result = await validateEvalFile(filePath); + + expect(result.valid).toBe(false); + expect(result.errors).toContainEqual( + expect.objectContaining({ + severity: 'error', + location: 'tests', + message: expect.stringContaining('Unsupported promptfoo __expected assertion "similar"'), + }), + ); + }); }); describe('suite-level input validation', () => {