diff --git a/.gitignore b/.gitignore
index 5f96167..aa49fe5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -103,7 +103,7 @@ ipython_config.py
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
-#uv.lock
+uv.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
@@ -187,4 +187,4 @@ examples/checkpoints/
examples/outputs/
.codex/
-openspec/
\ No newline at end of file
+.agents/
\ No newline at end of file
diff --git a/README.md b/README.md
index 6521a97..38b2deb 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ Datafast is a python library for synthetic data generation using llms.
The old dataset-class API has been removed. The canonical package is `datafast`, and the primary model is:
- create records with `Source` or `Seed`
-- transform them with composable steps
+- transform them with composable steps such as `AddUUID`, `Map`, and `Filter`
- call LLMs with `LLMStep`, `Classify`, `Score`, `Compare`, `Rewrite`, or `Extract`
- persist results with `Sink`
@@ -53,7 +53,7 @@ pipeline.run(batch_size=4)
- `Source`: load records from Python lists, files, or Hugging Face datasets
- `Seed`: generate record combinations declaratively
-- `Map`, `FlatMap`, `Filter`, `Group`, `Pair`, `Concat`, `Join`: data operations
+- `AddUUID`, `Map`, `FlatMap`, `Filter`, `Group`, `Pair`, `Concat`, `Join`: data operations
- `LLMStep`: free-form generation
- `Classify`, `Score`, `Compare`, `Rewrite`, `Extract`: higher-level LLM transforms
- `Branch` and `JoinBranches`: multi-path pipelines
diff --git a/datafast/__init__.py b/datafast/__init__.py
index 19bd452..6576370 100644
--- a/datafast/__init__.py
+++ b/datafast/__init__.py
@@ -31,7 +31,7 @@
is_langfuse_tracing_enabled,
)
from datafast.transforms.branch import Branch, JoinBranches
-from datafast.transforms.data_ops import Map, FlatMap, Filter, Group, Pair, Concat, Join
+from datafast.transforms.data_ops import AddUUID, Map, FlatMap, Filter, Group, Pair, Concat, Join
from datafast.transforms.llm_eval import Classify, Score, Compare
from datafast.transforms.llm_extract import Extract
from datafast.transforms.llm_step import LLMStep
@@ -64,6 +64,7 @@ def get_version() -> str:
"Seed",
"SeedDimension",
"Sample",
+ "AddUUID",
"Map",
"FlatMap",
"Filter",
diff --git a/datafast/core/runner.py b/datafast/core/runner.py
index 0a28ba2..3497605 100644
--- a/datafast/core/runner.py
+++ b/datafast/core/runner.py
@@ -233,7 +233,7 @@ def _execute_llm_step(
try:
result = model.generate(
- call.messages,
+ messages=call.messages,
metadata=build_trace_metadata(
model=model,
component="pipeline.step",
diff --git a/datafast/llms.py b/datafast/llms.py
index 092346a..754e8b8 100644
--- a/datafast/llms.py
+++ b/datafast/llms.py
@@ -18,7 +18,6 @@
# LiteLLM
import litellm
from litellm.exceptions import RateLimitError
-from litellm.utils import ModelResponse
# Internal imports
from .llm_utils import get_messages
@@ -292,17 +291,23 @@ def generate(
if response_format is not None:
completion_params["response_format"] = response_format
- # Call LiteLLM completion with batch messages - retry on rate limit
+ # Call LiteLLM completion with retry on rate limit.
+ # OpenRouter accepts single message requests via completion(), but
+ # rejects the same payload when wrapped in batch_completion().
max_retries = 3
retry_delay = 5 # Start with 5 seconds
response = None
-
+
for attempt in range(max_retries):
try:
- response: list[ModelResponse] = litellm.batch_completion(
- **completion_params)
+ if len(batch_to_send) == 1:
+ response = [litellm.completion(
+ **{**completion_params, "messages": batch_to_send[0]}
+ )]
+ else:
+ response = litellm.batch_completion(**completion_params)
break # Success, exit retry loop
- except RateLimitError as e:
+ except RateLimitError:
if attempt < max_retries - 1:
wait_time = retry_delay * (2 ** attempt) # Exponential backoff
logger.warning(
@@ -316,7 +321,7 @@ def generate(
f"Provider: {self.provider_name} | Model: {self.model_id}"
)
raise
-
+
if response is None:
raise RuntimeError("Failed to get response after retries")
diff --git a/datafast/transforms/__init__.py b/datafast/transforms/__init__.py
index 025ea3f..f7a88d2 100644
--- a/datafast/transforms/__init__.py
+++ b/datafast/transforms/__init__.py
@@ -1,7 +1,7 @@
"""Transform steps for datafast v2."""
from datafast.transforms.sample import Sample
-from datafast.transforms.data_ops import Map, FlatMap, Filter, Group, Pair, Concat, Join
+from datafast.transforms.data_ops import AddUUID, Map, FlatMap, Filter, Group, Pair, Concat, Join
from datafast.transforms.llm_step import LLMStep
from datafast.transforms.llm_eval import Classify, Score, Compare
from datafast.transforms.llm_transform import Rewrite
@@ -9,7 +9,7 @@
from datafast.transforms.branch import Branch, JoinBranches
__all__ = [
- "Sample", "Map", "FlatMap", "Filter", "Group", "Pair", "Concat", "Join",
+ "Sample", "AddUUID", "Map", "FlatMap", "Filter", "Group", "Pair", "Concat", "Join",
"LLMStep", "Classify", "Score", "Compare", "Rewrite", "Extract",
"Branch", "JoinBranches",
]
diff --git a/datafast/transforms/data_ops.py b/datafast/transforms/data_ops.py
index 3887460..fafb5cf 100644
--- a/datafast/transforms/data_ops.py
+++ b/datafast/transforms/data_ops.py
@@ -3,6 +3,7 @@
import itertools
import random
import re
+import uuid
from collections import defaultdict
from collections.abc import Callable, Iterable
from typing import Any
@@ -62,6 +63,34 @@ def process(self, records: Iterable[Record]) -> Iterable[Record]:
yield from self._fn(record)
+class AddUUID(Step):
+ """Add a UUID field to each record."""
+
+ def __init__(self, column: str = "id", overwrite: bool = False) -> None:
+ """
+ Initialize an AddUUID step.
+
+ Args:
+ column: Field name to write the UUID into.
+ overwrite: If True, replace existing values in the target column.
+
+ Examples:
+ >>> AddUUID()
+ >>> AddUUID(column="example_id", overwrite=True)
+ """
+ super().__init__()
+ self._column = column
+ self._overwrite = overwrite
+
+ def process(self, records: Iterable[Record]) -> Iterable[Record]:
+ """Add UUIDs while preserving all other fields."""
+ for record in records:
+ if self._column in record and not self._overwrite:
+ yield record
+ else:
+ yield {**record, self._column: str(uuid.uuid4())}
+
+
class Filter(Step):
"""Keep or drop records based on conditions."""
diff --git a/docs/api.md b/docs/api.md
index edef161..45857e2 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -36,6 +36,7 @@ from datafast import Source, LLMStep, Sink, openrouter
## Data Operations
- `Sample`
+- `AddUUID`
- `Map`
- `FlatMap`
- `Filter`
diff --git a/docs/cookbook/assets/index.md b/docs/cookbook/assets/index.md
new file mode 100644
index 0000000..7f69923
--- /dev/null
+++ b/docs/cookbook/assets/index.md
@@ -0,0 +1,80 @@
+# Cookbook Assets
+
+Prompt files and dataset details used by cookbook examples.
+
+## Text Classification
+
+### Dataset
+
+- **Source:** seed dimensions created with `Seed.product`
+- **Dimensions:** label, trail type, style, language, and model
+- **Local output:** `examples/outputs/45_text_classification_cookbook.jsonl`
+- **Checkpoints:** `examples/checkpoints/45_text_classification_cookbook`
+- **Hub output:** optional, controlled by `DATAFAST_PUSH_TO_HUB=1`
+
+This cookbook models variation directly as seed dimensions so the label, trail
+type, style, language, and model are all explicit in the
+pipeline.
+
+### Prompt
+
+| File | Style |
+| --- | --- |
+| [text_classification_generation.txt](text_classification_generation.txt) | One short trail report per call, with label, trail type, style, and language injected |
+
+## Persona Generation
+
+### Dataset
+
+- **Source:** `xsum` (Hugging Face), `validation` split
+- **Fields used:** `id`, `document`, `summary`
+- **Filter:** 300–500 words, first 100 matches
+- **Local output:** `examples/outputs/43_persona_cookbook.jsonl`
+- **Checkpoints:** `examples/checkpoints/43_persona_cookbook`
+- **Hub output:** set `HF_REPO_ID` and the `repo_id` in `push_records_to_hub()` to repos under your own Hugging Face username or organization
+
+The example keeps first-match sampling for reproducibility. For local JSONL corpora with metadata such as `document_filename`, stratified sampling is usually a better fit.
+
+### Prompt Variants
+
+Each LLM step picks one prompt at random per record. The script also assigns random `life_stage` and `related_life_stage` values before the corresponding LLM steps. Multiple variants add diversity.
+
+#### Text-to-Persona
+
+| File | Style |
+| --- | --- |
+| [text_to_persona_v1.txt](text_to_persona_v1.txt) | Direct inference of a reader persona |
+| [text_to_persona_v2.txt](text_to_persona_v2.txt) | XML-tagged source text, writer/reader framing |
+| [text_to_persona_v3.txt](text_to_persona_v3.txt) | System-role preamble, search-interest angle |
+
+#### Persona-to-Persona
+
+| File | Style |
+| --- | --- |
+| [persona_to_persona_v1.txt](persona_to_persona_v1.txt) | Close relationship, standalone description |
+| [persona_to_persona_v2.txt](persona_to_persona_v2.txt) | Rule-list format, explicit separation of description and relationship |
+| [persona_to_persona_v3.txt](persona_to_persona_v3.txt) | XML-tagged input, concise vivid output |
+
+### Provenance
+
+- Text-to-Persona and Persona-to-Persona prompts are paper-aligned adaptations. The Persona Hub paper states its published prompts are simplified, not exact.
+- No Persona Hub code is reused. The workflow is built with datafast primitives.
+
+## Space Engineering Text Generation
+
+### Dataset
+
+- **Source:** seed dimensions created with `Seed.product`
+- **Dimensions:** document type, topic, expertise level, and language
+- **Local output:** `examples/outputs/44_space_text_generation_cookbook.jsonl`
+- **Checkpoints:** `examples/checkpoints/44_space_text_generation_cookbook`
+- **Hub output:** optional, controlled by `DATAFAST_PUSH_TO_HUB=1`
+
+### Prompt
+
+The text-generation cookbook uses one compact prompt and relies on seed
+dimensions for variation.
+
+| File | Style |
+| --- | --- |
+| [space_text_generation.txt](space_text_generation.txt) | Minimal variable-driven request |
diff --git a/docs/cookbook/assets/persona_to_persona_v1.txt b/docs/cookbook/assets/persona_to_persona_v1.txt
new file mode 100644
index 0000000..eabb6d6
--- /dev/null
+++ b/docs/cookbook/assets/persona_to_persona_v1.txt
@@ -0,0 +1,11 @@
+Given the following persona, infer one other specific persona who is in a close relationship with them.
+
+Persona:
+{persona_description}
+
+Requirements:
+1. Use one clear relationship such as family member, colleague, friend, or neighbor, coach, teacher, married partner.
+2. Choose a related persona that adds a meaningfully different life perspective but is still likely to be in close contact with the original persona.
+3. Keep the related persona realistic and specific.
+4. Don't talk about the orginal person in the description of the related persona, as it should be self-contained description.
+5. The related persona must be {related_life_stage}. Do not state a precise age, just reflect this life stage naturally.
diff --git a/docs/cookbook/assets/persona_to_persona_v2.txt b/docs/cookbook/assets/persona_to_persona_v2.txt
new file mode 100644
index 0000000..b4e4adf
--- /dev/null
+++ b/docs/cookbook/assets/persona_to_persona_v2.txt
@@ -0,0 +1,14 @@
+Think of a person who regularly interacts with the following persona in a meaningful way.
+
+Rules:
+- Do not mention the original persona in the description of the related persona.
+- Do not mention the relationship between the two personas in the description, only in the relationship_type
+- Pick a single, concrete relationship type such as mentor-mentee, colleague, neighbor, supervisor-report, or service provider-client
+- The related person should bring a distinctly different viewpoint or expertise, and some uniqueness.
+- Keep the description realistic and standalone without mentionning with the original persona.
+- The related persona must be {related_life_stage}. Do not state a precise age, just reflect this life stage naturally.
+
+Original Persona:
+{persona_description}
+
+Now generate a related persona.
\ No newline at end of file
diff --git a/docs/cookbook/assets/persona_to_persona_v3.txt b/docs/cookbook/assets/persona_to_persona_v3.txt
new file mode 100644
index 0000000..9652161
--- /dev/null
+++ b/docs/cookbook/assets/persona_to_persona_v3.txt
@@ -0,0 +1,16 @@
+Here is the description of someone:
+
+{persona_description}
+
+
+Come up with one other description of an individual who could be part of this persona's life.
+We want the description to be detailed but super concise (max 2 sentences) and vivid.
+But we want to have the a standalone description of that new persona without mentioning the original persona or a reason in the description.
+
+Requirements:
+1. Define a clear interpersonal link such as friend, advisor, competitor, family member, or collaborator.
+2. The new persona should offer a complementary or contrasting perspective.
+3. Make the related persona vivid and believable, avoid generic archetypes.
+4. Describe the relation in relationship_type field, not in the description.
+5. The related persona must be {related_life_stage}. Do not state a precise age, just reflect this life stage naturally.
+
diff --git a/docs/cookbook/assets/space_text_generation.txt b/docs/cookbook/assets/space_text_generation.txt
new file mode 100644
index 0000000..ca5af4b
--- /dev/null
+++ b/docs/cookbook/assets/space_text_generation.txt
@@ -0,0 +1 @@
+Write one {document_type} excerpt about {topic} for {expertise_level} in {language_name}.
diff --git a/docs/cookbook/assets/text_classification_generation.txt b/docs/cookbook/assets/text_classification_generation.txt
new file mode 100644
index 0000000..24d28c0
--- /dev/null
+++ b/docs/cookbook/assets/text_classification_generation.txt
@@ -0,0 +1,13 @@
+Write one realistic hiker report in {language_name}.
+
+Target category: {label}
+Category definition: {label_description}
+
+Constraints:
+- The report must clearly match the target category.
+- The setting must be a {trail_type}.
+- The writing style must be {style}.
+- Keep it to 1 or 2 sentences.
+- Do not mention the category name directly.
+- Do not use bullets, numbering, or explanations.
+- Make the report concrete and varied.
diff --git a/docs/cookbook/assets/text_to_persona_v1.txt b/docs/cookbook/assets/text_to_persona_v1.txt
new file mode 100644
index 0000000..cd09909
--- /dev/null
+++ b/docs/cookbook/assets/text_to_persona_v1.txt
@@ -0,0 +1,17 @@
+Infer one specific persona who is likely to read text.
+
+Source text:
+{document}
+
+Requirements:
+1. Return a single persona, not a group.
+2. Make the persona specific and fine-grained rather than generic.
+3. Ground the persona in signals from the text such as domain, expertise, context, or likely motivation.
+4. Do not quote the source text in the persona field.
+5. Only write 1 or 2 sentences maximum.
+6. The persona is not the subject of the text, but rather someone who would be reading it.
+7. Do not refer to the source text, article, or its content in the persona description. The persona must be self-contained.
+8. The persona must be {life_stage}. Do not mention a precise age, just reflect this life stage naturally.
+
+Now figure out a persona description who would be reading this text.
+
diff --git a/docs/cookbook/assets/text_to_persona_v2.txt b/docs/cookbook/assets/text_to_persona_v2.txt
new file mode 100644
index 0000000..294577d
--- /dev/null
+++ b/docs/cookbook/assets/text_to_persona_v2.txt
@@ -0,0 +1,16 @@
+
+{document}
+
+
+Identify one precise individual who would naturally encounter or write the .
+
+Requirements:
+1. Describe exactly one person.
+2. Be as specific as possible: mention plausible occupation and/or life situation.
+3. Derive the persona strictly from cues in the text such as topic, jargon, tone, or implied audience as a potential writter / reader of this text.
+4. Do not copy or paraphrase the source text in the persona field.
+5. Only return 1 or 2 sentences maximum.
+6. The described person is not the subject of the text, but rather someone who would be encountering or writing such text as part of their life.
+7. Do not reference the source text, article, or its content in the persona description. The persona must stand on its own.
+8. The persona must be {life_stage}. Do not state a precise age, just reflect this life stage naturally.
+
diff --git a/docs/cookbook/assets/text_to_persona_v3.txt b/docs/cookbook/assets/text_to_persona_v3.txt
new file mode 100644
index 0000000..3ccb077
--- /dev/null
+++ b/docs/cookbook/assets/text_to_persona_v3.txt
@@ -0,0 +1,17 @@
+You are a persona inference assistant.
+
+Based on the text content below, imagine one real person who would be interested in searching about the topic from this content.
+
+Rules:
+- Output a single, concrete persona rather than a broad demographic.
+- Include details like professional background, interests, or situational context that make the persona feel authentic.
+- Don't mention the person search or information retrieval action in the persona description, just describe the persona which could explain their interest in the topic.
+- Keep it super short and concise.
+- Do not mention or refer to the source text, article, or its content in the persona description. The persona must be self-contained.
+- The persona must be {life_stage}. Do not state a precise age, just reflect this life stage naturally.
+
+Source text:
+{document}
+
+
+
diff --git a/docs/cookbook/index.md b/docs/cookbook/index.md
new file mode 100644
index 0000000..1b745ec
--- /dev/null
+++ b/docs/cookbook/index.md
@@ -0,0 +1,16 @@
+# Cookbook
+
+Cookbooks connect a runnable script to a documentation walkthrough.
+
+The Python script is the source of truth. Each cookbook page explains:
+
+- where the executable example lives
+- what inputs it uses
+- which prompt assets it depends on
+- where it writes its output artifacts
+
+## Available Cookbooks
+
+- [Text Classification](text_classification.md): generate a multilingual trail-conditions classification dataset from explicit seed dimensions.
+- [Persona Generation](persona_generation.md): infer personas from real articles and expand them through relationships using randomized prompt variants.
+- [Space Engineering Text Generation](space_text_generation.md): generate a raw multilingual technical text corpus from seed dimensions.
diff --git a/docs/cookbook/persona_generation.md b/docs/cookbook/persona_generation.md
new file mode 100644
index 0000000..f314a39
--- /dev/null
+++ b/docs/cookbook/persona_generation.md
@@ -0,0 +1,89 @@
+# Persona Generation
+
+Build personas from real articles and expand them through relationships. Inspired by the Persona Hub paper, implemented entirely with datafast.
+
+## Source
+
+- **Script:** `examples/scripts/43_cookbook_persona_generation.py`
+- **Prompt assets:** [asset index](assets/index.md)
+- **Local output:** `examples/outputs/43_persona_cookbook.jsonl`
+- **Checkpoints:** `examples/checkpoints/43_persona_cookbook`
+- **Hub output:** pushed to the Hugging Face Hub repo IDs configured in the script
+
+## Pipeline
+
+1. Load `xsum` articles (`validation` split), preserving the dataset `id`.
+2. Filter to documents between 300 and 500 words. Keep the first 100 matches.
+3. Assign a random life stage to the source persona.
+4. **Text-to-Persona** — infer one persona from each article and life stage.
+5. Assign a random life stage to the related persona.
+6. **Persona-to-Persona** — expand that persona into a related individual.
+7. Keep the final output fields, add a row UUID, write JSONL, checkpoint progress, and push results to Hugging Face Hub.
+
+Each LLM step randomly picks one prompt variant per record using `Sample(prompts, n=1)`. This adds diversity across generations.
+
+The cookbook keeps `Sample(n=100, strategy="first")` so runs are deterministic and easy to compare. For local corpora with source metadata, use stratified sampling, for example `Sample(n=250, strategy="stratified", by="document_filename")`, to avoid over-representing one source file.
+
+```text
+xsum article
+ │
+ ▼
+life_stage (random from configured stages)
+ │
+ ▼
+Text-to-Persona (random prompt from 3 variants)
+ │
+ ▼
+related_life_stage (random from configured stages)
+ │
+ ▼
+Persona-to-Persona (random prompt from 3 variants)
+ │
+ ▼
+Hugging Face Hub
+```
+
+## Run
+
+Prerequisites:
+
+- `OPENROUTER_API_KEY` set in a `.env` file
+- Hugging Face authentication via `HF_TOKEN` in `.env` or a cached `huggingface_hub` login
+- Base dependencies from `pyproject.toml` installed
+
+Before running, replace the example Hugging Face namespaces in the script with your own username or organization:
+
+- `HF_REPO_ID = "/new-persona-cookbook-dataset"` controls the private pipeline sink.
+- `repo_id = "/datafast-persona-cookbook"` inside `push_records_to_hub()` controls the public publish step.
+
+```bash
+python examples/scripts/43_cookbook_persona_generation.py
+```
+
+The run uses `checkpoint_dir` and `resume=True`, which is useful for paid or rate-limited LLM calls. If a run is interrupted, re-run the same command to continue from the saved checkpoints.
+
+The main example reads from Hugging Face. For a local JSONL corpus, replace `Source.huggingface(...)` with `Source.file(...)` and map your text column to `document` before `add_word_count`.
+
+## Prompt Variants
+
+Each step draws from multiple prompt files stored under `docs/cookbook/assets/`. See the [asset index](assets/index.md) for the full list.
+
+- **Text-to-Persona:** 3 variants (`text_to_persona_v1.txt`, `v2`, `v3`)
+- **Persona-to-Persona:** 3 variants (`persona_to_persona_v1.txt`, `v2`, `v3`)
+
+## Research Basis
+
+The Persona Hub paper introduces Text-to-Persona and Persona-to-Persona as scalable methods for building personas from web text. The paper states that its published prompts are simplified, not the exact experiment strings. This cookbook treats them as paper-aligned adaptations. It does not reuse any Persona Hub code.
+
+## Output Fields
+
+- `id` — generated row UUID
+- `source_id` — original XSum record identifier
+- `summary` — original article summary
+- `document` — source article text
+- `word_count` — whitespace token count
+- `life_stage` — randomly selected life stage for the inferred persona
+- `persona_description` — inferred persona
+- `relationship_type` — link between the two personas
+- `related_life_stage` — randomly selected life stage for the expanded persona
+- `related_persona_description` — the expanded related persona
diff --git a/docs/cookbook/space_text_generation.md b/docs/cookbook/space_text_generation.md
new file mode 100644
index 0000000..92c55dc
--- /dev/null
+++ b/docs/cookbook/space_text_generation.md
@@ -0,0 +1,103 @@
+# Space Engineering Text Generation
+
+Build a raw technical text corpus across document types, topics, expertise levels,
+languages, and model choices.
+
+## Source
+
+- **Script:** `examples/scripts/44_cookbook_space_text_generation.py`
+- **Prompt assets:** [asset index](assets/index.md)
+- **Local output:** `examples/outputs/44_space_text_generation_cookbook.jsonl`
+- **Checkpoints:** `examples/checkpoints/44_space_text_generation_cookbook`
+- **Hub output:** optional, controlled by `DATAFAST_PUSH_TO_HUB=1`
+
+## Pipeline
+
+1. Create a seed grid with `Seed.product`.
+2. Cross document types, topics, and expertise levels explicitly.
+3. Generate one section per seed and language with `LLMStep`.
+4. Let the prompt variables drive the corpus variation.
+5. Parse `title` and `text` from JSON mode.
+6. Keep publication fields, add a row UUID, write JSONL, checkpoint progress,
+ and optionally push to Hugging Face Hub.
+
+The default model is `nvidia/nemotron-3-super-120b-a12b:nitro` through
+OpenRouter.
+
+```text
+document_type x topic x expertise_level
+ |
+ v
+LLMStep language expansion: English and French
+ |
+ v
+JSON fields: title, text
+ |
+ v
+examples/outputs/44_space_text_generation_cookbook.jsonl
+```
+
+## Row Count
+
+The default script generates:
+
+```text
+3 document types x 8 topics x 3 expertise levels x 2 languages
+x 1 generated output x 1 model = 144 rows
+```
+
+To use several models, add provider IDs to `MODEL_IDS`. `LLMStep` will run each
+seed-language combination through every model and the row count will multiply by
+the number of models.
+
+## Run
+
+Prerequisites:
+
+- `OPENROUTER_API_KEY` set in a `.env` file
+- Base dependencies from `pyproject.toml` installed
+- Hugging Face authentication only if publishing
+
+```bash
+python examples/scripts/44_cookbook_space_text_generation.py
+```
+
+To publish, replace `HF_REPO_ID` in the script with a repository under your own
+Hugging Face username or organization, then run:
+
+```bash
+DATAFAST_PUSH_TO_HUB=1 python examples/scripts/44_cookbook_space_text_generation.py
+```
+
+The run uses `checkpoint_dir` and `resume=True`. If generation is interrupted,
+run the command again to continue from saved checkpoints.
+
+## Prompt
+
+The script uses one compact prompt file:
+
+```text
+Write one {document_type} excerpt about {topic} for {expertise_level} in {language_name}.
+```
+
+## Generation Controls
+
+- `MODEL_IDS` controls which models generate each record.
+- `LANGUAGES` controls language expansion and writes the emitted language code to
+ the `language` field.
+- `NUM_OUTPUTS` controls how many generated rows are created for each
+ seed, language, and model combination.
+- `PROMPT_PATH` controls the prompt file used for generation.
+- `SEED` controls deterministic dataset splitting when publishing.
+- `HF_REPO_ID` controls the optional Hugging Face Hub destination.
+
+## Output Fields
+
+- `id` - generated row UUID
+- `document_type` - requested document style
+- `topic` - space engineering topic
+- `expertise_level` - intended reader level
+- `language` - language code emitted by `LLMStep`
+- `model` - model ID emitted by `LLMStep`
+- `title` - generated section title
+- `text` - generated corpus text
diff --git a/docs/cookbook/text_classification.md b/docs/cookbook/text_classification.md
new file mode 100644
index 0000000..819523d
--- /dev/null
+++ b/docs/cookbook/text_classification.md
@@ -0,0 +1,118 @@
+# Text Classification
+
+Build a multilingual trail-conditions classification dataset with `datafast`.
+
+## Source
+
+- **Script:** `examples/scripts/45_cookbook_text_classification.py`
+- **Prompt assets:** [asset index](assets/index.md)
+- **Local output:** `examples/outputs/45_text_classification_cookbook.jsonl`
+- **Checkpoints:** `examples/checkpoints/45_text_classification_cookbook`
+- **Hub output:** optional, controlled by `DATAFAST_PUSH_TO_HUB=1`
+
+## Use Case
+
+This cookbook generates short hiker reports across four trail-condition labels
+so teams can monitor trail quality and surface issues quickly.
+
+The default setup is:
+
+- multi-class: 4 trail-condition labels
+- multi-lingual: English and French
+- multi-model: two generation models by default
+- publishable: optional push to Hugging Face Hub
+
+## Pipeline
+
+1. Create a seed grid from labels, trail types, and writing styles.
+2. Generate one short hiker report for each seed across all configured models
+ and languages.
+3. Keep the label and prompt-variation provenance in flat output columns.
+4. Add a UUID, write JSONL locally, and optionally push to Hugging Face Hub.
+
+Variation is modeled explicitly through `Seed.product(...)`, which keeps the
+generation axes inspectable and easy to count.
+
+```text
+label x trail_type x style
+ |
+ v
+LLMStep language expansion: English and French
+ |
+ v
+LLMStep model expansion
+ |
+ v
+examples/outputs/45_text_classification_cookbook.jsonl
+```
+
+## Row Count
+
+The default script generates:
+
+```text
+4 labels x 3 trail types x 2 styles x 2 languages
+x 2 models = 96 rows
+```
+
+Each extra model in `MODEL_IDS` multiplies the total row count.
+
+## Run
+
+Prerequisites:
+
+- `OPENROUTER_API_KEY` set in a `.env` file
+- Base dependencies from `pyproject.toml` installed
+- Hugging Face authentication only if publishing
+
+```bash
+python examples/scripts/45_cookbook_text_classification.py
+```
+
+To publish, replace `HF_REPO_ID` in the script with a repository under your own
+Hugging Face username or organization, then run:
+
+```bash
+DATAFAST_PUSH_TO_HUB=1 python examples/scripts/45_cookbook_text_classification.py
+```
+
+The run uses `checkpoint_dir` and `resume=True`. If generation is interrupted,
+run the command again to continue from saved checkpoints.
+
+If you want to use provider-specific clients directly, replace `MODEL_IDS` or
+the `model=MODELS` argument in `LLMStep` with providers such as `openai(...)`
+or `anthropic(...)`. The default setup uses multiple OpenRouter-backed models
+so it works with one API key.
+
+## Prompt
+
+The cookbook uses one prompt file and drives diversity through seed dimensions:
+
+```text
+Write one realistic hiker report in {language_name}.
+```
+
+See [text_classification_generation.txt](assets/text_classification_generation.txt)
+for the full prompt.
+
+## Generation Controls
+
+- `LABELS` defines the target classes and their prompt descriptions.
+- `TRAIL_TYPES` controls the trail settings used in generation.
+- `STYLES` controls the voice and format of each report.
+- `LANGUAGES` controls language expansion.
+- `MODEL_IDS` controls which models generate records.
+- `HF_REPO_ID` controls the optional Hugging Face Hub destination.
+
+If you want an extra quality-control pass, add a downstream `Classify` and
+`Filter` stage to verify that generated reports match their intended label.
+
+## Output Fields
+
+- `id` - generated row UUID
+- `label` - target trail-condition label
+- `trail_type` - prompt expansion axis for the trail setting
+- `style` - prompt expansion axis for the report style
+- `language` - language code emitted by `LLMStep`
+- `model` - model ID emitted by `LLMStep`
+- `text` - generated hiker report
diff --git a/docs/guides/building_pipelines.md b/docs/guides/building_pipelines.md
index 64aaaf2..b755410 100644
--- a/docs/guides/building_pipelines.md
+++ b/docs/guides/building_pipelines.md
@@ -3,11 +3,12 @@
## Minimal Pipeline
```python
-from datafast import Map, Sink, Source
+from datafast import AddUUID, Map, Sink, Source
pipeline = (
Source.list([{"text": "hello"}])
>> Map(lambda r: {**r, "length": len(r["text"])})
+ >> AddUUID()
>> Sink.list()
)
@@ -38,6 +39,7 @@ seed = Seed.product(
## Core Data Operations
+- `AddUUID`: add a UUID field to each record
- `Map`: one record in, one record out
- `FlatMap`: one record in, many records out
- `Filter`: keep or drop records
diff --git a/examples/scripts/43_cookbook_persona_generation.py b/examples/scripts/43_cookbook_persona_generation.py
new file mode 100644
index 0000000..ac4f718
--- /dev/null
+++ b/examples/scripts/43_cookbook_persona_generation.py
@@ -0,0 +1,137 @@
+"""Persona-generation cookbook: XSum article -> personas -> related personas.
+
+Demonstrates: Source.huggingface, Map, Filter, Sample, JSON-mode LLMSteps,
+and prompt assets stored under docs/cookbook/assets.
+
+Requires:
+- OPENROUTER_API_KEY
+- Hugging Face authentication via HF_TOKEN or a cached `huggingface_hub` login
+- network access to Hugging Face and OpenRouter
+"""
+
+import random
+
+from dotenv import load_dotenv
+
+from datafast import AddUUID, Filter, LLMStep, Map, Sample, Sink, Source, openrouter
+
+import litellm
+
+load_dotenv()
+
+litellm.suppress_debug_info = True
+
+
+MODEL_ID = "nvidia/nemotron-3-super-120b-a12b:nitro"
+OUTPUT_PATH = "examples/outputs/43_persona_cookbook.jsonl"
+CHECKPOINT_DIR = "examples/checkpoints/43_persona_cookbook"
+HF_REPO_ID = "patrickfleith/new-persona-cookbook-dataset"
+TEXT_TO_PERSONA_PROMPTS = [
+ "docs/cookbook/assets/text_to_persona_v1.txt",
+ "docs/cookbook/assets/text_to_persona_v2.txt",
+ "docs/cookbook/assets/text_to_persona_v3.txt",
+]
+PERSONA_TO_PERSONA_PROMPTS = [
+ "docs/cookbook/assets/persona_to_persona_v1.txt",
+ "docs/cookbook/assets/persona_to_persona_v2.txt",
+ "docs/cookbook/assets/persona_to_persona_v3.txt",
+]
+LIFE_STAGES = [
+ "a teenager",
+ "a young adult",
+ "an adult (30s/40s)",
+ "a middle-aged person (in their 50s/60s)",
+ "a senior person (in their 70s/80s)",
+]
+
+
+def add_word_count(record: dict) -> dict:
+ return {**record, "word_count": len(record["document"].split())}
+
+
+def assign_life_stage(record: dict) -> dict:
+ return {**record, "life_stage": random.choice(LIFE_STAGES)}
+
+
+def assign_related_life_stage(record: dict) -> dict:
+ return {**record, "related_life_stage": random.choice(LIFE_STAGES)}
+
+
+def keep_output_fields(record: dict) -> dict:
+ return {
+ "source_id": record["id"],
+ "summary": record["summary"],
+ "document": record["document"],
+ "word_count": record["word_count"],
+ "life_stage": record["life_stage"],
+ "persona_description": record["persona_description"],
+ "relationship_type": record["relationship_type"],
+ "related_life_stage": record["related_life_stage"],
+ "related_persona_description": record["related_persona_description"],
+ }
+
+
+def build_pipeline():
+ model = openrouter(MODEL_ID, temperature=0.7)
+
+ return (
+ Source.huggingface(
+ "xsum",
+ split="validation",
+ columns=["id", "document", "summary"],
+ )
+ # For a local JSONL corpus, replace the Hugging Face source with something
+ # like Source.file("data/articles.jsonl") and map your text field to
+ # "document" before add_word_count.
+ >> Map(add_word_count).as_step("add_word_count")
+ >> Filter(fn=lambda r: 300 <= r["word_count"] <= 500).as_step("filter_word_count")
+ >> Sample(n=10, strategy="first").as_step("take_first_100")
+ >> Map(assign_life_stage).as_step("assign_life_stage")
+ >> LLMStep(
+ prompt=Sample(TEXT_TO_PERSONA_PROMPTS, n=1),
+ input_columns=["document", "life_stage"],
+ output_columns=["persona_description"],
+ model=model,
+ parse_mode="json",
+ on_parse_error="raise",
+ ).as_step("text_to_persona")
+ >> Map(assign_related_life_stage).as_step("assign_related_life_stage")
+ >> LLMStep(
+ prompt=Sample(PERSONA_TO_PERSONA_PROMPTS, n=1),
+ input_columns=["persona_description", "related_life_stage"],
+ output_columns=["relationship_type", "related_persona_description"],
+ model=model,
+ parse_mode="json",
+ on_parse_error="raise",
+ ).as_step("persona_to_persona")
+ >> Map(keep_output_fields).as_step("keep_output_fields")
+ >> AddUUID(column="id", overwrite=True).as_step("add_uuid")
+ >> Sink.jsonl(OUTPUT_PATH)
+ >> Sink.hub(HF_REPO_ID, private=True)
+)
+
+
+def push_records_to_hub(records: list[dict]) -> None:
+ repo_id = "patrickfleith/datafast-persona-cookbook"
+ private = False
+
+ list(
+ Sink.hub(
+ repo_id=repo_id,
+ private=private,
+ commit_message=f"Publish cookbook 43 persona dataset with {MODEL_ID}",
+ ).process(records)
+ )
+
+
+def main() -> None:
+ records = build_pipeline().run(
+ batch_size=1,
+ checkpoint_dir=CHECKPOINT_DIR,
+ resume=False,
+ )
+ push_records_to_hub(records)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/scripts/44_cookbook_space_text_generation.py b/examples/scripts/44_cookbook_space_text_generation.py
new file mode 100644
index 0000000..6c5d2cb
--- /dev/null
+++ b/examples/scripts/44_cookbook_space_text_generation.py
@@ -0,0 +1,143 @@
+"""Space text-generation cookbook: seed grid -> technical text corpus.
+
+Demonstrates: Seed.product, LLMStep JSON mode, multi-language generation,
+num_outputs, checkpointing, JSONL output, and optional Hub push.
+
+Requires:
+- OPENROUTER_API_KEY
+- Hugging Face authentication only if DATAFAST_PUSH_TO_HUB=1
+- network access to OpenRouter, and to Hugging Face when publishing
+"""
+
+from __future__ import annotations
+
+import os
+
+import litellm
+from dotenv import load_dotenv
+
+from datafast import AddUUID, LLMStep, Map, Seed, Sink, openrouter
+
+load_dotenv()
+litellm.suppress_debug_info = True
+
+
+SEED = 20250304
+MODEL_IDS = ["nvidia/nemotron-3-super-120b-a12b:nitro"]
+OUTPUT_PATH = "examples/outputs/44_space_text_generation_cookbook.jsonl"
+CHECKPOINT_DIR = "examples/checkpoints/44_space_text_generation_cookbook"
+HF_REPO_ID = "patrickfleith/datafast-space-text-generation-cookbook"
+NUM_OUTPUTS = 1
+PROMPT_PATH = "docs/cookbook/assets/space_text_generation.txt"
+
+DOCUMENT_TYPES = [
+ "space engineering textbook",
+ "spacecraft design justification document",
+ "personal blog of a space engineer",
+]
+
+TOPICS = [
+ "Microgravity",
+ "Vacuum",
+ "Heavy Ions",
+ "Thermal Extremes",
+ "Atomic Oxygen",
+ "Debris Impact",
+ "Electrostatic Charging",
+ "Propellant Boil-off",
+]
+
+EXPERTISE_LEVELS = [
+ "executives",
+ "senior engineers",
+ "PhD candidates",
+]
+
+LANGUAGES = {
+ "en": "English",
+ "fr": "French",
+}
+
+
+def make_models():
+ return [openrouter(model_id, temperature=0.7) for model_id in MODEL_IDS]
+
+
+def expected_row_count(model_count: int | None = None) -> int:
+ """Return the number of rows this configuration should generate."""
+ model_total = len(MODEL_IDS) if model_count is None else model_count
+ return (
+ len(DOCUMENT_TYPES)
+ * len(TOPICS)
+ * len(EXPERTISE_LEVELS)
+ * len(LANGUAGES)
+ * NUM_OUTPUTS
+ * model_total
+ )
+
+
+def finalize_record(record: dict) -> dict:
+ """Keep the columns meant for publication."""
+ return {
+ "document_type": record["document_type"],
+ "topic": record["topic"],
+ "expertise_level": record["expertise_level"],
+ "language": record.get("_language", ""),
+ "model": record.get("_model", ""),
+ "title": record["title"],
+ "text": record["text"],
+ }
+
+
+def build_pipeline():
+ return (
+ Seed.product(
+ Seed.values("document_type", DOCUMENT_TYPES),
+ Seed.values("topic", TOPICS),
+ Seed.values("expertise_level", EXPERTISE_LEVELS),
+ ).as_step("seed_space_text_grid")
+ >> LLMStep(
+ prompt=PROMPT_PATH,
+ input_columns=["document_type", "topic", "expertise_level"],
+ output_columns=["title", "text"],
+ parse_mode="json",
+ model=make_models(),
+ language=LANGUAGES,
+ num_outputs=NUM_OUTPUTS,
+ on_parse_error="raise",
+ ).as_step("generate_space_text")
+ >> Map(finalize_record).as_step("finalize_record")
+ >> AddUUID(column="id", overwrite=True).as_step("add_uuid")
+ >> Sink.jsonl(OUTPUT_PATH)
+ )
+
+
+def push_records_to_hub(records: list[dict]) -> None:
+ list(
+ Sink.hub(
+ repo_id=HF_REPO_ID,
+ private=True,
+ train_size=0.8,
+ seed=SEED,
+ shuffle=True,
+ commit_message=f"Publish cookbook 44 text dataset with {', '.join(MODEL_IDS)}",
+ ).process(records)
+ )
+
+
+def main() -> None:
+ print(f"Expected rows: {expected_row_count()}")
+ records = build_pipeline().run(
+ batch_size=4,
+ checkpoint_dir=CHECKPOINT_DIR,
+ resume=True,
+ )
+
+ if os.getenv("DATAFAST_PUSH_TO_HUB") == "1":
+ push_records_to_hub(records)
+
+ print(f"Wrote {len(records)} records to {OUTPUT_PATH}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/scripts/45_cookbook_text_classification.py b/examples/scripts/45_cookbook_text_classification.py
new file mode 100644
index 0000000..01b7e5a
--- /dev/null
+++ b/examples/scripts/45_cookbook_text_classification.py
@@ -0,0 +1,158 @@
+"""Text-classification cookbook: seed grid -> multilingual trail reports.
+
+Demonstrates: Seed.product, prompt expansion via seed dimensions, multi-model
+generation, multi-language generation, checkpointing, JSONL output, and
+optional Hugging Face Hub publishing.
+
+Requires:
+- OPENROUTER_API_KEY
+- Hugging Face authentication only if DATAFAST_PUSH_TO_HUB=1
+- network access to OpenRouter, and to Hugging Face when publishing
+"""
+
+from __future__ import annotations
+
+import os
+
+import litellm
+from dotenv import load_dotenv
+
+from datafast import AddUUID, LLMStep, Map, Seed, SeedDimension, Sink, openrouter
+
+load_dotenv()
+litellm.suppress_debug_info = True
+
+
+SEED = 20250611
+MODEL_IDS = [
+ "nvidia/nemotron-3-super-120b-a12b:nitro",
+ "mistralai/ministral-14b-2512",
+]
+OUTPUT_PATH = "examples/outputs/45_text_classification_cookbook.jsonl"
+CHECKPOINT_DIR = "examples/checkpoints/45_text_classification_cookbook"
+HF_REPO_ID = "patrickfleith/datafast-text-classification-cookbook"
+PROMPT_PATH = "docs/cookbook/assets/text_classification_generation.txt"
+
+LABELS = [
+ {
+ "label": "trail_obstruction",
+ "label_description": (
+ "The trail is partially or fully blocked by obstacles such as "
+ "fallen trees, landslides, snow, flooding, erosion, or dense "
+ "vegetation."
+ ),
+ },
+ {
+ "label": "infrastructure_issues",
+ "label_description": (
+ "The report is about damaged or missing bridges, signs, stairs, "
+ "handrails, markers, boardwalks, or similar trail infrastructure."
+ ),
+ },
+ {
+ "label": "hazards",
+ "label_description": (
+ "The trail has immediate safety risks such as slippery surfaces, "
+ "dangerous crossings, unstable terrain, wildlife threats, or "
+ "other hazardous conditions."
+ ),
+ },
+ {
+ "label": "positive_conditions",
+ "label_description": (
+ "The report highlights clear, safe, enjoyable trail conditions "
+ "such as good maintenance, solid infrastructure, clear signage, "
+ "or scenic features."
+ ),
+ },
+]
+
+TRAIL_TYPES = [
+ "mountain trail",
+ "coastal path",
+ "forest walk",
+]
+
+STYLES = [
+ "a brief social media post",
+ "a hiking review",
+]
+
+LANGUAGES = {
+ "en": "English",
+ "fr": "French",
+}
+
+MODELS = [openrouter(model_id, temperature=0.8) for model_id in MODEL_IDS]
+EXPECTED_ROWS = (
+ len(LABELS)
+ * len(TRAIL_TYPES)
+ * len(STYLES)
+ * len(LANGUAGES)
+ * len(MODELS)
+)
+
+
+def keep_output_fields(record: dict) -> dict:
+ """Keep only the fields meant for publication."""
+ return {
+ "label": record["label"],
+ "trail_type": record["trail_type"],
+ "style": record["style"],
+ "language": record.get("_language", ""),
+ "model": record.get("_model", ""),
+ "text": record["text"],
+ }
+
+
+pipeline = (
+ Seed.product(
+ SeedDimension(
+ columns=["label", "label_description"],
+ values=LABELS,
+ ),
+ Seed.values("trail_type", TRAIL_TYPES),
+ Seed.values("style", STYLES),
+ ).as_step("seed_trail_report_grid")
+ >> LLMStep(
+ prompt=PROMPT_PATH,
+ input_columns=["label", "label_description", "trail_type", "style"],
+ output_column="text",
+ parse_mode="text",
+ model=MODELS,
+ language=LANGUAGES,
+ ).as_step("generate_trail_reports")
+ >> Map(keep_output_fields).as_step("keep_output_fields")
+ >> AddUUID(column="id", overwrite=True).as_step("add_uuid")
+ >> Sink.jsonl(OUTPUT_PATH)
+)
+
+
+def main() -> None:
+ print(f"Expected rows: {EXPECTED_ROWS}")
+ records = pipeline.run(
+ batch_size=4,
+ checkpoint_dir=CHECKPOINT_DIR,
+ resume=True,
+ )
+
+ if os.getenv("DATAFAST_PUSH_TO_HUB") == "1":
+ list(
+ Sink.hub(
+ repo_id=HF_REPO_ID,
+ private=False,
+ train_size=0.8,
+ seed=SEED,
+ shuffle=True,
+ commit_message=(
+ "Publish cookbook 45 classification dataset with "
+ f"{', '.join(MODEL_IDS)}"
+ ),
+ ).process(records)
+ )
+
+ print(f"Wrote {len(records)} records to {OUTPUT_PATH}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/mkdocs.yml b/mkdocs.yml
index 87e795a..131400c 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -43,6 +43,11 @@ nav:
- LLM Steps: guides/llm_steps.md
- Checkpointing: guides/checkpointing.md
- Langfuse Tracing: guides/langfuse_tracing.md
+ - Cookbook:
+ - cookbook/index.md
+ - Text Classification: cookbook/text_classification.md
+ - Persona Generation: cookbook/persona_generation.md
+ - Space Engineering Text Generation: cookbook/space_text_generation.md
- Providers: llms.md
- Models: models.md
- API: api.md
diff --git a/tests/test_add_uuid.py b/tests/test_add_uuid.py
new file mode 100644
index 0000000..e89f837
--- /dev/null
+++ b/tests/test_add_uuid.py
@@ -0,0 +1,78 @@
+import uuid
+
+from datafast import AddUUID, LLMStep, Sink, Source
+
+
+def assert_valid_uuid(value: str) -> None:
+ parsed = uuid.UUID(value)
+ assert str(parsed) == value
+
+
+def test_add_uuid_adds_id_when_missing():
+ records = list(AddUUID().process([{"text": "hello"}]))
+
+ assert records[0]["text"] == "hello"
+ assert_valid_uuid(records[0]["id"])
+
+
+def test_add_uuid_preserves_existing_id_by_default():
+ records = list(AddUUID().process([{"id": "source-1", "text": "hello"}]))
+
+ assert records == [{"id": "source-1", "text": "hello"}]
+
+
+def test_add_uuid_overwrites_existing_id_when_requested():
+ records = list(
+ AddUUID(overwrite=True).process([{"id": "source-1", "text": "hello"}])
+ )
+
+ assert records[0]["text"] == "hello"
+ assert records[0]["id"] != "source-1"
+ assert_valid_uuid(records[0]["id"])
+
+
+def test_add_uuid_generates_distinct_ids_for_multiple_records():
+ records = list(AddUUID().process([{"text": "a"}, {"text": "b"}]))
+ ids = [record["id"] for record in records]
+
+ assert len(set(ids)) == 2
+ for value in ids:
+ assert_valid_uuid(value)
+
+
+def test_add_uuid_supports_custom_column_name():
+ records = list(AddUUID(column="example_id").process([{"text": "hello"}]))
+
+ assert "id" not in records[0]
+ assert_valid_uuid(records[0]["example_id"])
+
+
+def test_add_uuid_assigns_unique_ids_to_llm_num_outputs_pipeline():
+ class FakeModel:
+ model_id = "fake-model"
+ provider_name = "fake"
+
+ def generate(self, messages, metadata=None):
+ return '{"title": "Generated", "text": "Body"}'
+
+ pipeline = (
+ Source.list([{"topic": "vacuum"}])
+ >> LLMStep(
+ prompt="Write about {topic}.",
+ input_columns=["topic"],
+ output_columns=["title", "text"],
+ parse_mode="json",
+ model=FakeModel(),
+ num_outputs=2,
+ )
+ >> AddUUID()
+ >> Sink.list()
+ )
+
+ records = pipeline.run()
+ ids = [record["id"] for record in records]
+
+ assert len(records) == 2
+ assert len(set(ids)) == 2
+ for value in ids:
+ assert_valid_uuid(value)
diff --git a/tests/test_llms_unit.py b/tests/test_llms_unit.py
new file mode 100644
index 0000000..a6e5674
--- /dev/null
+++ b/tests/test_llms_unit.py
@@ -0,0 +1,80 @@
+import datafast.llms as llms_module
+from datafast.llms import OpenRouterProvider
+
+
+class _DummyMessage:
+ def __init__(self, content: str) -> None:
+ self.content = content
+
+
+class _DummyChoice:
+ def __init__(self, content: str) -> None:
+ self.message = _DummyMessage(content)
+
+
+class _DummyResponse:
+ def __init__(self, content: str) -> None:
+ self.choices = [_DummyChoice(content)]
+
+
+def test_openrouter_single_messages_use_completion(monkeypatch):
+ monkeypatch.setattr(llms_module, "load_env_once", lambda: None)
+ monkeypatch.setattr(
+ llms_module,
+ "maybe_configure_langfuse_tracing",
+ lambda load_env=False: False,
+ )
+
+ calls = {"completion": 0, "batch_completion": 0}
+
+ def fake_completion(**kwargs):
+ calls["completion"] += 1
+ assert kwargs["messages"] == [{"role": "user", "content": "ping"}]
+ return _DummyResponse("ok")
+
+ def fake_batch_completion(**kwargs):
+ calls["batch_completion"] += 1
+ raise AssertionError("single-message requests should not use batch_completion")
+
+ monkeypatch.setattr(llms_module.litellm, "completion", fake_completion)
+ monkeypatch.setattr(llms_module.litellm, "batch_completion", fake_batch_completion)
+
+ provider = OpenRouterProvider(model_id="demo-model", api_key="test-key")
+
+ response = provider.generate(messages=[{"role": "user", "content": "ping"}])
+
+ assert response == "ok"
+ assert calls == {"completion": 1, "batch_completion": 0}
+
+
+def test_openrouter_batch_messages_use_batch_completion(monkeypatch):
+ monkeypatch.setattr(llms_module, "load_env_once", lambda: None)
+ monkeypatch.setattr(
+ llms_module,
+ "maybe_configure_langfuse_tracing",
+ lambda load_env=False: False,
+ )
+
+ calls = {"completion": 0, "batch_completion": 0}
+
+ def fake_completion(**kwargs):
+ calls["completion"] += 1
+ raise AssertionError("batched requests should not use completion")
+
+ def fake_batch_completion(**kwargs):
+ calls["batch_completion"] += 1
+ assert len(kwargs["messages"]) == 2
+ return [_DummyResponse("first"), _DummyResponse("second")]
+
+ monkeypatch.setattr(llms_module.litellm, "completion", fake_completion)
+ monkeypatch.setattr(llms_module.litellm, "batch_completion", fake_batch_completion)
+
+ provider = OpenRouterProvider(model_id="demo-model", api_key="test-key")
+
+ response = provider.generate(messages=[
+ [{"role": "user", "content": "one"}],
+ [{"role": "user", "content": "two"}],
+ ])
+
+ assert response == ["first", "second"]
+ assert calls == {"completion": 0, "batch_completion": 1}
diff --git a/tests/test_public_api.py b/tests/test_public_api.py
index 7eaf787..ac56477 100644
--- a/tests/test_public_api.py
+++ b/tests/test_public_api.py
@@ -1,4 +1,5 @@
from datafast import (
+ AddUUID,
Branch,
Classify,
Compare,
@@ -70,6 +71,7 @@ def test_factory_exports_are_available(monkeypatch):
assert Sink is not None
assert Seed is not None
assert Sample is not None
+ assert AddUUID is not None
assert Map is not None
assert FlatMap is not None
assert Filter is not None
diff --git a/tests/test_runner_llm_messages.py b/tests/test_runner_llm_messages.py
new file mode 100644
index 0000000..d870093
--- /dev/null
+++ b/tests/test_runner_llm_messages.py
@@ -0,0 +1,47 @@
+from datafast import LLMStep, ListSink, Source
+
+
+def test_runner_passes_llm_messages_by_keyword():
+ class FakeModel:
+ provider_name = "fake"
+ model_id = "fake-model"
+
+ def __init__(self) -> None:
+ self.calls: list[dict] = []
+
+ def generate(
+ self,
+ prompt=None,
+ messages=None,
+ metadata=None,
+ response_format=None,
+ ):
+ self.calls.append({
+ "prompt": prompt,
+ "messages": messages,
+ "metadata": metadata,
+ })
+ return "done"
+
+ model = FakeModel()
+ sink = ListSink()
+
+ pipeline = (
+ Source.list([{"topic": "robotics"}])
+ >> LLMStep(
+ prompt="Write one short line about {topic}.",
+ input_columns=["topic"],
+ output_column="result",
+ model=model,
+ ).as_step("generate_copy")
+ >> sink
+ )
+
+ output = pipeline.run()
+
+ assert output == [{"topic": "robotics", "result": "done", "_model": "fake-model"}]
+ assert len(model.calls) == 1
+ assert model.calls[0]["prompt"] is None
+ assert model.calls[0]["messages"] == [
+ {"role": "user", "content": "Write one short line about robotics."}
+ ]