patrickfleith · patrickfleith · Apr 5, 2026 · Apr 5, 2026 · Apr 5, 2026 · Apr 5, 2026
diff --git a/.gitignore b/.gitignore
@@ -103,7 +103,7 @@ ipython_config.py
 #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
-#uv.lock
+uv.lock
 
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
@@ -187,4 +187,4 @@ examples/checkpoints/
 examples/outputs/
 
 .codex/
-openspec/
+.agents/
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@ Datafast is a python library for synthetic data generation using llms.
 The old dataset-class API has been removed. The canonical package is `datafast`, and the primary model is:
 
 - create records with `Source` or `Seed`
-- transform them with composable steps
+- transform them with composable steps such as `AddUUID`, `Map`, and `Filter`
 - call LLMs with `LLMStep`, `Classify`, `Score`, `Compare`, `Rewrite`, or `Extract`
 - persist results with `Sink`
 
@@ -53,7 +53,7 @@ pipeline.run(batch_size=4)
 
 - `Source`: load records from Python lists, files, or Hugging Face datasets
 - `Seed`: generate record combinations declaratively
-- `Map`, `FlatMap`, `Filter`, `Group`, `Pair`, `Concat`, `Join`: data operations
+- `AddUUID`, `Map`, `FlatMap`, `Filter`, `Group`, `Pair`, `Concat`, `Join`: data operations
 - `LLMStep`: free-form generation
 - `Classify`, `Score`, `Compare`, `Rewrite`, `Extract`: higher-level LLM transforms
 - `Branch` and `JoinBranches`: multi-path pipelines

diff --git a/datafast/__init__.py b/datafast/__init__.py
@@ -31,7 +31,7 @@
     is_langfuse_tracing_enabled,
 )
 from datafast.transforms.branch import Branch, JoinBranches
-from datafast.transforms.data_ops import Map, FlatMap, Filter, Group, Pair, Concat, Join
+from datafast.transforms.data_ops import AddUUID, Map, FlatMap, Filter, Group, Pair, Concat, Join
 from datafast.transforms.llm_eval import Classify, Score, Compare
 from datafast.transforms.llm_extract import Extract
 from datafast.transforms.llm_step import LLMStep
@@ -64,6 +64,7 @@ def get_version() -> str:
     "Seed",
     "SeedDimension",
     "Sample",
+    "AddUUID",
     "Map",
     "FlatMap",
     "Filter",

diff --git a/datafast/core/runner.py b/datafast/core/runner.py
@@ -233,7 +233,7 @@ def _execute_llm_step(
 
                 try:
                     result = model.generate(
-                        call.messages,
+                        messages=call.messages,
                         metadata=build_trace_metadata(
                             model=model,
                             component="pipeline.step",

diff --git a/datafast/llms.py b/datafast/llms.py
@@ -18,7 +18,6 @@
 # LiteLLM
 import litellm
 from litellm.exceptions import RateLimitError
-from litellm.utils import ModelResponse
 
 # Internal imports
 from .llm_utils import get_messages
@@ -292,17 +291,23 @@ def generate(
             if response_format is not None:
                 completion_params["response_format"] = response_format
 
-            # Call LiteLLM completion with batch messages - retry on rate limit
+            # Call LiteLLM completion with retry on rate limit.
+            # OpenRouter accepts single message requests via completion(), but
+            # rejects the same payload when wrapped in batch_completion().
             max_retries = 3
             retry_delay = 5  # Start with 5 seconds
             response = None
-            
+
             for attempt in range(max_retries):
                 try:
-                    response: list[ModelResponse] = litellm.batch_completion(
-                        **completion_params)
+                    if len(batch_to_send) == 1:
+                        response = [litellm.completion(
+                            **{**completion_params, "messages": batch_to_send[0]}
+                        )]
+                    else:
+                        response = litellm.batch_completion(**completion_params)
                     break  # Success, exit retry loop
-                except RateLimitError as e:
+                except RateLimitError:
                     if attempt < max_retries - 1:
                         wait_time = retry_delay * (2 ** attempt)  # Exponential backoff
                         logger.warning(
@@ -316,7 +321,7 @@ def generate(
                             f"Provider: {self.provider_name} | Model: {self.model_id}"
                         )
                         raise
-            
+
             if response is None:
                 raise RuntimeError("Failed to get response after retries")
 

diff --git a/datafast/transforms/__init__.py b/datafast/transforms/__init__.py
@@ -1,15 +1,15 @@
 """Transform steps for datafast v2."""
 
 from datafast.transforms.sample import Sample
-from datafast.transforms.data_ops import Map, FlatMap, Filter, Group, Pair, Concat, Join
+from datafast.transforms.data_ops import AddUUID, Map, FlatMap, Filter, Group, Pair, Concat, Join
 from datafast.transforms.llm_step import LLMStep
 from datafast.transforms.llm_eval import Classify, Score, Compare
 from datafast.transforms.llm_transform import Rewrite
 from datafast.transforms.llm_extract import Extract
 from datafast.transforms.branch import Branch, JoinBranches
 
 __all__ = [
-    "Sample", "Map", "FlatMap", "Filter", "Group", "Pair", "Concat", "Join",
+    "Sample", "AddUUID", "Map", "FlatMap", "Filter", "Group", "Pair", "Concat", "Join",
     "LLMStep", "Classify", "Score", "Compare", "Rewrite", "Extract",
     "Branch", "JoinBranches",
 ]
diff --git a/datafast/transforms/data_ops.py b/datafast/transforms/data_ops.py
@@ -3,6 +3,7 @@
 import itertools
 import random
 import re
+import uuid
 from collections import defaultdict
 from collections.abc import Callable, Iterable
 from typing import Any
@@ -62,6 +63,34 @@ def process(self, records: Iterable[Record]) -> Iterable[Record]:
             yield from self._fn(record)
 
 
+class AddUUID(Step):
+    """Add a UUID field to each record."""
+
+    def __init__(self, column: str = "id", overwrite: bool = False) -> None:
+        """
+        Initialize an AddUUID step.
+
+        Args:
+            column: Field name to write the UUID into.
+            overwrite: If True, replace existing values in the target column.
+
+        Examples:
+            >>> AddUUID()
+            >>> AddUUID(column="example_id", overwrite=True)
+        """
+        super().__init__()
+        self._column = column
+        self._overwrite = overwrite
+
+    def process(self, records: Iterable[Record]) -> Iterable[Record]:
+        """Add UUIDs while preserving all other fields."""
+        for record in records:
+            if self._column in record and not self._overwrite:
+                yield record
+            else:
+                yield {**record, self._column: str(uuid.uuid4())}
+
+
 class Filter(Step):
     """Keep or drop records based on conditions."""
 

diff --git a/docs/api.md b/docs/api.md
@@ -36,6 +36,7 @@ from datafast import Source, LLMStep, Sink, openrouter
 ## Data Operations
 
 - `Sample`
+- `AddUUID`
 - `Map`
 - `FlatMap`
 - `Filter`

diff --git a/docs/cookbook/assets/index.md b/docs/cookbook/assets/index.md
@@ -0,0 +1,80 @@
+# Cookbook Assets
+
+Prompt files and dataset details used by cookbook examples.
+
+## Text Classification
+
+### Dataset
+
+- **Source:** seed dimensions created with `Seed.product`
+- **Dimensions:** label, trail type, style, language, and model
+- **Local output:** `examples/outputs/45_text_classification_cookbook.jsonl`
+- **Checkpoints:** `examples/checkpoints/45_text_classification_cookbook`
+- **Hub output:** optional, controlled by `DATAFAST_PUSH_TO_HUB=1`
+
+This cookbook models variation directly as seed dimensions so the label, trail
+type, style, language, and model are all explicit in the
+pipeline.
+
+### Prompt
+
+| File | Style |
+| --- | --- |
+| [text_classification_generation.txt](text_classification_generation.txt) | One short trail report per call, with label, trail type, style, and language injected |
+
+## Persona Generation
+
+### Dataset
+
+- **Source:** `xsum` (Hugging Face), `validation` split
+- **Fields used:** `id`, `document`, `summary`
+- **Filter:** 300–500 words, first 100 matches
+- **Local output:** `examples/outputs/43_persona_cookbook.jsonl`
+- **Checkpoints:** `examples/checkpoints/43_persona_cookbook`
+- **Hub output:** set `HF_REPO_ID` and the `repo_id` in `push_records_to_hub()` to repos under your own Hugging Face username or organization
+
+The example keeps first-match sampling for reproducibility. For local JSONL corpora with metadata such as `document_filename`, stratified sampling is usually a better fit.
+
+### Prompt Variants
+
+Each LLM step picks one prompt at random per record. The script also assigns random `life_stage` and `related_life_stage` values before the corresponding LLM steps. Multiple variants add diversity.
+
+#### Text-to-Persona
+
+| File | Style |
+| --- | --- |
+| [text_to_persona_v1.txt](text_to_persona_v1.txt) | Direct inference of a reader persona |
+| [text_to_persona_v2.txt](text_to_persona_v2.txt) | XML-tagged source text, writer/reader framing |
+| [text_to_persona_v3.txt](text_to_persona_v3.txt) | System-role preamble, search-interest angle |
+
+#### Persona-to-Persona
+
+| File | Style |
+| --- | --- |
+| [persona_to_persona_v1.txt](persona_to_persona_v1.txt) | Close relationship, standalone description |
+| [persona_to_persona_v2.txt](persona_to_persona_v2.txt) | Rule-list format, explicit separation of description and relationship |
+| [persona_to_persona_v3.txt](persona_to_persona_v3.txt) | XML-tagged input, concise vivid output |
+
+### Provenance
+
+- Text-to-Persona and Persona-to-Persona prompts are paper-aligned adaptations. The Persona Hub paper states its published prompts are simplified, not exact.
+- No Persona Hub code is reused. The workflow is built with datafast primitives.
+
+## Space Engineering Text Generation
+
+### Dataset
+
+- **Source:** seed dimensions created with `Seed.product`
+- **Dimensions:** document type, topic, expertise level, and language
+- **Local output:** `examples/outputs/44_space_text_generation_cookbook.jsonl`
+- **Checkpoints:** `examples/checkpoints/44_space_text_generation_cookbook`
+- **Hub output:** optional, controlled by `DATAFAST_PUSH_TO_HUB=1`
+
+### Prompt
+
+The text-generation cookbook uses one compact prompt and relies on seed
+dimensions for variation.
+
+| File | Style |
+| --- | --- |
+| [space_text_generation.txt](space_text_generation.txt) | Minimal variable-driven request |
diff --git a/docs/cookbook/assets/persona_to_persona_v1.txt b/docs/cookbook/assets/persona_to_persona_v1.txt
@@ -0,0 +1,11 @@
+Given the following persona, infer one other specific persona who is in a close relationship with them.
+
+Persona:
+{persona_description}
+
+Requirements:
+1. Use one clear relationship such as family member, colleague, friend, or neighbor, coach, teacher, married partner.
+2. Choose a related persona that adds a meaningfully different life perspective but is still likely to be in close contact with the original persona.
+3. Keep the related persona realistic and specific.
+4. Don't talk about the orginal person in the description of the related persona, as it should be self-contained description.
+5. The related persona must be {related_life_stage}. Do not state a precise age, just reflect this life stage naturally.
diff --git a/docs/cookbook/assets/persona_to_persona_v2.txt b/docs/cookbook/assets/persona_to_persona_v2.txt
@@ -0,0 +1,14 @@
+Think of a person who regularly interacts with the following persona in a meaningful way.
+
+Rules:
+- Do not mention the original persona in the description of the related persona.
+- Do not mention the relationship between the two personas in the description, only in the relationship_type
+- Pick a single, concrete relationship type such as mentor-mentee, colleague, neighbor, supervisor-report, or service provider-client
+- The related person should bring a distinctly different viewpoint or expertise, and some uniqueness.
+- Keep the description realistic and standalone without mentionning with the original persona.
+- The related persona must be {related_life_stage}. Do not state a precise age, just reflect this life stage naturally.
+
+Original Persona:
+{persona_description}
+
+Now generate a related persona.
diff --git a/docs/cookbook/assets/persona_to_persona_v3.txt b/docs/cookbook/assets/persona_to_persona_v3.txt
@@ -0,0 +1,16 @@
+Here is the description of someone:
+<description>
+{persona_description}
+</description>
+
+Come up with one other description of an individual who could be part of this persona's life.
+We want the description to be detailed but super concise (max 2 sentences) and vivid.
+But we want to have the a standalone description of that new persona without mentioning the original persona or a reason in the description.
+
+Requirements:
+1. Define a clear interpersonal link such as friend, advisor, competitor, family member, or collaborator.
+2. The new persona should offer a complementary or contrasting perspective.
+3. Make the related persona vivid and believable, avoid generic archetypes.
+4. Describe the relation in relationship_type field, not in the description.
+5. The related persona must be {related_life_stage}. Do not state a precise age, just reflect this life stage naturally.
+
diff --git a/docs/cookbook/assets/space_text_generation.txt b/docs/cookbook/assets/space_text_generation.txt
@@ -0,0 +1 @@
+Write one {document_type} excerpt about {topic} for {expertise_level} in {language_name}.
diff --git a/docs/cookbook/assets/text_classification_generation.txt b/docs/cookbook/assets/text_classification_generation.txt
@@ -0,0 +1,13 @@
+Write one realistic hiker report in {language_name}.
+
+Target category: {label}
+Category definition: {label_description}
+
+Constraints:
+- The report must clearly match the target category.
+- The setting must be a {trail_type}.
+- The writing style must be {style}.
+- Keep it to 1 or 2 sentences.
+- Do not mention the category name directly.
+- Do not use bullets, numbering, or explanations.
+- Make the report concrete and varied.
diff --git a/docs/cookbook/assets/text_to_persona_v1.txt b/docs/cookbook/assets/text_to_persona_v1.txt
@@ -0,0 +1,17 @@
+Infer one specific persona who is likely to read text.
+
+Source text:
+{document}
+
+Requirements:
+1. Return a single persona, not a group.
+2. Make the persona specific and fine-grained rather than generic.
+3. Ground the persona in signals from the text such as domain, expertise, context, or likely motivation.
+4. Do not quote the source text in the persona field.
+5. Only write 1 or 2 sentences maximum.
+6. The persona is not the subject of the text, but rather someone who would be reading it.
+7. Do not refer to the source text, article, or its content in the persona description. The persona must be self-contained.
+8. The persona must be {life_stage}. Do not mention a precise age, just reflect this life stage naturally.
+
+Now figure out a persona description who would be reading this text.
+
diff --git a/docs/cookbook/assets/text_to_persona_v2.txt b/docs/cookbook/assets/text_to_persona_v2.txt
@@ -0,0 +1,16 @@
+<source_text>
+{document}
+</source_text>
+
+Identify one precise individual who would naturally encounter or write the <source_text>.
+
+Requirements:
+1. Describe exactly one person.
+2. Be as specific as possible: mention plausible occupation and/or life situation.
+3. Derive the persona strictly from cues in the text such as topic, jargon, tone, or implied audience as a potential writter / reader of this text.
+4. Do not copy or paraphrase the source text in the persona field.
+5. Only return 1 or 2 sentences maximum.
+6. The described person is not the subject of the text, but rather someone who would be encountering or writing such text as part of their life.
+7. Do not reference the source text, article, or its content in the persona description. The persona must stand on its own.
+8. The persona must be {life_stage}. Do not state a precise age, just reflect this life stage naturally.
+
diff --git a/docs/cookbook/assets/text_to_persona_v3.txt b/docs/cookbook/assets/text_to_persona_v3.txt
@@ -0,0 +1,17 @@
+You are a persona inference assistant.
+
+Based on the text content below, imagine one real person who would be interested in searching about the topic from this content.
+
+Rules:
+- Output a single, concrete persona rather than a broad demographic.
+- Include details like professional background, interests, or situational context that make the persona feel authentic.
+- Don't mention the person search or information retrieval action in the persona description, just describe the persona which could explain their interest in the topic.
+- Keep it super short and concise.
+- Do not mention or refer to the source text, article, or its content in the persona description. The persona must be self-contained.
+- The persona must be {life_stage}. Do not state a precise age, just reflect this life stage naturally.
+
+Source text:
+{document}
+
+
+
diff --git a/docs/cookbook/index.md b/docs/cookbook/index.md
@@ -0,0 +1,16 @@
+# Cookbook
+
+Cookbooks connect a runnable script to a documentation walkthrough.
+
+The Python script is the source of truth. Each cookbook page explains:
+
+- where the executable example lives
+- what inputs it uses
+- which prompt assets it depends on
+- where it writes its output artifacts
+
+## Available Cookbooks
+
+- [Text Classification](text_classification.md): generate a multilingual trail-conditions classification dataset from explicit seed dimensions.
+- [Persona Generation](persona_generation.md): infer personas from real articles and expand them through relationships using randomized prompt variants.
+- [Space Engineering Text Generation](space_text_generation.md): generate a raw multilingual technical text corpus from seed dimensions.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Write one {document_type} excerpt about {topic} for {expertise_level} in {language_name}.