Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ ipython_config.py
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
#uv.lock
uv.lock

# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
Expand Down Expand Up @@ -187,4 +187,4 @@ examples/checkpoints/
examples/outputs/

.codex/
openspec/
.agents/
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Datafast is a python library for synthetic data generation using llms.
The old dataset-class API has been removed. The canonical package is `datafast`, and the primary model is:

- create records with `Source` or `Seed`
- transform them with composable steps
- transform them with composable steps such as `AddUUID`, `Map`, and `Filter`
- call LLMs with `LLMStep`, `Classify`, `Score`, `Compare`, `Rewrite`, or `Extract`
- persist results with `Sink`

Expand Down Expand Up @@ -53,7 +53,7 @@ pipeline.run(batch_size=4)

- `Source`: load records from Python lists, files, or Hugging Face datasets
- `Seed`: generate record combinations declaratively
- `Map`, `FlatMap`, `Filter`, `Group`, `Pair`, `Concat`, `Join`: data operations
- `AddUUID`, `Map`, `FlatMap`, `Filter`, `Group`, `Pair`, `Concat`, `Join`: data operations
- `LLMStep`: free-form generation
- `Classify`, `Score`, `Compare`, `Rewrite`, `Extract`: higher-level LLM transforms
- `Branch` and `JoinBranches`: multi-path pipelines
Expand Down
3 changes: 2 additions & 1 deletion datafast/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
is_langfuse_tracing_enabled,
)
from datafast.transforms.branch import Branch, JoinBranches
from datafast.transforms.data_ops import Map, FlatMap, Filter, Group, Pair, Concat, Join
from datafast.transforms.data_ops import AddUUID, Map, FlatMap, Filter, Group, Pair, Concat, Join
from datafast.transforms.llm_eval import Classify, Score, Compare
from datafast.transforms.llm_extract import Extract
from datafast.transforms.llm_step import LLMStep
Expand Down Expand Up @@ -64,6 +64,7 @@ def get_version() -> str:
"Seed",
"SeedDimension",
"Sample",
"AddUUID",
"Map",
"FlatMap",
"Filter",
Expand Down
2 changes: 1 addition & 1 deletion datafast/core/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def _execute_llm_step(

try:
result = model.generate(
call.messages,
messages=call.messages,
metadata=build_trace_metadata(
model=model,
component="pipeline.step",
Expand Down
19 changes: 12 additions & 7 deletions datafast/llms.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
# LiteLLM
import litellm
from litellm.exceptions import RateLimitError
from litellm.utils import ModelResponse

# Internal imports
from .llm_utils import get_messages
Expand Down Expand Up @@ -292,17 +291,23 @@ def generate(
if response_format is not None:
completion_params["response_format"] = response_format

# Call LiteLLM completion with batch messages - retry on rate limit
# Call LiteLLM completion with retry on rate limit.
# OpenRouter accepts single message requests via completion(), but
# rejects the same payload when wrapped in batch_completion().
max_retries = 3
retry_delay = 5 # Start with 5 seconds
response = None

for attempt in range(max_retries):
try:
response: list[ModelResponse] = litellm.batch_completion(
**completion_params)
if len(batch_to_send) == 1:
response = [litellm.completion(
**{**completion_params, "messages": batch_to_send[0]}
)]
else:
response = litellm.batch_completion(**completion_params)
break # Success, exit retry loop
except RateLimitError as e:
except RateLimitError:
if attempt < max_retries - 1:
wait_time = retry_delay * (2 ** attempt) # Exponential backoff
logger.warning(
Expand All @@ -316,7 +321,7 @@ def generate(
f"Provider: {self.provider_name} | Model: {self.model_id}"
)
raise

if response is None:
raise RuntimeError("Failed to get response after retries")

Expand Down
4 changes: 2 additions & 2 deletions datafast/transforms/__init__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
"""Transform steps for datafast v2."""

from datafast.transforms.sample import Sample
from datafast.transforms.data_ops import Map, FlatMap, Filter, Group, Pair, Concat, Join
from datafast.transforms.data_ops import AddUUID, Map, FlatMap, Filter, Group, Pair, Concat, Join
from datafast.transforms.llm_step import LLMStep
from datafast.transforms.llm_eval import Classify, Score, Compare
from datafast.transforms.llm_transform import Rewrite
from datafast.transforms.llm_extract import Extract
from datafast.transforms.branch import Branch, JoinBranches

__all__ = [
"Sample", "Map", "FlatMap", "Filter", "Group", "Pair", "Concat", "Join",
"Sample", "AddUUID", "Map", "FlatMap", "Filter", "Group", "Pair", "Concat", "Join",
"LLMStep", "Classify", "Score", "Compare", "Rewrite", "Extract",
"Branch", "JoinBranches",
]
29 changes: 29 additions & 0 deletions datafast/transforms/data_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import itertools
import random
import re
import uuid
from collections import defaultdict
from collections.abc import Callable, Iterable
from typing import Any
Expand Down Expand Up @@ -62,6 +63,34 @@ def process(self, records: Iterable[Record]) -> Iterable[Record]:
yield from self._fn(record)


class AddUUID(Step):
"""Add a UUID field to each record."""

def __init__(self, column: str = "id", overwrite: bool = False) -> None:
"""
Initialize an AddUUID step.

Args:
column: Field name to write the UUID into.
overwrite: If True, replace existing values in the target column.

Examples:
>>> AddUUID()
>>> AddUUID(column="example_id", overwrite=True)
"""
super().__init__()
self._column = column
self._overwrite = overwrite

def process(self, records: Iterable[Record]) -> Iterable[Record]:
"""Add UUIDs while preserving all other fields."""
for record in records:
if self._column in record and not self._overwrite:
yield record
else:
yield {**record, self._column: str(uuid.uuid4())}


class Filter(Step):
"""Keep or drop records based on conditions."""

Expand Down
1 change: 1 addition & 0 deletions docs/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ from datafast import Source, LLMStep, Sink, openrouter
## Data Operations

- `Sample`
- `AddUUID`
- `Map`
- `FlatMap`
- `Filter`
Expand Down
80 changes: 80 additions & 0 deletions docs/cookbook/assets/index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Cookbook Assets

Prompt files and dataset details used by cookbook examples.

## Text Classification

### Dataset

- **Source:** seed dimensions created with `Seed.product`
- **Dimensions:** label, trail type, style, language, and model
- **Local output:** `examples/outputs/45_text_classification_cookbook.jsonl`
- **Checkpoints:** `examples/checkpoints/45_text_classification_cookbook`
- **Hub output:** optional, controlled by `DATAFAST_PUSH_TO_HUB=1`

This cookbook models variation directly as seed dimensions so the label, trail
type, style, language, and model are all explicit in the
pipeline.

### Prompt

| File | Style |
| --- | --- |
| [text_classification_generation.txt](text_classification_generation.txt) | One short trail report per call, with label, trail type, style, and language injected |

## Persona Generation

### Dataset

- **Source:** `xsum` (Hugging Face), `validation` split
- **Fields used:** `id`, `document`, `summary`
- **Filter:** 300–500 words, first 100 matches
- **Local output:** `examples/outputs/43_persona_cookbook.jsonl`
- **Checkpoints:** `examples/checkpoints/43_persona_cookbook`
- **Hub output:** set `HF_REPO_ID` and the `repo_id` in `push_records_to_hub()` to repos under your own Hugging Face username or organization

The example keeps first-match sampling for reproducibility. For local JSONL corpora with metadata such as `document_filename`, stratified sampling is usually a better fit.

### Prompt Variants

Each LLM step picks one prompt at random per record. The script also assigns random `life_stage` and `related_life_stage` values before the corresponding LLM steps. Multiple variants add diversity.

#### Text-to-Persona

| File | Style |
| --- | --- |
| [text_to_persona_v1.txt](text_to_persona_v1.txt) | Direct inference of a reader persona |
| [text_to_persona_v2.txt](text_to_persona_v2.txt) | XML-tagged source text, writer/reader framing |
| [text_to_persona_v3.txt](text_to_persona_v3.txt) | System-role preamble, search-interest angle |

#### Persona-to-Persona

| File | Style |
| --- | --- |
| [persona_to_persona_v1.txt](persona_to_persona_v1.txt) | Close relationship, standalone description |
| [persona_to_persona_v2.txt](persona_to_persona_v2.txt) | Rule-list format, explicit separation of description and relationship |
| [persona_to_persona_v3.txt](persona_to_persona_v3.txt) | XML-tagged input, concise vivid output |

### Provenance

- Text-to-Persona and Persona-to-Persona prompts are paper-aligned adaptations. The Persona Hub paper states its published prompts are simplified, not exact.
- No Persona Hub code is reused. The workflow is built with datafast primitives.

## Space Engineering Text Generation

### Dataset

- **Source:** seed dimensions created with `Seed.product`
- **Dimensions:** document type, topic, expertise level, and language
- **Local output:** `examples/outputs/44_space_text_generation_cookbook.jsonl`
- **Checkpoints:** `examples/checkpoints/44_space_text_generation_cookbook`
- **Hub output:** optional, controlled by `DATAFAST_PUSH_TO_HUB=1`

### Prompt

The text-generation cookbook uses one compact prompt and relies on seed
dimensions for variation.

| File | Style |
| --- | --- |
| [space_text_generation.txt](space_text_generation.txt) | Minimal variable-driven request |
11 changes: 11 additions & 0 deletions docs/cookbook/assets/persona_to_persona_v1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
Given the following persona, infer one other specific persona who is in a close relationship with them.

Persona:
{persona_description}

Requirements:
1. Use one clear relationship such as family member, colleague, friend, or neighbor, coach, teacher, married partner.
2. Choose a related persona that adds a meaningfully different life perspective but is still likely to be in close contact with the original persona.
3. Keep the related persona realistic and specific.
4. Don't talk about the orginal person in the description of the related persona, as it should be self-contained description.
5. The related persona must be {related_life_stage}. Do not state a precise age, just reflect this life stage naturally.
14 changes: 14 additions & 0 deletions docs/cookbook/assets/persona_to_persona_v2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
Think of a person who regularly interacts with the following persona in a meaningful way.

Rules:
- Do not mention the original persona in the description of the related persona.
- Do not mention the relationship between the two personas in the description, only in the relationship_type
- Pick a single, concrete relationship type such as mentor-mentee, colleague, neighbor, supervisor-report, or service provider-client
- The related person should bring a distinctly different viewpoint or expertise, and some uniqueness.
- Keep the description realistic and standalone without mentionning with the original persona.
- The related persona must be {related_life_stage}. Do not state a precise age, just reflect this life stage naturally.

Original Persona:
{persona_description}

Now generate a related persona.
16 changes: 16 additions & 0 deletions docs/cookbook/assets/persona_to_persona_v3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
Here is the description of someone:
<description>
{persona_description}
</description>

Come up with one other description of an individual who could be part of this persona's life.
We want the description to be detailed but super concise (max 2 sentences) and vivid.
But we want to have the a standalone description of that new persona without mentioning the original persona or a reason in the description.

Requirements:
1. Define a clear interpersonal link such as friend, advisor, competitor, family member, or collaborator.
2. The new persona should offer a complementary or contrasting perspective.
3. Make the related persona vivid and believable, avoid generic archetypes.
4. Describe the relation in relationship_type field, not in the description.
5. The related persona must be {related_life_stage}. Do not state a precise age, just reflect this life stage naturally.

1 change: 1 addition & 0 deletions docs/cookbook/assets/space_text_generation.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Write one {document_type} excerpt about {topic} for {expertise_level} in {language_name}.
13 changes: 13 additions & 0 deletions docs/cookbook/assets/text_classification_generation.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Write one realistic hiker report in {language_name}.

Target category: {label}
Category definition: {label_description}

Constraints:
- The report must clearly match the target category.
- The setting must be a {trail_type}.
- The writing style must be {style}.
- Keep it to 1 or 2 sentences.
- Do not mention the category name directly.
- Do not use bullets, numbering, or explanations.
- Make the report concrete and varied.
17 changes: 17 additions & 0 deletions docs/cookbook/assets/text_to_persona_v1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
Infer one specific persona who is likely to read text.

Source text:
{document}

Requirements:
1. Return a single persona, not a group.
2. Make the persona specific and fine-grained rather than generic.
3. Ground the persona in signals from the text such as domain, expertise, context, or likely motivation.
4. Do not quote the source text in the persona field.
5. Only write 1 or 2 sentences maximum.
6. The persona is not the subject of the text, but rather someone who would be reading it.
7. Do not refer to the source text, article, or its content in the persona description. The persona must be self-contained.
8. The persona must be {life_stage}. Do not mention a precise age, just reflect this life stage naturally.

Now figure out a persona description who would be reading this text.

16 changes: 16 additions & 0 deletions docs/cookbook/assets/text_to_persona_v2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<source_text>
{document}
</source_text>

Identify one precise individual who would naturally encounter or write the <source_text>.

Requirements:
1. Describe exactly one person.
2. Be as specific as possible: mention plausible occupation and/or life situation.
3. Derive the persona strictly from cues in the text such as topic, jargon, tone, or implied audience as a potential writter / reader of this text.
4. Do not copy or paraphrase the source text in the persona field.
5. Only return 1 or 2 sentences maximum.
6. The described person is not the subject of the text, but rather someone who would be encountering or writing such text as part of their life.
7. Do not reference the source text, article, or its content in the persona description. The persona must stand on its own.
8. The persona must be {life_stage}. Do not state a precise age, just reflect this life stage naturally.

17 changes: 17 additions & 0 deletions docs/cookbook/assets/text_to_persona_v3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
You are a persona inference assistant.

Based on the text content below, imagine one real person who would be interested in searching about the topic from this content.

Rules:
- Output a single, concrete persona rather than a broad demographic.
- Include details like professional background, interests, or situational context that make the persona feel authentic.
- Don't mention the person search or information retrieval action in the persona description, just describe the persona which could explain their interest in the topic.
- Keep it super short and concise.
- Do not mention or refer to the source text, article, or its content in the persona description. The persona must be self-contained.
- The persona must be {life_stage}. Do not state a precise age, just reflect this life stage naturally.

Source text:
{document}



16 changes: 16 additions & 0 deletions docs/cookbook/index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Cookbook

Cookbooks connect a runnable script to a documentation walkthrough.

The Python script is the source of truth. Each cookbook page explains:

- where the executable example lives
- what inputs it uses
- which prompt assets it depends on
- where it writes its output artifacts

## Available Cookbooks

- [Text Classification](text_classification.md): generate a multilingual trail-conditions classification dataset from explicit seed dimensions.
- [Persona Generation](persona_generation.md): infer personas from real articles and expand them through relationships using randomized prompt variants.
- [Space Engineering Text Generation](space_text_generation.md): generate a raw multilingual technical text corpus from seed dimensions.
Loading