diff --git a/.gitignore b/.gitignore index 5f96167..aa49fe5 100644 --- a/.gitignore +++ b/.gitignore @@ -103,7 +103,7 @@ ipython_config.py # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. -#uv.lock +uv.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. @@ -187,4 +187,4 @@ examples/checkpoints/ examples/outputs/ .codex/ -openspec/ \ No newline at end of file +.agents/ \ No newline at end of file diff --git a/README.md b/README.md index 6521a97..38b2deb 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Datafast is a python library for synthetic data generation using llms. The old dataset-class API has been removed. The canonical package is `datafast`, and the primary model is: - create records with `Source` or `Seed` -- transform them with composable steps +- transform them with composable steps such as `AddUUID`, `Map`, and `Filter` - call LLMs with `LLMStep`, `Classify`, `Score`, `Compare`, `Rewrite`, or `Extract` - persist results with `Sink` @@ -53,7 +53,7 @@ pipeline.run(batch_size=4) - `Source`: load records from Python lists, files, or Hugging Face datasets - `Seed`: generate record combinations declaratively -- `Map`, `FlatMap`, `Filter`, `Group`, `Pair`, `Concat`, `Join`: data operations +- `AddUUID`, `Map`, `FlatMap`, `Filter`, `Group`, `Pair`, `Concat`, `Join`: data operations - `LLMStep`: free-form generation - `Classify`, `Score`, `Compare`, `Rewrite`, `Extract`: higher-level LLM transforms - `Branch` and `JoinBranches`: multi-path pipelines diff --git a/datafast/__init__.py b/datafast/__init__.py index 19bd452..6576370 100644 --- a/datafast/__init__.py +++ b/datafast/__init__.py @@ -31,7 +31,7 @@ is_langfuse_tracing_enabled, ) from datafast.transforms.branch import Branch, JoinBranches -from datafast.transforms.data_ops import Map, FlatMap, Filter, Group, Pair, Concat, Join +from datafast.transforms.data_ops import AddUUID, Map, FlatMap, Filter, Group, Pair, Concat, Join from datafast.transforms.llm_eval import Classify, Score, Compare from datafast.transforms.llm_extract import Extract from datafast.transforms.llm_step import LLMStep @@ -64,6 +64,7 @@ def get_version() -> str: "Seed", "SeedDimension", "Sample", + "AddUUID", "Map", "FlatMap", "Filter", diff --git a/datafast/core/runner.py b/datafast/core/runner.py index 0a28ba2..3497605 100644 --- a/datafast/core/runner.py +++ b/datafast/core/runner.py @@ -233,7 +233,7 @@ def _execute_llm_step( try: result = model.generate( - call.messages, + messages=call.messages, metadata=build_trace_metadata( model=model, component="pipeline.step", diff --git a/datafast/llms.py b/datafast/llms.py index 092346a..754e8b8 100644 --- a/datafast/llms.py +++ b/datafast/llms.py @@ -18,7 +18,6 @@ # LiteLLM import litellm from litellm.exceptions import RateLimitError -from litellm.utils import ModelResponse # Internal imports from .llm_utils import get_messages @@ -292,17 +291,23 @@ def generate( if response_format is not None: completion_params["response_format"] = response_format - # Call LiteLLM completion with batch messages - retry on rate limit + # Call LiteLLM completion with retry on rate limit. + # OpenRouter accepts single message requests via completion(), but + # rejects the same payload when wrapped in batch_completion(). max_retries = 3 retry_delay = 5 # Start with 5 seconds response = None - + for attempt in range(max_retries): try: - response: list[ModelResponse] = litellm.batch_completion( - **completion_params) + if len(batch_to_send) == 1: + response = [litellm.completion( + **{**completion_params, "messages": batch_to_send[0]} + )] + else: + response = litellm.batch_completion(**completion_params) break # Success, exit retry loop - except RateLimitError as e: + except RateLimitError: if attempt < max_retries - 1: wait_time = retry_delay * (2 ** attempt) # Exponential backoff logger.warning( @@ -316,7 +321,7 @@ def generate( f"Provider: {self.provider_name} | Model: {self.model_id}" ) raise - + if response is None: raise RuntimeError("Failed to get response after retries") diff --git a/datafast/transforms/__init__.py b/datafast/transforms/__init__.py index 025ea3f..f7a88d2 100644 --- a/datafast/transforms/__init__.py +++ b/datafast/transforms/__init__.py @@ -1,7 +1,7 @@ """Transform steps for datafast v2.""" from datafast.transforms.sample import Sample -from datafast.transforms.data_ops import Map, FlatMap, Filter, Group, Pair, Concat, Join +from datafast.transforms.data_ops import AddUUID, Map, FlatMap, Filter, Group, Pair, Concat, Join from datafast.transforms.llm_step import LLMStep from datafast.transforms.llm_eval import Classify, Score, Compare from datafast.transforms.llm_transform import Rewrite @@ -9,7 +9,7 @@ from datafast.transforms.branch import Branch, JoinBranches __all__ = [ - "Sample", "Map", "FlatMap", "Filter", "Group", "Pair", "Concat", "Join", + "Sample", "AddUUID", "Map", "FlatMap", "Filter", "Group", "Pair", "Concat", "Join", "LLMStep", "Classify", "Score", "Compare", "Rewrite", "Extract", "Branch", "JoinBranches", ] diff --git a/datafast/transforms/data_ops.py b/datafast/transforms/data_ops.py index 3887460..fafb5cf 100644 --- a/datafast/transforms/data_ops.py +++ b/datafast/transforms/data_ops.py @@ -3,6 +3,7 @@ import itertools import random import re +import uuid from collections import defaultdict from collections.abc import Callable, Iterable from typing import Any @@ -62,6 +63,34 @@ def process(self, records: Iterable[Record]) -> Iterable[Record]: yield from self._fn(record) +class AddUUID(Step): + """Add a UUID field to each record.""" + + def __init__(self, column: str = "id", overwrite: bool = False) -> None: + """ + Initialize an AddUUID step. + + Args: + column: Field name to write the UUID into. + overwrite: If True, replace existing values in the target column. + + Examples: + >>> AddUUID() + >>> AddUUID(column="example_id", overwrite=True) + """ + super().__init__() + self._column = column + self._overwrite = overwrite + + def process(self, records: Iterable[Record]) -> Iterable[Record]: + """Add UUIDs while preserving all other fields.""" + for record in records: + if self._column in record and not self._overwrite: + yield record + else: + yield {**record, self._column: str(uuid.uuid4())} + + class Filter(Step): """Keep or drop records based on conditions.""" diff --git a/docs/api.md b/docs/api.md index edef161..45857e2 100644 --- a/docs/api.md +++ b/docs/api.md @@ -36,6 +36,7 @@ from datafast import Source, LLMStep, Sink, openrouter ## Data Operations - `Sample` +- `AddUUID` - `Map` - `FlatMap` - `Filter` diff --git a/docs/cookbook/assets/index.md b/docs/cookbook/assets/index.md new file mode 100644 index 0000000..7f69923 --- /dev/null +++ b/docs/cookbook/assets/index.md @@ -0,0 +1,80 @@ +# Cookbook Assets + +Prompt files and dataset details used by cookbook examples. + +## Text Classification + +### Dataset + +- **Source:** seed dimensions created with `Seed.product` +- **Dimensions:** label, trail type, style, language, and model +- **Local output:** `examples/outputs/45_text_classification_cookbook.jsonl` +- **Checkpoints:** `examples/checkpoints/45_text_classification_cookbook` +- **Hub output:** optional, controlled by `DATAFAST_PUSH_TO_HUB=1` + +This cookbook models variation directly as seed dimensions so the label, trail +type, style, language, and model are all explicit in the +pipeline. + +### Prompt + +| File | Style | +| --- | --- | +| [text_classification_generation.txt](text_classification_generation.txt) | One short trail report per call, with label, trail type, style, and language injected | + +## Persona Generation + +### Dataset + +- **Source:** `xsum` (Hugging Face), `validation` split +- **Fields used:** `id`, `document`, `summary` +- **Filter:** 300–500 words, first 100 matches +- **Local output:** `examples/outputs/43_persona_cookbook.jsonl` +- **Checkpoints:** `examples/checkpoints/43_persona_cookbook` +- **Hub output:** set `HF_REPO_ID` and the `repo_id` in `push_records_to_hub()` to repos under your own Hugging Face username or organization + +The example keeps first-match sampling for reproducibility. For local JSONL corpora with metadata such as `document_filename`, stratified sampling is usually a better fit. + +### Prompt Variants + +Each LLM step picks one prompt at random per record. The script also assigns random `life_stage` and `related_life_stage` values before the corresponding LLM steps. Multiple variants add diversity. + +#### Text-to-Persona + +| File | Style | +| --- | --- | +| [text_to_persona_v1.txt](text_to_persona_v1.txt) | Direct inference of a reader persona | +| [text_to_persona_v2.txt](text_to_persona_v2.txt) | XML-tagged source text, writer/reader framing | +| [text_to_persona_v3.txt](text_to_persona_v3.txt) | System-role preamble, search-interest angle | + +#### Persona-to-Persona + +| File | Style | +| --- | --- | +| [persona_to_persona_v1.txt](persona_to_persona_v1.txt) | Close relationship, standalone description | +| [persona_to_persona_v2.txt](persona_to_persona_v2.txt) | Rule-list format, explicit separation of description and relationship | +| [persona_to_persona_v3.txt](persona_to_persona_v3.txt) | XML-tagged input, concise vivid output | + +### Provenance + +- Text-to-Persona and Persona-to-Persona prompts are paper-aligned adaptations. The Persona Hub paper states its published prompts are simplified, not exact. +- No Persona Hub code is reused. The workflow is built with datafast primitives. + +## Space Engineering Text Generation + +### Dataset + +- **Source:** seed dimensions created with `Seed.product` +- **Dimensions:** document type, topic, expertise level, and language +- **Local output:** `examples/outputs/44_space_text_generation_cookbook.jsonl` +- **Checkpoints:** `examples/checkpoints/44_space_text_generation_cookbook` +- **Hub output:** optional, controlled by `DATAFAST_PUSH_TO_HUB=1` + +### Prompt + +The text-generation cookbook uses one compact prompt and relies on seed +dimensions for variation. + +| File | Style | +| --- | --- | +| [space_text_generation.txt](space_text_generation.txt) | Minimal variable-driven request | diff --git a/docs/cookbook/assets/persona_to_persona_v1.txt b/docs/cookbook/assets/persona_to_persona_v1.txt new file mode 100644 index 0000000..eabb6d6 --- /dev/null +++ b/docs/cookbook/assets/persona_to_persona_v1.txt @@ -0,0 +1,11 @@ +Given the following persona, infer one other specific persona who is in a close relationship with them. + +Persona: +{persona_description} + +Requirements: +1. Use one clear relationship such as family member, colleague, friend, or neighbor, coach, teacher, married partner. +2. Choose a related persona that adds a meaningfully different life perspective but is still likely to be in close contact with the original persona. +3. Keep the related persona realistic and specific. +4. Don't talk about the orginal person in the description of the related persona, as it should be self-contained description. +5. The related persona must be {related_life_stage}. Do not state a precise age, just reflect this life stage naturally. diff --git a/docs/cookbook/assets/persona_to_persona_v2.txt b/docs/cookbook/assets/persona_to_persona_v2.txt new file mode 100644 index 0000000..b4e4adf --- /dev/null +++ b/docs/cookbook/assets/persona_to_persona_v2.txt @@ -0,0 +1,14 @@ +Think of a person who regularly interacts with the following persona in a meaningful way. + +Rules: +- Do not mention the original persona in the description of the related persona. +- Do not mention the relationship between the two personas in the description, only in the relationship_type +- Pick a single, concrete relationship type such as mentor-mentee, colleague, neighbor, supervisor-report, or service provider-client +- The related person should bring a distinctly different viewpoint or expertise, and some uniqueness. +- Keep the description realistic and standalone without mentionning with the original persona. +- The related persona must be {related_life_stage}. Do not state a precise age, just reflect this life stage naturally. + +Original Persona: +{persona_description} + +Now generate a related persona. \ No newline at end of file diff --git a/docs/cookbook/assets/persona_to_persona_v3.txt b/docs/cookbook/assets/persona_to_persona_v3.txt new file mode 100644 index 0000000..9652161 --- /dev/null +++ b/docs/cookbook/assets/persona_to_persona_v3.txt @@ -0,0 +1,16 @@ +Here is the description of someone: + +{persona_description} + + +Come up with one other description of an individual who could be part of this persona's life. +We want the description to be detailed but super concise (max 2 sentences) and vivid. +But we want to have the a standalone description of that new persona without mentioning the original persona or a reason in the description. + +Requirements: +1. Define a clear interpersonal link such as friend, advisor, competitor, family member, or collaborator. +2. The new persona should offer a complementary or contrasting perspective. +3. Make the related persona vivid and believable, avoid generic archetypes. +4. Describe the relation in relationship_type field, not in the description. +5. The related persona must be {related_life_stage}. Do not state a precise age, just reflect this life stage naturally. + diff --git a/docs/cookbook/assets/space_text_generation.txt b/docs/cookbook/assets/space_text_generation.txt new file mode 100644 index 0000000..ca5af4b --- /dev/null +++ b/docs/cookbook/assets/space_text_generation.txt @@ -0,0 +1 @@ +Write one {document_type} excerpt about {topic} for {expertise_level} in {language_name}. diff --git a/docs/cookbook/assets/text_classification_generation.txt b/docs/cookbook/assets/text_classification_generation.txt new file mode 100644 index 0000000..24d28c0 --- /dev/null +++ b/docs/cookbook/assets/text_classification_generation.txt @@ -0,0 +1,13 @@ +Write one realistic hiker report in {language_name}. + +Target category: {label} +Category definition: {label_description} + +Constraints: +- The report must clearly match the target category. +- The setting must be a {trail_type}. +- The writing style must be {style}. +- Keep it to 1 or 2 sentences. +- Do not mention the category name directly. +- Do not use bullets, numbering, or explanations. +- Make the report concrete and varied. diff --git a/docs/cookbook/assets/text_to_persona_v1.txt b/docs/cookbook/assets/text_to_persona_v1.txt new file mode 100644 index 0000000..cd09909 --- /dev/null +++ b/docs/cookbook/assets/text_to_persona_v1.txt @@ -0,0 +1,17 @@ +Infer one specific persona who is likely to read text. + +Source text: +{document} + +Requirements: +1. Return a single persona, not a group. +2. Make the persona specific and fine-grained rather than generic. +3. Ground the persona in signals from the text such as domain, expertise, context, or likely motivation. +4. Do not quote the source text in the persona field. +5. Only write 1 or 2 sentences maximum. +6. The persona is not the subject of the text, but rather someone who would be reading it. +7. Do not refer to the source text, article, or its content in the persona description. The persona must be self-contained. +8. The persona must be {life_stage}. Do not mention a precise age, just reflect this life stage naturally. + +Now figure out a persona description who would be reading this text. + diff --git a/docs/cookbook/assets/text_to_persona_v2.txt b/docs/cookbook/assets/text_to_persona_v2.txt new file mode 100644 index 0000000..294577d --- /dev/null +++ b/docs/cookbook/assets/text_to_persona_v2.txt @@ -0,0 +1,16 @@ + +{document} + + +Identify one precise individual who would naturally encounter or write the . + +Requirements: +1. Describe exactly one person. +2. Be as specific as possible: mention plausible occupation and/or life situation. +3. Derive the persona strictly from cues in the text such as topic, jargon, tone, or implied audience as a potential writter / reader of this text. +4. Do not copy or paraphrase the source text in the persona field. +5. Only return 1 or 2 sentences maximum. +6. The described person is not the subject of the text, but rather someone who would be encountering or writing such text as part of their life. +7. Do not reference the source text, article, or its content in the persona description. The persona must stand on its own. +8. The persona must be {life_stage}. Do not state a precise age, just reflect this life stage naturally. + diff --git a/docs/cookbook/assets/text_to_persona_v3.txt b/docs/cookbook/assets/text_to_persona_v3.txt new file mode 100644 index 0000000..3ccb077 --- /dev/null +++ b/docs/cookbook/assets/text_to_persona_v3.txt @@ -0,0 +1,17 @@ +You are a persona inference assistant. + +Based on the text content below, imagine one real person who would be interested in searching about the topic from this content. + +Rules: +- Output a single, concrete persona rather than a broad demographic. +- Include details like professional background, interests, or situational context that make the persona feel authentic. +- Don't mention the person search or information retrieval action in the persona description, just describe the persona which could explain their interest in the topic. +- Keep it super short and concise. +- Do not mention or refer to the source text, article, or its content in the persona description. The persona must be self-contained. +- The persona must be {life_stage}. Do not state a precise age, just reflect this life stage naturally. + +Source text: +{document} + + + diff --git a/docs/cookbook/index.md b/docs/cookbook/index.md new file mode 100644 index 0000000..1b745ec --- /dev/null +++ b/docs/cookbook/index.md @@ -0,0 +1,16 @@ +# Cookbook + +Cookbooks connect a runnable script to a documentation walkthrough. + +The Python script is the source of truth. Each cookbook page explains: + +- where the executable example lives +- what inputs it uses +- which prompt assets it depends on +- where it writes its output artifacts + +## Available Cookbooks + +- [Text Classification](text_classification.md): generate a multilingual trail-conditions classification dataset from explicit seed dimensions. +- [Persona Generation](persona_generation.md): infer personas from real articles and expand them through relationships using randomized prompt variants. +- [Space Engineering Text Generation](space_text_generation.md): generate a raw multilingual technical text corpus from seed dimensions. diff --git a/docs/cookbook/persona_generation.md b/docs/cookbook/persona_generation.md new file mode 100644 index 0000000..f314a39 --- /dev/null +++ b/docs/cookbook/persona_generation.md @@ -0,0 +1,89 @@ +# Persona Generation + +Build personas from real articles and expand them through relationships. Inspired by the Persona Hub paper, implemented entirely with datafast. + +## Source + +- **Script:** `examples/scripts/43_cookbook_persona_generation.py` +- **Prompt assets:** [asset index](assets/index.md) +- **Local output:** `examples/outputs/43_persona_cookbook.jsonl` +- **Checkpoints:** `examples/checkpoints/43_persona_cookbook` +- **Hub output:** pushed to the Hugging Face Hub repo IDs configured in the script + +## Pipeline + +1. Load `xsum` articles (`validation` split), preserving the dataset `id`. +2. Filter to documents between 300 and 500 words. Keep the first 100 matches. +3. Assign a random life stage to the source persona. +4. **Text-to-Persona** — infer one persona from each article and life stage. +5. Assign a random life stage to the related persona. +6. **Persona-to-Persona** — expand that persona into a related individual. +7. Keep the final output fields, add a row UUID, write JSONL, checkpoint progress, and push results to Hugging Face Hub. + +Each LLM step randomly picks one prompt variant per record using `Sample(prompts, n=1)`. This adds diversity across generations. + +The cookbook keeps `Sample(n=100, strategy="first")` so runs are deterministic and easy to compare. For local corpora with source metadata, use stratified sampling, for example `Sample(n=250, strategy="stratified", by="document_filename")`, to avoid over-representing one source file. + +```text +xsum article + │ + ▼ +life_stage (random from configured stages) + │ + ▼ +Text-to-Persona (random prompt from 3 variants) + │ + ▼ +related_life_stage (random from configured stages) + │ + ▼ +Persona-to-Persona (random prompt from 3 variants) + │ + ▼ +Hugging Face Hub +``` + +## Run + +Prerequisites: + +- `OPENROUTER_API_KEY` set in a `.env` file +- Hugging Face authentication via `HF_TOKEN` in `.env` or a cached `huggingface_hub` login +- Base dependencies from `pyproject.toml` installed + +Before running, replace the example Hugging Face namespaces in the script with your own username or organization: + +- `HF_REPO_ID = "/new-persona-cookbook-dataset"` controls the private pipeline sink. +- `repo_id = "/datafast-persona-cookbook"` inside `push_records_to_hub()` controls the public publish step. + +```bash +python examples/scripts/43_cookbook_persona_generation.py +``` + +The run uses `checkpoint_dir` and `resume=True`, which is useful for paid or rate-limited LLM calls. If a run is interrupted, re-run the same command to continue from the saved checkpoints. + +The main example reads from Hugging Face. For a local JSONL corpus, replace `Source.huggingface(...)` with `Source.file(...)` and map your text column to `document` before `add_word_count`. + +## Prompt Variants + +Each step draws from multiple prompt files stored under `docs/cookbook/assets/`. See the [asset index](assets/index.md) for the full list. + +- **Text-to-Persona:** 3 variants (`text_to_persona_v1.txt`, `v2`, `v3`) +- **Persona-to-Persona:** 3 variants (`persona_to_persona_v1.txt`, `v2`, `v3`) + +## Research Basis + +The Persona Hub paper introduces Text-to-Persona and Persona-to-Persona as scalable methods for building personas from web text. The paper states that its published prompts are simplified, not the exact experiment strings. This cookbook treats them as paper-aligned adaptations. It does not reuse any Persona Hub code. + +## Output Fields + +- `id` — generated row UUID +- `source_id` — original XSum record identifier +- `summary` — original article summary +- `document` — source article text +- `word_count` — whitespace token count +- `life_stage` — randomly selected life stage for the inferred persona +- `persona_description` — inferred persona +- `relationship_type` — link between the two personas +- `related_life_stage` — randomly selected life stage for the expanded persona +- `related_persona_description` — the expanded related persona diff --git a/docs/cookbook/space_text_generation.md b/docs/cookbook/space_text_generation.md new file mode 100644 index 0000000..92c55dc --- /dev/null +++ b/docs/cookbook/space_text_generation.md @@ -0,0 +1,103 @@ +# Space Engineering Text Generation + +Build a raw technical text corpus across document types, topics, expertise levels, +languages, and model choices. + +## Source + +- **Script:** `examples/scripts/44_cookbook_space_text_generation.py` +- **Prompt assets:** [asset index](assets/index.md) +- **Local output:** `examples/outputs/44_space_text_generation_cookbook.jsonl` +- **Checkpoints:** `examples/checkpoints/44_space_text_generation_cookbook` +- **Hub output:** optional, controlled by `DATAFAST_PUSH_TO_HUB=1` + +## Pipeline + +1. Create a seed grid with `Seed.product`. +2. Cross document types, topics, and expertise levels explicitly. +3. Generate one section per seed and language with `LLMStep`. +4. Let the prompt variables drive the corpus variation. +5. Parse `title` and `text` from JSON mode. +6. Keep publication fields, add a row UUID, write JSONL, checkpoint progress, + and optionally push to Hugging Face Hub. + +The default model is `nvidia/nemotron-3-super-120b-a12b:nitro` through +OpenRouter. + +```text +document_type x topic x expertise_level + | + v +LLMStep language expansion: English and French + | + v +JSON fields: title, text + | + v +examples/outputs/44_space_text_generation_cookbook.jsonl +``` + +## Row Count + +The default script generates: + +```text +3 document types x 8 topics x 3 expertise levels x 2 languages +x 1 generated output x 1 model = 144 rows +``` + +To use several models, add provider IDs to `MODEL_IDS`. `LLMStep` will run each +seed-language combination through every model and the row count will multiply by +the number of models. + +## Run + +Prerequisites: + +- `OPENROUTER_API_KEY` set in a `.env` file +- Base dependencies from `pyproject.toml` installed +- Hugging Face authentication only if publishing + +```bash +python examples/scripts/44_cookbook_space_text_generation.py +``` + +To publish, replace `HF_REPO_ID` in the script with a repository under your own +Hugging Face username or organization, then run: + +```bash +DATAFAST_PUSH_TO_HUB=1 python examples/scripts/44_cookbook_space_text_generation.py +``` + +The run uses `checkpoint_dir` and `resume=True`. If generation is interrupted, +run the command again to continue from saved checkpoints. + +## Prompt + +The script uses one compact prompt file: + +```text +Write one {document_type} excerpt about {topic} for {expertise_level} in {language_name}. +``` + +## Generation Controls + +- `MODEL_IDS` controls which models generate each record. +- `LANGUAGES` controls language expansion and writes the emitted language code to + the `language` field. +- `NUM_OUTPUTS` controls how many generated rows are created for each + seed, language, and model combination. +- `PROMPT_PATH` controls the prompt file used for generation. +- `SEED` controls deterministic dataset splitting when publishing. +- `HF_REPO_ID` controls the optional Hugging Face Hub destination. + +## Output Fields + +- `id` - generated row UUID +- `document_type` - requested document style +- `topic` - space engineering topic +- `expertise_level` - intended reader level +- `language` - language code emitted by `LLMStep` +- `model` - model ID emitted by `LLMStep` +- `title` - generated section title +- `text` - generated corpus text diff --git a/docs/cookbook/text_classification.md b/docs/cookbook/text_classification.md new file mode 100644 index 0000000..819523d --- /dev/null +++ b/docs/cookbook/text_classification.md @@ -0,0 +1,118 @@ +# Text Classification + +Build a multilingual trail-conditions classification dataset with `datafast`. + +## Source + +- **Script:** `examples/scripts/45_cookbook_text_classification.py` +- **Prompt assets:** [asset index](assets/index.md) +- **Local output:** `examples/outputs/45_text_classification_cookbook.jsonl` +- **Checkpoints:** `examples/checkpoints/45_text_classification_cookbook` +- **Hub output:** optional, controlled by `DATAFAST_PUSH_TO_HUB=1` + +## Use Case + +This cookbook generates short hiker reports across four trail-condition labels +so teams can monitor trail quality and surface issues quickly. + +The default setup is: + +- multi-class: 4 trail-condition labels +- multi-lingual: English and French +- multi-model: two generation models by default +- publishable: optional push to Hugging Face Hub + +## Pipeline + +1. Create a seed grid from labels, trail types, and writing styles. +2. Generate one short hiker report for each seed across all configured models + and languages. +3. Keep the label and prompt-variation provenance in flat output columns. +4. Add a UUID, write JSONL locally, and optionally push to Hugging Face Hub. + +Variation is modeled explicitly through `Seed.product(...)`, which keeps the +generation axes inspectable and easy to count. + +```text +label x trail_type x style + | + v +LLMStep language expansion: English and French + | + v +LLMStep model expansion + | + v +examples/outputs/45_text_classification_cookbook.jsonl +``` + +## Row Count + +The default script generates: + +```text +4 labels x 3 trail types x 2 styles x 2 languages +x 2 models = 96 rows +``` + +Each extra model in `MODEL_IDS` multiplies the total row count. + +## Run + +Prerequisites: + +- `OPENROUTER_API_KEY` set in a `.env` file +- Base dependencies from `pyproject.toml` installed +- Hugging Face authentication only if publishing + +```bash +python examples/scripts/45_cookbook_text_classification.py +``` + +To publish, replace `HF_REPO_ID` in the script with a repository under your own +Hugging Face username or organization, then run: + +```bash +DATAFAST_PUSH_TO_HUB=1 python examples/scripts/45_cookbook_text_classification.py +``` + +The run uses `checkpoint_dir` and `resume=True`. If generation is interrupted, +run the command again to continue from saved checkpoints. + +If you want to use provider-specific clients directly, replace `MODEL_IDS` or +the `model=MODELS` argument in `LLMStep` with providers such as `openai(...)` +or `anthropic(...)`. The default setup uses multiple OpenRouter-backed models +so it works with one API key. + +## Prompt + +The cookbook uses one prompt file and drives diversity through seed dimensions: + +```text +Write one realistic hiker report in {language_name}. +``` + +See [text_classification_generation.txt](assets/text_classification_generation.txt) +for the full prompt. + +## Generation Controls + +- `LABELS` defines the target classes and their prompt descriptions. +- `TRAIL_TYPES` controls the trail settings used in generation. +- `STYLES` controls the voice and format of each report. +- `LANGUAGES` controls language expansion. +- `MODEL_IDS` controls which models generate records. +- `HF_REPO_ID` controls the optional Hugging Face Hub destination. + +If you want an extra quality-control pass, add a downstream `Classify` and +`Filter` stage to verify that generated reports match their intended label. + +## Output Fields + +- `id` - generated row UUID +- `label` - target trail-condition label +- `trail_type` - prompt expansion axis for the trail setting +- `style` - prompt expansion axis for the report style +- `language` - language code emitted by `LLMStep` +- `model` - model ID emitted by `LLMStep` +- `text` - generated hiker report diff --git a/docs/guides/building_pipelines.md b/docs/guides/building_pipelines.md index 64aaaf2..b755410 100644 --- a/docs/guides/building_pipelines.md +++ b/docs/guides/building_pipelines.md @@ -3,11 +3,12 @@ ## Minimal Pipeline ```python -from datafast import Map, Sink, Source +from datafast import AddUUID, Map, Sink, Source pipeline = ( Source.list([{"text": "hello"}]) >> Map(lambda r: {**r, "length": len(r["text"])}) + >> AddUUID() >> Sink.list() ) @@ -38,6 +39,7 @@ seed = Seed.product( ## Core Data Operations +- `AddUUID`: add a UUID field to each record - `Map`: one record in, one record out - `FlatMap`: one record in, many records out - `Filter`: keep or drop records diff --git a/examples/scripts/43_cookbook_persona_generation.py b/examples/scripts/43_cookbook_persona_generation.py new file mode 100644 index 0000000..ac4f718 --- /dev/null +++ b/examples/scripts/43_cookbook_persona_generation.py @@ -0,0 +1,137 @@ +"""Persona-generation cookbook: XSum article -> personas -> related personas. + +Demonstrates: Source.huggingface, Map, Filter, Sample, JSON-mode LLMSteps, +and prompt assets stored under docs/cookbook/assets. + +Requires: +- OPENROUTER_API_KEY +- Hugging Face authentication via HF_TOKEN or a cached `huggingface_hub` login +- network access to Hugging Face and OpenRouter +""" + +import random + +from dotenv import load_dotenv + +from datafast import AddUUID, Filter, LLMStep, Map, Sample, Sink, Source, openrouter + +import litellm + +load_dotenv() + +litellm.suppress_debug_info = True + + +MODEL_ID = "nvidia/nemotron-3-super-120b-a12b:nitro" +OUTPUT_PATH = "examples/outputs/43_persona_cookbook.jsonl" +CHECKPOINT_DIR = "examples/checkpoints/43_persona_cookbook" +HF_REPO_ID = "patrickfleith/new-persona-cookbook-dataset" +TEXT_TO_PERSONA_PROMPTS = [ + "docs/cookbook/assets/text_to_persona_v1.txt", + "docs/cookbook/assets/text_to_persona_v2.txt", + "docs/cookbook/assets/text_to_persona_v3.txt", +] +PERSONA_TO_PERSONA_PROMPTS = [ + "docs/cookbook/assets/persona_to_persona_v1.txt", + "docs/cookbook/assets/persona_to_persona_v2.txt", + "docs/cookbook/assets/persona_to_persona_v3.txt", +] +LIFE_STAGES = [ + "a teenager", + "a young adult", + "an adult (30s/40s)", + "a middle-aged person (in their 50s/60s)", + "a senior person (in their 70s/80s)", +] + + +def add_word_count(record: dict) -> dict: + return {**record, "word_count": len(record["document"].split())} + + +def assign_life_stage(record: dict) -> dict: + return {**record, "life_stage": random.choice(LIFE_STAGES)} + + +def assign_related_life_stage(record: dict) -> dict: + return {**record, "related_life_stage": random.choice(LIFE_STAGES)} + + +def keep_output_fields(record: dict) -> dict: + return { + "source_id": record["id"], + "summary": record["summary"], + "document": record["document"], + "word_count": record["word_count"], + "life_stage": record["life_stage"], + "persona_description": record["persona_description"], + "relationship_type": record["relationship_type"], + "related_life_stage": record["related_life_stage"], + "related_persona_description": record["related_persona_description"], + } + + +def build_pipeline(): + model = openrouter(MODEL_ID, temperature=0.7) + + return ( + Source.huggingface( + "xsum", + split="validation", + columns=["id", "document", "summary"], + ) + # For a local JSONL corpus, replace the Hugging Face source with something + # like Source.file("data/articles.jsonl") and map your text field to + # "document" before add_word_count. + >> Map(add_word_count).as_step("add_word_count") + >> Filter(fn=lambda r: 300 <= r["word_count"] <= 500).as_step("filter_word_count") + >> Sample(n=10, strategy="first").as_step("take_first_100") + >> Map(assign_life_stage).as_step("assign_life_stage") + >> LLMStep( + prompt=Sample(TEXT_TO_PERSONA_PROMPTS, n=1), + input_columns=["document", "life_stage"], + output_columns=["persona_description"], + model=model, + parse_mode="json", + on_parse_error="raise", + ).as_step("text_to_persona") + >> Map(assign_related_life_stage).as_step("assign_related_life_stage") + >> LLMStep( + prompt=Sample(PERSONA_TO_PERSONA_PROMPTS, n=1), + input_columns=["persona_description", "related_life_stage"], + output_columns=["relationship_type", "related_persona_description"], + model=model, + parse_mode="json", + on_parse_error="raise", + ).as_step("persona_to_persona") + >> Map(keep_output_fields).as_step("keep_output_fields") + >> AddUUID(column="id", overwrite=True).as_step("add_uuid") + >> Sink.jsonl(OUTPUT_PATH) + >> Sink.hub(HF_REPO_ID, private=True) +) + + +def push_records_to_hub(records: list[dict]) -> None: + repo_id = "patrickfleith/datafast-persona-cookbook" + private = False + + list( + Sink.hub( + repo_id=repo_id, + private=private, + commit_message=f"Publish cookbook 43 persona dataset with {MODEL_ID}", + ).process(records) + ) + + +def main() -> None: + records = build_pipeline().run( + batch_size=1, + checkpoint_dir=CHECKPOINT_DIR, + resume=False, + ) + push_records_to_hub(records) + + +if __name__ == "__main__": + main() diff --git a/examples/scripts/44_cookbook_space_text_generation.py b/examples/scripts/44_cookbook_space_text_generation.py new file mode 100644 index 0000000..6c5d2cb --- /dev/null +++ b/examples/scripts/44_cookbook_space_text_generation.py @@ -0,0 +1,143 @@ +"""Space text-generation cookbook: seed grid -> technical text corpus. + +Demonstrates: Seed.product, LLMStep JSON mode, multi-language generation, +num_outputs, checkpointing, JSONL output, and optional Hub push. + +Requires: +- OPENROUTER_API_KEY +- Hugging Face authentication only if DATAFAST_PUSH_TO_HUB=1 +- network access to OpenRouter, and to Hugging Face when publishing +""" + +from __future__ import annotations + +import os + +import litellm +from dotenv import load_dotenv + +from datafast import AddUUID, LLMStep, Map, Seed, Sink, openrouter + +load_dotenv() +litellm.suppress_debug_info = True + + +SEED = 20250304 +MODEL_IDS = ["nvidia/nemotron-3-super-120b-a12b:nitro"] +OUTPUT_PATH = "examples/outputs/44_space_text_generation_cookbook.jsonl" +CHECKPOINT_DIR = "examples/checkpoints/44_space_text_generation_cookbook" +HF_REPO_ID = "patrickfleith/datafast-space-text-generation-cookbook" +NUM_OUTPUTS = 1 +PROMPT_PATH = "docs/cookbook/assets/space_text_generation.txt" + +DOCUMENT_TYPES = [ + "space engineering textbook", + "spacecraft design justification document", + "personal blog of a space engineer", +] + +TOPICS = [ + "Microgravity", + "Vacuum", + "Heavy Ions", + "Thermal Extremes", + "Atomic Oxygen", + "Debris Impact", + "Electrostatic Charging", + "Propellant Boil-off", +] + +EXPERTISE_LEVELS = [ + "executives", + "senior engineers", + "PhD candidates", +] + +LANGUAGES = { + "en": "English", + "fr": "French", +} + + +def make_models(): + return [openrouter(model_id, temperature=0.7) for model_id in MODEL_IDS] + + +def expected_row_count(model_count: int | None = None) -> int: + """Return the number of rows this configuration should generate.""" + model_total = len(MODEL_IDS) if model_count is None else model_count + return ( + len(DOCUMENT_TYPES) + * len(TOPICS) + * len(EXPERTISE_LEVELS) + * len(LANGUAGES) + * NUM_OUTPUTS + * model_total + ) + + +def finalize_record(record: dict) -> dict: + """Keep the columns meant for publication.""" + return { + "document_type": record["document_type"], + "topic": record["topic"], + "expertise_level": record["expertise_level"], + "language": record.get("_language", ""), + "model": record.get("_model", ""), + "title": record["title"], + "text": record["text"], + } + + +def build_pipeline(): + return ( + Seed.product( + Seed.values("document_type", DOCUMENT_TYPES), + Seed.values("topic", TOPICS), + Seed.values("expertise_level", EXPERTISE_LEVELS), + ).as_step("seed_space_text_grid") + >> LLMStep( + prompt=PROMPT_PATH, + input_columns=["document_type", "topic", "expertise_level"], + output_columns=["title", "text"], + parse_mode="json", + model=make_models(), + language=LANGUAGES, + num_outputs=NUM_OUTPUTS, + on_parse_error="raise", + ).as_step("generate_space_text") + >> Map(finalize_record).as_step("finalize_record") + >> AddUUID(column="id", overwrite=True).as_step("add_uuid") + >> Sink.jsonl(OUTPUT_PATH) + ) + + +def push_records_to_hub(records: list[dict]) -> None: + list( + Sink.hub( + repo_id=HF_REPO_ID, + private=True, + train_size=0.8, + seed=SEED, + shuffle=True, + commit_message=f"Publish cookbook 44 text dataset with {', '.join(MODEL_IDS)}", + ).process(records) + ) + + +def main() -> None: + print(f"Expected rows: {expected_row_count()}") + records = build_pipeline().run( + batch_size=4, + checkpoint_dir=CHECKPOINT_DIR, + resume=True, + ) + + if os.getenv("DATAFAST_PUSH_TO_HUB") == "1": + push_records_to_hub(records) + + print(f"Wrote {len(records)} records to {OUTPUT_PATH}") + + +if __name__ == "__main__": + main() diff --git a/examples/scripts/45_cookbook_text_classification.py b/examples/scripts/45_cookbook_text_classification.py new file mode 100644 index 0000000..01b7e5a --- /dev/null +++ b/examples/scripts/45_cookbook_text_classification.py @@ -0,0 +1,158 @@ +"""Text-classification cookbook: seed grid -> multilingual trail reports. + +Demonstrates: Seed.product, prompt expansion via seed dimensions, multi-model +generation, multi-language generation, checkpointing, JSONL output, and +optional Hugging Face Hub publishing. + +Requires: +- OPENROUTER_API_KEY +- Hugging Face authentication only if DATAFAST_PUSH_TO_HUB=1 +- network access to OpenRouter, and to Hugging Face when publishing +""" + +from __future__ import annotations + +import os + +import litellm +from dotenv import load_dotenv + +from datafast import AddUUID, LLMStep, Map, Seed, SeedDimension, Sink, openrouter + +load_dotenv() +litellm.suppress_debug_info = True + + +SEED = 20250611 +MODEL_IDS = [ + "nvidia/nemotron-3-super-120b-a12b:nitro", + "mistralai/ministral-14b-2512", +] +OUTPUT_PATH = "examples/outputs/45_text_classification_cookbook.jsonl" +CHECKPOINT_DIR = "examples/checkpoints/45_text_classification_cookbook" +HF_REPO_ID = "patrickfleith/datafast-text-classification-cookbook" +PROMPT_PATH = "docs/cookbook/assets/text_classification_generation.txt" + +LABELS = [ + { + "label": "trail_obstruction", + "label_description": ( + "The trail is partially or fully blocked by obstacles such as " + "fallen trees, landslides, snow, flooding, erosion, or dense " + "vegetation." + ), + }, + { + "label": "infrastructure_issues", + "label_description": ( + "The report is about damaged or missing bridges, signs, stairs, " + "handrails, markers, boardwalks, or similar trail infrastructure." + ), + }, + { + "label": "hazards", + "label_description": ( + "The trail has immediate safety risks such as slippery surfaces, " + "dangerous crossings, unstable terrain, wildlife threats, or " + "other hazardous conditions." + ), + }, + { + "label": "positive_conditions", + "label_description": ( + "The report highlights clear, safe, enjoyable trail conditions " + "such as good maintenance, solid infrastructure, clear signage, " + "or scenic features." + ), + }, +] + +TRAIL_TYPES = [ + "mountain trail", + "coastal path", + "forest walk", +] + +STYLES = [ + "a brief social media post", + "a hiking review", +] + +LANGUAGES = { + "en": "English", + "fr": "French", +} + +MODELS = [openrouter(model_id, temperature=0.8) for model_id in MODEL_IDS] +EXPECTED_ROWS = ( + len(LABELS) + * len(TRAIL_TYPES) + * len(STYLES) + * len(LANGUAGES) + * len(MODELS) +) + + +def keep_output_fields(record: dict) -> dict: + """Keep only the fields meant for publication.""" + return { + "label": record["label"], + "trail_type": record["trail_type"], + "style": record["style"], + "language": record.get("_language", ""), + "model": record.get("_model", ""), + "text": record["text"], + } + + +pipeline = ( + Seed.product( + SeedDimension( + columns=["label", "label_description"], + values=LABELS, + ), + Seed.values("trail_type", TRAIL_TYPES), + Seed.values("style", STYLES), + ).as_step("seed_trail_report_grid") + >> LLMStep( + prompt=PROMPT_PATH, + input_columns=["label", "label_description", "trail_type", "style"], + output_column="text", + parse_mode="text", + model=MODELS, + language=LANGUAGES, + ).as_step("generate_trail_reports") + >> Map(keep_output_fields).as_step("keep_output_fields") + >> AddUUID(column="id", overwrite=True).as_step("add_uuid") + >> Sink.jsonl(OUTPUT_PATH) +) + + +def main() -> None: + print(f"Expected rows: {EXPECTED_ROWS}") + records = pipeline.run( + batch_size=4, + checkpoint_dir=CHECKPOINT_DIR, + resume=True, + ) + + if os.getenv("DATAFAST_PUSH_TO_HUB") == "1": + list( + Sink.hub( + repo_id=HF_REPO_ID, + private=False, + train_size=0.8, + seed=SEED, + shuffle=True, + commit_message=( + "Publish cookbook 45 classification dataset with " + f"{', '.join(MODEL_IDS)}" + ), + ).process(records) + ) + + print(f"Wrote {len(records)} records to {OUTPUT_PATH}") + + +if __name__ == "__main__": + main() diff --git a/mkdocs.yml b/mkdocs.yml index 87e795a..131400c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -43,6 +43,11 @@ nav: - LLM Steps: guides/llm_steps.md - Checkpointing: guides/checkpointing.md - Langfuse Tracing: guides/langfuse_tracing.md + - Cookbook: + - cookbook/index.md + - Text Classification: cookbook/text_classification.md + - Persona Generation: cookbook/persona_generation.md + - Space Engineering Text Generation: cookbook/space_text_generation.md - Providers: llms.md - Models: models.md - API: api.md diff --git a/tests/test_add_uuid.py b/tests/test_add_uuid.py new file mode 100644 index 0000000..e89f837 --- /dev/null +++ b/tests/test_add_uuid.py @@ -0,0 +1,78 @@ +import uuid + +from datafast import AddUUID, LLMStep, Sink, Source + + +def assert_valid_uuid(value: str) -> None: + parsed = uuid.UUID(value) + assert str(parsed) == value + + +def test_add_uuid_adds_id_when_missing(): + records = list(AddUUID().process([{"text": "hello"}])) + + assert records[0]["text"] == "hello" + assert_valid_uuid(records[0]["id"]) + + +def test_add_uuid_preserves_existing_id_by_default(): + records = list(AddUUID().process([{"id": "source-1", "text": "hello"}])) + + assert records == [{"id": "source-1", "text": "hello"}] + + +def test_add_uuid_overwrites_existing_id_when_requested(): + records = list( + AddUUID(overwrite=True).process([{"id": "source-1", "text": "hello"}]) + ) + + assert records[0]["text"] == "hello" + assert records[0]["id"] != "source-1" + assert_valid_uuid(records[0]["id"]) + + +def test_add_uuid_generates_distinct_ids_for_multiple_records(): + records = list(AddUUID().process([{"text": "a"}, {"text": "b"}])) + ids = [record["id"] for record in records] + + assert len(set(ids)) == 2 + for value in ids: + assert_valid_uuid(value) + + +def test_add_uuid_supports_custom_column_name(): + records = list(AddUUID(column="example_id").process([{"text": "hello"}])) + + assert "id" not in records[0] + assert_valid_uuid(records[0]["example_id"]) + + +def test_add_uuid_assigns_unique_ids_to_llm_num_outputs_pipeline(): + class FakeModel: + model_id = "fake-model" + provider_name = "fake" + + def generate(self, messages, metadata=None): + return '{"title": "Generated", "text": "Body"}' + + pipeline = ( + Source.list([{"topic": "vacuum"}]) + >> LLMStep( + prompt="Write about {topic}.", + input_columns=["topic"], + output_columns=["title", "text"], + parse_mode="json", + model=FakeModel(), + num_outputs=2, + ) + >> AddUUID() + >> Sink.list() + ) + + records = pipeline.run() + ids = [record["id"] for record in records] + + assert len(records) == 2 + assert len(set(ids)) == 2 + for value in ids: + assert_valid_uuid(value) diff --git a/tests/test_llms_unit.py b/tests/test_llms_unit.py new file mode 100644 index 0000000..a6e5674 --- /dev/null +++ b/tests/test_llms_unit.py @@ -0,0 +1,80 @@ +import datafast.llms as llms_module +from datafast.llms import OpenRouterProvider + + +class _DummyMessage: + def __init__(self, content: str) -> None: + self.content = content + + +class _DummyChoice: + def __init__(self, content: str) -> None: + self.message = _DummyMessage(content) + + +class _DummyResponse: + def __init__(self, content: str) -> None: + self.choices = [_DummyChoice(content)] + + +def test_openrouter_single_messages_use_completion(monkeypatch): + monkeypatch.setattr(llms_module, "load_env_once", lambda: None) + monkeypatch.setattr( + llms_module, + "maybe_configure_langfuse_tracing", + lambda load_env=False: False, + ) + + calls = {"completion": 0, "batch_completion": 0} + + def fake_completion(**kwargs): + calls["completion"] += 1 + assert kwargs["messages"] == [{"role": "user", "content": "ping"}] + return _DummyResponse("ok") + + def fake_batch_completion(**kwargs): + calls["batch_completion"] += 1 + raise AssertionError("single-message requests should not use batch_completion") + + monkeypatch.setattr(llms_module.litellm, "completion", fake_completion) + monkeypatch.setattr(llms_module.litellm, "batch_completion", fake_batch_completion) + + provider = OpenRouterProvider(model_id="demo-model", api_key="test-key") + + response = provider.generate(messages=[{"role": "user", "content": "ping"}]) + + assert response == "ok" + assert calls == {"completion": 1, "batch_completion": 0} + + +def test_openrouter_batch_messages_use_batch_completion(monkeypatch): + monkeypatch.setattr(llms_module, "load_env_once", lambda: None) + monkeypatch.setattr( + llms_module, + "maybe_configure_langfuse_tracing", + lambda load_env=False: False, + ) + + calls = {"completion": 0, "batch_completion": 0} + + def fake_completion(**kwargs): + calls["completion"] += 1 + raise AssertionError("batched requests should not use completion") + + def fake_batch_completion(**kwargs): + calls["batch_completion"] += 1 + assert len(kwargs["messages"]) == 2 + return [_DummyResponse("first"), _DummyResponse("second")] + + monkeypatch.setattr(llms_module.litellm, "completion", fake_completion) + monkeypatch.setattr(llms_module.litellm, "batch_completion", fake_batch_completion) + + provider = OpenRouterProvider(model_id="demo-model", api_key="test-key") + + response = provider.generate(messages=[ + [{"role": "user", "content": "one"}], + [{"role": "user", "content": "two"}], + ]) + + assert response == ["first", "second"] + assert calls == {"completion": 0, "batch_completion": 1} diff --git a/tests/test_public_api.py b/tests/test_public_api.py index 7eaf787..ac56477 100644 --- a/tests/test_public_api.py +++ b/tests/test_public_api.py @@ -1,4 +1,5 @@ from datafast import ( + AddUUID, Branch, Classify, Compare, @@ -70,6 +71,7 @@ def test_factory_exports_are_available(monkeypatch): assert Sink is not None assert Seed is not None assert Sample is not None + assert AddUUID is not None assert Map is not None assert FlatMap is not None assert Filter is not None diff --git a/tests/test_runner_llm_messages.py b/tests/test_runner_llm_messages.py new file mode 100644 index 0000000..d870093 --- /dev/null +++ b/tests/test_runner_llm_messages.py @@ -0,0 +1,47 @@ +from datafast import LLMStep, ListSink, Source + + +def test_runner_passes_llm_messages_by_keyword(): + class FakeModel: + provider_name = "fake" + model_id = "fake-model" + + def __init__(self) -> None: + self.calls: list[dict] = [] + + def generate( + self, + prompt=None, + messages=None, + metadata=None, + response_format=None, + ): + self.calls.append({ + "prompt": prompt, + "messages": messages, + "metadata": metadata, + }) + return "done" + + model = FakeModel() + sink = ListSink() + + pipeline = ( + Source.list([{"topic": "robotics"}]) + >> LLMStep( + prompt="Write one short line about {topic}.", + input_columns=["topic"], + output_column="result", + model=model, + ).as_step("generate_copy") + >> sink + ) + + output = pipeline.run() + + assert output == [{"topic": "robotics", "result": "done", "_model": "fake-model"}] + assert len(model.calls) == 1 + assert model.calls[0]["prompt"] is None + assert model.calls[0]["messages"] == [ + {"role": "user", "content": "Write one short line about robotics."} + ]