From 6912f3e3ce4805f6833577cbb40375410eb9a2d3 Mon Sep 17 00:00:00 2001 From: svonava Date: Tue, 23 Jun 2026 08:27:23 +0000 Subject: [PATCH] examples(contract-review-agent): multi-model contract review with the OpenAI Agents SDK An OpenAI Agents SDK agent whose every model call is served by one SIE cluster. An autonomous 'investigator' agent fans out across the SIE catalog (no structured output_type, so it must use its tools) and a 'synthesizer' agent produces a grounded, structured ContractReview: - triage Qwen3-0.6B, orchestration Qwen3-4B-Instruct, vision Qwen3.5-4B, risk-analysis sub-agent Qwen3-4B-Instruct (newer Qwen3.5-4B / stronger Qwen3.6-27B where the cluster serves them), text-to-SQL sqlcoder-7b-2, OCR LightOnOCR-2-1B, embeddings bge-m3, rerank Qwen3-Reranker-4B, entities gliner_large; granite-guardian input guardrail. - Real contracts from CUAD (CC BY 4.0), with a synthetic offline fallback. - Per-model observability: cold-start warm-up vs warm throughput, per call. - Resilient: fail-open guardrail, graceful tool degradation, and provisioning retries for cold/evicted models. Validated end-to-end against a GPU SIE cluster. --- examples/README.md | 1 + examples/contract-review-agent/.env.example | 4 + examples/contract-review-agent/.gitignore | 8 + examples/contract-review-agent/README.md | 111 ++++++ examples/contract-review-agent/config.yaml | 36 ++ .../contract_review_agent/__init__.py | 6 + .../contract_review_agent/app.py | 114 +++++++ .../contract_review_agent/cli.py | 270 +++++++++++++++ .../contract_review_agent/config.py | 27 ++ .../contract_review_agent/data/__init__.py | 15 + .../data/fetch_contracts.py | 205 +++++++++++ .../contract_review_agent/data/make_sample.py | 290 ++++++++++++++++ .../contract_review_agent/data/render.py | 53 +++ .../contract_review_agent/guardrails.py | 60 ++++ .../contract_review_agent/runtime.py | 235 +++++++++++++ .../contract_review_agent/tools.py | 321 ++++++++++++++++++ examples/contract-review-agent/pyproject.toml | 38 +++ 17 files changed, 1794 insertions(+) create mode 100644 examples/contract-review-agent/.env.example create mode 100644 examples/contract-review-agent/.gitignore create mode 100644 examples/contract-review-agent/README.md create mode 100644 examples/contract-review-agent/config.yaml create mode 100644 examples/contract-review-agent/contract_review_agent/__init__.py create mode 100644 examples/contract-review-agent/contract_review_agent/app.py create mode 100644 examples/contract-review-agent/contract_review_agent/cli.py create mode 100644 examples/contract-review-agent/contract_review_agent/config.py create mode 100644 examples/contract-review-agent/contract_review_agent/data/__init__.py create mode 100644 examples/contract-review-agent/contract_review_agent/data/fetch_contracts.py create mode 100644 examples/contract-review-agent/contract_review_agent/data/make_sample.py create mode 100644 examples/contract-review-agent/contract_review_agent/data/render.py create mode 100644 examples/contract-review-agent/contract_review_agent/guardrails.py create mode 100644 examples/contract-review-agent/contract_review_agent/runtime.py create mode 100644 examples/contract-review-agent/contract_review_agent/tools.py create mode 100644 examples/contract-review-agent/pyproject.toml diff --git a/examples/README.md b/examples/README.md index 980d6a7c..567b849e 100644 --- a/examples/README.md +++ b/examples/README.md @@ -20,6 +20,7 @@ service keys. | [Swap an OCR model with one identifier change](./document-ocr) | Driving recognition (VLM-OCR), structured extraction (Donut), and zero-shot NER (GLiNER) through the same `extract` call by swapping the model ID | `extract` | Docker Compose plus Node UI, no API key required, hosted version on [Hugging Face Spaces](https://huggingface.co/spaces/superlinked/document-ocr) | Runnable demo | | [A Stripe Link checkout with an SIE fraud-risk gate](./stripe-link-fraud) | Wiring all three SIE primitives into a pre-authorization fraud-risk gate that runs in the same round-trip as the Stripe PaymentIntent | `extract`, `encode`, `score` | Docker Compose plus Node UI; Stripe test-mode keys optional (runs in mock mode without them) | Runnable demo | | [Vision-first document RAG](./vision-doc-rag) | Retrieving and answering questions over a multi-tenant page corpus by looking at page images — including scanned drawings — with OCR kept out of the score path | `encode`, `chat/completions`, `score` (optional) | GPU SIE deployment required: ColQwen2.5 retriever + Qwen3.5-4B answer model (runs on the generation bundle) | Runnable demo | +| [Multi-model contract review with the OpenAI Agents SDK](./contract-review-agent) | Running an OpenAI Agents SDK agent whose every model call — triage, orchestration, vision, OCR, embeddings, rerank, entity extraction, text-to-SQL, reasoning, and a safety guardrail — is served by one SIE cluster, each step on the right catalog model, with per-model observability | `chat/completions`, `encode`, `score`, `extract` | GPU SIE deployment required; standalone `uv` project; real contracts fetched from CUAD (CC BY 4.0) | Runnable demo | For docs publishing, lead with the quickest runnable demos, then use the benchmark and evaluation examples for deeper technical users. diff --git a/examples/contract-review-agent/.env.example b/examples/contract-review-agent/.env.example new file mode 100644 index 00000000..b99497bb --- /dev/null +++ b/examples/contract-review-agent/.env.example @@ -0,0 +1,4 @@ +# Point these at any SIE deployment that serves /v1/chat/completions (a GPU +# cluster — see README "Run it"). Defaults match a local CUDA container. +SIE_CLUSTER_URL=http://localhost:8080 +SIE_API_KEY= diff --git a/examples/contract-review-agent/.gitignore b/examples/contract-review-agent/.gitignore new file mode 100644 index 00000000..ea00f029 --- /dev/null +++ b/examples/contract-review-agent/.gitignore @@ -0,0 +1,8 @@ +.venv/ +__pycache__/ +.env +*.log +uv.lock +.ruff_cache/ +# Generated sample artifacts (recreate with `uv run make-sample`) +contract_review_agent/data/generated/ diff --git a/examples/contract-review-agent/README.md b/examples/contract-review-agent/README.md new file mode 100644 index 00000000..db9b8ab7 --- /dev/null +++ b/examples/contract-review-agent/README.md @@ -0,0 +1,111 @@ +# Contract review with the OpenAI Agents SDK, on one SIE cluster + +A multi-agent contract reviewer built with the [OpenAI Agents SDK](https://openai.github.io/openai-agents-python/) where **every model call is served by SIE** — no `api.openai.com`, no per-token bill. An **investigator** agent autonomously calls tools to gather grounded facts, then a **synthesizer** agent turns them into a structured review — each step running on the **right model from the SIE catalog**: a fast triage model, a vision model that reads the scanned signature page, a reasoning sub-agent for clause risk, a text-to-SQL specialist, an OCR model, embedding + reranker models for clause search, a zero-shot entity extractor, and a safety guardrail. Ten specialized jobs, one cluster, one request. + +This is the "one cluster powers every model your agent calls" idea from the [SIE landing page](https://superlinked.com), made real and runnable. + +## The catalog: the right model for each job + +Every value below is a real model in the [SIE catalog](https://superlinked.com/models). Swap any line in `config.yaml` to try another — nothing else changes. + +| Role in the agent | SIE model | SIE function | +|---|---|---| +| Triage — classify the document type | `Qwen/Qwen3-0.6B` | chat | +| **Orchestrator** — plan, call tools, assemble the review | `Qwen/Qwen3-4B-Instruct-2507` (alias `code`) | chat + tools + JSON schema | +| Vision — read the scanned signature page | `Qwen/Qwen3.5-4B` | chat + image | +| Reasoning sub-agent — clause-risk analysis | `Qwen/Qwen3-4B-Instruct-2507` (↑ `Qwen3.5-4B` / `Qwen3.6-27B` where served) | chat | +| Text-to-SQL — query the obligations DB | `defog/sqlcoder-7b-2` | completions | +| Guardrail — safety / prompt-injection | `ibm-granite/granite-guardian-3.0-2b` (alias `guard`) | chat | +| OCR — scanned page → markdown | `lightonai/LightOnOCR-2-1B` | extract | +| Clause search — dense embeddings | `BAAI/bge-m3` | encode | +| Clause rerank — cross-encoder | `Qwen/Qwen3-Reranker-4B` | score | +| Entity extraction — parties, dates, amounts | `urchade/gliner_large-v2.1` | extract | + +## How it works + +The whole trick is one idea: **the Agents SDK speaks the OpenAI wire protocol, and SIE serves an OpenAI-compatible `/v1` endpoint.** So we point the SDK at SIE and force chat completions (`contract_review_agent/runtime.py`): + +```python +client = AsyncOpenAI(base_url="http://localhost:8080/v1", api_key="not-needed") +set_default_openai_client(client) # every agent talks to SIE... +set_default_openai_api("chat_completions") # ...over chat completions, not the Responses API... +set_tracing_disabled(True) # ...and we never phone home with traces. +``` + +After that, each `Agent` just names the SIE model it should run on: + +```python +Agent(name="Risk Analyst", model=OpenAIChatCompletionsModel("Qwen/Qwen3-4B-Instruct-2507", openai_client=client), ...) +``` + +The flow is **two agents** (which is what keeps a small open model reliable): + +1. An **investigator** (on `Qwen3-4B-Instruct`) with seven tools and **no** structured `output_type` — so it can't short-circuit to a hallucinated answer and instead must call tools to learn anything about the contract: + - `classify_document` (triage) · `read_signature_page` (vision) · `analyze_clause_risks` (delegates to the reasoning **sub-agent**) — generative LLMs + - `ocr_signature_page` · `extract_entities` (`extract`), `search_clauses` (`encode` + `score`), `query_obligations_db` (`completions`) — retrieval & extraction + - a `granite-guardian` **input guardrail** screens the request first (and fails open, logged, if the guard model is unavailable). +2. A **synthesizer** (structured `output_type=ContractReview`, no tools) turns the investigator's grounded findings into the final review — parties, dates, governing law, executed?, key obligations, risk flags with severity + redlines, recommendation — via SIE's JSON-schema-constrained generation. + +> Why two agents? With a structured `output_type`, a small model tends to emit the schema immediately and skip the tools (it will even hallucinate the fields). Splitting "gather with tools" from "format the result" keeps the fan-out real and the output grounded. + +## Run it + +You need Python 3.12 and a **GPU-backed SIE deployment** — the generative models run on SIE's generation bundle (CUDA), so the `latest-cpu-default` image can't serve them. + +```bash +# 1. SIE on a local NVIDIA GPU, or point SIE_CLUSTER_URL / SIE_API_KEY at a managed GPU cluster. +docker run --gpus all -p 8080:8080 -v sie-hf-cache:/app/.cache/huggingface \ + ghcr.io/superlinked/sie-server:latest-cuda12-default + +cd examples/contract-review-agent +cp .env.example .env # edit SIE_CLUSTER_URL / SIE_API_KEY if not localhost +uv sync + +# 2. Fetch a handful of real contracts from CUAD (CC BY 4.0). Downloads a ~18 MB archive once. +uv run fetch-contracts # or: uv run make-sample (offline synthetic contracts) + +# 3. Review the first contract and watch the model fan-out. +uv run review # uv run review --list to see available contracts +uv run review --contract # review a specific one +``` + +> **GPU sizing.** `reasoning` defaults to `Qwen/Qwen3-4B-Instruct-2507` (reliable, fast) so the demo +> runs on a single mid-size GPU; swap in the newer `Qwen/Qwen3.5-4B` or the stronger `Qwen/Qwen3.6-27B` (H100/RTX PRO 6000) where the cluster serves them. A cold +> cluster pays a one-time load per model on first use; the agent retries the "still +> provisioning" responses under `cluster.provision_timeout_s`. Keep bundles warm +> (`minReplicas: 1`) to skip the wait — and any model the cluster can't serve degrades +> gracefully (logged in the ledger) instead of failing the run. + +## What you'll see + +`uv run review` prints the model catalog, runs the agent, then prints the structured review **plus a per-model observability ledger** — each step's model, SIE function, **cold-start warm-up**, warm latency, data sent, and **warm throughput (tokens/s)** — so you can watch one cluster fan a single request across the catalog and see how each model performed. (Warm-up is shown separately from throughput for the generative calls; the `encode`/`score`/`extract` calls go through the SIE SDK, which provisions internally, so those show total latency.) Try `--instruction "..."` to change the ask, or feed the guardrail a malicious prompt to watch `granite-guardian` trip the tripwire. + +## Swapping models (the point of the catalog) + +`config.yaml` maps each role to a model id. Change a string, rerun — no code edits: + +```yaml +models: + reasoning: "Qwen/Qwen3.6-27B" # default 4B runs anywhere; bump to 27B on an H100-class cluster + ocr: "opendatalab/MinerU2.5-Pro-2604-1.2B" # try a different OCR model +``` + +Alternatively, resolve roles **server-side** with SIE's gateway aliases — set +`SIE_GATEWAY_MODEL_ALIASES='{"vision":"Qwen/Qwen3.5-4B","ocr":"lightonai/LightOnOCR-2-1B"}'` +and reference `vision` / `ocr` (the built-ins `code`, `sql`, `guard` already ship). + +## Data + +The default corpus is **[CUAD](https://www.atticusprojectai.org/cuad/)** (Contract Understanding Atticus Dataset) — 510 real commercial contracts filed with the SEC, released by The Atticus Project under **CC BY 4.0**. `fetch-contracts` downloads CUAD's ~18 MB archive once (from the [Atticus Project repo](https://github.com/TheAtticusProject/cuad)), parses the SQuAD-format contract text, writes a curated handful as the corpus, renders one page to an image for the OCR/vision step, and seeds a small SQLite obligations database that references the contracts pulled. + +> CUAD: An Expert-Annotated NLP Dataset for Legal Contract Review. Dan Hendrycks, Collin Burns, Anya Chen, Spencer Ball. arXiv:2103.06268. Licensed CC BY 4.0. + +`uv run make-sample` builds a fully synthetic, offline alternative (an Acme MSA, an NDA, and an SOW) so the demo runs with no network. + +## Notes + +- Chat completions, tool calling, JSON-schema structured output, vision, and `/v1/completions` (for `sqlcoder`) are all served over SIE's OpenAI-compatible API. +- `sqlcoder-7b-2` is a completion model used with its native text-to-SQL template; for higher accuracy you can instead point `sql` at the `code`-aliased instruct model. +- This is a demo of inference orchestration, **not legal advice**. + +Apache-2.0, like the rest of SIE. diff --git a/examples/contract-review-agent/config.yaml b/examples/contract-review-agent/config.yaml new file mode 100644 index 00000000..fc16f707 --- /dev/null +++ b/examples/contract-review-agent/config.yaml @@ -0,0 +1,36 @@ +# SIE deployment. /v1/chat/completions is served by SIE's generation bundle +# (GPU), so use a CUDA image locally: +# docker run --gpus all -p 8080:8080 ghcr.io/superlinked/sie-server:latest-cuda12-default +# or override with SIE_CLUSTER_URL / SIE_API_KEY to target a managed GPU cluster. +cluster: + url: "http://localhost:8080" + api_key: "" + gpu: "" # only set for managed multi-GPU clusters (e.g. "l4-spot"); ignored locally + provision_timeout_s: 900 # cold clusters scale from zero — first call to each model pays a load + +# ── The heart of the demo: one cluster, many models — the right one for each job. ── +# Every value is a real model in the SIE catalog (https://superlinked.com/models). +# Swap any line to try a different model; no other code changes. +models: + # Generative LLMs — the agent "brains" (chat / tools / structured output): + triage: "Qwen/Qwen3-0.6B" # fast, cheap doc-type classifier (no tools) + orchestrator: "Qwen/Qwen3-4B-Instruct-2507" # plans, calls tools, assembles output (alias: code) + vision: "Qwen/Qwen3.5-4B" # reads scanned / signature pages (text + image) + reasoning: "Qwen/Qwen3-4B-Instruct-2507" # clause-risk analysis (reliable, fast); swap to newer Qwen/Qwen3.5-4B or stronger Qwen/Qwen3.6-27B where the cluster serves them + sql: "defog/sqlcoder-7b-2" # text-to-SQL specialist (completions + native template) + guard: "ibm-granite/granite-guardian-3.0-2b" # safety / prompt-injection guardrail (alias: guard) + # Retrieval + extraction tools (encode / score / extract): + ocr: "lightonai/LightOnOCR-2-1B" # scanned PDF / image -> markdown (latest OCR model) + embed: "BAAI/bge-m3" # dense embeddings for clause search + rerank: "Qwen/Qwen3-Reranker-4B" # cross-encoder rerank of retrieved clauses + entities: "urchade/gliner_large-v2.1" # zero-shot entity extraction (parties, dates, amounts) + +# Tunables for the SIE-backed tools. +search: + top_k_candidates: 12 # clauses retrieved by embedding similarity + top_k_results: 4 # clauses kept after rerank + +guard: + # granite-guardian emits a "yes" (unsafe) / "no" (safe) verdict. Trip the + # guardrail when P(unsafe) clears this threshold. 0.5 is recall-biased. + threshold: 0.5 diff --git a/examples/contract-review-agent/contract_review_agent/__init__.py b/examples/contract-review-agent/contract_review_agent/__init__.py new file mode 100644 index 00000000..af59174f --- /dev/null +++ b/examples/contract-review-agent/contract_review_agent/__init__.py @@ -0,0 +1,6 @@ +"""Contract review with the OpenAI Agents SDK, served entirely by SIE. + +One orchestrator agent drives specialist sub-agents and SIE-backed tools, each +running on a different model from the SIE catalog — the "one cluster powers +every model your agent calls" story, made runnable. +""" diff --git a/examples/contract-review-agent/contract_review_agent/app.py b/examples/contract-review-agent/contract_review_agent/app.py new file mode 100644 index 00000000..837de71d --- /dev/null +++ b/examples/contract-review-agent/contract_review_agent/app.py @@ -0,0 +1,114 @@ +"""Assemble the multi-agent app: an orchestrator on one model, a risk-analyst +sub-agent on another, SIE-backed tools, a safety guardrail, and a structured +output type.""" + +from __future__ import annotations + +from typing import Any + +from agents import Agent, RunResult, Runner +from pydantic import BaseModel + +from .guardrails import safety_guardrail +from .runtime import AppContext, model_for +from .tools import ALL_TOOLS + + +class RiskFlag(BaseModel): + clause: str + issue: str + severity: str # low | medium | high + suggested_redline: str + + +class ContractReview(BaseModel): + """The structured deliverable the orchestrator must produce.""" + + document_type: str + parties: list[str] + effective_date: str # "unknown" if not stated + renewal_terms: str + governing_law: str # "unknown" if not stated + executed: bool # is the signature page signed and dated? + key_obligations: list[str] + risk_flags: list[RiskFlag] + recommendation: str + + +# The investigator has NO output_type on purpose: a structured output_type gives a +# weak model an escape hatch to emit the schema immediately instead of using tools. +# With only tools available, it must call them to do its job. +_INVESTIGATOR_INSTRUCTIONS = """\ +You are a contract investigator. You have NO prior knowledge of this contract — the +ONLY way to learn anything is to CALL YOUR TOOLS. Investigate thoroughly: call EVERY +one of these tools, one after another, before you write anything. + +- classify_document() — the document type +- ocr_signature_page() — read the executed signature page (signatories, titles, date) +- extract_entities() — parties, dates, amounts, governing law +- read_signature_page("Are both parties' signatures present and dated?") — visual execution check +- search_clauses("automatic renewal"), then search_clauses("limitation of liability"), + then search_clauses("indemnification"), then search_clauses("termination") +- analyze_clause_risks() — risk analysis with severities +- query_obligations_db("upcoming obligations with due dates and amounts") — deadlines + +Do NOT write your report until you have called them all. Then write a thorough, +factual findings report that cites ONLY what the tools returned. Never invent a party, +date, number, or clause — if a tool failed, say so.""" + +_SYNTHESIZER_INSTRUCTIONS = """\ +You turn a contract investigator's findings into a structured ContractReview. Use +ONLY the findings provided — never add facts. If the findings don't establish a +field, use "unknown" (or false for `executed`). Make key_obligations and risk_flags +specific and grounded in the findings, and give a clear recommendation.""" + + +def build_reasoning_agent(cfg: dict[str, Any], client: Any) -> Agent: + return Agent( + name="Risk Analyst", + instructions=( + "You are a senior contracts attorney. Given contract clauses, identify " + "risks to the Customer. For each, state the clause, the issue, a severity " + "(low/medium/high), and a concrete one-line redline. Be specific and brief." + ), + model=model_for(cfg["models"]["reasoning"], client), + ) + + +def build_investigator(cfg: dict[str, Any], client: Any) -> Agent: + """Autonomous tool-using agent (no output_type) that gathers grounded findings.""" + return Agent( + name="Contract Investigator", + instructions=_INVESTIGATOR_INSTRUCTIONS, + model=model_for(cfg["models"]["orchestrator"], client), + tools=ALL_TOOLS, + input_guardrails=[safety_guardrail], + ) + + +def build_synthesizer(cfg: dict[str, Any], client: Any) -> Agent: + """Structured-output agent (no tools) that formats the findings into a review.""" + return Agent( + name="Contract Reviewer", + instructions=_SYNTHESIZER_INSTRUCTIONS, + model=model_for(cfg["models"]["orchestrator"], client), + output_type=ContractReview, + ) + + +async def run_review( + app: AppContext, investigator: Agent, synthesizer: Agent, instruction: str +) -> tuple[RunResult, RunResult]: + """Investigate with tools (autonomous fan-out), then synthesize the structured review.""" + gather = await Runner.run( + investigator, + f"{instruction}\n\nInvestigate the contract using your tools, then report your findings.", + context=app, + max_turns=20, + ) + synth = await Runner.run( + synthesizer, + f"Investigator findings:\n\n{gather.final_output}\n\nProduce the ContractReview.", + context=app, + ) + return gather, synth diff --git a/examples/contract-review-agent/contract_review_agent/cli.py b/examples/contract-review-agent/contract_review_agent/cli.py new file mode 100644 index 00000000..eeb310ce --- /dev/null +++ b/examples/contract-review-agent/contract_review_agent/cli.py @@ -0,0 +1,270 @@ +"""Run the contract-review agent over one contract and show the model fan-out.""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import time +from pathlib import Path + +from agents import InputGuardrailTripwireTriggered +from rich.console import Console +from rich.panel import Panel +from rich.table import Table +from sie_sdk import SIEAsyncClient + +from .app import ContractReview, build_investigator, build_reasoning_agent, build_synthesizer, run_review +from .config import load_config +from .data import CUAD_DIR, GENERATED_DIR, MANIFEST_PATH, make_sample +from .runtime import AppContext, Ledger, chat_once, configure_runtime, make_openai_client + +console = Console() + +# (config role, human job, SIE function) — drives the catalog table. +ROLE_INFO = [ + ("orchestrator", "Plan, call tools, assemble the review", "chat + tools"), + ("triage", "Classify the document type (fast)", "chat"), + ("vision", "Read the scanned signature page", "chat + image"), + ("reasoning", "Clause-risk specialist (sub-agent)", "chat"), + ("sql", "Text-to-SQL over the obligations DB", "completions"), + ("guard", "Safety / prompt-injection guardrail", "chat"), + ("ocr", "Scanned page → markdown", "extract"), + ("embed", "Clause search (embeddings)", "encode"), + ("rerank", "Rerank retrieved clauses", "score"), + ("entities", "Entity extraction (parties, dates, $)", "extract"), +] + + +def _print_catalog(cfg: dict) -> None: + table = Table(title="One SIE cluster · the right model for each job", title_style="bold") + table.add_column("Role", style="cyan") + table.add_column("SIE catalog model", style="green") + table.add_column("SIE function", style="magenta") + table.add_column("Job") + for role, job, fn in ROLE_INFO: + table.add_row(role, cfg["models"][role], fn, job) + console.print(table) + + +def _print_ledger(ledger: Ledger) -> None: + table = Table(title="Per-model observability (in call order)", title_style="bold") + table.add_column("#", justify="right") + table.add_column("Step") + table.add_column("Model", style="green") + table.add_column("SIE fn", style="magenta") + table.add_column("Warm-up", justify="right") + table.add_column("Latency", justify="right") + table.add_column("Sent", justify="right") + table.add_column("Got", justify="right") + table.add_column("Throughput", justify="right") + warmup_total = 0.0 + call_total = 0.0 + for i, e in enumerate(ledger.entries, 1): + warmup_total += e.warmup_s + call_total += e.latency_s + table.add_row( + str(i), e.step, e.model.split("/")[-1], e.sie_fn, + f"{e.warmup_s:.1f}s" if e.warmup_s else "—", + f"{e.latency_s:.2f}s" if e.latency_s else "—", + e.sent or "—", e.got or "—", e.throughput or "—", + ) + console.print(table) + used = {e.model for e in ledger.entries} + console.print(f"[bold]{len(used)} distinct SIE models[/] handled this request — " + f"{warmup_total:.0f}s cold-start (model warm-up) + {call_total:.0f}s warm calls.") + + +def _print_summary(cfg: dict, usage, wall_s: float) -> None: + parts = [f"end-to-end wall time [bold]{wall_s:.1f}s[/]"] + reqs = getattr(usage, "requests", None) + if reqs is not None: + it = getattr(usage, "input_tokens", 0) or 0 + ot = getattr(usage, "output_tokens", 0) or 0 + orch = cfg["models"]["orchestrator"].split("/")[-1] + parts.append(f"investigator {orch}: {reqs} LLM calls, {it:,} in / {ot:,} out tok") + console.print("Run summary — " + " · ".join(parts)) + + +def _print_review(review: ContractReview) -> None: + lines = [ + f"[bold]Document type[/]: {review.document_type}", + f"[bold]Parties[/]: {', '.join(review.parties) or '—'}", + f"[bold]Effective date[/]: {review.effective_date or '—'}", + f"[bold]Governing law[/]: {review.governing_law or '—'}", + f"[bold]Executed[/]: {review.executed}", + f"[bold]Renewal terms[/]: {review.renewal_terms}", + "", + "[bold]Key obligations[/]:", + *[f" • {o}" for o in review.key_obligations], + "", + "[bold]Recommendation[/]:", + f" {review.recommendation}", + ] + console.print(Panel("\n".join(lines), title="Contract review", border_style="blue")) + + if review.risk_flags: + risks = Table(title="Risk flags", title_style="bold red") + risks.add_column("Severity", style="bold") + risks.add_column("Clause") + risks.add_column("Issue") + risks.add_column("Suggested redline") + sev_color = {"high": "red", "medium": "yellow", "low": "green"} + for f in review.risk_flags: + color = sev_color.get(f.severity.lower(), "white") + risks.add_row(f"[{color}]{f.severity}[/]", f.clause, f.issue, f.suggested_redline) + console.print(risks) + + +def _resolve_corpus(args) -> tuple[str, str, str, str]: + """Return (contract_text, scan_path, db_path, label).""" + # Explicit file path wins. + if args.contract and Path(args.contract).is_file(): + p = Path(args.contract) + scan = args.scan or str(GENERATED_DIR / "acme-msa-signature.png") + return p.read_text(), scan, str(GENERATED_DIR / "obligations.db"), p.name + + if MANIFEST_PATH.exists(): # real CUAD corpus + manifest = json.loads(MANIFEST_PATH.read_text()) + slug = args.contract or manifest["primary"] + text = (CUAD_DIR / f"{slug}.txt").read_text() + scan = args.scan or str(GENERATED_DIR / manifest["scan_path"]) + return text, scan, str(GENERATED_DIR / manifest["db_path"]), f"CUAD · {slug}" + + # Offline fallback: synthetic corpus (generate it if missing). + if not (GENERATED_DIR / "acme-msa.md").exists(): + console.print("[yellow]No corpus found — generating the synthetic one. " + "Run `uv run fetch-contracts` for real CUAD contracts.[/]") + make_sample.main() + name = args.contract or "acme-msa" + text = (GENERATED_DIR / f"{name}.md").read_text() + scan = args.scan or str(GENERATED_DIR / "acme-msa-signature.png") + return text, scan, str(GENERATED_DIR / "obligations.db"), f"synthetic · {name}" + + +def _list_contracts() -> None: + if MANIFEST_PATH.exists(): + manifest = json.loads(MANIFEST_PATH.read_text()) + console.print(f"[bold]CUAD corpus[/] ({manifest['license']}):") + for c in manifest["contracts"]: + console.print(f" {c['slug']} [dim]{c['type']} · {c['char_len']:,} chars[/]") + elif (GENERATED_DIR / "acme-msa.md").exists(): + console.print("[bold]Synthetic corpus[/]: acme-msa, mutual-nda, acme-sow") + else: + console.print("No corpus yet. Run `uv run fetch-contracts` or `uv run make-sample`.") + + +async def _warm(app: AppContext) -> None: + """Provision the generative models before the run. + + The orchestrator and reasoning sub-agent call models through the Agents SDK, + which has no cold-start retry — so on a scale-from-zero cluster the first + call would fail. Touch each model once (our helpers retry while it loads). + """ + m = app.cfg["models"] + # Only the orchestrator and reasoning sub-agent run through the Agents SDK, + # which has no cold-start retry — so only these must be warm before the run. + # Triage, vision, guard, SQL, and the encode/score/extract tools all retry + # while their model provisions, so they load lazily on first use. + for model in dict.fromkeys([m["orchestrator"], m["reasoning"]]): + with console.status(f"Warming {model} (first call provisions it on a cold cluster)..."): + try: + await chat_once(app, model, [{"role": "user", "content": "ok"}], max_tokens=1) + except Exception as exc: # warm-up is best-effort; the run will retry + console.print(f"[yellow]warm-up: {model} not ready ({type(exc).__name__}); will retry during the run.[/]") + console.print("[green]Warm-up done.[/]\n") + + +async def _run(args) -> None: + cfg = load_config() + text, scan_path, db_path, label = _resolve_corpus(args) + + # Tool calls use our own provisioning-retry, so their client shouldn't also retry + # (max_retries=0). Agents-SDK calls can't be wrapped, so that client retries hard + # to survive a model being evicted/reloaded mid-run on a busy cluster. + tool_client = make_openai_client(cfg["cluster"]["url"], cfg["cluster"]["api_key"], max_retries=0) + agent_client = make_openai_client(cfg["cluster"]["url"], cfg["cluster"]["api_key"], max_retries=12, timeout_s=180) + configure_runtime(agent_client) + + _print_catalog(cfg) + console.print(f"Reviewing [bold]{label}[/] against SIE at " + f"[bold]{cfg['cluster']['url']}[/]\n") + + async with SIEAsyncClient(cfg["cluster"]["url"], api_key=cfg["cluster"]["api_key"] or None) as sie: + ledger = Ledger() + app = AppContext( + sie=sie, + oai=tool_client, + cfg=cfg, + ledger=ledger, + contract_text=text, + scan_path=scan_path, + db_path=db_path, + reasoning_agent=build_reasoning_agent(cfg, agent_client), + ) + investigator = build_investigator(cfg, agent_client) + synthesizer = build_synthesizer(cfg, agent_client) + if not args.no_warm: + await _warm(app) + t0 = time.monotonic() + try: + gather, result = await run_review(app, investigator, synthesizer, args.instruction) + except InputGuardrailTripwireTriggered: + console.print(Panel("Request blocked by the granite-guardian safety guardrail.", + border_style="red", title="Guardrail tripped")) + _print_ledger(ledger) + await agent_client.close() + await tool_client.close() + return + except Exception as exc: # a model the SDK calls (investigator/sub-agent) was unreachable + console.print(Panel(f"{type(exc).__name__}: {exc}", border_style="red", + title="Run failed (model unavailable)")) + _print_ledger(ledger) + await agent_client.close() + await tool_client.close() + return + wall = time.monotonic() - t0 + await agent_client.close() + await tool_client.close() + + try: + review = result.final_output_as(ContractReview) + except Exception: + review = result.final_output + + console.print() + if isinstance(review, ContractReview): + _print_review(review) + else: + console.print(Panel(str(review), title="Agent output (unstructured)")) + console.print() + _print_ledger(ledger) + usage = getattr(getattr(gather, "context_wrapper", None), "usage", None) + _print_summary(cfg, usage, wall) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Review a contract with a multi-model SIE agent.") + parser.add_argument("--contract", default=None, + help="contract slug (CUAD), synthetic name, or a path to a .txt/.md file") + parser.add_argument("--scan", default=None, help="path to a signature-page image (png/jpg)") + parser.add_argument( + "--instruction", + default="Review this contract. Identify the parties and key terms, flag the " + "biggest risks to the Customer with severity and redlines, confirm it is " + "executed, and surface upcoming obligations and deadlines.", + help="what to ask the agent to do", + ) + parser.add_argument("--list", action="store_true", help="list available contracts and exit") + parser.add_argument("--no-warm", action="store_true", + help="skip pre-warming models (faster when the cluster is already warm)") + args = parser.parse_args() + + if args.list: + _list_contracts() + return + asyncio.run(_run(args)) + + +if __name__ == "__main__": + main() diff --git a/examples/contract-review-agent/contract_review_agent/config.py b/examples/contract-review-agent/contract_review_agent/config.py new file mode 100644 index 00000000..068526f4 --- /dev/null +++ b/examples/contract-review-agent/contract_review_agent/config.py @@ -0,0 +1,27 @@ +"""Load `config.yaml` and resolve the SIE endpoint from env / `.env`. + +Precedence for the cluster URL and key: real env var > `.env` file > `config.yaml`. +""" + +from __future__ import annotations + +import os +from pathlib import Path +from typing import Any + +import yaml +from dotenv import dotenv_values + +PROJECT_ROOT = Path(__file__).resolve().parent.parent + + +def load_config() -> dict[str, Any]: + cfg = yaml.safe_load((PROJECT_ROOT / "config.yaml").read_text()) + env = dotenv_values(PROJECT_ROOT / ".env") + + cluster = cfg.setdefault("cluster", {}) + cluster["url"] = ( + os.environ.get("SIE_CLUSTER_URL") or env.get("SIE_CLUSTER_URL") or cluster.get("url") or "http://localhost:8080" + ) + cluster["api_key"] = os.environ.get("SIE_API_KEY") or env.get("SIE_API_KEY") or cluster.get("api_key") or "" + return cfg diff --git a/examples/contract-review-agent/contract_review_agent/data/__init__.py b/examples/contract-review-agent/contract_review_agent/data/__init__.py new file mode 100644 index 00000000..202a8791 --- /dev/null +++ b/examples/contract-review-agent/contract_review_agent/data/__init__.py @@ -0,0 +1,15 @@ +"""Self-contained sample data for the contract-review demo. + +Run ``uv run make-sample`` (or ``python -m contract_review_agent.data.make_sample``) +to generate everything under ``data/generated/``: the contract markdown, the +"scanned" signature-page image, and the SQLite obligations database. Nothing is +downloaded — it is all synthetic and safe to ship. +""" + +from pathlib import Path + +DATA_DIR = Path(__file__).resolve().parent +GENERATED_DIR = DATA_DIR / "generated" +CUAD_DIR = GENERATED_DIR / "cuad" # real contracts fetched from CUAD +MANIFEST_PATH = GENERATED_DIR / "manifest.json" +DB_PATH = GENERATED_DIR / "obligations.db" diff --git a/examples/contract-review-agent/contract_review_agent/data/fetch_contracts.py b/examples/contract-review-agent/contract_review_agent/data/fetch_contracts.py new file mode 100644 index 00000000..189d9309 --- /dev/null +++ b/examples/contract-review-agent/contract_review_agent/data/fetch_contracts.py @@ -0,0 +1,205 @@ +"""Fetch real contracts from CUAD to build the corpus the agent reviews. + +CUAD (Contract Understanding Atticus Dataset) is 510 real commercial contracts +filed with the SEC, released by The Atticus Project under CC BY 4.0 — the +dataset built specifically for contract review. We download the dataset's small +(~18 MB) archive once, parse the SQuAD-format JSON, write a curated handful of +full contracts to ``data/generated/cuad/``, render one page to an image for the +OCR/vision step, and seed an obligations database that references them. + + Dan Hendrycks, Collin Burns, Anya Chen, Spencer Ball. "CUAD: An + Expert-Annotated NLP Dataset for Legal Contract Review." arXiv:2103.06268. + Dataset: https://www.atticusprojectai.org/cuad — CC BY 4.0. + +Run: ``uv run fetch-contracts`` (offline alternative: ``uv run make-sample``). +""" + +from __future__ import annotations + +import argparse +import json +import re +import sqlite3 +import sys +import tempfile +import zipfile +from pathlib import Path + +import httpx + +from . import CUAD_DIR, DB_PATH, GENERATED_DIR, MANIFEST_PATH +from .make_sample import SCHEMA_DDL +from .render import render_text_page + +CUAD_ZIP_URL = "https://raw.githubusercontent.com/TheAtticusProject/cuad/main/data.zip" +CITATION = ( + "CUAD (Contract Understanding Atticus Dataset), The Atticus Project, " + "CC BY 4.0. Hendrycks et al., arXiv:2103.06268." +) + +_TYPE_KEYWORDS = [ + ("LICENSE", "License"), + ("RESELLER", "Reseller"), + ("DISTRIBUTOR", "Distribution"), + ("HOSTING", "Hosting"), + ("MAINTENANCE", "Maintenance"), + ("SUPPLY", "Supply"), + ("SERVICE", "Services"), + ("CONSULTING", "Consulting"), + ("DEVELOPMENT", "Development"), + ("MARKETING", "Marketing"), +] + + +def _slug(title: str) -> str: + s = re.sub(r"[^a-z0-9]+", "-", title.lower()).strip("-") + return (s[:60] or "contract").rstrip("-") + + +def _infer_type(title: str) -> str: + upper = title.upper() + for needle, label in _TYPE_KEYWORDS: + if needle in upper: + return label + return "Agreement" + + +def _short_name(title: str) -> str: + # CUAD titles are long EDGAR filenames; keep a readable counterparty-ish label. + return re.sub(r"[_\-]+", " ", title).strip()[:48] + + +def _download_archive() -> Path: + """Download CUAD's data.zip once, cached under data/generated/.cache/.""" + dest = GENERATED_DIR / ".cache" / "cuad-data.zip" + if dest.exists(): + return dest + dest.parent.mkdir(parents=True, exist_ok=True) + print(f"Downloading CUAD archive (~18 MB) from {CUAD_ZIP_URL} ...", file=sys.stderr) + with tempfile.NamedTemporaryFile(delete=False, dir=dest.parent, suffix=".tmp") as tmp: + tmp_path = Path(tmp.name) + try: + with httpx.stream("GET", CUAD_ZIP_URL, follow_redirects=True, timeout=180) as resp: + resp.raise_for_status() + for chunk in resp.iter_bytes(): + tmp.write(chunk) + except BaseException: + tmp_path.unlink(missing_ok=True) + raise + tmp_path.rename(dest) + return dest + + +def fetch(n: int, match: str | None) -> list[dict]: + """Return up to ``n`` full contracts from CUAD as {title, text, ...} dicts.""" + archive = _download_archive() + with zipfile.ZipFile(archive) as z: + names = z.namelist() + json_name = next((m for m in names if m.lower().endswith("cuadv1.json")), None) or next( + m for m in names if m.lower().endswith(".json") + ) + squad = json.loads(z.read(json_name)) + + contracts: list[dict] = [] + for entry in squad["data"]: + title = (entry.get("title") or "").strip() + paragraphs = entry.get("paragraphs") or [] + if not title or not paragraphs: + continue + if match and match.lower() not in title.lower(): + continue + text = (paragraphs[0].get("context") or "").strip() + if not text: + continue + contracts.append( + { + "title": title, + "slug": _slug(title), + "type": _infer_type(title), + "counterparty": _short_name(title), + "text": text, + "char_len": len(text), + } + ) + if len(contracts) >= n: + break + return contracts + + +# Synthetic obligations attached to the real contracts, so the text-to-SQL tool +# has deadlines/payments to query about the contracts actually in the corpus. +_OBLIGATION_PLANS = [ + ("Send renewal / non-renewal notice before term end", "us", "2026-09-15", None, "open"), + ("Annual subscription / license fee", "us", "2026-07-01", 120000.0, "open"), + ("Quarterly compliance attestation", "counterparty", "2026-06-30", None, "open"), + ("Prior-period true-up payment", "us", "2026-04-30", 45000.0, "done"), + ("Audit / records inspection response", "counterparty", "2026-06-10", None, "overdue"), +] + + +def build_obligations(contracts: list[dict]) -> int: + DB_PATH.unlink(missing_ok=True) + rows = [] + for i, c in enumerate(contracts): + for j in range(3): + obligation, owner, due, amount, status = _OBLIGATION_PLANS[(i + j) % len(_OBLIGATION_PLANS)] + rows.append((c["counterparty"], c["type"], obligation, owner, due, amount, status)) + conn = sqlite3.connect(DB_PATH) + try: + conn.execute(SCHEMA_DDL) + conn.executemany( + "INSERT INTO obligations " + "(counterparty, contract_type, obligation, owner, due_date, amount_usd, status) " + "VALUES (?, ?, ?, ?, ?, ?, ?)", + rows, + ) + conn.commit() + finally: + conn.close() + return len(rows) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Fetch real contracts from CUAD (CC BY 4.0).") + parser.add_argument("--contracts", type=int, default=4, help="how many contracts to pull") + parser.add_argument("--match", default=None, help="only titles containing this substring") + args = parser.parse_args() + + CUAD_DIR.mkdir(parents=True, exist_ok=True) + contracts = fetch(args.contracts, args.match) + if not contracts: + raise SystemExit("No contracts matched. Try a different --match or run `uv run make-sample`.") + + for c in contracts: + (CUAD_DIR / f"{c['slug']}.txt").write_text(c["text"]) + + primary = contracts[0] + scan_path = CUAD_DIR / f"{primary['slug']}-page.png" + render_text_page(primary["text"], scan_path, title=primary["title"]) + + n_obligations = build_obligations(contracts) + + manifest = { + "source": "CUAD v1", + "license": "CC BY 4.0", + "citation": CITATION, + "primary": primary["slug"], + "scan_path": str(scan_path.relative_to(GENERATED_DIR)), + "db_path": str(DB_PATH.relative_to(GENERATED_DIR)), + "contracts": [ + {k: c[k] for k in ("title", "slug", "type", "counterparty", "char_len")} for c in contracts + ], + } + MANIFEST_PATH.write_text(json.dumps(manifest, indent=2) + "\n") + + print(f"Wrote {len(contracts)} contracts to {CUAD_DIR}:") + for c in contracts: + flag = " (primary)" if c["slug"] == primary["slug"] else "" + print(f" {c['type']:12s} {c['slug']}.txt ({c['char_len']:,} chars){flag}") + print(f"Rendered scan page : {scan_path.name}") + print(f"Obligations DB : {DB_PATH.name} ({n_obligations} rows)") + print(f"Source: {CITATION}") + + +if __name__ == "__main__": + main() diff --git a/examples/contract-review-agent/contract_review_agent/data/make_sample.py b/examples/contract-review-agent/contract_review_agent/data/make_sample.py new file mode 100644 index 00000000..9a1bd991 --- /dev/null +++ b/examples/contract-review-agent/contract_review_agent/data/make_sample.py @@ -0,0 +1,290 @@ +"""Generate the self-contained sample corpus for the demo. + +Produces, under ``data/generated/``: + * ``acme-msa.md``, ``mutual-nda.md``, ``acme-sow.md`` — contracts to review/index + * ``acme-msa-signature.png`` — the executed signature page, as a "scan" image + * ``obligations.db`` — a SQLite database the text-to-SQL tool queries + +Everything here is fictional. The MSA is seeded with genuinely risk-bearing +clauses (auto-renewal, uncapped indemnity, vendor-friendly termination) so the +risk-analysis agent has real findings to surface. +""" + +from __future__ import annotations + +import sqlite3 +import textwrap + +from PIL import Image, ImageDraw, ImageFont + +from contract_review_agent.data import GENERATED_DIR + +# Treat this as "today" for the seeded obligation due dates. +TODAY = "2026-06-22" + +# Single source of truth for the obligations schema: used both to create the +# table and to prompt the text-to-SQL model (the inline comments help it). +SCHEMA_DDL = """\ +CREATE TABLE obligations ( + id INTEGER PRIMARY KEY, + counterparty TEXT NOT NULL, -- legal name of the other party + contract_type TEXT NOT NULL, -- one of: MSA, NDA, SOW + obligation TEXT NOT NULL, -- description of what is owed/required + owner TEXT NOT NULL, -- who must act: 'us' or 'counterparty' + due_date TEXT NOT NULL, -- ISO-8601 date (YYYY-MM-DD) + amount_usd REAL, -- dollar amount if a payment, else NULL + status TEXT NOT NULL -- one of: open, done, overdue +)""" + +ACME_MSA = """\ +# Master Services Agreement + +This Master Services Agreement (the "Agreement") is entered into between +Northwind Analytics, Inc. ("Customer") and Acme Cloud Services, LLC ("Provider"). + +## 1. Definitions +"Services" means the cloud data-processing services described in one or more +Order Forms. "Order Form" means an ordering document executed by both parties +that references this Agreement. + +## 2. Term and Renewal +The initial term of this Agreement is twelve (12) months from the Effective +Date. Thereafter, this Agreement automatically renews for successive twelve (12) +month terms unless Customer provides written notice of non-renewal at least +ninety (90) days before the end of the then-current term. Fees for each renewal +term may increase by up to ten percent (10%) over the prior term. + +## 3. Fees and Payment +Customer shall pay all fees within thirty (30) days of the invoice date. +Late amounts accrue interest at 1.5% per month. All fees are non-refundable, +including fees for partially used subscription periods. + +## 4. Termination +Provider may terminate this Agreement or any Order Form for convenience upon +thirty (30) days written notice. Customer may terminate only for Provider's +uncured material breach, after a sixty (60) day cure period. Upon termination, +Customer shall pay all fees due through the end of the then-current term. + +## 5. Limitation of Liability +EXCEPT FOR CUSTOMER'S PAYMENT OBLIGATIONS AND CUSTOMER'S INDEMNIFICATION +OBLIGATIONS, EACH PARTY'S TOTAL AGGREGATE LIABILITY SHALL NOT EXCEED THE FEES +PAID BY CUSTOMER IN THE THREE (3) MONTHS PRECEDING THE CLAIM. PROVIDER SHALL NOT +BE LIABLE FOR ANY INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES. + +## 6. Indemnification +Customer shall defend, indemnify, and hold harmless Provider from any and all +claims arising out of Customer's use of the Services, without limitation. This +indemnity is uncapped and survives termination of this Agreement. + +## 7. Confidentiality +Each party shall protect the other's Confidential Information using at least a +reasonable standard of care, and shall not disclose it except to personnel with +a need to know who are bound by confidentiality obligations. + +## 8. Data Protection +Provider shall process Customer personal data only on Customer's documented +instructions and shall maintain appropriate technical and organizational +security measures. Provider may engage sub-processors, provided Provider remains +liable for their performance. + +## 9. Service Levels +Provider targets 99.9% monthly availability. If availability falls below the +target, Customer's sole and exclusive remedy is a service credit equal to 5% of +the monthly fee per 0.1% below target, capped at 25% of the monthly fee. + +## 10. Governing Law +This Agreement is governed by the laws of the State of Delaware, without regard +to its conflict-of-laws principles. The parties consent to the exclusive +jurisdiction of the state and federal courts located in Wilmington, Delaware. + +## 11. Assignment +Customer may not assign this Agreement without Provider's prior written consent. +Provider may assign this Agreement freely, including to an acquirer of all or +substantially all of its assets. + +## 12. Entire Agreement +This Agreement, together with all Order Forms, constitutes the entire agreement +between the parties and supersedes all prior agreements on its subject matter. +""" + +MUTUAL_NDA = """\ +# Mutual Non-Disclosure Agreement + +This Mutual Non-Disclosure Agreement is entered into between Northwind +Analytics, Inc. and Globex Corporation (each a "Party"). + +## 1. Purpose +The Parties wish to explore a potential business relationship and may disclose +confidential information to one another for that purpose. + +## 2. Confidential Information +"Confidential Information" means any non-public information disclosed by a Party, +whether orally, in writing, or by inspection of tangible objects, that is +designated confidential or that reasonably should be understood to be +confidential. + +## 3. Obligations +The receiving Party shall use Confidential Information solely for the Purpose, +shall protect it with the same degree of care it uses for its own confidential +information (but no less than reasonable care), and shall not disclose it to +third parties without the disclosing Party's prior written consent. + +## 4. Term +This Agreement remains in effect for two (2) years from the Effective Date. +Confidentiality obligations survive for three (3) years after disclosure. + +## 5. Return of Materials +Upon written request, the receiving Party shall promptly return or destroy all +Confidential Information in its possession. + +## 6. Governing Law +This Agreement is governed by the laws of the State of New York. +""" + +ACME_SOW = """\ +# Statement of Work No. 1 + +This Statement of Work ("SOW") is issued under and incorporates the Master +Services Agreement between Northwind Analytics, Inc. and Acme Cloud Services, +LLC. + +## 1. Scope +Provider will deliver a managed data-ingestion pipeline, including connectors, +transformation jobs, and a monitoring dashboard, as further described in +Exhibit A. + +## 2. Deliverables and Milestones +Milestone 1 (design) is due 30 days after the SOW Effective Date. Milestone 2 +(implementation) is due 90 days after. Milestone 3 (acceptance) is due 120 days +after. + +## 3. Fees +Fees for this SOW total USD 240,000, invoiced 25% per milestone and 25% on final +acceptance. Travel expenses are billed at cost with prior written approval. + +## 4. Acceptance +Customer has fifteen (15) business days after delivery of each milestone to +accept or reject deliverables in writing. Deliverables are deemed accepted if +Customer does not respond within that period. + +## 5. Personnel +Provider shall assign a named technical lead for the duration of this SOW and +shall not reassign that person without Customer's consent, not to be +unreasonably withheld. + +## 6. Governing Law +This SOW is governed by the Master Services Agreement, including its governing +law provision. +""" + +CONTRACTS = { + "acme-msa": ACME_MSA, + "mutual-nda": MUTUAL_NDA, + "acme-sow": ACME_SOW, +} + +# The executed signature page — rendered to an image so the OCR and vision +# models have a real "scan" to read (these executed details are intentionally +# NOT in the template body above; the agent must read them off the scan). +SIGNATURE_PAGE = """\ +EXECUTION PAGE + +MASTER SERVICES AGREEMENT + +Effective Date: March 1, 2026 + +IN WITNESS WHEREOF, the parties have executed this Agreement as of the +Effective Date. + +CUSTOMER: PROVIDER: +Northwind Analytics, Inc. Acme Cloud Services, LLC + +By: /s/ Dana Whitfield By: /s/ Marcus Reyes +Name: Dana Whitfield Name: Marcus Reyes +Title: Chief Operating Officer Title: VP, Customer Success +Date: March 1, 2026 Date: March 1, 2026 + +Governing Law: State of Delaware +Notices to Customer: legal@northwind-analytics.example +""" + +# (counterparty, contract_type, obligation, owner, due_date, amount_usd, status) +OBLIGATIONS = [ + ("Acme Cloud Services, LLC", "MSA", "Send non-renewal notice (90 days before term end) to avoid auto-renewal", "us", "2026-11-30", None, "open"), + ("Acme Cloud Services, LLC", "MSA", "Annual subscription fee (renewal term)", "us", "2026-03-01", 180000.0, "done"), + ("Acme Cloud Services, LLC", "SOW", "Milestone 1 (design) payment", "us", "2026-04-15", 60000.0, "done"), + ("Acme Cloud Services, LLC", "SOW", "Milestone 2 (implementation) payment", "us", "2026-07-14", 60000.0, "open"), + ("Acme Cloud Services, LLC", "SOW", "Milestone 3 (acceptance) payment", "us", "2026-08-13", 60000.0, "open"), + ("Acme Cloud Services, LLC", "MSA", "Quarterly security attestation delivery", "counterparty", "2026-06-30", None, "open"), + ("Globex Corporation", "NDA", "Return or destroy confidential materials on request", "us", "2026-05-10", None, "overdue"), + ("Globex Corporation", "NDA", "Confidentiality survival period ends", "us", "2027-09-01", None, "open"), + ("Initech LLC", "MSA", "Send non-renewal notice (60 days before term end)", "us", "2026-07-02", None, "open"), + ("Initech LLC", "MSA", "Annual subscription fee", "us", "2026-09-01", 95000.0, "open"), + ("Umbrella Health, Inc.", "MSA", "Data processing audit response", "counterparty", "2026-06-18", None, "overdue"), + ("Umbrella Health, Inc.", "MSA", "Annual subscription fee", "us", "2026-12-01", 220000.0, "open"), +] + + +def _load_font(size: int) -> ImageFont.ImageFont: + """A scalable font with no external file dependency (Pillow >= 10.1).""" + try: + return ImageFont.load_default(size=size) + except TypeError: # very old Pillow without sized default + return ImageFont.load_default() + + +def render_signature_page(out_path) -> None: + width, height, margin = 1000, 1300, 70 + img = Image.new("RGB", (width, height), "white") + draw = ImageDraw.Draw(img) + font = _load_font(22) + y = margin + for raw_line in SIGNATURE_PAGE.splitlines(): + # Preserve the two-column layout by not wrapping; just render each line. + draw.text((margin, y), raw_line, fill="black", font=font) + y += 34 + img.save(out_path) + + +def build_obligations_db(out_path) -> None: + out_path.unlink(missing_ok=True) + conn = sqlite3.connect(out_path) + try: + conn.execute(SCHEMA_DDL) + conn.executemany( + "INSERT INTO obligations " + "(counterparty, contract_type, obligation, owner, due_date, amount_usd, status) " + "VALUES (?, ?, ?, ?, ?, ?, ?)", + OBLIGATIONS, + ) + conn.commit() + finally: + conn.close() + + +def main() -> None: + GENERATED_DIR.mkdir(parents=True, exist_ok=True) + + for name, body in CONTRACTS.items(): + (GENERATED_DIR / f"{name}.md").write_text(body) + + sig_path = GENERATED_DIR / "acme-msa-signature.png" + render_signature_page(sig_path) + + db_path = GENERATED_DIR / "obligations.db" + build_obligations_db(db_path) + + print( + textwrap.dedent( + f"""\ + Sample corpus written to {GENERATED_DIR}: + contracts : {", ".join(f"{n}.md" for n in CONTRACTS)} + scan : {sig_path.name} + database : {db_path.name} ({len(OBLIGATIONS)} obligations) + """ + ) + ) + + +if __name__ == "__main__": + main() diff --git a/examples/contract-review-agent/contract_review_agent/data/render.py b/examples/contract-review-agent/contract_review_agent/data/render.py new file mode 100644 index 00000000..b98f2b9e --- /dev/null +++ b/examples/contract-review-agent/contract_review_agent/data/render.py @@ -0,0 +1,53 @@ +"""Render contract text to a page image. + +Used to produce the "scanned page" the OCR and vision models read. (For a true +scan, point ``--scan`` at a real PDF/PNG instead — see the README.) +""" + +from __future__ import annotations + +import textwrap +from pathlib import Path + +from PIL import Image, ImageDraw, ImageFont + + +def load_font(size: int) -> ImageFont.ImageFont: + """A scalable font with no external file dependency (Pillow >= 10.1).""" + try: + return ImageFont.load_default(size=size) + except TypeError: # very old Pillow without sized default + return ImageFont.load_default() + + +def render_text_page( + text: str, + out_path: Path, + *, + title: str | None = None, + width: int = 1000, + line_height: int = 30, + font_size: int = 20, + margin: int = 60, + max_lines: int = 46, + wrap: int = 92, +) -> None: + lines: list[str] = [] + for raw in text.splitlines(): + lines.extend(textwrap.wrap(raw, width=wrap) or [""]) + lines = lines[:max_lines] + + n = len(lines) + (2 if title else 0) + height = max(margin * 2 + line_height * n, 400) + img = Image.new("RGB", (width, height), "white") + draw = ImageDraw.Draw(img) + body_font = load_font(font_size) + + y = margin + if title: + draw.text((margin, y), title[:80], fill="black", font=load_font(font_size + 4)) + y += line_height * 2 + for line in lines: + draw.text((margin, y), line, fill="black", font=body_font) + y += line_height + img.save(out_path) diff --git a/examples/contract-review-agent/contract_review_agent/guardrails.py b/examples/contract-review-agent/contract_review_agent/guardrails.py new file mode 100644 index 00000000..94bebbd3 --- /dev/null +++ b/examples/contract-review-agent/contract_review_agent/guardrails.py @@ -0,0 +1,60 @@ +"""Input guardrail backed by ibm-granite/granite-guardian. + +Before the orchestrator sees a request, granite-guardian screens it for unsafe +content / prompt-injection. The model emits a "yes" (unsafe) / "no" (safe) +verdict; we trip the guardrail on "yes". This is the Agents SDK's guardrail +hook wired to a dedicated safety model in the SIE catalog — the same "guard +content" job the landing page describes. +""" + +from __future__ import annotations + +import time +from typing import Any + +from agents import Agent, GuardrailFunctionOutput, RunContextWrapper, input_guardrail + +from .runtime import AppContext, chat_once + + +def _input_text(data: Any) -> str: + """Flatten the guardrail input (a string, or a list of input items) to text.""" + if isinstance(data, str): + return data + parts: list[str] = [] + for item in data or []: + content = item.get("content") if isinstance(item, dict) else None + if isinstance(content, str): + parts.append(content) + elif isinstance(content, list): + parts.extend(c["text"] for c in content if isinstance(c, dict) and isinstance(c.get("text"), str)) + return "\n".join(parts) + + +@input_guardrail +async def safety_guardrail( + ctx: RunContextWrapper[AppContext], agent: Agent, data: Any +) -> GuardrailFunctionOutput: + app = ctx.context + model = app.cfg["models"]["guard"] + t0 = time.monotonic() + try: + res = await chat_once(app, model, [{"role": "user", "content": _input_text(data)[:6000]}], max_tokens=8, timeout_s=25) + except Exception as exc: + # Guard model unavailable: fail OPEN (allow the run) but make it visible. + # A stricter deployment might fail closed — that's a policy choice. + app.ledger.record("Safety guardrail (granite-guardian)", model, "chat", + warmup_s=time.monotonic() - t0, got=f"unavailable: {type(exc).__name__}") + return GuardrailFunctionOutput(output_info={"error": str(exc), "model": model}, tripwire_triggered=False) + verdict = res.text.strip() + app.ledger.record( + "Safety guardrail (granite-guardian)", model, "chat", + warmup_s=res.provision_s, latency_s=res.gen_s, + sent=f"{res.prompt_tokens:,} tok" if res.prompt_tokens else "—", + got=verdict or "—", + ) + unsafe = verdict.lower().startswith("yes") + return GuardrailFunctionOutput( + output_info={"verdict": verdict, "model": model}, + tripwire_triggered=unsafe, + ) diff --git a/examples/contract-review-agent/contract_review_agent/runtime.py b/examples/contract-review-agent/contract_review_agent/runtime.py new file mode 100644 index 00000000..68fbdd7c --- /dev/null +++ b/examples/contract-review-agent/contract_review_agent/runtime.py @@ -0,0 +1,235 @@ +"""Wire the OpenAI Agents SDK to a SIE cluster. + +The one idea that makes this whole example work: the Agents SDK speaks the +OpenAI wire protocol, and SIE serves an OpenAI-compatible ``/v1`` endpoint. So +we hand every agent an ``AsyncOpenAI`` client whose ``base_url`` points at SIE, +force the *chat completions* API (SIE doesn't implement the newer Responses +API), and disable tracing (so nothing is shipped to api.openai.com). After that, +each agent just names the SIE catalog model it should run on. +""" + +from __future__ import annotations + +import asyncio +import time +from dataclasses import dataclass, field +from typing import Any, Awaitable, Callable, TypeVar + +from agents import ( + OpenAIChatCompletionsModel, + set_default_openai_api, + set_default_openai_client, + set_tracing_disabled, +) +from openai import APIConnectionError, APIStatusError, AsyncOpenAI +from sie_sdk import SIEAsyncClient + +T = TypeVar("T") + + +def make_openai_client( + cluster_url: str, api_key: str, *, max_retries: int = 2, timeout_s: float | None = None +) -> AsyncOpenAI: + """An OpenAI client pointed at SIE's OpenAI-compatible endpoint. + + ``max_retries`` matters because Agents-SDK-driven calls can't be wrapped in our + own provisioning retry — a client with generous retries survives a model being + evicted and reloaded mid-run on a busy cluster. + """ + base_url = cluster_url.rstrip("/") + "/v1" + kwargs: dict[str, Any] = {"base_url": base_url, "api_key": api_key or "not-needed", "max_retries": max_retries} + if timeout_s is not None: + kwargs["timeout"] = timeout_s + # SIE ignores the key locally; a managed cluster reads it as a Bearer token. + return AsyncOpenAI(**kwargs) + + +def configure_runtime(client: AsyncOpenAI) -> None: + """Point the whole Agents SDK at SIE instead of api.openai.com.""" + set_default_openai_client(client) # every agent talks to SIE... + set_default_openai_api("chat_completions") # ...over chat completions, not Responses... + set_tracing_disabled(True) # ...and we never phone home with traces. + + +def model_for(model_id: str, client: AsyncOpenAI) -> OpenAIChatCompletionsModel: + """Bind one SIE catalog model id to an Agents-SDK model an Agent can use.""" + return OpenAIChatCompletionsModel(model=model_id, openai_client=client) + + +@dataclass +class GenResult: + """One generation's text plus the metrics we log for observability. + + ``provision_s`` is time spent waiting for the model to be ready — the cold + start, measured as the failed 503/504 retries before the request that + actually ran. ``gen_s`` is the duration of that successful (warm) call, so + throughput is measured warm instead of being blended with the cold start. + """ + + text: str + provision_s: float + gen_s: float + prompt_tokens: int | None = None + completion_tokens: int | None = None + + @property + def latency_s(self) -> float: + return self.provision_s + self.gen_s + + @property + def tokens_per_s(self) -> float | None: + if self.completion_tokens and self.gen_s > 0: + return self.completion_tokens / self.gen_s + return None + + +@dataclass +class LedgerEntry: + step: str + model: str + sie_fn: str + warmup_s: float = 0.0 # cold-start / provisioning wait (0 if warm or N/A) + latency_s: float = 0.0 # the call itself (warm), excluding warm-up + sent: str = "" + got: str = "" + throughput: str = "" + + +@dataclass +class Ledger: + """Per-call observability for one agent run. + + Every tool, guardrail, and sub-agent records the model it used plus latency, + how much data it sent, and throughput — so a normal run prints not just + *which* models the cluster fanned the request across, but how each performed. + """ + + entries: list[LedgerEntry] = field(default_factory=list) + + def record( + self, + step: str, + model: str, + sie_fn: str, + *, + warmup_s: float = 0.0, + latency_s: float = 0.0, + sent: str = "", + got: str = "", + throughput: str = "", + ) -> None: + self.entries.append(LedgerEntry(step, model, sie_fn, warmup_s, latency_s, sent, got, throughput)) + + +@dataclass +class AppContext: + """Shared dependencies handed to every tool via ``RunContextWrapper``.""" + + sie: SIEAsyncClient + oai: AsyncOpenAI + cfg: dict[str, Any] + ledger: Ledger + contract_text: str # the contract body we have on file (template/clauses) + scan_path: str # the executed signature page, delivered as a scan image + db_path: str # the SQLite obligations database the SQL tool queries + reasoning_agent: Any = None # the risk-analyst sub-agent (set during build) + # Cache the clause embeddings in a shared mutable dict, not a reassigned + # attribute: the Agents SDK hands each tool call a shallow copy of the context, + # so mutating a shared object persists but reassigning `app.x = ...` does not. + clause_cache: dict[str, Any] = field(default_factory=dict) + + @property + def provision_timeout_s(self) -> float: + return float(self.cfg["cluster"].get("provision_timeout_s", 900)) + + +async def with_provisioning_retry( + make_call: Callable[[], Awaitable[T]], deadline: float +) -> tuple[T, float, float]: + """Retry an OpenAI-client call while SIE scales a model from zero. + + A cold cluster answers 503/504/202 until the model is resident; we retry until + ``deadline`` (monotonic seconds) before giving up. Returns + ``(result, provision_s, call_s)``: ``provision_s`` is the time spent in failed + retries before the attempt that succeeded (the cold start), ``call_s`` is the + duration of that successful call — so callers can report warm-up and warm + throughput separately. + """ + start = time.monotonic() + while True: + attempt_start = time.monotonic() + try: + result = await make_call() + return result, attempt_start - start, time.monotonic() - attempt_start + except APIConnectionError: + if time.monotonic() < deadline: + await asyncio.sleep(5) + continue + raise + except APIStatusError as exc: + # 202 accepted (warming), 502/503 unavailable, 504 first_chunk_timeout — + # all transient while SIE scales a model from zero. + if exc.status_code in (202, 502, 503, 504) and time.monotonic() < deadline: + await asyncio.sleep(5) + continue + raise + + +async def chat_once( + app: AppContext, + model: str, + messages: list[dict[str, Any]], + *, + max_tokens: int = 512, + temperature: float = 0.0, + timeout_s: float | None = None, + **extra: Any, +) -> GenResult: + """One OpenAI-compatible chat completion against SIE, with cold-start retry. + + ``timeout_s`` overrides how long to keep retrying a provisioning model (the + guardrail uses a short budget so it fails open fast rather than blocking). + """ + deadline = time.monotonic() + (timeout_s if timeout_s is not None else app.provision_timeout_s) + resp, provision_s, gen_s = await with_provisioning_retry( + lambda: app.oai.chat.completions.create( + model=model, messages=messages, max_tokens=max_tokens, temperature=temperature, **extra + ), + deadline, + ) + usage = resp.usage + return GenResult( + text=resp.choices[0].message.content or "", + provision_s=provision_s, + gen_s=gen_s, + prompt_tokens=getattr(usage, "prompt_tokens", None) if usage else None, + completion_tokens=getattr(usage, "completion_tokens", None) if usage else None, + ) + + +async def complete_once( + app: AppContext, + model: str, + prompt: str, + *, + max_tokens: int = 256, + temperature: float = 0.0, + stop: list[str] | None = None, +) -> GenResult: + """One OpenAI-compatible *text completion* against SIE (for completion-only + models like sqlcoder that expect a raw prompt, not a chat transcript).""" + deadline = time.monotonic() + app.provision_timeout_s + resp, provision_s, gen_s = await with_provisioning_retry( + lambda: app.oai.completions.create( + model=model, prompt=prompt, max_tokens=max_tokens, temperature=temperature, stop=stop + ), + deadline, + ) + usage = resp.usage + return GenResult( + text=resp.choices[0].text or "", + provision_s=provision_s, + gen_s=gen_s, + prompt_tokens=getattr(usage, "prompt_tokens", None) if usage else None, + completion_tokens=getattr(usage, "completion_tokens", None) if usage else None, + ) diff --git a/examples/contract-review-agent/contract_review_agent/tools.py b/examples/contract-review-agent/contract_review_agent/tools.py new file mode 100644 index 00000000..014e6ff0 --- /dev/null +++ b/examples/contract-review-agent/contract_review_agent/tools.py @@ -0,0 +1,321 @@ +"""The agent's tools — each one a different model from the SIE catalog. + +Every tool takes the shared :class:`AppContext` (injected by the Agents SDK via +``RunContextWrapper``, never shown to the model) and records to the ledger the +model it used, the latency, how much data it sent, and the throughput — so a +normal end-to-end run doubles as per-model observability. +""" + +from __future__ import annotations + +import base64 +import re +import sqlite3 +import time +from pathlib import Path + +import numpy as np +from agents import RunContextWrapper, Runner, function_tool +from sie_sdk import Item + +from .data.make_sample import SCHEMA_DDL, TODAY +from .runtime import AppContext, GenResult, chat_once, complete_once + + +def _tok(n: int | None) -> str: + return f"{n:,} tok" if n else "—" + + +def _tps(res: GenResult) -> str: + return f"{res.tokens_per_s:.0f} tok/s" if res.tokens_per_s else "—" + + +def _split_clauses(text: str, target: int = 900) -> list[str]: + """Chunk a contract into ~``target``-char passages for retrieval. + + Works on real contract text (plain, no markdown) as well as the synthetic + markdown body: split on blank-line paragraphs (falling back to lines), greedily + pack to ``target`` chars, then hard-split anything still oversized. + """ + text = text.replace("\r\n", "\n") + blocks = [b.strip() for b in re.split(r"\n\s*\n", text) if b.strip()] + if len(blocks) < 3: # e.g. single-spaced contract text + blocks = [ln.strip() for ln in text.split("\n") if ln.strip()] + + chunks: list[str] = [] + current = "" + for block in blocks: + if current and len(current) + len(block) + 2 > target: + chunks.append(current) + current = block + else: + current = f"{current}\n\n{block}" if current else block + if current: + chunks.append(current) + + out: list[str] = [] + for chunk in chunks: + while len(chunk) > target * 1.6: + out.append(chunk[:target]) + chunk = chunk[target:] + out.append(chunk) + return out + + +async def _clause_index(app: AppContext, embed_model: str) -> tuple[list[str], np.ndarray]: + """Embed the contract's clauses once and cache (clauses, matrix).""" + if "matrix" in app.clause_cache: + return app.clause_cache["clauses"], app.clause_cache["matrix"] + clauses = _split_clauses(app.contract_text) + t0 = time.monotonic() + results = await app.sie.encode( + embed_model, + [Item(id=str(i), text=c) for i, c in enumerate(clauses)], + output_types=["dense"], + wait_for_capacity=True, + provision_timeout_s=app.provision_timeout_s, + ) + dt = time.monotonic() - t0 + matrix = np.vstack([np.asarray(r["dense"], dtype=np.float32) for r in results]) + app.ledger.record( + "Embed clauses (index)", embed_model, "encode", + latency_s=dt, sent=f"{len(clauses)} clauses", got=f"{matrix.shape[1]}-dim", + throughput=f"{len(clauses) / dt:.1f} items/s" if dt > 0 else "—", + ) + app.clause_cache["clauses"] = clauses + app.clause_cache["matrix"] = matrix + return clauses, matrix + + +# ────────────────────────────────────────────────────────────────────────── +# Generative-LLM tools +# ────────────────────────────────────────────────────────────────────────── +@function_tool +async def classify_document(ctx: RunContextWrapper[AppContext]) -> str: + """Classify the contract under review as MSA, NDA, SOW, or Other, with a + one-line reason. A fast first-pass triage over the loaded contract.""" + app = ctx.context + model = app.cfg["models"]["triage"] + res = await chat_once( + app, model, + [ + {"role": "system", "content": "You label legal documents. Reply with the type only " + "(MSA, NDA, SOW, or Other) followed by a short reason."}, + {"role": "user", "content": app.contract_text[:1500]}, + ], + max_tokens=60, + ) + app.ledger.record("Triage: classify document", model, "chat", + warmup_s=res.provision_s, latency_s=res.gen_s, + sent=_tok(res.prompt_tokens), got=_tok(res.completion_tokens), throughput=_tps(res)) + return res.text.strip() + + +@function_tool +async def read_signature_page(ctx: RunContextWrapper[AppContext], question: str) -> str: + """Ask a vision model a question about the scanned signature-page image + (e.g. 'Are both parties' signatures present and dated?'). Use this for + visual checks that OCR text alone cannot answer.""" + app = ctx.context + model = app.cfg["models"]["vision"] + data = base64.b64encode(Path(app.scan_path).read_bytes()).decode() + res = await chat_once( + app, model, + [ + {"role": "system", "content": "You are a meticulous contracts paralegal. Answer only from what is visible in the image."}, + {"role": "user", "content": [ + {"type": "text", "text": question}, + {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{data}"}}, + ]}, + ], + max_tokens=220, + ) + app.ledger.record("Read signature page (vision)", model, "chat + image", + warmup_s=res.provision_s, latency_s=res.gen_s, + sent=f"{Path(app.scan_path).stat().st_size // 1024} KB img + {_tok(res.prompt_tokens)}", + got=_tok(res.completion_tokens), throughput=_tps(res)) + return res.text.strip() + + +@function_tool +async def analyze_clause_risks(ctx: RunContextWrapper[AppContext], clauses: str) -> str: + """Delegate deep legal risk analysis of specific clauses to a specialist + reasoning agent (the largest model). Pass the clause text to analyze; get + back per-clause issues with severity and suggested redlines.""" + app = ctx.context + model = app.cfg["models"]["reasoning"] + t0 = time.monotonic() + result = await Runner.run( + app.reasoning_agent, + "Analyze the following contract clauses for risk to the Customer. For each " + "risk, give: the clause, the issue, a severity (low/medium/high), and a " + f"one-line suggested redline.\n\n{clauses}", + context=app, + ) + dt = time.monotonic() - t0 + usage = getattr(getattr(result, "context_wrapper", None), "usage", None) + out_tok = getattr(usage, "output_tokens", None) + app.ledger.record("Clause-risk analysis (specialist sub-agent)", model, "chat", + latency_s=dt, sent=_tok(getattr(usage, "input_tokens", None)), got=_tok(out_tok), + throughput=f"{out_tok / dt:.0f} tok/s" if out_tok and dt > 0 else "—") + return str(result.final_output) + + +# ────────────────────────────────────────────────────────────────────────── +# Retrieval / extraction tools (encode · score · extract) +# ────────────────────────────────────────────────────────────────────────── +@function_tool +async def ocr_signature_page(ctx: RunContextWrapper[AppContext]) -> str: + """OCR the executed signature page (a scanned image) into markdown text. + Use this to recover who signed, their titles, and the execution date — + details that exist only on the scan, not in the contract body.""" + app = ctx.context + model = app.cfg["models"]["ocr"] + t0 = time.monotonic() + res = await app.sie.extract( + model, Item(images=[app.scan_path]), wait_for_capacity=True, provision_timeout_s=app.provision_timeout_s + ) + dt = time.monotonic() - t0 + entities = res.get("entities") or [] + markdown = entities[0]["text"] if entities else "(no text recognized)" + app.ledger.record("OCR signature page → markdown", model, "extract", + latency_s=dt, sent=f"{Path(app.scan_path).stat().st_size // 1024} KB img", + got=f"{len(markdown):,} chars md", throughput=f"{len(markdown) / dt:.0f} chars/s" if dt > 0 else "—") + return markdown + + +@function_tool +async def extract_entities(ctx: RunContextWrapper[AppContext]) -> str: + """Extract structured entities (parties, dates, monetary amounts, governing + law, notice periods) from the loaded contract using zero-shot NER.""" + app = ctx.context + model = app.cfg["models"]["entities"] + labels = ["party", "effective date", "renewal date", "termination notice period", + "monetary amount", "governing law", "term length"] + payload = app.contract_text[:6000] + t0 = time.monotonic() + res = await app.sie.extract( + model, Item(text=payload), labels=labels, wait_for_capacity=True, provision_timeout_s=app.provision_timeout_s + ) + dt = time.monotonic() - t0 + entities = res.get("entities") or [] + app.ledger.record("Extract entities (zero-shot NER)", model, "extract", + latency_s=dt, sent=f"{len(payload):,} chars", got=f"{len(entities)} entities", + throughput=f"{len(entities) / dt:.1f} ent/s" if dt > 0 else "—") + lines = [f"- {e['label']}: {e['text']} (score {e.get('score', 0):.2f})" for e in entities] + return "\n".join(lines) if lines else "(no entities found)" + + +@function_tool +async def search_clauses(ctx: RunContextWrapper[AppContext], query: str) -> str: + """Find the clauses most relevant to a topic (e.g. 'automatic renewal', + 'limitation of liability', 'indemnification'). Dense-embedding retrieval + followed by a cross-encoder rerank; returns the top clauses verbatim.""" + app = ctx.context + embed_model = app.cfg["models"]["embed"] + rerank_model = app.cfg["models"]["rerank"] + k_cand = int(app.cfg["search"]["top_k_candidates"]) + k_res = int(app.cfg["search"]["top_k_results"]) + + clauses, matrix = await _clause_index(app, embed_model) + q = await app.sie.encode( + embed_model, Item(text=query), output_types=["dense"], is_query=True, + wait_for_capacity=True, provision_timeout_s=app.provision_timeout_s, + ) + qv = np.asarray(q["dense"], dtype=np.float32) + denom = np.linalg.norm(matrix, axis=1) * (np.linalg.norm(qv) + 1e-9) + 1e-9 + sims = (matrix @ qv) / denom + candidate_idx = np.argsort(-sims)[: min(k_cand, len(clauses))] + candidates = [clauses[i] for i in candidate_idx] + + t0 = time.monotonic() + scored = await app.sie.score( + rerank_model, Item(text=query), [Item(id=str(i), text=c) for i, c in enumerate(candidates)], + wait_for_capacity=True, provision_timeout_s=app.provision_timeout_s, + ) + dt = time.monotonic() - t0 + app.ledger.record("Rerank candidate clauses", rerank_model, "score", + latency_s=dt, sent=f"{len(candidates)} docs", got=f"top {k_res}", + throughput=f"{len(candidates) / dt:.1f} docs/s" if dt > 0 else "—") + ranked = sorted(scored["scores"], key=lambda s: s["rank"])[:k_res] + top = [candidates[int(s["item_id"])] for s in ranked] + return "\n\n---\n\n".join(top) if top else "(no relevant clauses found)" + + +# ────────────────────────────────────────────────────────────────────────── +# Text-to-SQL tool (completion-only specialist model) +# ────────────────────────────────────────────────────────────────────────── +_SQLCODER_PROMPT = """### Task +Generate a SQLite SQL query to answer [QUESTION]{question}[/QUESTION] + +### Instructions +- Use SQLite syntax only. Today's date is {today}. +- Dates are ISO-8601 text; use SQLite date functions (e.g. date('now'), date(due_date)). + +### Database Schema +The query will run on a database with this schema: +{schema} + +### Answer +Given the database schema, here is the SQLite query that answers [QUESTION]{question}[/QUESTION]: +""" + + +def _clean_sql(raw: str) -> str: + sql = raw.strip() + for fence in ("```sql", "```sqlite", "```"): + if sql.startswith(fence): + sql = sql[len(fence) :].strip() + sql = sql.split("```")[0] + if "[SQL]" in sql: + sql = sql.split("[SQL]", 1)[1] + return sql.strip().rstrip(";").strip() + + +def _run_select(db_path: str, sql: str) -> tuple[list[str], list[tuple]]: + conn = sqlite3.connect(db_path) + try: + cur = conn.execute(sql) + cols = [d[0] for d in cur.description] if cur.description else [] + return cols, cur.fetchmany(50) + finally: + conn.close() + + +@function_tool +async def query_obligations_db(ctx: RunContextWrapper[AppContext], question: str) -> str: + """Answer a question about tracked contract obligations and deadlines by + generating SQL (a text-to-SQL specialist model) and running it against the + obligations database. Good for 'which obligations are due soon?' or 'total + outstanding payments by counterparty'.""" + app = ctx.context + model = app.cfg["models"]["sql"] + prompt = _SQLCODER_PROMPT.format(question=question, schema=SCHEMA_DDL, today=TODAY) + res = await complete_once(app, model, prompt, max_tokens=256, stop=[";", "```", "\n\n\n"]) + app.ledger.record("Text-to-SQL", model, "completions", + warmup_s=res.provision_s, latency_s=res.gen_s, + sent=_tok(res.prompt_tokens), got=_tok(res.completion_tokens), throughput=_tps(res)) + + sql = _clean_sql(res.text) + if not sql.lower().startswith("select"): + return f"Generated query was not a SELECT, so it was not run:\n{sql}" + try: + cols, rows = _run_select(app.db_path, sql) + except sqlite3.Error as exc: + return f"SQL error: {exc}\nQuery was:\n{sql}" + if not rows: + return f"Query ran but returned no rows.\nSQL: {sql}" + header = " | ".join(cols) + body = "\n".join(" | ".join("" if v is None else str(v) for v in row) for row in rows) + return f"SQL: {sql}\n\n{header}\n{body}" + + +ALL_TOOLS = [ + classify_document, + ocr_signature_page, + extract_entities, + search_clauses, + read_signature_page, + query_obligations_db, + analyze_clause_risks, +] diff --git a/examples/contract-review-agent/pyproject.toml b/examples/contract-review-agent/pyproject.toml new file mode 100644 index 00000000..9761bc1f --- /dev/null +++ b/examples/contract-review-agent/pyproject.toml @@ -0,0 +1,38 @@ +[project] +name = "contract-review-agent" +version = "0.1.0" +description = "Contract review with the OpenAI Agents SDK, every model served from one SIE cluster" +readme = "README.md" +requires-python = ">=3.12,<3.13" +dependencies = [ + # openai-agents >=0.14 pins websockets>=15 (for its realtime/voice features, + # unused here) which conflicts with sie-sdk's websockets<15; 0.13.x is the + # newest line compatible with the SIE SDK and has the same agent API. + "openai-agents>=0.13,<0.14", + "openai>=1.40", + "sie-sdk>=0.6.8", + "httpx>=0.27", + "numpy>=1.26", + "pydantic>=2.7", + "python-dotenv>=1.0", + "pyyaml>=6.0", + "pillow>=10.3", + "rich>=13.7", +] + +[project.scripts] +fetch-contracts = "contract_review_agent.data.fetch_contracts:main" +make-sample = "contract_review_agent.data.make_sample:main" +review = "contract_review_agent.cli:main" + +[dependency-groups] +dev = [ + "ruff>=0.11.0", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.ruff] +target-version = "py312"