diff --git a/.env.example b/.env.example index 733b9cf..472b5a1 100644 --- a/.env.example +++ b/.env.example @@ -18,3 +18,11 @@ MDB_MCP_CONNECTION_STRING=mongodb+srv://USER:PASS@cluster0.xxxxx.mongodb.net/?re # Atlas Service Account (OAuth client creds) → MCP atlas-* tools (atlas-get-performance-advisor, mongodb-logs) MDB_MCP_API_CLIENT_ID= MDB_MCP_API_CLIENT_SECRET= + +# ===== Auth & write gate (two-persona console) ===== +# Signs/verifies the HS256 session token shared by the read API and the dashboard — must be the +# SAME value in both. Generate with: openssl rand -hex 32. Never commit the real value. +SESSION_SECRET= +# Shared secret gating the write endpoints (POST /run, /packs/{id}/decision). Held server-side only +# (read API + the dashboard proxies); never exposed to the browser. Generate: openssl rand -hex 16. +RUN_API_TOKEN= diff --git a/.gitignore b/.gitignore index 93df23c..313757a 100644 --- a/.gitignore +++ b/.gitignore @@ -40,6 +40,8 @@ mongo-build-plan.md .claude/ .codex/ .gstack/ +.agents/ +skills-lock.json # local design reference images (visual specs, not pushed) design-refs/ diff --git a/README.md b/README.md index 5c58957..795e6aa 100644 --- a/README.md +++ b/README.md @@ -1,47 +1,77 @@ -# googlecloudhack — Evidence-Driven DBRE Agent +# Evidence-Driven DBRE Agent -A Gemini-powered MongoDB performance engineer: detects slow queries, proposes ESR-correct -indexes from real `explain` evidence, gates `apply` behind human approval, verifies the -result, and ships a hashed evidence pack plus an internal event ledger for every fix. +Two personas, one MongoDB performance loop: -> **Status:** Day-5 — live Cloud Run demo with split Agent Engine diagnosis roles, -> deterministic validation, human-gated apply/verify, and Evidence Ledger collections. +- **Users** run real query workloads against a live Atlas collection from a guided console. Each + query's real `explain` evidence is captured and attributed to whoever ran it. +- A **DBRE** triages the *actual* slowest captured queries — ranked by explain evidence (blocking + sort, collection scan, over-scan ratio), not wall-clock — diagnoses one, and approves an + ESR-correct index fix. The controller applies it behind a hash-bound human gate, then verifies it. -## Architecture - -Five demo stages (Detect → Diagnose → Test → Approve → Verify) over a deterministic -three-phase safety engine (DIAGNOSE → APPROVE → VERIFY). +There is no hardcoded demo query: the queries the DBRE fixes are the ones users really ran. -Current production path: +## Flow ```text -Dashboard -> FastAPI Cloud Run (opens approval gate) - -> Diagnose Agent Engine -> Candidate Agent Engine -> Rationale Agent Engine - -> deterministic controller validates + emits DIAGNOSED EvidencePack - -> /packs/{id}/decision hash-bound approval ticket - -> apply + verify +USER ─ login ─> Workload Console ─ guided query ─> read API ─ explain ─> Atlas + └─ capture (attributed) ─> query_log +DBRE ─ login ─> Slow-Query Queue (ranked by evidence) + └─ Diagnose ─> deterministic ESR diagnosis ─> DIAGNOSED EvidencePack + └─ hash-bound Approve ─> apply index + re-explain ─> VERIFIED ``` -The three Agent Engine resources perform read-only Mongo diagnosis, candidate testing, -and rationale generation with Python-native tools. -Deterministic Python remains the safety authority: it recomputes the ESR winner, evidence -hash, phase transitions, index apply, and verification. `/run` opens the approval gate -before diagnosis and remains read-only; `/packs/{run_id}/decision` is the only path that -can issue the one-time approval ticket required for mutation. +## Architecture + +- **Dashboard** — Next.js (App Router). Seeded role-based login backed by an httpOnly session + cookie; the user persona is confined to the workload console, the DBRE to the triage + review + planes. The read API is the security authority — it re-verifies the session bearer on every data + call, and the approver identity always comes from the verified session, never the browser. +- **Read API** — FastAPI on Cloud Run. Guided, validated, read-only workload queries; evidence + capture; the evidence-ranked queue; and the DIAGNOSE → (human APPROVE) → VERIFY remediation flow. + Index mutation happens only after a matching hash-bound approval. +- **Diagnosis** — a pure, deterministic ESR analyzer derives the correct index key order + (Equality → Sort → Range) from each query's own structure. In production three read-only Vertex AI + Agent Engine roles narrate the diagnosis; locally the controller runs deterministically. +- **State** — MongoDB Atlas. `dbre_state` holds `users`, `query_log`, `evidence_packs`, and the + internal ledger collections; the demo workload runs against `sample_supplies.sales_agent_demo`. -The dashboard reads only `EvidencePack` JSON, including `approval_gate` state and -`agent_trace` proof that the gate opened before Agent Engine participation. Internally, -MongoDB persists ledger collections for `slow_queries`, `candidates`, `experiments`, -`decisions`, `evidence_packs`, `approvals`, `applications`, and `verifications`. +The dashboard reads only `EvidencePack` v1 JSON; that contract is frozen in `contracts/`. ## Quickstart ```bash uv sync --dev -cp .env.example .env # fill GCP + MongoDB values -uv run pytest -q +cp .env.example .env # fill MongoDB + (prod) Vertex values; set SESSION_SECRET + RUN_API_TOKEN + +# one-time data + accounts (against your Atlas cluster) +uv run python seed/seed_demo_fixture.py seed # 300k demo docs +uv run python seed/seed_workload.py verify # baseline indexes + prove the trap presets +uv run python seed/seed_users.py # Dev Trivedi, Aakash Singh, DBRE — prints passwords once + +uv run pytest -q # unit + contract (live integration auto-skips with no conn) ``` +Run the full stack locally (deterministic controller, no Vertex needed): + +```bash +SS=$(openssl rand -hex 32); RT=$(openssl rand -hex 16) +# read API (reads Atlas via MDB_MCP_CONNECTION_STRING from .env) +SESSION_SECRET=$SS RUN_API_TOKEN=$RT uv run uvicorn api.server:app --port 8000 +# dashboard — SAME SESSION_SECRET + RUN_API_TOKEN, API_URL -> the read API +cd dashboard && npm install && \ + API_URL=http://127.0.0.1:8000 SESSION_SECRET=$SS RUN_API_TOKEN=$RT npm run dev +``` + +Re-run `seed/seed_workload.py reset` between demos — an approved fix removes the trap for a whole +store/method class. + +## Safety + +- Agents and tools are read-only; only the deterministic controller mutates, and only after a + matching hash-bound approval. +- The approver identity comes from the verified DBRE session — never from the browser. +- Secrets live in `.env` (local) / Secret Manager (prod); none are committed. + ## License Apache-2.0 — see [`LICENSE`](LICENSE). diff --git a/api/auth.py b/api/auth.py new file mode 100644 index 0000000..946d309 --- /dev/null +++ b/api/auth.py @@ -0,0 +1,139 @@ +"""Web auth: user store, login route, and role guards. Credentials are verified against the +`users` collection, a short-lived HS256 session token is issued, and protected endpoints are +gated by role. SESSION_SECRET (env) signs/verifies tokens — it is never returned to clients +or logged. The dashboard holds the token in an httpOnly cookie and forwards it as a bearer.""" + +import os +from dataclasses import dataclass +from typing import Annotated, Protocol + +from fastapi import APIRouter, Depends, Header, HTTPException +from pydantic import BaseModel + +from controller.auth import ( + Identity, + TokenError, + hash_password, + make_session_token, + read_session_token, + verify_password, +) + +ROLE_USER = "user" +ROLE_DBRE = "dbre" +VALID_ROLES = (ROLE_USER, ROLE_DBRE) + +# Equalises login timing whether or not the username exists (reduces enumeration signal). +_DUMMY_HASH = hash_password("\x00 absent-account placeholder \x00") + + +@dataclass(frozen=True) +class UserRecord: + username: str + display_name: str + role: str + password_hash: str + + +class AuthStore(Protocol): + def get_user(self, username: str) -> UserRecord | None: ... + + +class MongoUserStore: + def __init__(self, collection) -> None: + self._col = collection + + def get_user(self, username: str) -> UserRecord | None: + doc = self._col.find_one({"username": username}, {"_id": False}) + if not doc: + return None + return UserRecord( + username=doc["username"], + display_name=doc.get("display_name", doc["username"]), + role=doc["role"], + password_hash=doc["password_hash"], + ) + + +def get_auth_store() -> AuthStore: + raise HTTPException(status_code=503, detail="authentication is not configured") + + +AuthStoreDep = Annotated[AuthStore, Depends(get_auth_store)] + + +def session_secret() -> str: + secret = os.environ.get("SESSION_SECRET") + if not secret: + raise HTTPException(status_code=503, detail="SESSION_SECRET is not configured") + return secret + + +class LoginRequest(BaseModel): + username: str + password: str + + +class LoginResponse(BaseModel): + token: str + role: str + username: str + display_name: str + + +auth_router = APIRouter() + + +@auth_router.post("/auth/login", response_model=LoginResponse) +def login(body: LoginRequest, store: AuthStoreDep) -> LoginResponse: + user = store.get_user(body.username) + # Always run a hash verification so the response time does not reveal whether the + # username exists. compare_digest inside verify_password keeps this constant-time. + if user is None: + verify_password(body.password, _DUMMY_HASH) + raise HTTPException(status_code=401, detail="invalid credentials") + if not verify_password(body.password, user.password_hash): + raise HTTPException(status_code=401, detail="invalid credentials") + identity = Identity(username=user.username, display_name=user.display_name, role=user.role) + token = make_session_token(identity, session_secret()) + return LoginResponse( + token=token, role=user.role, username=user.username, display_name=user.display_name + ) + + +def _identity_from_authorization(authorization: str | None) -> Identity: + if not authorization or not authorization.lower().startswith("bearer "): + raise HTTPException(status_code=401, detail="missing bearer token") + token = authorization.split(" ", 1)[1].strip() + try: + return read_session_token(token, session_secret()) + except TokenError as exc: + raise HTTPException(status_code=401, detail="invalid session token") from exc + + +def require_role(*roles: str): + """FastAPI dependency factory: returns the verified Identity, or 401 (no/invalid token) / + 403 (role not permitted). With no roles given, any valid session passes.""" + allowed = roles or VALID_ROLES + + def _dependency(authorization: Annotated[str | None, Header()] = None) -> Identity: + identity = _identity_from_authorization(authorization) + if identity.role not in allowed: + raise HTTPException(status_code=403, detail="insufficient role") + return identity + + return _dependency + + +def optional_dbre_identity( + authorization: Annotated[str | None, Header()] = None, +) -> Identity | None: + """DBRE role guard that is active only when SESSION_SECRET is configured — mirrors + require_write_token's conditional gating, so local/CI flows stay open while production + enforces the role. Returns the verified DBRE identity, or None when auth is unconfigured.""" + if not os.environ.get("SESSION_SECRET"): + return None + identity = _identity_from_authorization(authorization) + if identity.role != ROLE_DBRE: + raise HTTPException(status_code=403, detail="DBRE role required") + return identity diff --git a/api/routes.py b/api/routes.py index 2f3f60b..0ba90f4 100644 --- a/api/routes.py +++ b/api/routes.py @@ -1,5 +1,6 @@ import os import secrets +from dataclasses import dataclass from typing import Annotated, Literal, Protocol from uuid import uuid4 @@ -7,7 +8,11 @@ from fastapi.responses import JSONResponse from pydantic import BaseModel +from api.auth import optional_dbre_identity +from api.workload import WorkloadService, get_workload_service_optional +from controller.auth import Identity from controller.orchestrator import ApprovalTicket, issue_approval_ticket +from controller.workload import WorkloadSpecError, assert_safe_query from controller.schemas import ApprovalGateState, EvidencePack, PackStatus router = APIRouter() @@ -27,11 +32,20 @@ def get_pack(self, run_id: str) -> EvidencePack | None: ... def save_pack(self, pack: EvidencePack) -> None: ... +@dataclass(frozen=True) +class QueryInput: + """A real captured query to diagnose, in place of the preset demo fixture.""" + + query_filter: dict + query_sort: list[tuple[str, int]] + limit: int + + class Engine(Protocol): """Executes the remediation phases against the live target. diagnose() is read-only; apply_and_verify() performs the human-approved index mutation; reject() is a no-op record.""" - async def diagnose(self, run_id: str) -> EvidencePack: ... + async def diagnose(self, run_id: str, query: QueryInput | None = None) -> EvidencePack: ... async def apply_and_verify( self, pack: EvidencePack, ticket: ApprovalTicket ) -> EvidencePack: ... @@ -72,16 +86,44 @@ def get_pack(run_id: str, store: StoreDep) -> dict: class RunRequest(BaseModel): run_id: str | None = None + captured_query_id: str | None = None + + +WorkloadOptionalDep = Annotated[WorkloadService | None, Depends(get_workload_service_optional)] @router.post("/run", dependencies=[Depends(require_write_token)]) -async def trigger_run(store: StoreDep, engine: EngineDep, body: RunRequest | None = None) -> dict: - """Trigger a DIAGNOSE-only live run over the preset demo fixture (#37). Returns a - DIAGNOSED pack — NO database mutation happens here. The recommended index is applied only - after a human approves via POST /packs/{run_id}/decision. Synchronous (a few seconds, - longer on a cold start); pass {"run_id": "..."} only to pin the id.""" +async def trigger_run( + store: StoreDep, + engine: EngineDep, + workload: WorkloadOptionalDep, + _identity: Annotated[Identity | None, Depends(optional_dbre_identity)], + body: RunRequest | None = None, +) -> dict: + """Trigger a DIAGNOSE-only run. With a captured_query_id, diagnoses that real captured query + against its natural plan; otherwise the preset demo fixture. NO mutation happens here — a human + approves via POST /packs/{run_id}/decision before any index is applied. The DBRE role is + enforced when SESSION_SECRET is configured.""" run_id = (body.run_id if body else None) or f"run-{uuid4().hex[:8]}" - pack = await engine.diagnose(run_id) + captured_id = body.captured_query_id if body else None + query: QueryInput | None = None + if captured_id is not None: + if workload is None: + raise HTTPException(status_code=503, detail="workload service not configured") + captured = workload.get_captured(captured_id) + if captured is None: + raise HTTPException(status_code=404, detail=f"captured query '{captured_id}' not found") + spec = captured["query"] + query = QueryInput( + query_filter=dict(spec["filter"]), + query_sort=[(field, int(direction)) for field, direction in spec["sort"]], + limit=int(spec["limit"]), + ) + try: + assert_safe_query(query.query_filter, query.query_sort) + except WorkloadSpecError as exc: + raise HTTPException(status_code=422, detail=str(exc)) from exc + pack = await engine.diagnose(run_id, query) store.save_pack(pack) return pack.model_dump(mode="json") @@ -94,7 +136,13 @@ class DecisionRequest(BaseModel): @router.post("/packs/{run_id}/decision", dependencies=[Depends(require_write_token)]) -async def decide_pack(run_id: str, body: DecisionRequest, store: StoreDep, engine: EngineDep): +async def decide_pack( + run_id: str, + body: DecisionRequest, + store: StoreDep, + engine: EngineDep, + identity: Annotated[Identity | None, Depends(optional_dbre_identity)], +): """The dashboard's approve/reject endpoint. On approve the recommended index is applied and the fix verified (the human-gated mutation); on reject the decision is recorded with no mutation. Returns the updated pack; 404 not_found / 409 already_decided | @@ -119,12 +167,15 @@ async def decide_pack(run_id: str, body: DecisionRequest, store: StoreDep, engin status_code=409, content={"error": "approval_gate", "detail": "pending approval gate required"}, ) + # Authoritative approver: the verified DBRE session when configured, else the request body + # (local/CI). The dashboard never decides who approved. + approver = identity.display_name if identity is not None else body.approver if body.decision == "approve": try: ticket = issue_approval_ticket( pack, evidence_hash=body.evidence_hash, - approver=body.approver, + approver=approver, note=body.note, ) except ValueError as exc: @@ -134,7 +185,7 @@ async def decide_pack(run_id: str, body: DecisionRequest, store: StoreDep, engin updated = await engine.apply_and_verify(pack, ticket) else: try: - updated = engine.reject(pack, approver=body.approver, note=body.note) + updated = engine.reject(pack, approver=approver, note=body.note) except ValueError as exc: return JSONResponse( status_code=409, content={"error": "approval_gate", "detail": str(exc)} diff --git a/api/server.py b/api/server.py index a1e4b1f..a9edb25 100644 --- a/api/server.py +++ b/api/server.py @@ -5,7 +5,16 @@ from fastapi import FastAPI from api.agent_engine import AgentDiagnosisParseError, diagnosis_agent_from_env -from api.routes import Engine, PackStore, get_engine, get_store, router +from api.auth import AuthStore, MongoUserStore, auth_router, get_auth_store +from api.routes import Engine, PackStore, QueryInput, get_engine, get_store, router +from api.workload import ( + MongoWorkloadService, + WorkloadService, + get_workload_service, + get_workload_service_optional, + workload_router, +) +from controller.workload import NAMESPACE_COLL, NAMESPACE_DB from controller.ledger_store import LedgerStore, MongoLedgerStore from controller.orchestrator import ApprovalTicket, DiagnosisAdvice, DiagnosisAgent from controller.persistence import load_pack, read_pack, save_pack, write_pack @@ -116,9 +125,17 @@ def _backend(self): return PymongoBackend(self._conn, DB, COLL) - async def diagnose(self, run_id: str) -> EvidencePack: + async def diagnose(self, run_id: str, query: QueryInput | None = None) -> EvidencePack: from controller.demo_fixture import COLL, DB, LIMIT, QUERY_FILTER, QUERY_SORT - from controller.orchestrator import run_agent_diagnosis, run_diagnosis + from controller.orchestrator import INDEX_B_NAME, run_agent_diagnosis, run_diagnosis + + namespace = f"{DB}.{COLL}" + if query is not None: + query_filter, query_sort, limit = query.query_filter, query.query_sort, query.limit + current_index = None # real captured query: diagnose its natural plan, no forced hint + else: + query_filter, query_sort, limit = QUERY_FILTER, QUERY_SORT, LIMIT + current_index = INDEX_B_NAME agent_failure: Exception | None = None if self._diagnosis_agent is not None: @@ -126,11 +143,12 @@ async def diagnose(self, run_id: str) -> EvidencePack: return await run_agent_diagnosis( self._diagnosis_agent, run_id=run_id, - namespace=f"{DB}.{COLL}", - query_filter=QUERY_FILTER, - query_sort=QUERY_SORT, - limit=LIMIT, + namespace=namespace, + query_filter=query_filter, + query_sort=query_sort, + limit=limit, ledger=self._ledger, + current_index=current_index, ) except AgentDiagnosisParseError as exc: if not self._allow_agent_fallback: @@ -142,10 +160,11 @@ async def diagnose(self, run_id: str) -> EvidencePack: return await run_diagnosis( backend, run_id=run_id, - namespace=f"{DB}.{COLL}", - query_filter=QUERY_FILTER, - query_sort=QUERY_SORT, - limit=LIMIT, + namespace=namespace, + query_filter=query_filter, + query_sort=query_sort, + limit=limit, + current_index=current_index, advisor=( _AgentFailureAdvisor(self._diagnosis_agent, agent_failure) if self._diagnosis_agent is not None and agent_failure is not None @@ -157,17 +176,24 @@ async def diagnose(self, run_id: str) -> EvidencePack: backend.close() async def apply_and_verify(self, pack: EvidencePack, ticket: ApprovalTicket) -> EvidencePack: - from controller.demo_fixture import LIMIT, QUERY_FILTER, QUERY_SORT from controller.orchestrator import apply_and_verify + # Re-run the SAME query the pack was diagnosed with (captured or fixture). model_dump thaws + # the frozen before-evidence: pack.before.query holds mappingproxy nodes that pymongo cannot + # deep-copy into a filter, so we read the serialized (plain dict/list) form instead. + spec = pack.before.model_dump(mode="python")["query"] + query_filter = dict(spec.get("filter", {})) + query_sort = [(field, int(direction)) for field, direction in spec.get("sort", [])] + limit = int(spec.get("limit", 20)) + backend = self._backend() try: return await apply_and_verify( backend, pack, - query_filter=QUERY_FILTER, - query_sort=QUERY_SORT, - limit=LIMIT, + query_filter=query_filter, + query_sort=query_sort, + limit=limit, approval_ticket=ticket, ledger=self._ledger, ) @@ -182,29 +208,51 @@ def reject( return reject_pack(pack, approver=approver, note=note, ledger=self._ledger) -def create_app(store: PackStore | None = None, engine: Engine | None = None) -> FastAPI: +def create_app( + store: PackStore | None = None, + engine: Engine | None = None, + auth_store: AuthStore | None = None, + workload_service: WorkloadService | None = None, +) -> FastAPI: app = FastAPI(title="GCRAH Evidence Pack API") if store is None: - if os.getenv("MONGO_SECRET_NAME"): # pragma: no cover - live - if not os.getenv("RUN_API_TOKEN"): + secret_mode = bool(os.getenv("MONGO_SECRET_NAME")) + local_mode = bool(os.getenv("MDB_MCP_CONNECTION_STRING")) + if secret_mode or local_mode: # pragma: no cover - live + if secret_mode and not os.getenv("RUN_API_TOKEN"): raise RuntimeError( "RUN_API_TOKEN is required when MONGO_SECRET_NAME enables live Mongo mode" ) + if secret_mode and not os.getenv("SESSION_SECRET"): + # Fail closed: without it, optional_dbre_identity no-ops and the approver would + # fall back to the request body — role + approver authority must be structural. + raise RuntimeError( + "SESSION_SECRET is required in production to enforce the DBRE role and the " + "session-derived approver on the mutating endpoints" + ) from pymongo import MongoClient # noqa: PLC0415 from api.secrets import get_mongo_connection_string # noqa: PLC0415 conn = get_mongo_connection_string() - state_db = MongoClient(conn)["dbre_state"] - collection = state_db["evidence_packs"] - store = MongoPackStore(collection) + client = MongoClient(conn) + state_db = client["dbre_state"] + store = MongoPackStore(state_db["evidence_packs"]) + if auth_store is None: + auth_store = MongoUserStore(state_db["users"]) + if workload_service is None: + target = client[NAMESPACE_DB][NAMESPACE_COLL] + workload_service = MongoWorkloadService(target, state_db["query_log"]) if engine is None: - engine = _LiveEngine( - conn, - diagnosis_agent_from_env(require_split=True, allow_legacy=False), - MongoLedgerStore(state_db), + # Production requires the split Agent Engine; a local connection runs the + # deterministic controller (no Vertex), which still produces a valid pack. + agent = ( + diagnosis_agent_from_env(require_split=True, allow_legacy=False) + if secret_mode + else None ) + engine = _LiveEngine(conn, agent, MongoLedgerStore(state_db)) else: packs_dir = Path(os.getenv("PACKS_DIR", "runs")) store = LocalFilePackStore(packs_dir) if packs_dir.exists() else _EmptyPackStore() @@ -212,7 +260,14 @@ def create_app(store: PackStore | None = None, engine: Engine | None = None) -> app.dependency_overrides[get_store] = lambda: store if engine is not None: app.dependency_overrides[get_engine] = lambda: engine + if auth_store is not None: + app.dependency_overrides[get_auth_store] = lambda: auth_store + if workload_service is not None: + app.dependency_overrides[get_workload_service] = lambda: workload_service + app.dependency_overrides[get_workload_service_optional] = lambda: workload_service app.include_router(router) + app.include_router(auth_router) + app.include_router(workload_router) return app diff --git a/api/workload.py b/api/workload.py new file mode 100644 index 0000000..6dd1a6e --- /dev/null +++ b/api/workload.py @@ -0,0 +1,183 @@ +"""Workload execution + capture (user persona) and the DBRE slow-query queue. + +POST /workload/query runs a guided, validated, read-only query against the demo collection, +captures its real explain evidence, persists an attributed record to `query_log`, and returns a +result preview. GET /workload/slow-queries serves the evidence-ranked triage queue to the DBRE. +Every workload query is attributed to the authenticated user; only the `user` role can run them. +""" + +from datetime import datetime, timezone +from typing import Annotated, Protocol +from uuid import uuid4 + +from fastapi import APIRouter, Depends, HTTPException +from pydantic import BaseModel + +from api.auth import ROLE_DBRE, ROLE_USER, require_role +from controller.auth import Identity +from controller.explain import capture_evidence +from controller.workload import ( + DEFAULT_MAX_TIME_MS, + PRESET_BY_KEY, + PRESETS, + QuerySpec, + WorkloadSpecError, + build_capture_record, + build_query, +) + +PREVIEW_MAX = 8 +SLOW_QUEUE_LIMIT = 100 + + +def _now_iso() -> str: + return datetime.now(timezone.utc).isoformat() + + +def public_record(record: dict) -> dict: + """Rename the internal _id to captured_id for API responses.""" + out = {key: value for key, value in record.items() if key != "_id"} + out["captured_id"] = record["_id"] + return out + + +class WorkloadService(Protocol): + def run_query( + self, spec: QuerySpec, *, username: str, display_name: str, preset: str | None + ) -> dict: ... + def list_slow_queries(self) -> list[dict]: ... + def get_captured(self, captured_id: str) -> dict | None: ... + + +class MongoWorkloadService: + def __init__(self, target_collection, query_log_collection) -> None: + self._target = target_collection + self._log = query_log_collection + + def run_query(self, spec, *, username, display_name, preset=None) -> dict: + query_filter, query_sort, limit = build_query(spec) + evidence = capture_evidence( + self._target, query_filter, query_sort, limit, max_time_ms=DEFAULT_MAX_TIME_MS + ) + record = build_capture_record( + captured_id=uuid4().hex, + username=username, + display_name=display_name, + spec=spec, + evidence=evidence, + captured_at=_now_iso(), + preset=preset, + ) + self._log.insert_one(dict(record)) + return { + "captured": public_record(record), + "preview": self._preview(query_filter, query_sort, limit), + } + + def _preview(self, query_filter, query_sort, limit) -> list[dict]: + cursor = self._target.find( + dict(query_filter), + projection={ + "_id": False, + "storeLocation": True, + "saleDate": True, + "customer.age": True, + "purchaseMethod": True, + }, + sort=list(query_sort) or None, + limit=min(limit, PREVIEW_MAX), + ).max_time_ms(DEFAULT_MAX_TIME_MS) + rows = [] + for doc in cursor: + sale = doc.get("saleDate") + rows.append( + { + "storeLocation": doc.get("storeLocation"), + "saleDate": sale.isoformat() if hasattr(sale, "isoformat") else sale, + "age": (doc.get("customer") or {}).get("age"), + "purchaseMethod": doc.get("purchaseMethod"), + } + ) + return rows + + def list_slow_queries(self) -> list[dict]: + # Rank + cap in Mongo (bounded memory + maxTimeMS) rather than loading the whole log. + records = ( + self._log.find({"signal.is_slow": True}) + .sort("signal.score", -1) + .limit(SLOW_QUEUE_LIMIT) + .max_time_ms(DEFAULT_MAX_TIME_MS) + ) + return [public_record(r) for r in records] + + def get_captured(self, captured_id: str) -> dict | None: + return self._log.find_one({"_id": captured_id}) + + +def get_workload_service() -> WorkloadService: + raise HTTPException(status_code=503, detail="workload service is not configured") + + +def get_workload_service_optional() -> WorkloadService | None: + """Like get_workload_service but returns None instead of 503 when unconfigured, so /run can + resolve it eagerly and only fail when a captured_query_id is actually supplied.""" + return None + + +WorkloadServiceDep = Annotated[WorkloadService, Depends(get_workload_service)] + +workload_router = APIRouter() + + +class WorkloadQueryRequest(BaseModel): + preset: str | None = None + store_location: str | None = None + purchase_method: str | None = None + age_min: int | None = None + age_max: int | None = None + sort_field: str | None = None + sort_dir: int = -1 + limit: int = 20 + + +@workload_router.get("/workload/presets") +def list_presets(_: Annotated[Identity, Depends(require_role())]) -> list[dict]: + return [{"key": p.key, "label": p.label, "intent": p.intent} for p in PRESETS] + + +@workload_router.post("/workload/query") +def run_workload_query( + body: WorkloadQueryRequest, + identity: Annotated[Identity, Depends(require_role(ROLE_USER))], + service: WorkloadServiceDep, +) -> dict: + if body.preset is not None: + preset = PRESET_BY_KEY.get(body.preset) + if preset is None: + raise HTTPException(status_code=404, detail=f"unknown preset: {body.preset}") + spec, preset_key = preset.spec, preset.key + else: + spec = QuerySpec( + store_location=body.store_location, + purchase_method=body.purchase_method, + age_min=body.age_min, + age_max=body.age_max, + sort_field=body.sort_field, + sort_dir=body.sort_dir, + limit=body.limit, + ) + preset_key = None + try: + return service.run_query( + spec, username=identity.username, display_name=identity.display_name, preset=preset_key + ) + except WorkloadSpecError as exc: + raise HTTPException(status_code=422, detail=str(exc)) from exc + + +@workload_router.get("/workload/slow-queries") +def list_slow_queries( + _: Annotated[Identity, Depends(require_role(ROLE_DBRE))], + service: WorkloadServiceDep, +) -> list[dict]: + return service.list_slow_queries() diff --git a/controller/auth.py b/controller/auth.py new file mode 100644 index 0000000..184cbb2 --- /dev/null +++ b/controller/auth.py @@ -0,0 +1,151 @@ +"""Password hashing (scrypt) and HS256 session tokens — pure stdlib, no third-party +dependencies, no I/O. Shared by the seed script (hashing), the login route (verify + issue), +and the role guards (verify). The dashboard verifies the same HS256 token with `jose`, so the +token format here is standards-compliant JWT.""" + +import base64 +import hashlib +import hmac +import json +import secrets +import time +from dataclasses import dataclass + +# scrypt (RFC 7914) interactive-login profile. memory ≈ 128 * N * r ≈ 16 MiB. +_SCRYPT_N = 2**14 +_SCRYPT_R = 8 +_SCRYPT_P = 1 +_SCRYPT_DKLEN = 32 +_SCRYPT_MAXMEM = 2**26 # 64 MiB — headroom over the working set across OpenSSL builds +_SALT_BYTES = 16 + +_DEFAULT_TTL_SECONDS = 12 * 3600 + + +class TokenError(Exception): + """Raised when a session token is malformed, unsigned, tampered, or expired.""" + + +@dataclass(frozen=True) +class Identity: + username: str + display_name: str + role: str + + +def _b64e(raw: bytes) -> str: + return base64.urlsafe_b64encode(raw).rstrip(b"=").decode("ascii") + + +def _b64d(text: str) -> bytes: + return base64.urlsafe_b64decode(text + "=" * (-len(text) % 4)) + + +def hash_password(password: str) -> str: + if not password: + raise ValueError("password must be non-empty") + salt = secrets.token_bytes(_SALT_BYTES) + derived = hashlib.scrypt( + password.encode("utf-8"), + salt=salt, + n=_SCRYPT_N, + r=_SCRYPT_R, + p=_SCRYPT_P, + dklen=_SCRYPT_DKLEN, + maxmem=_SCRYPT_MAXMEM, + ) + return "$".join( + ["scrypt", str(_SCRYPT_N), str(_SCRYPT_R), str(_SCRYPT_P), _b64e(salt), _b64e(derived)] + ) + + +def verify_password(password: str, stored: str) -> bool: + try: + scheme, n_s, r_s, p_s, salt_b64, dk_b64 = stored.split("$") + if scheme != "scrypt": + return False + salt = _b64d(salt_b64) + expected = _b64d(dk_b64) + derived = hashlib.scrypt( + password.encode("utf-8"), + salt=salt, + n=int(n_s), + r=int(r_s), + p=int(p_s), + dklen=len(expected), + maxmem=_SCRYPT_MAXMEM, + ) + except (ValueError, TypeError): + return False + return hmac.compare_digest(derived, expected) + + +def encode_jwt(payload: dict, secret: str) -> str: + header = {"alg": "HS256", "typ": "JWT"} + segments = [ + _b64e(json.dumps(header, separators=(",", ":"), sort_keys=True).encode("utf-8")), + _b64e(json.dumps(payload, separators=(",", ":"), sort_keys=True).encode("utf-8")), + ] + signing_input = ".".join(segments).encode("ascii") + signature = hmac.new(secret.encode("utf-8"), signing_input, hashlib.sha256).digest() + segments.append(_b64e(signature)) + return ".".join(segments) + + +def decode_jwt(token: str, secret: str) -> dict: + parts = token.split(".") + if len(parts) != 3: + raise TokenError("malformed token") + header_b64, payload_b64, signature_b64 = parts + try: + header = json.loads(_b64d(header_b64)) + except (ValueError, TypeError) as exc: + raise TokenError("bad header") from exc + if header.get("alg") != "HS256": + raise TokenError("unsupported algorithm") + signing_input = f"{header_b64}.{payload_b64}".encode("ascii") + expected = hmac.new(secret.encode("utf-8"), signing_input, hashlib.sha256).digest() + try: + provided = _b64d(signature_b64) + except (ValueError, TypeError) as exc: + raise TokenError("bad signature encoding") from exc + if not hmac.compare_digest(expected, provided): + raise TokenError("bad signature") + try: + payload = json.loads(_b64d(payload_b64)) + except (ValueError, TypeError) as exc: + raise TokenError("bad payload") from exc + exp = payload.get("exp") + if exp is not None and time.time() > float(exp): + raise TokenError("token expired") + return payload + + +def make_session_token( + identity: Identity, + secret: str, + *, + ttl_seconds: int = _DEFAULT_TTL_SECONDS, + now: float | None = None, +) -> str: + issued = time.time() if now is None else now + payload = { + "sub": identity.username, + "name": identity.display_name, + "role": identity.role, + "iat": int(issued), + "exp": int(issued + ttl_seconds), + } + return encode_jwt(payload, secret) + + +def read_session_token(token: str, secret: str) -> Identity: + payload = decode_jwt(token, secret) + try: + return Identity( + username=payload["sub"], + display_name=payload.get("name", payload["sub"]), + role=payload["role"], + ) + except KeyError as exc: + raise TokenError(f"missing claim: {exc}") from exc diff --git a/controller/explain.py b/controller/explain.py index db9c580..e22eec5 100644 --- a/controller/explain.py +++ b/controller/explain.py @@ -68,10 +68,13 @@ def capture_evidence( query_sort: Sequence[tuple[str, int]], limit: int, hint: Any | None = None, + max_time_ms: int | None = None, ) -> Evidence: cursor = collection.find(dict(query_filter), sort=list(query_sort), limit=limit) if hint is not None: cursor = cursor.hint(hint) + if max_time_ms is not None: + cursor = cursor.max_time_ms(max_time_ms) explained = cursor.explain() winning = explained["queryPlanner"]["winningPlan"] diff --git a/controller/ledger_store.py b/controller/ledger_store.py index 1b036ea..b7775c0 100644 --- a/controller/ledger_store.py +++ b/controller/ledger_store.py @@ -99,7 +99,7 @@ def write_diagnosis_records( query_filter: dict[str, Any], query_sort: list[tuple[str, int]], limit: int, - current_index: str, + current_index: str | None, source: str = "deterministic_esr", ) -> None: if ledger is None: diff --git a/controller/orchestrator.py b/controller/orchestrator.py index 998eb63..fb13196 100644 --- a/controller/orchestrator.py +++ b/controller/orchestrator.py @@ -390,7 +390,7 @@ async def run_diagnosis( narrator: Narrator | None = None, advisor: DiagnosisAdvisor | None = None, ledger: LedgerStore | None = None, - current_index: str = INDEX_B_NAME, + current_index: str | None = INDEX_B_NAME, ) -> EvidencePack: """Read-only DIAGNOSE phase. Returns a DIAGNOSED pack with NO decision and NO mutation — the human approves (via the API) before anything is applied. The before-explain hints the @@ -482,7 +482,7 @@ async def run_agent_diagnosis( created_at: str | None = None, narrator: Narrator | None = None, ledger: LedgerStore | None = None, - current_index: str = INDEX_B_NAME, + current_index: str | None = INDEX_B_NAME, ) -> EvidencePack: created_at = created_at or _now() write_gate_opened_record( diff --git a/controller/workload.py b/controller/workload.py new file mode 100644 index 0000000..5cac328 --- /dev/null +++ b/controller/workload.py @@ -0,0 +1,272 @@ +"""Guided workload query contract + evidence-based slow-query ranking. + +Pure and offline-testable. The query a user can run is built ONLY from validated parameters — +equality on storeLocation/purchaseMethod, a customer.age range, a sort on saleDate/customer.age, +and a capped limit — never a raw Mongo filter. That keeps every workload query injection-safe +and strictly read-only. ESR-trap presets force a blocking in-memory SORT against the baseline +index set ({storeLocation:1} + {purchaseMethod:1}); healthy presets are served in index order. +Ranking is by explain evidence (blocking sort, COLLSCAN, docs-examined/returned ratio, keys), +never wall-clock time, which is noisy on shared clusters. +""" + +from dataclasses import dataclass + +NAMESPACE_DB = "sample_supplies" +NAMESPACE_COLL = "sales_agent_demo" +NAMESPACE = f"{NAMESPACE_DB}.{NAMESPACE_COLL}" + +STORE_LOCATIONS = ("Austin", "Denver", "London", "New York", "San Diego", "Seattle") +PURCHASE_METHODS = ("In store", "Online", "Phone") +SORT_FIELDS = ("saleDate", "customer.age") +AGE_MIN, AGE_MAX = 16, 75 +MAX_LIMIT = 200 +DEFAULT_MAX_TIME_MS = 5000 + +# docs examined per doc returned above which a sort-free query is still "slow" (over-scan) +SLOW_RATIO = 25.0 + +# Baseline index set for the workload demo — equality serving only, NO saleDate ordering, so a +# query that sorts on saleDate is forced into a blocking SORT and the ESR fix is a real change. +BASELINE_INDEXES = ( + ([("storeLocation", 1)], "store_eq"), + ([("purchaseMethod", 1)], "method_eq"), +) +# Legacy fixture indexes dropped so they can't pre-serve the trap (esr_right_C is the ESR answer). +LEGACY_INDEX_NAMES = ("esr_wrong_B", "esr_right_C") +APPLIED_INDEX_PREFIX = "gcrah_rec_" + + +class WorkloadSpecError(ValueError): + """Invalid guided-query parameters — rejected before any database access.""" + + +@dataclass(frozen=True) +class QuerySpec: + store_location: str | None = None + purchase_method: str | None = None + age_min: int | None = None + age_max: int | None = None + sort_field: str | None = None + sort_dir: int = -1 + limit: int = 20 + + +@dataclass(frozen=True) +class Preset: + key: str + label: str + intent: str # "trap" | "healthy" + spec: QuerySpec + + +PRESETS: tuple[Preset, ...] = ( + Preset( + "denver_recent", + "Denver buyers 30–50, newest first", + "trap", + QuerySpec("Denver", None, 30, 50, "saleDate", -1, 20), + ), + Preset( + "seattle_recent", + "Seattle buyers 25–45, newest first", + "trap", + QuerySpec("Seattle", None, 25, 45, "saleDate", -1, 20), + ), + Preset( + "online_recent", + "Online orders 18–35, newest first", + "trap", + QuerySpec(None, "Online", 18, 35, "saleDate", -1, 25), + ), + Preset( + "austin_oldest", + "Austin buyers 40–60, oldest first", + "trap", + QuerySpec("Austin", None, 40, 60, "saleDate", 1, 20), + ), + Preset( + "phone_seniors", + "Phone orders 55–75, newest first", + "trap", + QuerySpec(None, "Phone", 55, 75, "saleDate", -1, 15), + ), + Preset( + "ny_recent", + "New York buyers 30–45, newest first", + "trap", + QuerySpec("New York", None, 30, 45, "saleDate", -1, 20), + ), + Preset( + "denver_lookup", + "Denver buyers — lookup, no sort", + "healthy", + QuerySpec("Denver", None, None, None, None, -1, 10), + ), + Preset( + "online_lookup", + "Online orders — lookup, no sort", + "healthy", + QuerySpec(None, "Online", None, None, None, -1, 10), + ), + Preset( + "london_browse", + "London buyers — browse, no sort", + "healthy", + QuerySpec("London", None, None, None, None, -1, 25), + ), +) +PRESET_BY_KEY = {p.key: p for p in PRESETS} + + +def validate_spec(spec: QuerySpec) -> QuerySpec: + if spec.store_location is not None and spec.store_location not in STORE_LOCATIONS: + raise WorkloadSpecError(f"unknown storeLocation: {spec.store_location!r}") + if spec.purchase_method is not None and spec.purchase_method not in PURCHASE_METHODS: + raise WorkloadSpecError(f"unknown purchaseMethod: {spec.purchase_method!r}") + for bound in (spec.age_min, spec.age_max): + if bound is not None and not (AGE_MIN <= bound <= AGE_MAX): + raise WorkloadSpecError(f"age out of range [{AGE_MIN},{AGE_MAX}]: {bound}") + if spec.age_min is not None and spec.age_max is not None and spec.age_min > spec.age_max: + raise WorkloadSpecError("age_min must be <= age_max") + if spec.sort_field is not None and spec.sort_field not in SORT_FIELDS: + raise WorkloadSpecError(f"unsupported sort field: {spec.sort_field!r}") + if spec.sort_dir not in (1, -1): + raise WorkloadSpecError("sort_dir must be 1 or -1") + if not (1 <= spec.limit <= MAX_LIMIT): + raise WorkloadSpecError(f"limit must be in [1,{MAX_LIMIT}]") + return spec + + +def build_query(spec: QuerySpec) -> tuple[dict, list[tuple[str, int]], int]: + """Validated spec -> (filter, sort, limit). Raises WorkloadSpecError on any bad input.""" + validate_spec(spec) + query_filter: dict = {} + if spec.store_location is not None: + query_filter["storeLocation"] = spec.store_location + if spec.purchase_method is not None: + query_filter["purchaseMethod"] = spec.purchase_method + age: dict = {} + if spec.age_min is not None: + age["$gte"] = spec.age_min + if spec.age_max is not None: + age["$lte"] = spec.age_max + if age: + query_filter["customer.age"] = age + query_sort = [(spec.sort_field, spec.sort_dir)] if spec.sort_field is not None else [] + return query_filter, query_sort, spec.limit + + +_ALLOWED_FILTER_KEYS = frozenset({"storeLocation", "purchaseMethod", "customer.age"}) +_ALLOWED_RANGE_OPS = frozenset({"$gte", "$lte"}) + + +def assert_safe_query(query_filter: dict, query_sort: list[tuple[str, int]]) -> None: + """Re-validate a query loaded from query_log before it reaches the backend. query_log is only + written through the validated run_query path; re-checking here stops a future writer from + smuggling an operator (e.g. $where) the guided builder could never produce — defense in depth.""" + for key, value in query_filter.items(): + if key not in _ALLOWED_FILTER_KEYS: + raise WorkloadSpecError(f"disallowed filter field: {key!r}") + if key == "customer.age": + if not isinstance(value, dict) or not set(value).issubset(_ALLOWED_RANGE_OPS): + raise WorkloadSpecError("customer.age must be a {$gte,$lte} range") + for bound in value.values(): + if ( + isinstance(bound, bool) + or not isinstance(bound, int) + or not (AGE_MIN <= bound <= AGE_MAX) + ): + raise WorkloadSpecError( + f"customer.age bound out of range [{AGE_MIN},{AGE_MAX}]: {bound!r}" + ) + elif not isinstance(value, str): + raise WorkloadSpecError(f"{key} must be an equality value") + for field, direction in query_sort: + if field not in SORT_FIELDS or direction not in (1, -1): + raise WorkloadSpecError(f"disallowed sort: {field!r}") + + +@dataclass(frozen=True) +class SlowSignal: + is_slow: bool + severity: str # "high" | "medium" | "low" + score: float + ratio: float + blocking_sort: bool + collscan: bool + + +def _ratio(docs_examined: int, docs_returned: int) -> float: + return docs_examined / max(docs_returned, 1) + + +def slow_signal(metrics) -> SlowSignal: + """Evidence-only slowness verdict + a deterministic ranking score. `metrics` is an + EvidenceMetrics (or any object with docs_examined/docs_returned/total_keys_examined/ + has_blocking_sort/stages).""" + stages = tuple(metrics.stages) + collscan = "COLLSCAN" in stages + blocking_sort = bool(metrics.has_blocking_sort) + ratio = _ratio(metrics.docs_examined, metrics.docs_returned) + is_slow = blocking_sort or collscan or ratio >= SLOW_RATIO + severity = "high" if (blocking_sort or collscan) else ("medium" if ratio >= 10 else "low") + score = ( + (1_000_000.0 if blocking_sort else 0.0) + + (500_000.0 if collscan else 0.0) + + min(ratio, 10_000.0) * 50.0 + + min(float(metrics.total_keys_examined), 1_000_000.0) / 100.0 + ) + return SlowSignal( + is_slow=is_slow, + severity=severity, + score=score, + ratio=ratio, + blocking_sort=blocking_sort, + collscan=collscan, + ) + + +def build_capture_record( + *, + captured_id: str, + username: str, + display_name: str, + spec: QuerySpec, + evidence, + captured_at: str, + preset: str | None = None, +) -> dict: + """Assemble the attributed query_log document from a spec + its live explain Evidence. + Stores the query SPEC and ranking metrics only — never threaded into a v1 EvidencePack; + the diagnose pack re-explains fresh.""" + query_filter, query_sort, limit = build_query(spec) + metrics = evidence.metrics + signal = slow_signal(metrics) + return { + "_id": captured_id, + "namespace": NAMESPACE, + "preset": preset, + "user": {"username": username, "display_name": display_name}, + "query": { + "filter": query_filter, + "sort": [[field, direction] for field, direction in query_sort], + "limit": limit, + }, + "metrics": { + "docs_examined": metrics.docs_examined, + "docs_returned": metrics.docs_returned, + "total_keys_examined": metrics.total_keys_examined, + "millis": metrics.millis, + "stages": list(metrics.stages), + "has_blocking_sort": metrics.has_blocking_sort, + }, + "signal": { + "is_slow": signal.is_slow, + "severity": signal.severity, + "score": signal.score, + "ratio": signal.ratio, + "blocking_sort": signal.blocking_sort, + "collscan": signal.collscan, + }, + "captured_at": captured_at, + } diff --git a/dashboard/DEPLOY.md b/dashboard/DEPLOY.md index f89c166..a44f0c9 100644 --- a/dashboard/DEPLOY.md +++ b/dashboard/DEPLOY.md @@ -1,33 +1,33 @@ -# Dashboard deploy → Cloud Run (#32) +# Dashboard deploy → Cloud Run Ship the Next.js DBRE Console to a public URL pointed at the live read API. Mirrors the read-API runbook (`../deploy/cloudrun.md`). -> **Who runs this:** @d3v07 runs the `gcloud` deploy (the dashboard author has no -> `gcloud` auth). Everything else — the standalone build, Dockerfile, and the -> env contract below — is ready in this branch. +> **Who runs this:** @d3v07 runs the `gcloud` deploy. The standalone build, Dockerfile, +> and the env contract below are ready in this branch. ## Service - Service: **gcrah-dashboard** - Project: **performer-497915** · Region: **us-central1** -- Build: `Dockerfile` (multi-stage, Next standalone output) — runs `node server.js`, honors Cloud Run's `PORT`. +- Build: `Dockerfile` (multi-stage, Next standalone output) — runs `node server.js`, honors `PORT`. ## Required server env (set on the Cloud Run service) | Var | Value | Why | |-----|-------|-----| -| `API_URL` | `https://gcrah-read-api-2vbnam7yma-uc.a.run.app` | the proxy routes forward here | -| `RUN_API_TOKEN` | *(the read API's write token)* | injected by the `/api/run` + `/api/decision` proxies as `X-API-Token`; **server-only, never in the client bundle** | -| `NEXT_PUBLIC_API_URL` | `https://gcrah-read-api-2vbnam7yma-uc.a.run.app` | browser-side reads (`/packs`, `/packs/{id}`) — unauthenticated, safe to expose | +| `API_URL` | the read API base URL | server proxies + server components fetch here | +| `SESSION_SECRET` | **byte-identical to the read API's `SESSION_SECRET`** | middleware verifies the HS256 session cookie with it. If absent or mismatched, `verifyToken` returns null and **every request bounces to `/login` even after a successful sign-in** — login silently fails | +| `RUN_API_TOKEN` | the read API's write token | injected by the `/api/diagnose` + `/api/decision` proxies as `X-API-Token`; **server-only, never in the client bundle** | -`RUN_API_TOKEN` is the same secret already set on the read API. It must **not** -use the `NEXT_PUBLIC_` prefix (that would ship it to the browser). Reads need no -token; only the write proxies use it. +`SESSION_SECRET` and `RUN_API_TOKEN` must match the read API exactly and must **not** use the +`NEXT_PUBLIC_` prefix (that ships them to the browser). Do **not** set `NEXT_PUBLIC_API_URL`: +server components read the API via `API_URL` at runtime, and the public prefix would inline the +internal URL into the client bundle. ## Deploy -From `dashboard/`: +From `dashboard/` — use the SAME `SESSION_SECRET` + `RUN_API_TOKEN` set on the read API: ```bash gcloud run deploy gcrah-dashboard \ @@ -36,13 +36,12 @@ gcloud run deploy gcrah-dashboard \ --region us-central1 \ --allow-unauthenticated \ --port 8080 \ - --set-env-vars "API_URL=https://gcrah-read-api-2vbnam7yma-uc.a.run.app,NEXT_PUBLIC_API_URL=https://gcrah-read-api-2vbnam7yma-uc.a.run.app" \ - --set-env-vars "RUN_API_TOKEN=" + --set-env-vars "API_URL=https://" \ + --set-secrets "SESSION_SECRET=gcrah-session-secret:latest,RUN_API_TOKEN=gcrah-run-token:latest" ``` -`--source .` triggers Cloud Build to build the `Dockerfile`. (For a real secret, -prefer Secret Manager + `--set-secrets RUN_API_TOKEN=gcrah-run-token:latest` -instead of `--set-env-vars`, matching how the read API handles its Mongo URI.) +`--source .` triggers Cloud Build on the `Dockerfile`. Prefer Secret Manager (as shown) over +`--set-env-vars` for the two secrets. ## Smoke tests @@ -50,32 +49,20 @@ instead of `--set-env-vars`, matching how the read API handles its Mongo URI.) URL=$(gcloud run services describe gcrah-dashboard \ --region us-central1 --project performer-497915 --format "value(status.url)") -curl -sf "$URL/" | grep -q "DBRE Console" && echo "homepage ok" - -# the write proxy injects the token server-side; client sends none: -curl -sf -X POST "$URL/api/run" -H 'content-type: application/json' -d '{}' \ - | python3 -c "import json,sys; print('run ->', json.load(sys.stdin)['status'])" +# unauthenticated request redirects to login (middleware) +curl -s -o /dev/null -w "%{http_code}\n" "$URL/" # expect 307 +curl -sf "$URL/login" | grep -q "DBRE Console" && echo "login ok" ``` -Expected: `homepage ok`, then `run -> diagnosed`. - -In the rendered page, the first viewport should show the **Approval Gate**. A live -run should move the gate from collecting evidence to `pending approval`, show the -`evidence hash`, and keep the live/ledger persisted footer. Approving the run should -return a `verified` pack and close the gate as verified. - -## Local verification (already done) +Then in a browser (seed accounts first: `uv run python seed/seed_users.py`): -The standalone server was run exactly as the container does it -(`node .next/standalone/server.js`, `PORT=8080`, env from `.env.local`): -homepage `200`, the ask-the-agent button renders, and `/api/run` returned a live -`diagnosed` pack with the ESR index C — proving the image + proxy work before the -cloud deploy. +1. Sign in as a user (Dev Trivedi / Aakash Singh) → run a guided workload from the console. +2. Sign in as the DBRE → the Slow-Query Queue shows the captured query, ranked by evidence. +3. Diagnose → review the DIAGNOSED pack → Approve → the pack moves to `verified`. ## Build locally ```bash npm ci npm run build # emits .next/standalone (output: "standalone") -# container entrypoint is: node server.js (with static + public copied alongside) ``` diff --git a/dashboard/app/api/auth/login/route.ts b/dashboard/app/api/auth/login/route.ts new file mode 100644 index 0000000..cec40b8 --- /dev/null +++ b/dashboard/app/api/auth/login/route.ts @@ -0,0 +1,64 @@ +/** + * Server-only login proxy. Forwards credentials to the read API's /auth/login, and on success + * stores the issued token in an httpOnly cookie — the token never reaches the browser JS. The + * JSON returned to the client carries only role/identity, never the token. + */ + +import { NextResponse } from "next/server"; + +import { SESSION_COOKIE } from "@/lib/session"; + +export const runtime = "nodejs"; + +function apiBase(): string | null { + const raw = (process.env.API_URL ?? process.env.NEXT_PUBLIC_API_URL)?.trim(); + return raw ? raw.replace(/\/+$/, "") : null; +} + +export async function POST(req: Request) { + const base = apiBase(); + if (!base) { + return NextResponse.json({ error: "auth_unconfigured" }, { status: 503 }); + } + + const body = await req.text().catch(() => "{}"); + let upstream: Response; + try { + upstream = await fetch(`${base}/auth/login`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: body || "{}", + }); + } catch { + return NextResponse.json({ error: "upstream_unreachable" }, { status: 502 }); + } + + if (!upstream.ok) { + // Pass through 401/422/503 without leaking internal detail. + return new NextResponse(await upstream.text(), { + status: upstream.status, + headers: { "content-type": "application/json" }, + }); + } + + const data = (await upstream.json()) as { + token: string; + role: string; + username: string; + display_name: string; + }; + + const res = NextResponse.json({ + role: data.role, + username: data.username, + display_name: data.display_name, + }); + res.cookies.set(SESSION_COOKIE, data.token, { + httpOnly: true, + secure: process.env.NODE_ENV === "production", + sameSite: "lax", + path: "/", + maxAge: 12 * 60 * 60, + }); + return res; +} diff --git a/dashboard/app/api/auth/logout/route.ts b/dashboard/app/api/auth/logout/route.ts new file mode 100644 index 0000000..2615a62 --- /dev/null +++ b/dashboard/app/api/auth/logout/route.ts @@ -0,0 +1,11 @@ +import { NextResponse } from "next/server"; + +import { SESSION_COOKIE } from "@/lib/session"; + +export const runtime = "nodejs"; + +export async function POST() { + const res = NextResponse.json({ ok: true }); + res.cookies.set(SESSION_COOKIE, "", { httpOnly: true, path: "/", maxAge: 0 }); + return res; +} diff --git a/dashboard/app/api/decision/route.ts b/dashboard/app/api/decision/route.ts index 0b8b696..2bf3f53 100644 --- a/dashboard/app/api/decision/route.ts +++ b/dashboard/app/api/decision/route.ts @@ -10,6 +10,8 @@ import { NextResponse } from "next/server"; +import { getSessionToken } from "@/lib/auth"; + export const runtime = "nodejs"; function apiBase(): string | null { @@ -41,10 +43,15 @@ export async function POST(req: Request) { ); } + const sessionToken = await getSessionToken(); try { const upstream = await fetch(`${base}/packs/${encodeURIComponent(runId)}/decision`, { method: "POST", - headers: { "content-type": "application/json", "x-api-token": token }, + headers: { + "content-type": "application/json", + "x-api-token": token, + ...(sessionToken ? { authorization: `Bearer ${sessionToken}` } : {}), + }, body: JSON.stringify(payload), }); return new NextResponse(await upstream.text(), { diff --git a/dashboard/app/api/diagnose/route.ts b/dashboard/app/api/diagnose/route.ts new file mode 100644 index 0000000..f1a55dc --- /dev/null +++ b/dashboard/app/api/diagnose/route.ts @@ -0,0 +1,59 @@ +/** + * Server-only proxy for the DBRE "Diagnose" action. Forwards a captured_query_id to the read + * API's POST /run with BOTH the write token (RUN_API_TOKEN, server-only) and the DBRE session + * bearer (from the httpOnly cookie). The backend loads the captured query and runs the existing + * diagnose pipeline on it. Returns the DIAGNOSED pack (with its run_id). + */ + +import { NextResponse } from "next/server"; + +import { getSessionToken } from "@/lib/auth"; + +export const runtime = "nodejs"; + +function apiBase(): string | null { + const raw = (process.env.API_URL ?? process.env.NEXT_PUBLIC_API_URL)?.trim(); + return raw ? raw.replace(/\/+$/, "") : null; +} + +export async function POST(req: Request) { + const base = apiBase(); + const apiToken = process.env.RUN_API_TOKEN; + if (!base || !apiToken) { + return NextResponse.json({ error: "diagnose_unconfigured" }, { status: 503 }); + } + const sessionToken = await getSessionToken(); + if (!sessionToken) { + return NextResponse.json({ error: "unauthenticated" }, { status: 401 }); + } + + let capturedId: string; + try { + const parsed = (await req.json()) as { captured_query_id?: string }; + if (!parsed.captured_query_id) throw new Error("missing captured_query_id"); + capturedId = parsed.captured_query_id; + } catch { + return NextResponse.json( + { error: "bad_request", message: "Expected { captured_query_id }." }, + { status: 400 }, + ); + } + + try { + const upstream = await fetch(`${base}/run`, { + method: "POST", + headers: { + "content-type": "application/json", + "x-api-token": apiToken, + authorization: `Bearer ${sessionToken}`, + }, + body: JSON.stringify({ captured_query_id: capturedId }), + }); + return new NextResponse(await upstream.text(), { + status: upstream.status, + headers: { "content-type": "application/json" }, + }); + } catch { + return NextResponse.json({ error: "upstream_unreachable" }, { status: 502 }); + } +} diff --git a/dashboard/app/api/run/route.ts b/dashboard/app/api/run/route.ts deleted file mode 100644 index 4de3034..0000000 --- a/dashboard/app/api/run/route.ts +++ /dev/null @@ -1,60 +0,0 @@ -/** - * Server-only write proxy for POST /run (#37). - * - * The read API gates /run behind a shared secret (backend #58). That secret must - * never reach the browser, so the dashboard calls THIS same-origin route, which - * holds the token in server-only env (RUN_API_TOKEN) and forwards to the read API - * with the X-API-Token header. - * - * When the backend is NOT configured, this route does NOT 503 — it returns a - * locally-generated DIAGNOSED EvidencePack (the simulation fixture) with a - * `simulated: true` flag, so the demo still works. The token is never involved - * on that path. The simulation is read-only: it never applies anything. - * - * Env (server-only — never NEXT_PUBLIC_*): - * API_URL base URL of the read API - * RUN_API_TOKEN shared secret for the gated write endpoints - */ - -import { NextResponse } from "next/server"; - -import { FIXTURE_SIMULATION } from "@/lib/fixtures"; - -export const runtime = "nodejs"; - -function apiBase(): string | null { - const raw = (process.env.API_URL ?? process.env.NEXT_PUBLIC_API_URL)?.trim(); - return raw ? raw.replace(/\/+$/, "") : null; -} - -export async function POST(req: Request) { - const base = apiBase(); - const token = process.env.RUN_API_TOKEN; - if (!base || !token) { - // No backend: return a locally-generated DIAGNOSED pack, flagged simulated. - // Shape mirrors the configured passthrough (a bare EvidencePack) plus the - // sibling `simulated` flag; the token is never touched on this path. - return NextResponse.json({ ...FIXTURE_SIMULATION, simulated: true }, { status: 200 }); - } - - // Body is optional ({} or {"run_id":"..."}); forward whatever the client sent. - const body = await req.text().catch(() => "{}"); - - try { - const upstream = await fetch(`${base}/run`, { - method: "POST", - headers: { "content-type": "application/json", "x-api-token": token }, - body: body || "{}", - }); - // Pass through status + body verbatim so the client sees the real result. - return new NextResponse(await upstream.text(), { - status: upstream.status, - headers: { "content-type": "application/json" }, - }); - } catch { - return NextResponse.json( - { error: "upstream_unreachable", message: "Run API unreachable." }, - { status: 502 }, - ); - } -} diff --git a/dashboard/app/api/workload/query/route.ts b/dashboard/app/api/workload/query/route.ts new file mode 100644 index 0000000..5f37fe5 --- /dev/null +++ b/dashboard/app/api/workload/query/route.ts @@ -0,0 +1,42 @@ +/** + * Server-only proxy for running a guided workload query. Reads the httpOnly session cookie and + * forwards it to the read API as a bearer — the token never touches the browser. The backend + * enforces the `user` role and attributes the capture to the authenticated identity. + */ + +import { NextResponse } from "next/server"; + +import { getSessionToken } from "@/lib/auth"; + +export const runtime = "nodejs"; + +function apiBase(): string | null { + const raw = (process.env.API_URL ?? process.env.NEXT_PUBLIC_API_URL)?.trim(); + return raw ? raw.replace(/\/+$/, "") : null; +} + +export async function POST(req: Request) { + const base = apiBase(); + if (!base) { + return NextResponse.json({ error: "workload_unconfigured" }, { status: 503 }); + } + const token = await getSessionToken(); + if (!token) { + return NextResponse.json({ error: "unauthenticated" }, { status: 401 }); + } + + const body = await req.text().catch(() => "{}"); + try { + const upstream = await fetch(`${base}/workload/query`, { + method: "POST", + headers: { "content-type": "application/json", authorization: `Bearer ${token}` }, + body: body || "{}", + }); + return new NextResponse(await upstream.text(), { + status: upstream.status, + headers: { "content-type": "application/json" }, + }); + } catch { + return NextResponse.json({ error: "upstream_unreachable" }, { status: 502 }); + } +} diff --git a/dashboard/app/console/page.tsx b/dashboard/app/console/page.tsx new file mode 100644 index 0000000..a911d9f --- /dev/null +++ b/dashboard/app/console/page.tsx @@ -0,0 +1,30 @@ +import { getSession, getSessionToken } from "@/lib/auth"; +import { WorkloadConsole } from "@/components/WorkloadConsole"; + +interface Preset { + key: string; + label: string; + intent: string; +} + +async function loadPresets(): Promise { + const base = (process.env.API_URL ?? process.env.NEXT_PUBLIC_API_URL)?.trim(); + const token = await getSessionToken(); + if (!base || !token) return []; + try { + const res = await fetch(`${base.replace(/\/+$/, "")}/workload/presets`, { + headers: { authorization: `Bearer ${token}`, accept: "application/json" }, + cache: "no-store", + }); + if (!res.ok) return []; + return (await res.json()) as Preset[]; + } catch { + return []; + } +} + +export default async function ConsolePage() { + const session = await getSession(); + const presets = await loadPresets(); + return ; +} diff --git a/dashboard/app/dbre/page.tsx b/dashboard/app/dbre/page.tsx new file mode 100644 index 0000000..22f4744 --- /dev/null +++ b/dashboard/app/dbre/page.tsx @@ -0,0 +1,23 @@ +import { getSessionToken } from "@/lib/auth"; +import { SlowQueryQueue, type SlowQuery } from "@/components/SlowQueryQueue"; + +async function loadSlowQueries(): Promise<{ rows: SlowQuery[]; error: string | null }> { + const base = (process.env.API_URL ?? process.env.NEXT_PUBLIC_API_URL)?.trim(); + const token = await getSessionToken(); + if (!base || !token) return { rows: [], error: "Backend not configured." }; + try { + const res = await fetch(`${base.replace(/\/+$/, "")}/workload/slow-queries`, { + headers: { authorization: `Bearer ${token}`, accept: "application/json" }, + cache: "no-store", + }); + if (!res.ok) return { rows: [], error: `Read API returned ${res.status}.` }; + return { rows: (await res.json()) as SlowQuery[], error: null }; + } catch { + return { rows: [], error: "Read API unreachable." }; + } +} + +export default async function DbreQueuePage() { + const { rows, error } = await loadSlowQueries(); + return ; +} diff --git a/dashboard/app/layout.module.css b/dashboard/app/layout.module.css index 7274230..075344a 100644 --- a/dashboard/app/layout.module.css +++ b/dashboard/app/layout.module.css @@ -2,6 +2,11 @@ min-height: 100vh; } +/* unauthenticated routes (login) render full-width with no sidebar */ +.bare { + min-height: 100vh; +} + /* content sits to the right of the fixed 248px sidebar on desktop */ .content { margin-left: 248px; diff --git a/dashboard/app/layout.tsx b/dashboard/app/layout.tsx index 947e660..996dc3c 100644 --- a/dashboard/app/layout.tsx +++ b/dashboard/app/layout.tsx @@ -2,6 +2,7 @@ import type { Metadata } from "next"; import { JetBrains_Mono, IBM_Plex_Sans } from "next/font/google"; import "./globals.css"; import { SidebarNav } from "@/components/SidebarNav"; +import { getSession } from "@/lib/auth"; import layout from "./layout.module.css"; // Non-default typography (see globals.css rationale). @@ -21,21 +22,26 @@ const sans = IBM_Plex_Sans({ export const metadata: Metadata = { title: "DBRE Console — Evidence-Driven Agent", description: - "Operator console for the Evidence-Driven DBRE agent: review the evidence pack and approve the index fix.", + "Operator console for the Evidence-Driven DBRE agent: run workloads, triage slow queries, and approve the index fix.", }; -export default function RootLayout({ +export default async function RootLayout({ children, }: { children: React.ReactNode; }) { + const session = await getSession(); return ( -
- -
{children}
-
+ {session ? ( +
+ +
{children}
+
+ ) : ( +
{children}
+ )} ); diff --git a/dashboard/app/login/login.module.css b/dashboard/app/login/login.module.css new file mode 100644 index 0000000..1b8c61b --- /dev/null +++ b/dashboard/app/login/login.module.css @@ -0,0 +1,93 @@ +.wrap { + min-height: 100vh; + display: flex; + align-items: center; + justify-content: center; + padding: 2rem; +} + +.card { + width: 100%; + max-width: 380px; + background: var(--bg-panel); + border: 1px solid var(--border); + border-radius: var(--radius); + padding: 2rem 1.75rem; + display: flex; + flex-direction: column; + gap: 1rem; +} + +.brand { + display: flex; + align-items: center; + gap: 0.6rem; + font-family: var(--mono); + font-weight: 700; + font-size: 1.15rem; + color: var(--text); +} + +.brand svg { + color: var(--cyan); +} + +.tagline { + color: var(--text-dim); + font-size: 0.9rem; + margin-bottom: 0.25rem; +} + +.field { + display: flex; + flex-direction: column; + gap: 0.35rem; +} + +.field span { + font-size: 0.8rem; + color: var(--text-dim); + font-family: var(--mono); +} + +.field input { + background: var(--bg); + border: 1px solid var(--border); + border-radius: var(--radius); + padding: 0.6rem 0.7rem; + color: var(--text); + font-family: var(--mono); + font-size: 0.95rem; +} + +.field input:focus { + outline: none; + border-color: var(--cyan); + box-shadow: 0 0 0 2px rgba(92, 200, 230, 0.18); +} + +.error { + color: var(--red); + font-size: 0.85rem; +} + +.submit { + margin-top: 0.5rem; + display: inline-flex; + align-items: center; + justify-content: center; + gap: 0.5rem; + background: var(--cyan); + color: var(--bg); + border: none; + border-radius: var(--radius); + padding: 0.65rem 1rem; + font-weight: 700; + font-family: var(--sans); + cursor: pointer; +} + +.submit:disabled { + opacity: 0.6; + cursor: not-allowed; +} diff --git a/dashboard/app/login/page.tsx b/dashboard/app/login/page.tsx new file mode 100644 index 0000000..52a6c11 --- /dev/null +++ b/dashboard/app/login/page.tsx @@ -0,0 +1,90 @@ +"use client"; + +import { useState } from "react"; +import { useRouter } from "next/navigation"; +import { Database, SignIn } from "@phosphor-icons/react/dist/ssr"; + +import styles from "./login.module.css"; + +export default function LoginPage() { + const router = useRouter(); + const [username, setUsername] = useState(""); + const [password, setPassword] = useState(""); + const [error, setError] = useState(null); + const [pending, setPending] = useState(false); + + async function onSubmit(e: React.FormEvent) { + e.preventDefault(); + setError(null); + setPending(true); + try { + const res = await fetch("/api/auth/login", { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ username, password }), + }); + if (!res.ok) { + setError( + res.status === 401 + ? "Invalid username or password." + : "Sign-in is unavailable right now.", + ); + setPending(false); + return; + } + const data = (await res.json()) as { role: "user" | "dbre" }; + router.replace(data.role === "user" ? "/console" : "/dbre"); + router.refresh(); + } catch { + setError("Network error — please try again."); + setPending(false); + } + } + + return ( +
+
+
+ + DBRE Console +
+

+ Sign in to run database workloads, or to triage the slowest queries. +

+ + + + + + {error && ( +

+ {error} +

+ )} + + +
+
+ ); +} diff --git a/dashboard/components/AgentRunView.tsx b/dashboard/components/AgentRunView.tsx index e10ca79..0c13f63 100644 --- a/dashboard/components/AgentRunView.tsx +++ b/dashboard/components/AgentRunView.tsx @@ -1,15 +1,12 @@ "use client"; import { useState } from "react"; -import { usePathname, useRouter } from "next/navigation"; import { MagnifyingGlass, GitBranch, WifiHigh, WifiSlash, Flask, - Sparkle, - CircleNotch, } from "@phosphor-icons/react/dist/ssr"; import { StageIndicator } from "@/components/StageIndicator"; import { PlanPanel } from "@/components/PlanPanel"; @@ -20,15 +17,13 @@ import { TracePanel } from "@/components/TracePanel"; import type { EvidencePack } from "@/lib/evidence"; import { displayStatus, isVerificationFailed } from "@/lib/evidence"; import type { PackSource } from "@/lib/api"; -import { askTheAgent } from "@/lib/run"; import styles from "@/app/run-review.module.css"; /** - * The interactive run view (#37). Seeded with the server-loaded pack; the "Ask - * the agent" button triggers a diagnosis (POST /api/run) and navigates to - * /runs/ for the freshly-produced run, so the destination's loader - * supplies the durable source (live vs simulation) — we never assert "live" on - * the client. The 5-stage indicator shows a running state while it works. + * Run review for a DBRE-selected diagnosis. The run is produced from the slow-query queue (the + * "Diagnose" action runs POST /run on the captured query); this view renders the resulting pack + * and the approval gate. Approving applies + verifies the index server-side — the dashboard + * never asserts a verified result on the client. */ export function AgentRunView({ initialPack, @@ -39,41 +34,12 @@ export function AgentRunView({ initialSource: PackSource; initialNotice?: string; }) { - const router = useRouter(); - const pathname = usePathname(); const [pack, setPack] = useState(initialPack); - // source/notice are fixed for this rendered run: "Ask" navigates to a fresh - // run rather than mutating in place, and approving a pack never changes its - // source. setPack still drives in-place approval updates from the gate panel. const source = initialSource; const notice = initialNotice; - const [running, setRunning] = useState(false); - const [error, setError] = useState(null); const ds = displayStatus(pack); const verificationFailed = isVerificationFailed(pack); - async function onAsk() { - setRunning(true); - setError(null); - const res = await askTheAgent(); - if (res.ok && res.pack) { - // Navigate to the produced run; loadPack on that page resolves the honest - // source (live from the read API, or simulation when no backend is set). - const target = `/runs/${encodeURIComponent(res.pack.run_id)}`; - if (pathname === target) { - // Same run id (e.g. re-running the simulation while already here): a push - // is a no-op and would never clear the spinner — refresh in place instead. - router.refresh(); - setRunning(false); - } else { - router.push(target); - } - return; - } - setRunning(false); - setError(res.message ?? "Could not run the agent."); - } - return (
@@ -100,26 +66,9 @@ export function AgentRunView({
- - -
- - - {running - ? "Diagnosis running over the Denver/ESR query…" - : "Triggers a diagnosis run and opens the resulting evidence pack."} - - {error && {error}} -
+ - + (null); const firstLinkRef = useRef(null); const wasOpen = useRef(false); + const nav = session.role === "dbre" ? DBRE_NAV : USER_NAV; + function isActive(href: string): boolean { if (href === "/") return pathname === "/"; // /runs/ is the canonical run view; keep "Run Review" active there too. - if (href === "/run-review") return pathname.startsWith("/run-review") || pathname.startsWith("/runs"); + if (href === "/run-review") + return pathname.startsWith("/run-review") || pathname.startsWith("/runs"); return pathname.startsWith(href); } @@ -58,6 +69,15 @@ export function SidebarNav() { return () => window.removeEventListener("keydown", onKey); }, [open]); + async function logout() { + try { + await fetch("/api/auth/logout", { method: "POST" }); + } finally { + router.replace("/login"); + router.refresh(); + } + } + return ( <> {/* mobile top bar with hamburger */} @@ -83,12 +103,14 @@ export function SidebarNav() {
DBRE Console - operator control plane + + {session.role === "dbre" ? "operator control plane" : "workload console"} +