From 24d31c7df11c0f5bb21332eedfb47912b4385ad6 Mon Sep 17 00:00:00 2001 From: vsoch Date: Wed, 24 Jun 2026 22:35:59 -0700 Subject: [PATCH 1/3] wip: postfilter rematch I want to support having other filters (extension points) running with fluence, which means that fluence might make a scheduling decision that is then rejected by some other plugin. If this happens we need to cancel the allocation and record the node names to give as constraints the next time around. This will likely take me a few days to work on, and I am thinking we want to replicate our original experiments in this paper to demosntrate the basic gang scheduling before we go into quantum / custom resources. Signed-off-by: vsoch --- .github/workflows/e2e-tests.yaml | 14 +- Makefile | 8 +- cmd/webhook/main.go | 25 ++ deploy/fluence-pull-test.yaml | 277 +++++++++++++++++++++ deploy/kind-config.yaml | 1 + docs/handlers.md | 83 ++++++ examples/multi-gang-contention.yaml | 40 +++ examples/multi-gang-requeue.yaml | 48 ++++ examples/multi-gang.yaml | 29 +++ examples/test/e2e/quantum-split-pods.yaml | 55 ++++ pkg/fluence/enqueue.go | 86 +++++++ pkg/fluence/enqueue_test.go | 87 +++++++ pkg/fluence/fluence.go | 98 +++++++- pkg/fluence/fluence_test.go | 108 +++++++- pkg/placement/placement.go | 41 ++- pkg/placement/placement_test.go | 93 ++++++- pkg/webhook/handler.go | 87 +++++-- pkg/webhook/handlers/gang.go | 76 +++++- pkg/webhook/handlers/gang_mincount_test.go | 154 ++++++++++++ pkg/webhook/handlers/handlers_test.go | 8 +- pkg/webhook/handlers/leader.go | 65 +++++ pkg/webhook/handlers/quantum.go | 103 +++++++- pkg/webhook/handlers/quantum_split_test.go | 117 +++++++++ pkg/webhook/handlers/registry_test.go | 82 ++++++ pkg/webhook/handlers/sidecar.go | 56 +++++ pkg/webhook/webhook.go | 72 ++---- pkg/webhook/webhook_test.go | 2 +- python/fluence/sidecar.py | 19 +- test/e2e/01-classical-gang.sh | 4 + test/e2e/05-postfilter-rematch.sh | 112 +++++++++ test/e2e/06-multi-gang.sh | 75 ++++++ test/e2e/07-quantum-split.sh | 54 ++++ test/e2e/08-requeue-on-capacity.sh | 63 +++++ 33 files changed, 2119 insertions(+), 123 deletions(-) create mode 100644 deploy/fluence-pull-test.yaml create mode 100644 docs/handlers.md create mode 100644 examples/multi-gang-contention.yaml create mode 100644 examples/multi-gang-requeue.yaml create mode 100644 examples/multi-gang.yaml create mode 100644 examples/test/e2e/quantum-split-pods.yaml create mode 100644 pkg/fluence/enqueue.go create mode 100644 pkg/fluence/enqueue_test.go create mode 100644 pkg/webhook/handlers/gang_mincount_test.go create mode 100644 pkg/webhook/handlers/leader.go create mode 100644 pkg/webhook/handlers/quantum_split_test.go create mode 100644 pkg/webhook/handlers/registry_test.go create mode 100644 pkg/webhook/handlers/sidecar.go create mode 100644 test/e2e/05-postfilter-rematch.sh create mode 100755 test/e2e/06-multi-gang.sh create mode 100755 test/e2e/07-quantum-split.sh create mode 100644 test/e2e/08-requeue-on-capacity.sh diff --git a/.github/workflows/e2e-tests.yaml b/.github/workflows/e2e-tests.yaml index a6c1266..f27fa92 100644 --- a/.github/workflows/e2e-tests.yaml +++ b/.github/workflows/e2e-tests.yaml @@ -90,6 +90,15 @@ jobs: - name: E2E - classical gang run: bash test/e2e/01-classical-gang.sh + - name: E2E - multi-pod gang (all-or-nothing + contention) + run: bash test/e2e/06-multi-gang.sh + + - name: E2E - requeue on capacity (EventsToRegister) + run: bash test/e2e/08-requeue-on-capacity.sh + + - name: E2E - PostFilter re-match (taint-rejected allocation) + run: bash test/e2e/05-postfilter-rematch.sh + - name: Deploy quantum add-on run: | # Includes the device plugin and oriented to testing container @@ -140,8 +149,11 @@ jobs: - name: E2E - sidecar ungate run: bash test/e2e/04-sidecar-ungate.sh + - name: E2E - quantum two-group split (leader=1, workers=N-1) + run: bash test/e2e/07-quantum-split.sh + - name: Dump diagnostics on failure if: failure() run: | kubectl get pods -A -o wide - kubectl logs -n kube-system deployment/fluence + kubectl logs -n kube-system deployment/fluence \ No newline at end of file diff --git a/Makefile b/Makefile index 1160cb4..08d15b3 100644 --- a/Makefile +++ b/Makefile @@ -55,13 +55,17 @@ test-image-deploy: test-image kubectl patch podgroup training -n default --type=merge -p '{"metadata":{"finalizers":null}}' || true kubectl delete deployments --all kubectl delete pods --all - kubectl delete -f deploy/fluence-test.yaml + kubectl delete -f deploy/fluence-test.yaml || true kubectl delete pods --all +.PHONY: test-deploy-recreate +test-deploy-recreate: test-image-deploy + kubectl apply -f deploy/fluence-pull-test.yaml + kubectl apply -f deploy/device-plugin.yaml .PHONY: deploy deploy: ## Install RBAC + scheduler into kube-system - kubectl apply -f deploy/fluence.yaml + kubectl apply -f deploy/fluence-.yaml .PHONY: help help: diff --git a/cmd/webhook/main.go b/cmd/webhook/main.go index ea2669a..4169e8a 100644 --- a/cmd/webhook/main.go +++ b/cmd/webhook/main.go @@ -12,9 +12,11 @@ package main import ( "context" "crypto/tls" + "flag" "log" "net/http" "os" + "strings" "time" "github.com/converged-computing/fluence/pkg/cluster" @@ -38,6 +40,29 @@ func main() { cfgName := env("WEBHOOK_CONFIG", "fluence-webhook") addr := env("WEBHOOK_ADDR", ":8443") + // Handler selection. By default ALL registered handlers are enabled. The + // operator may restrict the active set with --handlers (comma-separated) or + // the FLUENCE_HANDLERS env var, e.g. --handlers=fluxion,gang to run without + // quantum. An empty value means all enabled. Unknown names are warned about + // but not fatal (so config survives a handler being renamed/removed). + handlersFlag := flag.String("handlers", env("FLUENCE_HANDLERS", ""), + "comma-separated handlers in dispatch order (default: fluxion,quantum,gang). e.g. fluxion,gang disables quantum") + flag.Parse() + + var requested []string + if *handlersFlag != "" { + for _, n := range strings.Split(*handlersFlag, ",") { + if n = strings.TrimSpace(n); n != "" { + requested = append(requested, n) + } + } + } + active, unknown := webhook.SetActiveHandlers(requested) + for _, n := range unknown { + log.Printf("[fluence-webhook] WARNING: unknown handler %q — ignoring", n) + } + log.Printf("[fluence-webhook] active handlers (in dispatch order): %v", active) + dnsNames := []string{ svc + "." + ns + ".svc", svc + "." + ns + ".svc.cluster.local", diff --git a/deploy/fluence-pull-test.yaml b/deploy/fluence-pull-test.yaml new file mode 100644 index 0000000..0fc642f --- /dev/null +++ b/deploy/fluence-pull-test.yaml @@ -0,0 +1,277 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: fluence + namespace: kube-system +--- +# Bind the built-in scheduler roles so fluence (a full kube-scheduler build) has +# every list/watch the scheduling framework needs (nodes, pods, PV/PVC, CSI, +# storageclasses, resourceclaims/slices, volumeattachments, events, etc.). +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: fluence-as-kube-scheduler +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:kube-scheduler +subjects: + - kind: ServiceAccount + name: fluence + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: fluence-as-volume-scheduler +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:volume-scheduler +subjects: + - kind: ServiceAccount + name: fluence + namespace: kube-system +--- +# Delegated authentication: read the auth configmap in kube-system. This is the +# fix for the "extension-apiserver-authentication ... forbidden" errors. +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: fluence-extension-apiserver-authentication-reader + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: extension-apiserver-authentication-reader +subjects: + - kind: ServiceAccount + name: fluence + namespace: kube-system +--- +# Extras the built-in scheduler role does not grant: the alpha PodGroup/Workload +# API (gang), and leader-election leases under our scheduler name. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: fluence-extra +rules: + - apiGroups: ["scheduling.k8s.io"] + resources: ["podgroups", "workloads", "podgroups/status", "workloads/status"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["create", "get", "update", "list", "watch"] + # PreBind stamps the allocated backend onto the pod as an annotation; the + # built-in system:kube-scheduler role only allows patching pods/status, not + # the pod object, so grant it here. + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch", "patch", "update"] + # The webhook self-manages its TLS by patching its own config's caBundle. + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["mutatingwebhookconfigurations"] + verbs: ["get", "list", "watch", "patch"] + # The webhook creates per-namespace sidecar RBAC on demand when a leader + # pod is admitted, so users do not need to apply RBAC manually. + - apiGroups: [""] + resources: ["serviceaccounts"] + verbs: ["get", "create"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["get", "create"] + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["roles", "rolebindings"] + verbs: ["get", "create"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: fluence-extra +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: fluence-extra +subjects: + - kind: ServiceAccount + name: fluence + namespace: kube-system +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: fluence-scheduler-config + namespace: kube-system +data: + scheduler-config.yaml: | + apiVersion: kubescheduler.config.k8s.io/v1 + kind: KubeSchedulerConfiguration + leaderElection: + leaderElect: false + profiles: + - schedulerName: fluence + plugins: + # multiPoint wires Fluence into every extension point its Go type + # implements: PreFilter, Filter, and PreBind (which stamps the backend + # annotation). Listing points individually risks omitting one — that is + # exactly what left PreBind unwired and the backend annotation unset. + multiPoint: + enabled: [{name: Fluence}] +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: fluence + namespace: kube-system + labels: {app: fluence} +spec: + replicas: 1 + selector: + matchLabels: {app: fluence} + template: + metadata: + labels: {app: fluence} + spec: + serviceAccountName: fluence + containers: + - name: fluence + image: vanessa/fluence:test + # Allows for kind load + imagePullPolicy: Always + command: + - /bin/fluence + - --config=/etc/fluence/scheduler-config.yaml + # fluence is its own scheduler binary, so it needs the gang gates set + # here (the cluster-level kube-scheduler gates don't apply to it). + # Without these its PodGroup/GangScheduling plugin is inactive, pods + # schedule with no gang semantics, and PodGroup status stays Pending. + - --feature-gates=GenericWorkload=true,GangScheduling=true + - --v=4 + env: + # Path to the resources config (e.g. quantum backends). Unset/empty + # file -> classical-only graph. Supplied by the quantum add-on. + - name: FLUENCE_RESOURCES + value: /etc/fluence/resources.yaml + volumeMounts: + - name: config + mountPath: /etc/fluence + volumes: + - name: config + projected: + sources: + - configMap: {name: fluence-scheduler-config} + - configMap: {name: fluence-resources, optional: true} +--- +# Mutating webhook: injects scheduler-chosen values into pods at creation time +# (currently a downward-API QRMI_BACKEND env for quantum pods). It self-manages +# TLS — generates a CA + serving cert at startup and patches the caBundle below — +# so no cert-manager and no committed keys. failurePolicy Ignore keeps a webhook +# outage from blocking pod creation cluster-wide. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: fluence-webhook + namespace: kube-system + labels: {app: fluence-webhook} +spec: + replicas: 1 + selector: + matchLabels: {app: fluence-webhook} + template: + metadata: + labels: {app: fluence-webhook} + spec: + serviceAccountName: fluence + containers: + - name: webhook + image: vanessa/fluence:test + # Allows for kind load + imagePullPolicy: Always + command: ["/bin/fluence-webhook"] + env: + # Use busybox as sidecar image in tests — avoids pulling the real + # sidecar image which is large and not cached in CI. + - name: FLUENCE_SIDECAR_IMAGE + value: "busybox:latest" + ports: + - containerPort: 8443 + readinessProbe: + httpGet: {path: /healthz, port: 8443, scheme: HTTPS} + initialDelaySeconds: 2 +--- +apiVersion: v1 +kind: Service +metadata: + name: fluence-webhook + namespace: kube-system +spec: + selector: {app: fluence-webhook} + ports: + - port: 443 + targetPort: 8443 +--- +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: fluence-webhook +webhooks: + - name: pods.fluence.flux-framework.org + admissionReviewVersions: ["v1"] + sideEffects: None + failurePolicy: Ignore # never block pod creation if the webhook is down + # caBundle is filled in at runtime by the webhook patching this object. + clientConfig: + service: + name: fluence-webhook + namespace: kube-system + path: /mutate + port: 443 + rules: + - apiGroups: [""] + apiVersions: ["v1"] + operations: ["CREATE"] + resources: ["pods"] + scope: Namespaced + # Don't intercept system pods (and avoid bootstrap coupling). + namespaceSelector: + matchExpressions: + - key: kubernetes.io/metadata.name + operator: NotIn + values: ["kube-system"] +--- +# fluence-sidecar.yaml +# +# RBAC and supporting resources for the Fluence quantum sidecar. +# +# The sidecar runs inside a leader pod and needs: +# - patch/annotate on pods in its own namespace (to ungate workers and +# propagate the task ARN annotation) +# +# The sidecar ServiceAccount is namespace-scoped — it only has permissions +# in the namespace where the workflow runs. The webhook sets +# spec.serviceAccountName on the leader pod to fluence-sidecar. +# +# The fluence Python package is staged into user containers by an init +# container (Model C): the webhook injects an init container from the +# sidecar image that copies the package + sitecustomize into a shared +# volume on the user container's PYTHONPATH. No ConfigMap, no user install. +# +# Apply with: +# kubectl apply -f deploy/fluence-sidecar.yaml + + +--- +# PriorityClass for classical pods paired with quantum work. +# Applied to worker pods by the webhook when they are gated. +# When ungated, high priority triggers preemption of lower-priority work +# so workers get nodes immediately as the QPU result arrives. +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: fluence-quantum-classical + labels: + app: fluence +value: 1000000 +globalDefault: false +preemptionPolicy: PreemptLowerPriority +description: "High priority for classical pods paired with quantum work. Set by Fluence webhook." diff --git a/deploy/kind-config.yaml b/deploy/kind-config.yaml index c94e070..a13acf5 100644 --- a/deploy/kind-config.yaml +++ b/deploy/kind-config.yaml @@ -33,3 +33,4 @@ nodes: value: "GenericWorkload=true" - role: worker - role: worker + - role: worker diff --git a/docs/handlers.md b/docs/handlers.md new file mode 100644 index 0000000..1da169a --- /dev/null +++ b/docs/handlers.md @@ -0,0 +1,83 @@ +# Webhook handlers & sidecar architecture + +Fluence's value is not creating gangs (Kubernetes 1.36 native gang scheduling +already does that). It is **customizing the gang on the fly based on the +resources a pod requests** — e.g. a quantum leader/worker workload becomes a +size-1 leader gang plus a size-(N-1) worker gang, with the leader running a +sidecar that ungates its workers when the quantum task is ready. + +## Handlers + +Each handler is an interface implementation (`pkg/webhook/handler.go`): + +```go +type Handler interface { + Name() string + Applies(ctx, m MutatorAPI, pod) bool + Mutate(ctx, m MutatorAPI, pod) []spec.Op +} +``` + +Handlers self-register by name (`init()` -> `webhook.Register`); a blank import +of the handlers package makes them AVAILABLE. The core never names a handler. + +**Ordering = the active list.** There is no per-handler priority. The active +handler list is BOTH the selection and the dispatch order: + +```go +var DefaultHandlerOrder = []string{"fluxion", "quantum", "gang"} +``` + +Dispatch walks this list in order. `gang` is last because it is last in the +list — the fallback that applies common defaults (honor `group-size`, else +owner-derived N) only if no earlier handler already shaped the gang. A +custom-resource handler is inserted into the list before `gang` to shape its own +gang first. To change the order, or disable a handler, pass a different list. + +## Enabling/disabling handlers + +By default ALL registered handlers are enabled. Restrict the active set on the +webhook command: + +``` +fluence-webhook --handlers=fluxion,gang # run without quantum +FLUENCE_HANDLERS=fluxion,quantum,gang fluence-webhook +``` + +Empty = the default list. The list is the order: `--handlers=gang,fluxion` runs +gang first; omitting a name disables it. Unknown names are warned and dropped. + +(The handler set lives in the WEBHOOK, which mutates pods. `cmd/fluence` is the +scheduler plugin and runs no handlers.) + +## Sidecar interface + +The coordination sidecar is a handler-owned capability, not a core one. Handlers +that need a sidecar use `handlers.Sidecar`: + +```go +type Sidecar interface { + EnsureRBAC(ctx, namespace) + InterceptorOps(pod) []spec.Op + ContainerOps(pod, observe bool) []spec.Op +} +``` + +The default `coreSidecar` delegates to the core's staging primitives. The quantum +handler uses it today; a custom handler can supply its own implementation +(different image, env, gating) without touching the core or other handlers. The +core's `MutatorAPI` keeps the staging primitives only so the default +implementation can delegate — handlers do not call them directly. + +## Group size resolution (the default gang handler) + +`minCount` (the atomic-schedule count) resolves as: + +1. explicit `fluence.flux-framework.org/group-size` annotation — honored verbatim + (the override; e.g. a quantum split sets it directly); +2. else the owning indexed Job's `parallelism` (== MiniCluster size N); +3. else 1, logged. + +This is a common default available to every gang; handler-specific annotations +(quantum role, expected-workers, etc.) live in their handlers and are not +required by the core. diff --git a/examples/multi-gang-contention.yaml b/examples/multi-gang-contention.yaml new file mode 100644 index 0000000..bf5c74e --- /dev/null +++ b/examples/multi-gang-contention.yaml @@ -0,0 +1,40 @@ +# Two gangs that cannot both place: fluxion allocates one core per slot, so two +# 2-pod gangs need 4 cores, but the cluster graphs ~3 (3 workers, ~1 core each). One gang places entirely; the loser stays FULLY pending +# (all-or-nothing), never partial. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: gang-a +spec: + replicas: 2 + selector: {matchLabels: {app: gang-a}} + template: + metadata: + labels: {app: gang-a, fluence.flux-framework.org/group: gang-a} + annotations: {fluence.flux-framework.org/group-size: "2"} + spec: + schedulerName: fluence + containers: + - name: w + image: busybox + command: ["sleep", "3600"] + resources: {requests: {cpu: "100m"}} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: gang-b +spec: + replicas: 2 + selector: {matchLabels: {app: gang-b}} + template: + metadata: + labels: {app: gang-b, fluence.flux-framework.org/group: gang-b} + annotations: {fluence.flux-framework.org/group-size: "2"} + spec: + schedulerName: fluence + containers: + - name: w + image: busybox + command: ["sleep", "3600"] + resources: {requests: {cpu: "100m"}} diff --git a/examples/multi-gang-requeue.yaml b/examples/multi-gang-requeue.yaml new file mode 100644 index 0000000..1bbaf02 --- /dev/null +++ b/examples/multi-gang-requeue.yaml @@ -0,0 +1,48 @@ +# Requeue-on-capacity test for EventsToRegister. +# gang-win: a 2-pod gang that runs a SHORT job and COMPLETES (pods -> Succeeded), +# freeing its nodes. Pod completion is the capacity-free event. +# gang-wait: a 2-pod gang needing the same nodes; loses the initial race and sits +# Unschedulable. When gang-win completes, fluence's QueueingHint must +# wake gang-wait so it schedules and runs — with NO manual nudge. +# On a 3-worker (~3-core) cluster the two 2-pod gangs (4 cores) cannot co-run. +apiVersion: batch/v1 +kind: Job +metadata: + name: gang-win +spec: + completions: 2 + parallelism: 2 + completionMode: Indexed + template: + metadata: + labels: {fluence.flux-framework.org/group: gang-win} + annotations: {fluence.flux-framework.org/group-size: "2"} + spec: + schedulerName: fluence + restartPolicy: Never + containers: + - name: w + image: busybox + command: ["sh","-c","sleep 30"] # completes, frees nodes + resources: {requests: {cpu: "1"}} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: gang-wait +spec: + completions: 2 + parallelism: 2 + completionMode: Indexed + template: + metadata: + labels: {fluence.flux-framework.org/group: gang-wait} + annotations: {fluence.flux-framework.org/group-size: "2"} + spec: + schedulerName: fluence + restartPolicy: Never + containers: + - name: w + image: busybox + command: ["sh","-c","sleep 10"] + resources: {requests: {cpu: "1"}} \ No newline at end of file diff --git a/examples/multi-gang.yaml b/examples/multi-gang.yaml new file mode 100644 index 0000000..5b9f58e --- /dev/null +++ b/examples/multi-gang.yaml @@ -0,0 +1,29 @@ +# Multi-pod gang via the WEBHOOK path (the path the experiments use): pods carry +# the group LABEL + group-size annotation; the fluence webhook creates the +# PodGroup with minCount = group-size (3). All 3 must place or none. +# The CI cluster has 3 workers; fluxion graphs ~1 core per node, so a 3-pod +# gang needs all three. minCount=3 enforces all-or-none. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: gang3 +spec: + replicas: 3 + selector: + matchLabels: {app: gang3} + template: + metadata: + labels: + app: gang3 + fluence.flux-framework.org/group: gang3 + annotations: + fluence.flux-framework.org/group-size: "3" + spec: + schedulerName: fluence + containers: + - name: worker + image: busybox + command: ["sleep", "3600"] + resources: + requests: + cpu: "100m" diff --git a/examples/test/e2e/quantum-split-pods.yaml b/examples/test/e2e/quantum-split-pods.yaml new file mode 100644 index 0000000..0fcd296 --- /dev/null +++ b/examples/test/e2e/quantum-split-pods.yaml @@ -0,0 +1,55 @@ +# Heterogeneous quantum gang for the two-group split e2e. Leader requests qpu and +# is role=leader; two workers are role=worker. group-size on the leader makes the +# leader group minCount deterministic (1); the worker group is derived as N-1. +# (Owner-derived N from an indexed Job is exercised by the unit tests; a raw-pod +# mock has no owning Job, so we make the split observable via explicit roles.) +apiVersion: v1 +kind: Pod +metadata: + name: qsplit-leader + labels: + app: qsplit + fluence.flux-framework.org/group: qsplit + annotations: + fluence.flux-framework.org/role: leader + fluence.flux-framework.org/group-size: "3" +spec: + schedulerName: fluence + restartPolicy: Never + containers: + - name: app + image: busybox + command: ["sh","-c","echo leader; sleep 600"] + resources: + requests: {fluxion.flux-framework.org/qpu: "1"} + limits: {fluxion.flux-framework.org/qpu: "1"} +--- +apiVersion: v1 +kind: Pod +metadata: + name: qsplit-worker-0 + labels: {app: qsplit, fluence.flux-framework.org/group: qsplit} + annotations: {fluence.flux-framework.org/role: worker, fluence.flux-framework.org/group-size: "3"} +spec: + schedulerName: fluence + restartPolicy: Never + containers: + - name: w + image: busybox + command: ["sh","-c","echo worker; sleep 30"] + resources: {requests: {cpu: "100m"}} +--- +apiVersion: v1 +kind: Pod +metadata: + name: qsplit-worker-1 + labels: {app: qsplit, fluence.flux-framework.org/group: qsplit} + annotations: {fluence.flux-framework.org/role: worker, fluence.flux-framework.org/group-size: "3"} +spec: + schedulerName: fluence + restartPolicy: Never + containers: + - name: w + image: busybox + command: ["sh","-c","echo worker; sleep 30"] + resources: {requests: {cpu: "100m"}} diff --git a/pkg/fluence/enqueue.go b/pkg/fluence/enqueue.go new file mode 100644 index 0000000..f04e574 --- /dev/null +++ b/pkg/fluence/enqueue.go @@ -0,0 +1,86 @@ +/* +Copyright 2024 Lawrence Livermore National Security, LLC + (c.f. AUTHORS, NOTICE.LLNS, COPYING) +SPDX-License-Identifier: Apache-2.0 +*/ + +package fluence + +import ( + "context" + + corev1 "k8s.io/api/core/v1" + "k8s.io/klog/v2" + fwk "k8s.io/kube-scheduler/framework" +) + +// Fluence implements EnqueueExtensions so that a gang rejected as Unschedulable +// (because the node pool was full) is RE-ATTEMPTED when capacity frees up. +// +// Without this, a losing gang sits in the unschedulable queue until the +// scheduler's periodic backoff flush — it is NOT woken when another gang +// finishes and releases nodes. For the contention experiment (submit more gang +// demand than the cluster holds, watch gangs drain as others complete) that +// means contended gangs stall instead of draining promptly. The capacity-freeing +// events are: a pod terminating (Succeeded/Failed — batch apps Complete and +// linger before deletion, so Update catches it earlier than Delete), a pod being +// deleted, and node capacity appearing/growing. +var _ fwk.EnqueueExtensions = (*Fluence)(nil) + +// EventsToRegister declares the cluster events that may let a previously +// Unschedulable Fluence gang schedule, each with a QueueingHint that filters out +// events which cannot plausibly free capacity (so we do not churn the queue). +func (f *Fluence) EventsToRegister(_ context.Context) ([]fwk.ClusterEventWithHint, error) { + return []fwk.ClusterEventWithHint{ + // A pod going terminal (Succeeded/Failed) frees its node BEFORE deletion; + // this is the event that actually fires when a batch gang completes. + {Event: fwk.ClusterEvent{Resource: fwk.Pod, ActionType: fwk.Update}, + QueueingHintFn: f.isPodCapacityChange}, + // A pod being deleted frees its node. + {Event: fwk.ClusterEvent{Resource: fwk.Pod, ActionType: fwk.Delete}, + QueueingHintFn: f.isPodCapacityChange}, + // New node, or a node's allocatable growing, adds capacity. + {Event: fwk.ClusterEvent{Resource: fwk.Node, + ActionType: fwk.Add | fwk.UpdateNodeAllocatable}}, + }, nil +} + +// isPodCapacityChange returns Queue when the pod event plausibly frees node +// capacity for a waiting gang — i.e. another pod terminated or was deleted. +// Anything else (a pod being created, an unrelated label change) returns +// QueueSkip so the waiting gang is not retried pointlessly. +// +// We do not try to be clever about which specific nodes freed: any capacity +// release can change a Fluxion match, and PreFilter re-matches the whole graph +// on retry. The hint just suppresses the obviously-irrelevant events. +func (f *Fluence) isPodCapacityChange( + logger klog.Logger, _ *corev1.Pod, oldObj, newObj interface{}, +) (fwk.QueueingHint, error) { + // Delete event: newObj is nil, oldObj is the deleted pod -> frees capacity. + if newObj == nil { + if _, ok := oldObj.(*corev1.Pod); ok { + return fwk.Queue, nil + } + return fwk.QueueSkip, nil + } + // Update event: queue only when the pod BECOMES terminal (was running, now + // Succeeded/Failed) — that is the moment its node frees. + newPod, ok := newObj.(*corev1.Pod) + if !ok { + return fwk.QueueSkip, nil + } + if !isTerminalPhase(newPod.Status.Phase) { + return fwk.QueueSkip, nil + } + // If we can see the old object, only fire on the transition INTO terminal + // (avoid re-queuing on every update of an already-finished pod). + if oldPod, ok := oldObj.(*corev1.Pod); ok && isTerminalPhase(oldPod.Status.Phase) { + return fwk.QueueSkip, nil + } + return fwk.Queue, nil +} + +// isTerminalPhase reports whether a pod phase means its node capacity is released. +func isTerminalPhase(p corev1.PodPhase) bool { + return p == corev1.PodSucceeded || p == corev1.PodFailed +} \ No newline at end of file diff --git a/pkg/fluence/enqueue_test.go b/pkg/fluence/enqueue_test.go new file mode 100644 index 0000000..a99c1c5 --- /dev/null +++ b/pkg/fluence/enqueue_test.go @@ -0,0 +1,87 @@ +//go:build cgo + +/* +Copyright 2024 Lawrence Livermore National Security, LLC + (c.f. AUTHORS, NOTICE.LLNS, COPYING) +SPDX-License-Identifier: Apache-2.0 +*/ + +// Unit tests for the requeue QueueingHint. Tagged cgo because the package links +// the Fluxion matcher; runs in CI (fluence-base) via `make test`. +package fluence + +import ( + "context" + "testing" + + corev1 "k8s.io/api/core/v1" + "k8s.io/klog/v2" + fwk "k8s.io/kube-scheduler/framework" +) + +func pod(phase corev1.PodPhase) *corev1.Pod { + return &corev1.Pod{Status: corev1.PodStatus{Phase: phase}} +} + +func TestQueueingHint(t *testing.T) { + f := &Fluence{} + lg := klog.Background() + waiting := pod(corev1.PodPending) // the rejected gang pod (unused by the hint) + + cases := []struct { + name string + oldObj interface{} + newObj interface{} + want fwk.QueueingHint + }{ + {"pod deleted frees capacity", + pod(corev1.PodRunning), nil, fwk.Queue}, + {"pod became Succeeded frees capacity", + pod(corev1.PodRunning), pod(corev1.PodSucceeded), fwk.Queue}, + {"pod became Failed frees capacity", + pod(corev1.PodRunning), pod(corev1.PodFailed), fwk.Queue}, + {"already-terminal update does not re-fire", + pod(corev1.PodSucceeded), pod(corev1.PodSucceeded), fwk.QueueSkip}, + {"pod still running is irrelevant", + pod(corev1.PodRunning), pod(corev1.PodRunning), fwk.QueueSkip}, + {"pod created (pending) does not free capacity", + nil, pod(corev1.PodPending), fwk.QueueSkip}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got, err := f.isPodCapacityChange(lg, waiting, c.oldObj, c.newObj) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != c.want { + t.Errorf("hint = %v, want %v", got, c.want) + } + }) + } +} + +// The plugin must advertise the capacity-freeing events. +func TestEventsToRegister(t *testing.T) { + f := &Fluence{} + evts, err := f.EventsToRegister(context.Background()) + if err != nil { + t.Fatalf("EventsToRegister error: %v", err) + } + if len(evts) == 0 { + t.Fatal("no events registered — unschedulable gangs would never wake on capacity change") + } + var podUpdate, podDelete, node bool + for _, e := range evts { + switch { + case e.Event.Resource == fwk.Pod && e.Event.ActionType&fwk.Update != 0: + podUpdate = true + case e.Event.Resource == fwk.Pod && e.Event.ActionType&fwk.Delete != 0: + podDelete = true + case e.Event.Resource == fwk.Node: + node = true + } + } + if !podUpdate || !podDelete || !node { + t.Errorf("missing events: podUpdate=%v podDelete=%v node=%v", podUpdate, podDelete, node) + } +} diff --git a/pkg/fluence/fluence.go b/pkg/fluence/fluence.go index a1a10e1..45c72cd 100644 --- a/pkg/fluence/fluence.go +++ b/pkg/fluence/fluence.go @@ -77,12 +77,21 @@ type Fluence struct { mu sync.Mutex // placement maps a group key to its allocation (nodes, backend, jobids). placement map[string]groupAlloc + // excludedNodes maps a group key to the set of node names that have been + // rejected for that group by other scheduler plugins (taints, affinity, + // volume topology that Fluxion's graph does not model). PostFilter adds the + // whole failed allocation's nodes here; PreFilter feeds them back as an RFC 31 + // negated-hostlist constraint so the re-match is forced onto untried nodes. + // The set only grows for a group, guaranteeing the retry converges (finite + // node pool) and is cleared on teardown. Guarded by mu. + excludedNodes map[string]map[string]bool } var ( - _ fwk.PreFilterPlugin = (*Fluence)(nil) - _ fwk.FilterPlugin = (*Fluence)(nil) - _ fwk.PreBindPlugin = (*Fluence)(nil) + _ fwk.PreFilterPlugin = (*Fluence)(nil) + _ fwk.FilterPlugin = (*Fluence)(nil) + _ fwk.PostFilterPlugin = (*Fluence)(nil) + _ fwk.PreBindPlugin = (*Fluence)(nil) ) // New builds the plugin: discover cluster nodes, optionally inject quantum @@ -161,10 +170,11 @@ func New(ctx context.Context, _ runtime.Object, h fwk.Handle) (fwk.Plugin, error fluxion.Init(tmp.Name(), os.Getenv("FLUENCE_MATCH_POLICY"), "") f := &Fluence{ - handle: h, - matcher: fluxion, - knownDevices: knownDevices, - placement: map[string]groupAlloc{}, + handle: h, + matcher: fluxion, + knownDevices: knownDevices, + placement: map[string]groupAlloc{}, + excludedNodes: map[string]map[string]bool{}, } f.registerCancelHandlers() // Periodic + startup reconcile of completed Fluence-created PodGroups, so a @@ -251,7 +261,15 @@ func (f *Fluence) PreFilter( return nil, fwk.AsStatus(err) } - specs, err := placement.JobspecsForGroup(group, pods, f.knownDevices) + f.mu.Lock() + excluded := make([]string, 0, len(f.excludedNodes[group])) + for n := range f.excludedNodes[group] { + excluded = append(excluded, n) + } + f.mu.Unlock() + sort.Strings(excluded) // deterministic constraint for stable matching/logs + + specs, err := placement.JobspecsForGroup(group, pods, f.knownDevices, excluded) if err != nil { return nil, fwk.AsStatus(err) } @@ -390,6 +408,69 @@ func (f *Fluence) Filter( return fwk.NewStatus(fwk.Unschedulable, "node not in fluxion allocation for this group") } +// PostFilter runs when a pod could not be scheduled after Filter — for a Fluence +// group, this means the cached Fluxion allocation's nodes did not all survive +// the other scheduler plugins' Filter checks (a taint, node affinity, or volume +// topology constraint that Fluxion's resource graph does not model rejected one +// or more of them). Without intervention the group would retry forever against +// the same cached allocation while the Fluxion reservation leaked, because +// PreFilter short-circuits on the cache and nothing else releases it on a +// scheduling failure. +// +// We react by abandoning the failed allocation: the ENTIRE cached node set is +// added to the group's exclusion set, the Fluxion jobids are cancelled, and the +// cached placement is deleted. The next PreFilter for the group re-matches with +// an RFC 31 negated-hostlist constraint over the accumulated exclusion set, so +// Fluxion is forced onto untried nodes. We exclude the whole set (not just the +// individually-rejected nodes) deliberately: if the group as a whole could not +// be admitted, a node that happened to survive this round carries no guarantee +// for the next, and excluding the whole set makes each retry a strictly smaller, +// monotonic search that converges — either to a feasible allocation on untried +// nodes, or to a clean no-match (Unschedulable) once the graph is exhausted, at +// which point the pod waits for a cluster-state change rather than busy-looping. +func (f *Fluence) PostFilter( + ctx context.Context, + state fwk.CycleState, + pod *corev1.Pod, + filteredNodeStatusMap fwk.NodeToStatusReader, +) (*fwk.PostFilterResult, *fwk.Status) { + group := groupKey(pod) + + f.mu.Lock() + alloc, ok := f.placement[group] + if !ok { + // No cached allocation for this group — nothing of ours to reconcile. + // (Another plugin's PostFilter, or a non-group pod.) + f.mu.Unlock() + return nil, fwk.NewStatus(fwk.Unschedulable) + } + // Accumulate the whole failed allocation's nodes into the exclusion set. + if f.excludedNodes[group] == nil { + f.excludedNodes[group] = map[string]bool{} + } + for _, n := range alloc.place.Nodes { + f.excludedNodes[group][n] = true + } + excludedCount := len(f.excludedNodes[group]) + jobids := alloc.jobids + delete(f.placement, group) + f.mu.Unlock() + + // Release the Fluxion reservation for the abandoned allocation so the graph + // does not leak it while the group retries. + f.cancelJobids(jobids) + + log.Printf("[fluence] group %s unschedulable: abandoning allocation (nodes %v, "+ + "jobids %v); %d node(s) now excluded, will re-match on next cycle", + group, alloc.place.Nodes, jobids, excludedCount) + + // Returning Unschedulable (no nominated node) lets the pod be requeued; the + // next PreFilter re-matches with the enlarged exclusion set. We do not + // nominate a node — Fluxion, not PostFilter preemption, chooses the next + // placement. + return nil, fwk.NewStatus(fwk.Unschedulable) +} + // PreBindPreFlight runs before PreBind. It returns Success when we have a cached // allocation for the pod's group (so PreBind can record the jobid, and stamp the // backend for a quantum pod), and Skip otherwise. @@ -718,6 +799,7 @@ func (f *Fluence) cancelGroup(key string, ann map[string]string) { f.mu.Lock() delete(f.placement, key) + delete(f.excludedNodes, key) // drop accumulated exclusions so a future group reusing the name starts clean f.mu.Unlock() } diff --git a/pkg/fluence/fluence_test.go b/pkg/fluence/fluence_test.go index 998e1a7..6a53b56 100644 --- a/pkg/fluence/fluence_test.go +++ b/pkg/fluence/fluence_test.go @@ -1,6 +1,7 @@ package fluence import ( + "context" "errors" "testing" @@ -12,6 +13,7 @@ import ( schedv1a2 "k8s.io/api/scheduling/v1alpha2" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/tools/cache" + fwk "k8s.io/kube-scheduler/framework" ) // fakeMatcher records Cancel calls so cancel behavior can be asserted without @@ -46,7 +48,11 @@ func (m *fakeMatcher) Cancel(jobid uint64) error { } func newTestFluence(m matcher) *Fluence { - return &Fluence{matcher: m, placement: map[string]groupAlloc{}} + return &Fluence{ + matcher: m, + placement: map[string]groupAlloc{}, + excludedNodes: map[string]map[string]bool{}, + } } func ann(jobid string) map[string]string { @@ -345,3 +351,103 @@ func twoSpecs() []*jobspec.Jobspec { {Version: 9999}, } } + +// --- PostFilter allocation reconciliation ----------------------------------- + +// PostFilter must abandon a group's failed allocation: add the WHOLE cached node +// set to the exclusion set, cancel the Fluxion jobids, and delete the cache, so +// the next PreFilter re-matches onto untried nodes. +func TestPostFilterAbandonsAndExcludesWholeNodeSet(t *testing.T) { + m := &fakeMatcher{} + f := newTestFluence(m) + key := "default/training" + f.placement[key] = groupAlloc{ + place: placement.Placement{Nodes: []string{"node-a", "node-b", "node-c"}}, + jobids: []uint64{11, 12}, + } + pod := groupedPod("default", "training-0", "training", nil) + + _, status := f.PostFilter(context.Background(), nil, pod, nil) + if status == nil || status.Code() != fwk.Unschedulable { + t.Fatalf("expected Unschedulable status, got %v", status) + } + // cache cleared + if _, still := f.placement[key]; still { + t.Fatal("placement cache should be deleted after PostFilter") + } + // jobids cancelled + if len(m.cancelled) != 2 { + t.Fatalf("expected both jobids cancelled, got %v", m.cancelled) + } + // the WHOLE node set excluded + excl := f.excludedNodes[key] + for _, n := range []string{"node-a", "node-b", "node-c"} { + if !excl[n] { + t.Fatalf("expected %s excluded, set=%v", n, excl) + } + } + if len(excl) != 3 { + t.Fatalf("expected exactly 3 excluded nodes, got %v", excl) + } +} + +// Repeated failures accumulate monotonically: a second abandoned allocation adds +// its nodes to the existing exclusion set (the set only grows -> convergence). +func TestPostFilterAccumulatesAcrossAttempts(t *testing.T) { + m := &fakeMatcher{} + f := newTestFluence(m) + key := "default/training" + pod := groupedPod("default", "training-0", "training", nil) + + // attempt 1 fails on {a,b} + f.placement[key] = groupAlloc{place: placement.Placement{Nodes: []string{"node-a", "node-b"}}, jobids: []uint64{1}} + f.PostFilter(context.Background(), nil, pod, nil) + // attempt 2 (re-matched elsewhere) fails on {c,d} + f.placement[key] = groupAlloc{place: placement.Placement{Nodes: []string{"node-c", "node-d"}}, jobids: []uint64{2}} + f.PostFilter(context.Background(), nil, pod, nil) + + excl := f.excludedNodes[key] + for _, n := range []string{"node-a", "node-b", "node-c", "node-d"} { + if !excl[n] { + t.Fatalf("expected %s in accumulated exclusion set, got %v", n, excl) + } + } + if len(excl) != 4 { + t.Fatalf("exclusion set should accumulate to 4, got %v", excl) + } +} + +// PostFilter on a group with no cached allocation (not ours, or already cleared) +// is a safe no-op: no panic, no cancel, returns Unschedulable. +func TestPostFilterUnknownGroupNoop(t *testing.T) { + m := &fakeMatcher{} + f := newTestFluence(m) + pod := groupedPod("default", "stranger-0", "stranger", nil) + + _, status := f.PostFilter(context.Background(), nil, pod, nil) + if status == nil || status.Code() != fwk.Unschedulable { + t.Fatalf("expected Unschedulable, got %v", status) + } + if len(m.cancelled) != 0 { + t.Fatalf("nothing should be cancelled for an unknown group, got %v", m.cancelled) + } + if len(f.excludedNodes) != 0 { + t.Fatalf("no exclusion set should be created for an unknown group, got %v", f.excludedNodes) + } +} + +// Teardown (cancelGroup) must clear the exclusion set so a future group reusing +// the same key does not inherit stale exclusions. +func TestCancelGroupClearsExclusions(t *testing.T) { + m := &fakeMatcher{} + f := newTestFluence(m) + key := "default/training" + f.placement[key] = groupAlloc{jobids: []uint64{9}} + f.excludedNodes[key] = map[string]bool{"node-a": true} + + f.cancelGroup(key, ann("9")) + + if _, still := f.excludedNodes[key]; still { + t.Fatal("exclusion set should be cleared on teardown") + } +} diff --git a/pkg/placement/placement.go b/pkg/placement/placement.go index 554f319..c7f76de 100644 --- a/pkg/placement/placement.go +++ b/pkg/placement/placement.go @@ -214,14 +214,36 @@ func withEntries(counts map[string]int) []jobspec.Resource { // allocation (duration 0 runs to graph end) plus an RFC 31 property constraint // selecting the eligible node set. properties is the AND-set of composed // key=value property strings a matched node must carry. -func systemAttributes(properties []string) map[string]interface{} { +func systemAttributes(properties []string, excludeNodes []string) map[string]interface{} { + // Base property constraint (the eligible-node property AND-set). + constraints := map[string]interface{}{ + "properties": properties, + } + // When a group has had a placement rejected by other scheduler plugins + // (taints, affinity, volume topology that Fluxion's graph does not model), + // PostFilter accumulates the rejected hostnames and we AND in an RFC 31 + // negated hostlist so the re-match is forced onto untried nodes. RFC 31 is + // JsonLogic-style ({operator:[values]}, one operator per object), so to AND + // two operators we nest them under an explicit `and`. We only do this when + // there is something to exclude, so the no-exclusion jobspec is byte-for-byte + // what it was before (and existing tests/behavior are unchanged). + if len(excludeNodes) > 0 { + constraints = map[string]interface{}{ + "and": []interface{}{ + map[string]interface{}{"properties": properties}, + map[string]interface{}{ + "not": []interface{}{ + map[string]interface{}{"hostlist": excludeNodes}, + }, + }, + }, + } + } return map[string]interface{}{ "system": map[string]interface{}{ // duration 0 => hold the allocation until we explicitly Cancel. - "duration": 0, - "constraints": map[string]interface{}{ - "properties": properties, - }, + "duration": 0, + "constraints": constraints, }, } } @@ -229,7 +251,7 @@ func systemAttributes(properties []string) map[string]interface{} { // computeJobspec builds the physical-compute jobspec for a group: one slot per // pod holding the compute resources, constrained to virtual=false nodes. This is // the only jobspec for a group that requests no virtual devices. -func computeJobspec(groupName string, slots int, compute map[string]int) *jobspec.Jobspec { +func computeJobspec(groupName string, slots int, compute map[string]int, excludeNodes []string) *jobspec.Jobspec { return &jobspec.Jobspec{ Version: 9999, Resources: []jobspec.Resource{{ @@ -238,7 +260,7 @@ func computeJobspec(groupName string, slots int, compute map[string]int) *jobspe Label: "default", With: withEntries(compute), }}, - Attributes: systemAttributes([]string{VirtualPropertyFalse}), + Attributes: systemAttributes([]string{VirtualPropertyFalse}, excludeNodes), Tasks: []jobspec.Task{{ Command: []string{groupName}, Slot: "default", @@ -272,7 +294,7 @@ func deviceJobspec(groupName, deviceType string, count int, extraProps []string) Label: "device", With: []jobspec.Resource{{Type: "node", Count: count}}, }}, - Attributes: systemAttributes(props), + Attributes: systemAttributes(props, nil), Tasks: []jobspec.Task{{ Command: []string{groupName}, Slot: "device", @@ -299,6 +321,7 @@ func JobspecsForGroup( groupName string, pods []corev1.Pod, knownDevices map[string]bool, + excludeNodes []string, ) ([]*jobspec.Jobspec, error) { if len(pods) == 0 { return nil, fmt.Errorf("pod group %q has no pods", groupName) @@ -321,7 +344,7 @@ func JobspecsForGroup( } } - specs := []*jobspec.Jobspec{computeJobspec(groupName, len(pods), compute)} + specs := []*jobspec.Jobspec{computeJobspec(groupName, len(pods), compute, excludeNodes)} // Deterministic device order for stable output. deviceTypes := make([]string, 0, len(devices)) diff --git a/pkg/placement/placement_test.go b/pkg/placement/placement_test.go index 33786c8..fe68917 100644 --- a/pkg/placement/placement_test.go +++ b/pkg/placement/placement_test.go @@ -64,7 +64,7 @@ func TestClassicalSingleMatch(t *testing.T) { podWith("p0", corev1.ResourceList{corev1.ResourceCPU: qty(4), "nvidia.com/gpu": qty(1)}), podWith("p1", corev1.ResourceList{corev1.ResourceCPU: qty(4), "nvidia.com/gpu": qty(1)}), } - specs, err := JobspecsForGroup("grp", pods, nil) + specs, err := JobspecsForGroup("grp", pods, nil, nil) if err != nil { t.Fatal(err) } @@ -101,7 +101,7 @@ func TestGroupDeviceMatchWhenLeaderNotFirst(t *testing.T) { }) // Leader deliberately placed last. pods := []corev1.Pod{worker, worker, leader} - specs, err := JobspecsForGroup("qgrp", pods, map[string]bool{"qpu": true}) + specs, err := JobspecsForGroup("qgrp", pods, map[string]bool{"qpu": true}, nil) if err != nil { t.Fatal(err) } @@ -132,7 +132,7 @@ func qpuPodWithRequires(name string, requires map[string]string) corev1.Pod { // constraints, nothing extra (over-constraining would break unconstrained runs). func TestNoRequireAnnotationsAddsNoConstraints(t *testing.T) { p := qpuPodWithRequires("q", nil) - specs, err := JobspecsForGroup("g", []corev1.Pod{p}, map[string]bool{"qpu": true}) + specs, err := JobspecsForGroup("g", []corev1.Pod{p}, map[string]bool{"qpu": true}, nil) if err != nil { t.Fatal(err) } @@ -145,7 +145,7 @@ func TestNoRequireAnnotationsAddsNoConstraints(t *testing.T) { // Exactly one require- constraint. func TestSingleRequireConstraint(t *testing.T) { p := qpuPodWithRequires("q", map[string]string{"qrmi_type": "braket-gate"}) - specs, err := JobspecsForGroup("g", []corev1.Pod{p}, map[string]bool{"qpu": true}) + specs, err := JobspecsForGroup("g", []corev1.Pod{p}, map[string]bool{"qpu": true}, nil) if err != nil { t.Fatal(err) } @@ -169,7 +169,7 @@ func TestMultipleRequireConstraintsAreDeduped(t *testing.T) { // a worker that happens to repeat one of the same require- annotations worker := qpuPodWithRequires("w0", map[string]string{"vendor": "amazon"}) specs, err := JobspecsForGroup("g", []corev1.Pod{leader, worker}, - map[string]bool{"qpu": true}) + map[string]bool{"qpu": true}, nil) if err != nil { t.Fatal(err) } @@ -211,7 +211,7 @@ func TestRequireAnnotationConstrainsDevice(t *testing.T) { leader.Annotations[RequireAnnotationPrefix+"vendor"] = "amazon" specs, err := JobspecsForGroup("qgrp", []corev1.Pod{leader}, - map[string]bool{"qpu": true}) + map[string]bool{"qpu": true}, nil) if err != nil { t.Fatal(err) } @@ -232,7 +232,7 @@ func TestDeviceProducesSecondMatch(t *testing.T) { FluxionResourcePrefix + "qpu": qty(1), }) known := map[string]bool{"qpu": true} - specs, err := JobspecsForGroup("qgrp", []corev1.Pod{p}, known) + specs, err := JobspecsForGroup("qgrp", []corev1.Pod{p}, known, nil) if err != nil { t.Fatal(err) } @@ -274,7 +274,7 @@ func TestDeviceProducesSecondMatch(t *testing.T) { // node), so there are two matches: compute (core=1, virtual=false) and device. func TestDeviceOnlyStillForcesCompute(t *testing.T) { p := podWith("q", corev1.ResourceList{FluxionResourcePrefix + "qpu": qty(1)}) - specs, err := JobspecsForGroup("qonly", []corev1.Pod{p}, map[string]bool{"qpu": true}) + specs, err := JobspecsForGroup("qonly", []corev1.Pod{p}, map[string]bool{"qpu": true}, nil) if err != nil { t.Fatal(err) } @@ -289,7 +289,7 @@ func TestDeviceOnlyStillForcesCompute(t *testing.T) { // Requesting a device type the graph does not model is a hard error. func TestUnknownDeviceErrors(t *testing.T) { p := podWith("q", corev1.ResourceList{FluxionResourcePrefix + "fpga": qty(1)}) - _, err := JobspecsForGroup("grp", []corev1.Pod{p}, map[string]bool{"qpu": true}) + _, err := JobspecsForGroup("grp", []corev1.Pod{p}, map[string]bool{"qpu": true}, nil) if err == nil { t.Fatal("expected an error for an unmodeled device type") } @@ -301,7 +301,7 @@ func TestHoldDurationZero(t *testing.T) { corev1.ResourceCPU: qty(1), FluxionResourcePrefix + "qpu": qty(1), }) - specs, err := JobspecsForGroup("g", []corev1.Pod{p}, map[string]bool{"qpu": true}) + specs, err := JobspecsForGroup("g", []corev1.Pod{p}, map[string]bool{"qpu": true}, nil) if err != nil { t.Fatal(err) } @@ -366,3 +366,76 @@ func TestPlacementUnmarkedNodeIsCompute(t *testing.T) { t.Fatalf("unmarked node should not be a backend, got %q", p.Backend) } } + +// When excludeNodes is non-empty, the compute jobspec's constraint must AND the +// base properties with an RFC 31 negated hostlist, so a re-match avoids the +// rejected nodes. When empty, the constraint must be the plain properties form +// (byte-for-byte the pre-exclusion behavior). +func TestExcludeNodesAddsNegatedHostlist(t *testing.T) { + p := podWith("p", corev1.ResourceList{corev1.ResourceCPU: qty(1)}) + + // no exclusion -> plain properties, no `and`/`not` + specs, err := JobspecsForGroup("g", []corev1.Pod{p}, nil, nil) + if err != nil { + t.Fatal(err) + } + cons := computeConstraints(t, specs[0]) + if _, hasAnd := cons["and"]; hasAnd { + t.Fatalf("no-exclusion constraint must not use `and`: %#v", cons) + } + if _, hasProps := cons["properties"]; !hasProps { + t.Fatalf("no-exclusion constraint must have plain properties: %#v", cons) + } + + // with exclusion -> and[ properties, not[ hostlist ] ] + specs, err = JobspecsForGroup("g", []corev1.Pod{p}, nil, []string{"node-b", "node-c"}) + if err != nil { + t.Fatal(err) + } + cons = computeConstraints(t, specs[0]) + andTerms, ok := cons["and"].([]interface{}) + if !ok || len(andTerms) != 2 { + t.Fatalf("exclusion constraint must be `and` of 2 terms: %#v", cons) + } + // find the not/hostlist term + foundHostlist := false + for _, term := range andTerms { + tm, _ := term.(map[string]interface{}) + notTerm, ok := tm["not"].([]interface{}) + if !ok || len(notTerm) == 0 { + continue + } + inner, _ := notTerm[0].(map[string]interface{}) + hl, ok := inner["hostlist"].([]string) + if !ok { + // json round-trip may make it []interface{}; accept both + if hlAny, ok2 := inner["hostlist"].([]interface{}); ok2 { + if len(hlAny) == 2 { + foundHostlist = true + } + } + continue + } + if len(hl) == 2 { + foundHostlist = true + } + } + if !foundHostlist { + t.Fatalf("exclusion constraint must contain not[hostlist[2 nodes]]: %#v", cons) + } +} + +// computeConstraints digs out attributes.system.constraints from the compute +// jobspec (the first spec; device specs do not carry node exclusions). +func computeConstraints(t *testing.T, spec *jobspec.Jobspec) map[string]interface{} { + t.Helper() + sys, ok := spec.Attributes["system"].(map[string]interface{}) + if !ok { + t.Fatalf("no system attributes: %#v", spec.Attributes) + } + cons, ok := sys["constraints"].(map[string]interface{}) + if !ok { + t.Fatalf("no constraints: %#v", sys) + } + return cons +} diff --git a/pkg/webhook/handler.go b/pkg/webhook/handler.go index 82a1227..ddf1c84 100644 --- a/pkg/webhook/handler.go +++ b/pkg/webhook/handler.go @@ -25,21 +25,20 @@ type MutatorAPI interface { // InjectedEnv is the FLUXION_* env contract the scheduler/webhook supplies. InjectedEnv() []corev1.EnvVar - // PodGroup operations (gang scheduling). Group identity is the value of the - // group label, which the core treats as an opaque string. - PodGroupLeader(ctx context.Context, namespace, group string) string - EnsurePodGroup(ctx context.Context, namespace, group, leaderPod string) - RecordLeader(ctx context.Context, namespace, group, leaderPod string) - - // EnsureSidecarRBAC provisions the per-namespace ServiceAccount/Role/Binding - // the sidecar needs. - EnsureSidecarRBAC(ctx context.Context, namespace string) + // EnsurePodGroup creates the group's PodGroup with the given gang minCount if + // it does not already exist (idempotent). Group identity is the opaque value + // of the group label. Leader election is NOT here — it is a leader/worker + // concern owned by the handlers that need it (see handlers/leader.go). + EnsurePodGroup(ctx context.Context, namespace, group, leaderPod string, minCount int32) - // InterceptorOps stages the fluence package into the quantum container via an - // init container + shared volume on PYTHONPATH (Model C). SidecarContainerOps - // adds the sidecar container (observe=true => observe-only telemetry mode). + // Sidecar staging primitives. These remain on the core because the default + // Sidecar implementation (coreSidecar) delegates to them, but handlers do + // NOT use them directly — they go through the handlers.Sidecar interface, + // which is the customization seam. Kept here (not removed) so the concrete + // *Mutator continues to satisfy both this interface and coreSidecar's needs. + EnsureSidecarRBAC(ctx context.Context, namespace string) InterceptorOps(pod *corev1.Pod) []spec.Op - SidecarContainerOps(pod *corev1.Pod, observe bool) []spec.Op + SidecarContainerOps(pod *corev1.Pod, observe bool, extraEnv []corev1.EnvVar) []spec.Op } // Handler inspects a pod and, when it applies, contributes JSON patch ops. A pod @@ -53,6 +52,14 @@ type Handler interface { Mutate(ctx context.Context, m MutatorAPI, pod *corev1.Pod) []spec.Op } +// DefaultHandlerOrder is the active set AND the dispatch order when the operator +// passes no --handlers flag. Order matters: specific handlers run before the +// generic gang fallback, so "gang" is LAST — it applies default gang sizing +// (group-size annotation or owner-derived N) only if no earlier handler already +// shaped the gang. To change the order or disable a handler, pass a different +// list (e.g. --handlers=fluxion,gang drops quantum). +var DefaultHandlerOrder = []string{"fluxion", "quantum", "gang"} + // ── registration ──────────────────────────────────────────────────────────────── // // Handlers self-register via Register() from their package's init(). The core @@ -60,15 +67,57 @@ type Handler interface { // webhook server wiring) is what populates the registry. This keeps the core // domain-agnostic: adding or removing a handler does not touch core code. -var registry []Handler +// available maps a handler's Name() to the handler. Populated by Register() from +// each handler package's init(). This is the set of handlers that EXIST; which +// ones actually run, and in what order, is decided by activeOrder. +var available = map[string]Handler{} + +// activeOrder is the ordered list of handler names to dispatch. It is BOTH the +// selection (names not present are disabled) and the order (dispatch follows the +// slice). Defaults to DefaultHandlerOrder; overridden by SetActiveHandlers. +var activeOrder = append([]string(nil), DefaultHandlerOrder...) -// Register adds a handler to the global set. Called from handler packages' -// init(). Order of registration is the order handlers run. +// Register adds a handler to the available set under its Name(). Called from +// handler packages' init(). func Register(h Handler) { - registry = append(registry, h) + available[h.Name()] = h +} + +// SetActiveHandlers sets the active, ordered handler list (the --handlers value). +// Empty/nil restores DefaultHandlerOrder. Names with no registered handler are +// dropped and returned as `unknown` so the caller can warn. Order is preserved +// exactly as given — the list is the dispatch order. +func SetActiveHandlers(names []string) (active, unknown []string) { + if len(names) == 0 { + activeOrder = append([]string(nil), DefaultHandlerOrder...) + return activeOrder, nil + } + var ordered []string + for _, n := range names { + if _, ok := available[n]; ok { + ordered = append(ordered, n) + } else { + unknown = append(unknown, n) + } + } + activeOrder = ordered + return activeOrder, unknown +} + +// ActiveHandlerNames returns the active dispatch order (for logging at startup). +func ActiveHandlerNames() []string { + return append([]string(nil), activeOrder...) } -// registered returns the registered handlers (the live registry). +// registered returns the active handlers, resolved from activeOrder, in order. +// Names in the order with no registered handler are skipped (already warned at +// SetActiveHandlers time). func registered() []Handler { - return registry + out := make([]Handler, 0, len(activeOrder)) + for _, n := range activeOrder { + if h, ok := available[n]; ok { + out = append(out, h) + } + } + return out } diff --git a/pkg/webhook/handlers/gang.go b/pkg/webhook/handlers/gang.go index a6c6126..b1db6e3 100644 --- a/pkg/webhook/handlers/gang.go +++ b/pkg/webhook/handlers/gang.go @@ -2,11 +2,14 @@ package handlers import ( "context" + "log" + "strconv" "github.com/converged-computing/fluence/pkg/webhook" "github.com/converged-computing/fluence/pkg/webhook/spec" corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) func init() { @@ -28,15 +31,76 @@ func (h *gangHandler) Applies(ctx context.Context, m webhook.MutatorAPI, pod *co func (h *gangHandler) Mutate(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) []spec.Op { g := webhook.GroupName(pod) - // First pod admitted in the group creates the PodGroup and is recorded as - // the admission-order leader. All pods are linked to the group. - if m.PodGroupLeader(ctx, pod.Namespace, g) == "" { - m.EnsurePodGroup(ctx, pod.Namespace, g, pod.Name) - m.RecordLeader(ctx, pod.Namespace, g, pod.Name) - } + // Ensure the group's PodGroup exists with the resolved gang size, and link + // this pod to it. EnsurePodGroup is idempotent (no-ops if the PodGroup + // already exists — e.g. created by an earlier, more specific handler), so we + // call it unconditionally. The gang handler knows nothing about leaders or + // roles; that is a leader/worker concern handled by the quantum handler. + // minCount = full gang size N (group-size annotation, else owner-derived); + // see resolveMinCount. + m.EnsurePodGroup(ctx, pod.Namespace, g, pod.Name, resolveMinCount(ctx, m, pod)) return schedulingGroupOps(pod, g) } +// resolveMinCount determines the gang's atomic-schedule size N: +// 1. explicit group-size annotation -> honor it verbatim. This is the override +// for when minCount must differ from the parent's replica count (e.g. the +// quantum leader/worker split, where the gang's N is expressed directly). +// 2. otherwise derive from the OWNING object: a Flux Operator MiniCluster pod +// is owned by an indexed Job whose parallelism == completions == size == N. +// (The operator sets Parallelism = Completions = MiniCluster.Spec.Size.) +// 3. otherwise default to 1, logged — never silently size a multi-pod gang to 1. +// +// The leader/worker (quantum) split is orthogonal and unchanged: it is driven by +// RoleAnnotation / QuantumResource in the quantum handler. minCount is always the +// FULL gang N regardless of which pods get gated. +func resolveMinCount(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) int32 { + // 1. explicit override + if pod.Annotations != nil { + if n := pod.Annotations[webhook.GroupSizeAnnotation]; n != "" { + if v, err := strconv.Atoi(n); err == nil && v > 0 { + return int32(v) + } + } + } + // 2. derive from the owning Job's parallelism + if n := ownerJobN(ctx, m, pod); n > 0 { + return n + } + // 3. no signal: a single-pod gang. Log so a missing size on a multi-pod + // workload is visible rather than a silent gang-of-1. + log.Printf("[fluence-webhook] group %s: no group-size annotation and no owning Job parallelism; defaulting minCount=1", webhook.GroupName(pod)) + return 1 +} + +// ownerJobN returns the parallelism (== size N) of the indexed Job that owns the +// pod, or 0 if there is no such owner. The Flux Operator sets a MiniCluster's +// Job Parallelism == Completions == size, so this is the full gang size N. +// Shared by the gang handler (classical: minCount = N) and the quantum handler +// (split: leader group = 1, worker group = N-1). +func ownerJobN(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) int32 { + c := m.Client() + if c == nil { + return 0 + } + for _, ref := range pod.OwnerReferences { + if ref.Kind != "Job" { + continue + } + job, err := c.BatchV1().Jobs(pod.Namespace).Get(ctx, ref.Name, metav1.GetOptions{}) + if err != nil { + return 0 + } + if job.Spec.Parallelism != nil && *job.Spec.Parallelism > 0 { + return *job.Spec.Parallelism + } + if job.Spec.Completions != nil && *job.Spec.Completions > 0 { + return *job.Spec.Completions + } + } + return 0 +} + // schedulingGroupOps links a pod to its PodGroup via the native 1.36 field // spec.schedulingGroup.podGroupName. Idempotent if already linked. func schedulingGroupOps(pod *corev1.Pod, group string) []spec.Op { diff --git a/pkg/webhook/handlers/gang_mincount_test.go b/pkg/webhook/handlers/gang_mincount_test.go new file mode 100644 index 0000000..77f7f46 --- /dev/null +++ b/pkg/webhook/handlers/gang_mincount_test.go @@ -0,0 +1,154 @@ +/* +Copyright 2024 Lawrence Livermore National Security, LLC + (c.f. AUTHORS, NOTICE.LLNS, COPYING) +SPDX-License-Identifier: Apache-2.0 +*/ + +// Tests for gang PodGroup minCount: the whole gang (full N) must schedule +// atomically. Regression guard for the bug where every PodGroup was created +// with minCount=1, so a multi-pod gang was "satisfied" by a single pod and the +// rest were stranded (partial placement). +package handlers + +import ( + "context" + "testing" + + "strconv" + + "github.com/converged-computing/fluence/pkg/webhook" + + corev1 "k8s.io/api/core/v1" + + batchv1 "k8s.io/api/batch/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes/fake" +) + +// minCountOf runs the gang handler for the leader pod of a group and returns the +// minCount of the PodGroup the webhook created. +func minCountOf(t *testing.T, pod *corev1.Pod) int32 { + t.Helper() + m := &webhook.Mutator{Clientset: fake.NewSimpleClientset()} + m.Mutate(context.Background(), pod) + pg, err := m.Clientset.SchedulingV1alpha2(). + PodGroups(pod.Namespace).Get(context.Background(), webhook.GroupName(pod), metav1.GetOptions{}) + if err != nil { + t.Fatalf("PodGroup not created: %v", err) + } + if pg.Spec.SchedulingPolicy.Gang == nil { + t.Fatal("PodGroup has no gang scheduling policy") + } + return pg.Spec.SchedulingPolicy.Gang.MinCount +} + +// minCountWithClient runs the gang handler with a pre-seeded clientset (so the +// owning Job exists) and returns the created PodGroup's minCount. +func minCountWithClient(t *testing.T, pod *corev1.Pod, objs ...interface{}) int32 { + t.Helper() + cs := fake.NewSimpleClientset(toRuntime(objs)...) + m := &webhook.Mutator{Clientset: cs} + m.Mutate(context.Background(), pod) + pg, err := cs.SchedulingV1alpha2().PodGroups(pod.Namespace). + Get(context.Background(), webhook.GroupName(pod), metav1.GetOptions{}) + if err != nil { + t.Fatalf("PodGroup not created: %v", err) + } + return pg.Spec.SchedulingPolicy.Gang.MinCount +} + +func jobWithParallelism(ns, name string, n int32) *batchv1.Job { + return &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns}, + Spec: batchv1.JobSpec{Parallelism: &n, Completions: &n}, + } +} + +func ownedBy(pod *corev1.Pod, kind, name string) { + pod.OwnerReferences = append(pod.OwnerReferences, + metav1.OwnerReference{Kind: kind, Name: name}) +} + +// No annotation, but the pod is owned by an indexed Job with parallelism N +// (the Flux Operator MiniCluster case: Parallelism == Completions == size == N). +// minCount must come from the Job. +func TestGangMinCountDerivedFromOwningJob(t *testing.T) { + pod := cpuPod("fluence") + pod.Namespace = "default" + pod.Labels = map[string]string{webhook.GroupLabel: "mc-gang"} + ownedBy(pod, "Job", "mc-gang-job") + got := minCountWithClient(t, pod, jobWithParallelism("default", "mc-gang-job", 4)) + if got != 4 { + t.Errorf("owner-derived: minCount=%d, want 4 (from Job parallelism)", got) + } +} + +// The explicit annotation OVERRIDES the owning Job's parallelism (the override +// exists precisely because minCount may differ from the parent replica count). +func TestGangMinCountAnnotationOverridesOwner(t *testing.T) { + pod := cpuPod("fluence") + pod.Namespace = "default" + pod.Labels = map[string]string{webhook.GroupLabel: "ovr-gang"} + pod.Annotations = map[string]string{webhook.GroupSizeAnnotation: "2"} + ownedBy(pod, "Job", "ovr-gang-job") + got := minCountWithClient(t, pod, jobWithParallelism("default", "ovr-gang-job", 8)) + if got != 2 { + t.Errorf("annotation override: minCount=%d, want 2 (annotation wins over Job=8)", got) + } +} + +// A classical gang of size N must get minCount = N so the whole group schedules +// atomically (this is the core multi-gang fix). +func atoi32(s string) int32 { v, _ := strconv.Atoi(s); return int32(v) } + +func toRuntime(objs []interface{}) []runtime.Object { + out := make([]runtime.Object, 0, len(objs)) + for _, o := range objs { + if ro, ok := o.(runtime.Object); ok { + out = append(out, ro) + } + } + return out +} + +func TestGangMinCountEqualsGroupSize(t *testing.T) { + for _, n := range []string{"2", "4", "8"} { + pod := cpuPod("fluence") + pod.Namespace = "default" + pod.Labels = map[string]string{webhook.GroupLabel: "g-" + n} + pod.Annotations = map[string]string{webhook.GroupSizeAnnotation: n} + got := minCountOf(t, pod) + want := atoi32(n) + if got != want { + t.Errorf("group-size=%s: minCount=%d, want %d", n, got, want) + } + } +} + +// No group-size annotation -> minCount falls back to 1 (single-pod gang). +func TestGangMinCountDefaultsToOne(t *testing.T) { + pod := cpuPod("fluence") + pod.Namespace = "default" + pod.Labels = map[string]string{webhook.GroupLabel: "g-default"} + if got := minCountOf(t, pod); got != 1 { + t.Errorf("absent group-size: minCount=%d, want 1", got) + } +} + +// Quantum distinction: a gang of full size N=4 that ALSO carries +// expected-workers=3 (the N-1 workers the sidecar ungates) must still get +// minCount=4 (the whole gang), NOT 3. minCount comes from group-size, not +// expected-workers. +func TestGangMinCountHonorsFullNWithQuantumSplit(t *testing.T) { + pod := cpuPod("fluence") + pod.Namespace = "default" + pod.Labels = map[string]string{webhook.GroupLabel: "q-gang"} + pod.Annotations = map[string]string{ + webhook.GroupSizeAnnotation: "4", // full N (leader + workers) + webhook.ExpectedWorkersAnnotation: "3", // N-1 workers to ungate + } + if got := minCountOf(t, pod); got != 4 { + t.Errorf("quantum gang: minCount=%d, want 4 (full N, not N-1)", got) + } +} diff --git a/pkg/webhook/handlers/handlers_test.go b/pkg/webhook/handlers/handlers_test.go index 04d0e02..1322ec2 100644 --- a/pkg/webhook/handlers/handlers_test.go +++ b/pkg/webhook/handlers/handlers_test.go @@ -166,7 +166,7 @@ func quantumGroupFixture(ns, group, leaderName string) *fake.Clientset { pg := &schedulingv1alpha2.PodGroup{ ObjectMeta: metav1.ObjectMeta{ Name: group, Namespace: ns, - Annotations: map[string]string{webhook.LeaderAnnotation: leaderName}, + Annotations: map[string]string{LeaderAnnotation: leaderName}, }, } leaderPod := qpuPod("fluence") @@ -198,7 +198,7 @@ func TestClassicalGangWorkerNotGated(t *testing.T) { ns, group, leader := "default", "classical", "classical-leader" pg := &schedulingv1alpha2.PodGroup{ ObjectMeta: metav1.ObjectMeta{Name: group, Namespace: ns, - Annotations: map[string]string{webhook.LeaderAnnotation: leader}}, + Annotations: map[string]string{LeaderAnnotation: leader}}, } leaderPod := cpuPod("fluence") leaderPod.Name = leader @@ -222,7 +222,7 @@ func TestPodTemplateGangSecondPodIsWorker(t *testing.T) { ns, group, leader := "default", "qaoa", "qaoa-abc123" pg := &schedulingv1alpha2.PodGroup{ ObjectMeta: metav1.ObjectMeta{Name: group, Namespace: ns, - Annotations: map[string]string{webhook.LeaderAnnotation: leader}}, + Annotations: map[string]string{LeaderAnnotation: leader}}, } leaderPod := qpuPod("fluence") leaderPod.Name = leader @@ -294,7 +294,7 @@ func TestExplicitWorkerIsGatedRegardlessOfAdmission(t *testing.T) { worker := roledQPUPod(ns, group, "qaoa-worker-0", RoleWorker) pg := &schedulingv1alpha2.PodGroup{ ObjectMeta: metav1.ObjectMeta{Name: group, Namespace: ns, - Annotations: map[string]string{webhook.LeaderAnnotation: worker.Name}}, + Annotations: map[string]string{LeaderAnnotation: worker.Name}}, } m := &webhook.Mutator{Clientset: fake.NewSimpleClientset(pg)} diff --git a/pkg/webhook/handlers/leader.go b/pkg/webhook/handlers/leader.go new file mode 100644 index 0000000..7408204 --- /dev/null +++ b/pkg/webhook/handlers/leader.go @@ -0,0 +1,65 @@ +package handlers + +import ( + "context" + "fmt" + "time" + + "github.com/converged-computing/fluence/pkg/webhook" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" +) + +// Leader election is a LEADER/WORKER concern, not a core gang concern, so it +// lives with the handlers that need it (quantum) rather than on the webhook +// core's MutatorAPI. It records/reads the admission-order leader on the group's +// PodGroup via an annotation, used only when a workload does NOT declare an +// explicit role (RoleAnnotation). A purely classical gang never touches this. + +// LeaderAnnotation records the admission-order leader on a PodGroup. +const LeaderAnnotation = "fluence.flux-framework.org/leader" + +// podGroupLeader returns the recorded admission-order leader for the group, or +// "". Retries briefly to absorb the concurrent leader/worker admission race. +func podGroupLeader(ctx context.Context, m webhook.MutatorAPI, namespace, group string) string { + c := m.Client() + if c == nil || group == "" { + return "" + } + for i := 0; i < 3; i++ { + pg, err := c.SchedulingV1alpha2().PodGroups(namespace).Get(ctx, group, metav1.GetOptions{}) + if err != nil { + return "" + } + if pg.Annotations != nil && pg.Annotations[LeaderAnnotation] != "" { + return pg.Annotations[LeaderAnnotation] + } + if i < 2 { + time.Sleep(100 * time.Millisecond) + } + } + return "" +} + +// recordLeaderIfUnset records leaderPod as the group's admission-order leader if +// none is set yet. Best-effort; safe to call on every quantum admission. +func recordLeaderIfUnset(ctx context.Context, m webhook.MutatorAPI, namespace, group, leaderPod string) { + c := m.Client() + if c == nil || group == "" { + return + } + if podGroupLeader(ctx, m, namespace, group) != "" { + return + } + patch := fmt.Sprintf(`{"metadata":{"annotations":{%q:%q}}}`, LeaderAnnotation, leaderPod) + if _, err := c.SchedulingV1alpha2().PodGroups(namespace).Patch( + ctx, group, types.MergePatchType, []byte(patch), metav1.PatchOptions{}); err != nil { + // best-effort; the explicit RoleAnnotation path does not need this + _ = err + } +} + +// leaderName is a tiny helper so callers read naturally. +func leaderName(pod *corev1.Pod) string { return pod.Name } diff --git a/pkg/webhook/handlers/quantum.go b/pkg/webhook/handlers/quantum.go index 97fbfa6..8a0527b 100644 --- a/pkg/webhook/handlers/quantum.go +++ b/pkg/webhook/handlers/quantum.go @@ -4,6 +4,8 @@ import ( "context" "fmt" "log" + "strconv" + "strings" "github.com/converged-computing/fluence/pkg/webhook" "github.com/converged-computing/fluence/pkg/webhook/spec" @@ -35,6 +37,13 @@ const ( // Role values for webhook.RoleAnnotation. RoleLeader = "leader" RoleWorker = "worker" + + // WorkerGroupSuffix: a quantum gang of size N is split into TWO PodGroups — + // the leader keeps (minCount 1) and the workers move to + // -workers (minCount N-1, all gated). This suffix MUST match what the + // sidecar uses to discover workers (FLUENCE_WORKER_GROUP env, set on the + // leader's sidecar by the webhook). + WorkerGroupSuffix = "-workers" ) // quantumHandler coordinates quantum-classical workflows. It applies to a pod @@ -73,7 +82,7 @@ func (h *quantumHandler) isWorkerOfQuantumGroup(ctx context.Context, m webhook.M if g == "" || m.Client() == nil { return false } - leader := m.PodGroupLeader(ctx, pod.Namespace, g) + leader := podGroupLeader(ctx, m, pod.Namespace, g) if leader == "" || leader == pod.Name { return false } @@ -105,15 +114,24 @@ func (h *quantumHandler) Mutate(ctx context.Context, m webhook.MutatorAPI, pod * isWorker = false default: if g != "" { - leader := m.PodGroupLeader(ctx, pod.Namespace, g) + leader := podGroupLeader(ctx, m, pod.Namespace, g) isWorker = leader != "" && leader != pod.Name } } if g != "" && isWorker { - log.Printf("[fluence-webhook] quantum worker %s/%s (role=%q) — gating", - pod.Namespace, pod.Name, role) - ops := gateOps(pod) + // Two-group split: workers live in -workers with minCount = N-1 + // (the leader is the other group, size 1). N is the full gang size from + // the owning Job. The worker is RE-LINKED from to the worker + // group, and the worker PodGroup is created (idempotent) with minCount + // N-1 so the worker set schedules atomically among itself. + wg := g + WorkerGroupSuffix + workerMin := workerCount(ctx, m, pod) // N-1: the worker subgroup schedules atomically among itself + m.EnsurePodGroup(ctx, pod.Namespace, wg, pod.Name, workerMin) + log.Printf("[fluence-webhook] quantum worker %s/%s (role=%q) — group %s minCount=%d, gating", + pod.Namespace, pod.Name, role, wg, workerMin) + ops := relinkGroupOps(pod, wg) // move label + schedulingGroup to -workers + ops = append(ops, gateOps(pod)...) ops = append(ops, roleEnvOps(pod, RoleWorker)...) return ops } @@ -128,11 +146,35 @@ func (h *quantumHandler) Mutate(ctx context.Context, m webhook.MutatorAPI, pod * log.Printf("[fluence-webhook] quantum pod %s/%s — interceptor (leader=%v role=%q observe=%v)", pod.Namespace, pod.Name, isLeader, role, observe) - ops := m.InterceptorOps(pod) + if isLeader { + // Leader is its own group of 1 (the workers are -workers). Create + // the leader PodGroup with minCount=1 so the last-running gang handler + // (which would otherwise parent-derive N) finds it already present and + // leaves it alone. Also record the admission-order leader so a worker + // admitted without an explicit role can resolve its role by membership. + m.EnsurePodGroup(ctx, pod.Namespace, g, pod.Name, 1) + recordLeaderIfUnset(ctx, m, pod.Namespace, g, leaderName(pod)) + } + sc := sidecarFor(m) + ops := sc.InterceptorOps(pod) ops = append(ops, roleEnvOps(pod, RoleLeader)...) if isLeader || observe { - m.EnsureSidecarRBAC(ctx, pod.Namespace) - ops = append(ops, m.SidecarContainerOps(pod, observe)...) + sc.EnsureRBAC(ctx, pod.Namespace) + // Leader/worker sidecar env is supplied HERE (the quantum handler owns the + // split), keeping the core domain-agnostic. FLUENCE_EXPECTED_WORKERS is + // copied verbatim from the expected-workers ANNOTATION: env var values + // cannot be computed-and-patched dynamically at admission, so the workload + // declares the count as an annotation and the webhook propagates it to the + // env var the sidecar reads — annotation and env var are the same value in + // two representations. + var extra []corev1.EnvVar + if isLeader { + if n := spec.Annotation(pod, webhook.ExpectedWorkersAnnotation); n != "" { + extra = append(extra, corev1.EnvVar{Name: "FLUENCE_EXPECTED_WORKERS", Value: n}) + } + extra = append(extra, corev1.EnvVar{Name: "FLUENCE_WORKER_GROUP_BASE", Value: g}) + } + ops = append(ops, sc.ContainerOps(pod, observe, extra)...) } return ops } @@ -167,6 +209,51 @@ func roleEnvOps(pod *corev1.Pod, effectiveRole string) []spec.Op { return ops } +// relinkGroupOps moves a worker pod into the worker group: it overwrites the +// group label and the schedulingGroup.podGroupName link to point at wg +// (-workers). This is what puts the worker into the size-(N-1) PodGroup +// instead of the leader's size-1 group. +func relinkGroupOps(pod *corev1.Pod, wg string) []spec.Op { + var ops []spec.Op + // label (the value the sidecar lists workers by) — escape "/" and "~" per JSON Pointer + labelPath := "/metadata/labels/" + escapeJSONPointer(webhook.GroupLabel) + ops = append(ops, spec.Op{Op: "add", Path: labelPath, Value: wg}) + // the native gang link + ops = append(ops, spec.Op{Op: "add", Path: "/spec/schedulingGroup", + Value: map[string]string{"podGroupName": wg}}) + return ops +} + +// escapeJSONPointer escapes "~" and "/" for use in a JSON Pointer path segment. +func escapeJSONPointer(s string) string { + s = strings.ReplaceAll(s, "~", "~0") + s = strings.ReplaceAll(s, "/", "~1") + return s +} + +// workerCount returns N-1, the size of the worker subgroup in a quantum gang of +// full size N (N from the group-size annotation, else the owning Job's +// parallelism). Used for the worker PodGroup's gang minCount so the workers +// schedule atomically among themselves. (The sidecar's FLUENCE_EXPECTED_WORKERS +// is a SEPARATE value, copied from the expected-workers annotation — env vars +// cannot be patched dynamically, so the workload declares that count explicitly.) +// Minimum 1. +func workerCount(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) int32 { + n := int32(0) + if pod.Annotations != nil { + if v, err := strconv.Atoi(pod.Annotations[webhook.GroupSizeAnnotation]); err == nil && v > 0 { + n = int32(v) + } + } + if n == 0 { + n = ownerJobN(ctx, m, pod) + } + if n > 1 { + return n - 1 + } + return 1 +} + // gateOps adds the quantum scheduling gate (idempotent). const QuantumClassicalPriorityClass = "fluence-quantum-classical" diff --git a/pkg/webhook/handlers/quantum_split_test.go b/pkg/webhook/handlers/quantum_split_test.go new file mode 100644 index 0000000..4bf1160 --- /dev/null +++ b/pkg/webhook/handlers/quantum_split_test.go @@ -0,0 +1,117 @@ +/* +Copyright 2024 Lawrence Livermore National Security, LLC + (c.f. AUTHORS, NOTICE.LLNS, COPYING) +SPDX-License-Identifier: Apache-2.0 +*/ + +// Two-group quantum split: a quantum gang of size N becomes a leader PodGroup +// (minCount 1) and a worker PodGroup -workers (minCount N-1). +// minCount is derived from the owning Job's parallelism (N). +package handlers + +import ( + "context" + "testing" + + "github.com/converged-computing/fluence/pkg/webhook" + + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" +) + +func qpuLeader(ns, group, name, job string) *corev1.Pod { + p := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, Namespace: ns, + Labels: map[string]string{webhook.GroupLabel: group}, + Annotations: map[string]string{webhook.RoleAnnotation: RoleLeader}, + OwnerReferences: []metav1.OwnerReference{{Kind: "Job", Name: job}}, + }, + Spec: corev1.PodSpec{ + SchedulerName: webhook.SchedulerName, + Containers: []corev1.Container{{Name: "app", Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{QuantumResource: *resource.NewQuantity(1, resource.DecimalSI)}}}}, + }, + } + return p +} + +func qpuWorker(ns, group, name, job string) *corev1.Pod { + p := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, Namespace: ns, + Labels: map[string]string{webhook.GroupLabel: group}, + Annotations: map[string]string{webhook.RoleAnnotation: RoleWorker}, + OwnerReferences: []metav1.OwnerReference{{Kind: "Job", Name: job}}, + }, + Spec: corev1.PodSpec{ + SchedulerName: webhook.SchedulerName, + Containers: []corev1.Container{{Name: "w"}}, + }, + } + return p +} + +func mincount(t *testing.T, cs *fake.Clientset, ns, group string) (int32, bool) { + t.Helper() + pg, err := cs.SchedulingV1alpha2().PodGroups(ns).Get(context.Background(), group, metav1.GetOptions{}) + if err != nil { + return 0, false + } + if pg.Spec.SchedulingPolicy.Gang == nil { + return 0, false + } + return pg.Spec.SchedulingPolicy.Gang.MinCount, true +} + +// Quantum gang of size N=4 owned by a Job(parallelism=4): leader group minCount +// 1, worker group -workers minCount 3. +func TestQuantumSplitLeaderOneWorkersNMinus1(t *testing.T) { + ns, group, job := "default", "qg", "qg-job" + par := int32(4) + jobObj := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: job, Namespace: ns}, + Spec: batchv1.JobSpec{Parallelism: &par, Completions: &par}, + } + cs := fake.NewSimpleClientset(jobObj) + m := &webhook.Mutator{Clientset: cs} + + // leader admitted first + m.Mutate(context.Background(), qpuLeader(ns, group, "qg-0", job)) + // then a worker + m.Mutate(context.Background(), qpuWorker(ns, group, "qg-1", job)) + + if mc, ok := mincount(t, cs, ns, group); !ok || mc != 1 { + t.Errorf("leader group %q minCount=%d (ok=%v), want 1", group, mc, ok) + } + wg := group + WorkerGroupSuffix + if mc, ok := mincount(t, cs, ns, wg); !ok || mc != 3 { + t.Errorf("worker group %q minCount=%d (ok=%v), want 3 (N-1)", wg, mc, ok) + } +} + +// The worker is relinked into -workers (label + schedulingGroup op). +func TestQuantumWorkerRelinkedToWorkerGroup(t *testing.T) { + ns, group, job := "default", "qg2", "qg2-job" + par := int32(3) + cs := fake.NewSimpleClientset(&batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: job, Namespace: ns}, + Spec: batchv1.JobSpec{Parallelism: &par, Completions: &par}}) + m := &webhook.Mutator{Clientset: cs} + m.Mutate(context.Background(), qpuLeader(ns, group, "qg2-0", job)) + + ops := m.Mutate(context.Background(), qpuWorker(ns, group, "qg2-1", job)) + wg := group + WorkerGroupSuffix + var relinked bool + for _, op := range ops { + if v, ok := op.Value.(map[string]string); ok && v["podGroupName"] == wg { + relinked = true + } + } + if !relinked { + t.Errorf("worker not relinked to %q (ops: %+v)", wg, ops) + } +} diff --git a/pkg/webhook/handlers/registry_test.go b/pkg/webhook/handlers/registry_test.go new file mode 100644 index 0000000..346d786 --- /dev/null +++ b/pkg/webhook/handlers/registry_test.go @@ -0,0 +1,82 @@ +/* +Copyright 2024 Lawrence Livermore National Security, LLC + (c.f. AUTHORS, NOTICE.LLNS, COPYING) +SPDX-License-Identifier: Apache-2.0 +*/ + +// Registry behavior: dispatch order comes from the active handler list (not a +// per-handler Order), and the list both selects and orders handlers. +package handlers + +import ( + "context" + "testing" + + "github.com/converged-computing/fluence/pkg/webhook" + "github.com/converged-computing/fluence/pkg/webhook/spec" + + "k8s.io/client-go/kubernetes/fake" +) + +// The default active order ships gang LAST so it only applies default gang +// sizing when no earlier handler shaped the gang. +func TestDefaultOrderGangLast(t *testing.T) { + defer webhook.SetActiveHandlers(nil) + active, _ := webhook.SetActiveHandlers(nil) // restore + read default + if len(active) == 0 { + t.Fatal("no active handlers") + } + if active[len(active)-1] != "gang" { + t.Errorf("gang must be last in default order; got %v", active) + } + // default order is exactly fluxion, quantum, gang + want := []string{"fluxion", "quantum", "gang"} + if len(active) != len(want) { + t.Fatalf("default order = %v, want %v", active, want) + } + for i := range want { + if active[i] != want[i] { + t.Errorf("default order = %v, want %v", active, want) + break + } + } +} + +// The active list IS the order: passing a custom order reorders dispatch, and +// unknown names are reported, not silently kept. +func TestActiveListSetsOrderAndReportsUnknown(t *testing.T) { + defer webhook.SetActiveHandlers(nil) + active, unknown := webhook.SetActiveHandlers([]string{"gang", "fluxion", "bogus"}) + if len(active) != 2 || active[0] != "gang" || active[1] != "fluxion" { + t.Errorf("active = %v, want [gang fluxion] in that order", active) + } + if len(unknown) != 1 || unknown[0] != "bogus" { + t.Errorf("unknown = %v, want [bogus]", unknown) + } +} + +// Dropping a handler from the list disables it: a quantum pod with quantum +// omitted gets no interceptor ops (only fluxion/gang act). +func TestOmittedHandlerDoesNotDispatch(t *testing.T) { + defer webhook.SetActiveHandlers(nil) + m := &webhook.Mutator{Clientset: fake.NewSimpleClientset()} + + webhook.SetActiveHandlers(nil) // default: quantum present + if !hasInterceptor(m.Mutate(context.Background(), qpuPod("fluence"))) { + t.Fatal("with quantum active, expected interceptor (init container) ops") + } + + webhook.SetActiveHandlers([]string{"fluxion", "gang"}) // quantum omitted + if hasInterceptor(m.Mutate(context.Background(), qpuPod("fluence"))) { + t.Error("with quantum omitted, interceptor ops must NOT be present") + } +} + +func hasInterceptor(ops []spec.Op) bool { + for _, op := range ops { + if op.Path == "/spec/initContainers" || op.Path == "/spec/initContainers/-" { + return true + } + } + return false +} diff --git a/pkg/webhook/handlers/sidecar.go b/pkg/webhook/handlers/sidecar.go new file mode 100644 index 0000000..19b6569 --- /dev/null +++ b/pkg/webhook/handlers/sidecar.go @@ -0,0 +1,56 @@ +package handlers + +import ( + "context" + + "github.com/converged-computing/fluence/pkg/webhook" + "github.com/converged-computing/fluence/pkg/webhook/spec" + + corev1 "k8s.io/api/core/v1" +) + +// Sidecar is the capability a handler uses to attach a coordination sidecar to a +// pod. It is NOT part of the webhook core's MutatorAPI: only handlers that need +// a sidecar (today, quantum) depend on it, and a handler may supply its own +// implementation to customize delivery. The default implementation +// (coreSidecar) delegates to the webhook core's interceptor/sidecar ops, which +// remain the staging mechanism shared by any sidecar-using handler. +// +// This is the seam your design calls for: "a general sidecar interface that can +// be used across handlers and customized by the quantum [handler]". A future +// custom-resource handler can implement Sidecar differently (different image, +// env, gating) without touching the core or other handlers. +type Sidecar interface { + // EnsureRBAC provisions the per-namespace ServiceAccount/Role/Binding the + // sidecar needs to read/patch pods and podgroups. + EnsureRBAC(ctx context.Context, namespace string) + // InterceptorOps stages the in-pod interceptor (Model C) into the workload + // containers (init container + shared volume on PYTHONPATH). + InterceptorOps(pod *corev1.Pod) []spec.Op + // ContainerOps adds the sidecar container. observe=true selects observe-only + // telemetry mode (no ungating). extraEnv carries handler-computed, + // domain-specific env (e.g. the quantum handler's FLUENCE_EXPECTED_WORKERS = + // N-1 and FLUENCE_WORKER_GROUP_BASE) so the core never has to know about + // leader/worker concepts — the handler that owns the split owns those values. + ContainerOps(pod *corev1.Pod, observe bool, extraEnv []corev1.EnvVar) []spec.Op +} + +// coreSidecar is the default Sidecar, delegating to the webhook core. It is the +// shared, generic staging path; the quantum handler uses it as-is today and a +// custom handler could wrap or replace it. +type coreSidecar struct{ m webhook.MutatorAPI } + +func (s coreSidecar) EnsureRBAC(ctx context.Context, namespace string) { + s.m.EnsureSidecarRBAC(ctx, namespace) +} +func (s coreSidecar) InterceptorOps(pod *corev1.Pod) []spec.Op { + return s.m.InterceptorOps(pod) +} +func (s coreSidecar) ContainerOps(pod *corev1.Pod, observe bool, extraEnv []corev1.EnvVar) []spec.Op { + return s.m.SidecarContainerOps(pod, observe, extraEnv) +} + +// sidecarFor returns the Sidecar a handler should use. Centralized so the choice +// of implementation (and any future per-handler customization) lives in one +// place. Today every sidecar-using handler gets the core-backed default. +func sidecarFor(m webhook.MutatorAPI) Sidecar { return coreSidecar{m: m} } diff --git a/pkg/webhook/webhook.go b/pkg/webhook/webhook.go index 20a7288..bce6e8a 100644 --- a/pkg/webhook/webhook.go +++ b/pkg/webhook/webhook.go @@ -52,9 +52,6 @@ const ( // meaning to it (a handler decides what a group means). GroupLabel = "fluence.flux-framework.org/group" - // LeaderAnnotation records the admission-order leader on a PodGroup. - LeaderAnnotation = "fluence.flux-framework.org/leader" - // RoleAnnotation, set by the workload on each pod, explicitly declares the // pod's gang role ("leader" or "worker"). When present it is AUTHORITATIVE: // the quantum handler gates workers and gives the leader the sidecar based @@ -71,6 +68,13 @@ const ( // opaque string and ascribes no meaning to it beyond propagation. ExpectedWorkersAnnotation = "fluence.flux-framework.org/expected-workers" + // GroupSizeAnnotation is the FULL gang member count N (leader + workers), + // set by the workload on each pod. It drives the PodGroup gang minCount so the + // whole group schedules atomically. This is distinct from + // ExpectedWorkersAnnotation (N-1: the workers the sidecar ungates; the leader + // is not gated). For a classical gang with no leader/worker split, N = size. + GroupSizeAnnotation = "fluence.flux-framework.org/group-size" + // Sidecar/staging infrastructure (generic — not quantum-specific). SidecarImage = "ghcr.io/converged-computing/fluence-sidecar:latest" SidecarServiceAccount = "fluence-sidecar" @@ -138,29 +142,13 @@ func (m *Mutator) EnvVarNames() []string { return names } -// PodGroupLeader returns the recorded admission-order leader for the group, or -// "". Retries briefly to absorb the concurrent leader/worker admission race. -func (m *Mutator) PodGroupLeader(ctx context.Context, namespace, group string) string { - if m.Clientset == nil || group == "" { - return "" - } - for i := 0; i < 3; i++ { - pg, err := m.Clientset.SchedulingV1alpha2().PodGroups(namespace).Get(ctx, group, metav1.GetOptions{}) - if err != nil { - return "" - } - if pg.Annotations != nil && pg.Annotations[LeaderAnnotation] != "" { - return pg.Annotations[LeaderAnnotation] - } - if i < 2 { - time.Sleep(100 * time.Millisecond) - } +// EnsurePodGroup creates a Fluence-owned PodGroup with gang minCount = the full +// gang size N (the whole group schedules atomically) if absent. minCount<=0 +// falls back to 1. +func (m *Mutator) EnsurePodGroup(ctx context.Context, namespace, group, leaderPod string, minCount int32) { + if minCount <= 0 { + minCount = 1 } - return "" -} - -// EnsurePodGroup creates a Fluence-owned PodGroup (minCount:1) if absent. -func (m *Mutator) EnsurePodGroup(ctx context.Context, namespace, group, leaderPod string) { if m.Clientset == nil { return } @@ -179,26 +167,14 @@ func (m *Mutator) EnsurePodGroup(ctx context.Context, namespace, group, leaderPo }, Spec: schedulingv1alpha2.PodGroupSpec{ SchedulingPolicy: schedulingv1alpha2.PodGroupSchedulingPolicy{ - Gang: &schedulingv1alpha2.GangSchedulingPolicy{MinCount: 1}, + Gang: &schedulingv1alpha2.GangSchedulingPolicy{MinCount: minCount}, }, }, } if _, err := m.Clientset.SchedulingV1alpha2().PodGroups(namespace).Create(ctx, pg, metav1.CreateOptions{}); err != nil { log.Printf("[fluence-webhook] could not create PodGroup %s/%s: %v", namespace, group, err) } else { - log.Printf("[fluence-webhook] created PodGroup %s/%s (minCount=1)", namespace, group) - } -} - -// RecordLeader records leaderPod as the group's admission-order leader. -func (m *Mutator) RecordLeader(ctx context.Context, namespace, group, leaderPod string) { - if m.Clientset == nil || group == "" { - return - } - patch := fmt.Sprintf(`{"metadata":{"annotations":{%q:%q}}}`, LeaderAnnotation, leaderPod) - if _, err := m.Clientset.SchedulingV1alpha2().PodGroups(namespace).Patch( - ctx, group, types.MergePatchType, []byte(patch), metav1.PatchOptions{}); err != nil { - log.Printf("[fluence-webhook] could not record leader on PodGroup %s/%s: %v", namespace, group, err) + log.Printf("[fluence-webhook] created PodGroup %s/%s (minCount=%d)", namespace, group, minCount) } } @@ -312,7 +288,7 @@ func (m *Mutator) InterceptorOps(pod *corev1.Pod) []spec.Op { // SidecarContainerOps adds the fluence-sidecar container and sets its // ServiceAccount. observe=true selects observe-only telemetry mode. -func (m *Mutator) SidecarContainerOps(pod *corev1.Pod, observe bool) []spec.Op { +func (m *Mutator) SidecarContainerOps(pod *corev1.Pod, observe bool, extraEnv []corev1.EnvVar) []spec.Op { var ops []spec.Op // The sidecar resolves its vendor provider at runtime from the backend the // scheduler chose. It gets the same FLUXION_* contract as the workload @@ -329,17 +305,11 @@ func (m *Mutator) SidecarContainerOps(pod *corev1.Pod, observe bool) []spec.Op { if observe { env = append(env, corev1.EnvVar{Name: "FLUENCE_OBSERVE", Value: "true"}) } - // The gang size is known at admission (the leader carries it), even though - // the worker NAMES are not yet. Propagate the expected worker count to the - // sidecar as a static env var so it can wait until it has discovered that - // many gated workers before ungating, rather than ungating a partial set. - // Read from a generic annotation so the core stays domain-agnostic; the - // workload manifest sets it (e.g. from its own N_WORKERS). - if pod.Annotations != nil { - if n := pod.Annotations[ExpectedWorkersAnnotation]; n != "" { - env = append(env, corev1.EnvVar{Name: "FLUENCE_EXPECTED_WORKERS", Value: n}) - } - } + // Handler-supplied, domain-specific env (e.g. quantum's FLUENCE_EXPECTED_WORKERS + // and FLUENCE_WORKER_GROUP_BASE). The core does not know what these mean; the + // handler that owns the gang shape computes and passes them. Appended before + // the credential copy so workload creds still win on name collisions below. + env = append(env, extraEnv...) // The sidecar talks to the same backend the workload does (e.g. to find the // task and read its queue position), so it needs the same credentials. Copy // the workload container's secret/configmap-sourced env onto the sidecar. diff --git a/pkg/webhook/webhook_test.go b/pkg/webhook/webhook_test.go index 26983d4..dd32ac6 100644 --- a/pkg/webhook/webhook_test.go +++ b/pkg/webhook/webhook_test.go @@ -40,7 +40,7 @@ func TestSidecarInheritsWorkloadSecretEnv(t *testing.T) { }}, }, } - ops := m.SidecarContainerOps(pod, false) + ops := m.SidecarContainerOps(pod, false, nil) var sidecar *corev1.Container for _, op := range ops { if c, ok := op.Value.(corev1.Container); ok && c.Name == "fluence-sidecar" { diff --git a/python/fluence/sidecar.py b/python/fluence/sidecar.py index 098574b..9e1184b 100644 --- a/python/fluence/sidecar.py +++ b/python/fluence/sidecar.py @@ -29,6 +29,12 @@ from fluence.providers.base import log from fluence.ungate import ungate_pods, gated_pods_from_env, namespace_from_env, wait_for_gated_pods +# MUST match handlers.WorkerGroupSuffix in the Go webhook. A quantum gang of size +# N is split into the leader group (size 1) and the worker group +# -workers (size N-1, all gated). The sidecar runs in the leader and +# discovers/ungates workers in the WORKER group, not the leader's group. +WORKER_GROUP_SUFFIX = "-workers" + def _poll(provider, task, poll_interval, ungate): mode = "gang" if ungate else "observe-only" @@ -52,6 +58,13 @@ def main(): pod_uid = os.environ.get("FLUENCE_POD_UID", "") pod_name = os.environ.get("FLUENCE_POD_NAME", "") group = os.environ.get("FLUENCE_GROUP", "") + # Two-group quantum split: the leader (where this sidecar runs) is in + # ; the gated workers were moved to -workers by the webhook. + # WORKER_GROUP_SUFFIX MUST match handlers.WorkerGroupSuffix in the Go webhook + # (pkg/webhook/handlers/quantum.go). The webhook also passes the base group + # via FLUENCE_WORKER_GROUP_BASE; prefer it, fall back to FLUENCE_GROUP. + worker_group_base = os.environ.get("FLUENCE_WORKER_GROUP_BASE", group) + worker_group = worker_group_base + WORKER_GROUP_SUFFIX if worker_group_base else "" backend = os.environ.get("FLUXION_BACKEND", "") observe = os.environ.get("FLUENCE_OBSERVE", "").lower() == "true" discovery_timeout = int(os.environ.get("FLUENCE_TASK_DISCOVERY_TIMEOUT", 300)) @@ -63,7 +76,7 @@ def main(): log("starting fluence quantum sidecar") log(f" pod_uid={pod_uid} namespace={namespace} group={group} " - f"backend={backend} observe={observe} expected_workers={expected_workers}") + f"backend={backend} observe={observe} expected_workers={expected_workers} worker_group={worker_group}") provider = resolve_from_env() if provider is None: @@ -75,7 +88,7 @@ def main(): if task is None: log("ERROR: could not discover quantum task") if not observe: - ungate_pods(wait_for_gated_pods(namespace, group, expected_workers, + ungate_pods(wait_for_gated_pods(namespace, worker_group, expected_workers, exclude=pod_name, timeout=ungate_timeout), "", namespace) sys.exit(1) @@ -93,7 +106,7 @@ def main(): # together), then ungate them. expected_workers is N-1, propagated by the # webhook from the leader at admission; if unset we ungate whatever is found. gated_pods = gated_pods_from_env() or wait_for_gated_pods( - namespace, group, expected_workers, exclude=pod_name, + namespace, worker_group, expected_workers, exclude=pod_name, timeout=ungate_timeout) log(f"ungating {len(gated_pods)} worker(s): {gated_pods}") n_ok = ungate_pods(gated_pods, job_id, namespace) diff --git a/test/e2e/01-classical-gang.sh b/test/e2e/01-classical-gang.sh index d2018ac..ffe8fa8 100644 --- a/test/e2e/01-classical-gang.sh +++ b/test/e2e/01-classical-gang.sh @@ -27,3 +27,7 @@ count="$(kubectl get pods -l app=training --no-headers | wc -l | tr -d ' ')" log "PASS: classical gang placed all $count pods via fluence" kubectl delete -f examples/single-podgroup.yaml --wait=false || true kubectl patch podgroup training --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true +# Wait for the pods to actually be gone before the next test runs — otherwise a +# terminating 'training' pod (same name/labels reused by other scenarios) can be +# misread as the next test's placement. +kubectl wait --for=delete pod -l app=training --timeout=60s 2>/dev/null || true diff --git a/test/e2e/05-postfilter-rematch.sh b/test/e2e/05-postfilter-rematch.sh new file mode 100644 index 0000000..1712ea9 --- /dev/null +++ b/test/e2e/05-postfilter-rematch.sh @@ -0,0 +1,112 @@ +#!/usr/bin/env bash +# PostFilter re-match: when another scheduler plugin (TaintToleration) rejects a +# node Fluxion allocated, Fluence must abandon that allocation, exclude the node, +# and re-match onto an untainted node. Safety: the gang's RUNNING pod must NEVER +# bind to the tainted node. +# +# This test is self-isolating: it uses its own workload name (pf-rematch) and +# labels, distinct from the other e2e scenarios, and ensures a clean slate first, +# so a pod left over (terminating) from a previous test can never be mistaken for +# this test's placement. It also ignores terminating pods when asserting. +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE}/lib.sh" + +NAME=pf-rematch +SEL="app=${NAME}" + +log "TEST 5: PostFilter abandons a taint-rejected allocation and re-matches" + +# --- clean slate: no leftover pods from earlier tests under our name ---------- +kubectl delete deployment "$NAME" --ignore-not-found >/dev/null 2>&1 || true +kubectl delete podgroup "$NAME" --ignore-not-found >/dev/null 2>&1 || true +kubectl patch podgroup "$NAME" --type=merge \ + -p '{"metadata":{"finalizers":null}}' >/dev/null 2>&1 || true +kubectl wait --for=delete pod -l "$SEL" --timeout=60s >/dev/null 2>&1 || true + +TAINTED="$(kubectl get nodes -l '!node-role.kubernetes.io/control-plane' \ + -o jsonpath='{.items[0].metadata.name}')" +[ -n "$TAINTED" ] || fail "no worker node found to taint" +log "tainting node $TAINTED with fluence-e2e=blocked:NoSchedule" +kubectl taint nodes "$TAINTED" fluence-e2e=blocked:NoSchedule --overwrite + +cleanup() { + kubectl taint nodes "$TAINTED" fluence-e2e- 2>/dev/null || true + kubectl delete deployment "$NAME" --ignore-not-found --wait=false 2>/dev/null || true + kubectl delete podgroup "$NAME" --ignore-not-found --wait=false 2>/dev/null || true + kubectl patch podgroup "$NAME" --type=merge \ + -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true +} +trap cleanup EXIT + +# --- our own workload (distinct name/labels; does NOT tolerate the taint) ------ +kubectl apply -f - <" for empty fields, so an empty deletionTimestamp + # shows as "", NOT "". Treat "" as empty for both columns. + if [ "$deleted" != "" ] && [ -n "$deleted" ]; then continue; fi # skip terminating + if [ "$node" = "" ] || [ -z "$node" ]; then continue; fi # skip not-yet-bound + checked=$((checked+1)) + if [ "$node" = "$TAINTED" ]; then + fail "SAFETY VIOLATION: running pod $name is bound to the tainted node $TAINTED" + fi + log "$name correctly placed on $node (not the tainted $TAINTED)" +done < <(kubectl get pods -l "$SEL" \ + -o custom-columns='N:.metadata.name,NODE:.spec.nodeName,DEL:.metadata.deletionTimestamp' \ + --no-headers) + +[ "$checked" -ge 1 ] || fail "no running ${NAME} pod found to check" + +# Informational: did PostFilter actually fire (Fluxion picked the tainted node +# first and we re-matched), or did Fluxion place on the good node directly? +POD="$(kubectl -n kube-system get pods -l app=fluence \ + -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)" +if [ -n "$POD" ] && kubectl -n kube-system logs "$POD" 2>/dev/null \ + | grep -q "unschedulable: abandoning allocation"; then + log "observed PostFilter abandonment in scheduler log (re-match path exercised)" +else + log "note: Fluxion placed on the untainted node directly this run (PostFilter not needed)" +fi + +log "PASS: gang scheduled on an untainted node; no running pod on the tainted node" diff --git a/test/e2e/06-multi-gang.sh b/test/e2e/06-multi-gang.sh new file mode 100755 index 0000000..2bd6a4c --- /dev/null +++ b/test/e2e/06-multi-gang.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# Multi-pod gang scheduling on real nodes. Guards the two failures that the +# single-pod 01 test could NOT catch (and that shipped a minCount=1 bug): +# A) a 3-pod gang must place ALL 3 (minCount must equal the gang size, not 1) +# B) under contention, a gang that cannot fully fit stays ENTIRELY pending — +# never partially placed (no stranded pods holding nodes). +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE}/lib.sh" + +# ---- A) all-or-nothing placement of a 3-pod gang ------------------------------- +log "TEST 6A: multi-pod gang (3) places all-or-nothing" +kubectl apply -f examples/multi-gang.yaml + +# the webhook must have created the PodGroup with minCount = 3 (the bug set it to 1) +log "checking PodGroup minCount == 3 (set by webhook from group-size)" +for i in $(seq 1 30); do + mc="$(kubectl get podgroup gang3 -o jsonpath='{.spec.schedulingPolicy.gang.minCount}' 2>/dev/null || true)" + [ -n "$mc" ] && break; sleep 2 +done +[ "$mc" = "3" ] || fail "PodGroup gang3 minCount=$mc, want 3 (minCount=1 bug -> partial gangs)" + +log "waiting for all 3 gang pods to be Ready" +wait_pods_ready "app=gang3" 3 180 || fail "gang3 did not place all 3 pods (gang scheduling failed)" + +count="$(kubectl get pods -l app=gang3 --field-selector=status.phase=Running --no-headers | wc -l | tr -d ' ')" +[ "$count" = "3" ] || fail "expected 3 Running gang3 pods, got $count (partial placement)" +for p in $(kubectl get pods -l app=gang3 -o name); do + pod="${p#pod/}" + sched="$(kubectl get pod "$pod" -o jsonpath='{.spec.schedulerName}')" + [ "$sched" = "fluence" ] || fail "$pod not scheduled by fluence (got: $sched)" +done +log "PASS 6A: 3-pod gang placed atomically by fluence (minCount=3)" + +kubectl delete -f examples/multi-gang.yaml --wait=false || true +kubectl patch podgroup gang3 --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true +kubectl wait --for=delete pod -l app=gang3 --timeout=60s 2>/dev/null || true + +# ---- B) contention: the gang that can't fully fit stays ENTIRELY pending -------- +log "TEST 6B: contention — a gang that cannot fully fit must NOT partially place" +kubectl apply -f examples/multi-gang-contention.yaml + +# wait until the cluster settles. Three possible outcomes: +# - one gang fully Running, other fully Pending -> contention; assert no partial +# - BOTH fully Running -> runner big enough, no contention to test (skip) +# - any partial (1 of 2 in a gang scheduled) -> the bug, fail +log "waiting for gangs to settle" +winner=""; loser=""; both="" +for i in $(seq 1 90); do + ra="$(kubectl get pods -l app=gang-a --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l | tr -d ' ')" + rb="$(kubectl get pods -l app=gang-b --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l | tr -d ' ')" + if [ "$ra" = "2" ] && [ "$rb" = "2" ]; then both=1; break; fi + if [ "$ra" = "2" ] && [ "$rb" = "0" ]; then winner=gang-a; loser=gang-b; break; fi + if [ "$rb" = "2" ] && [ "$ra" = "0" ]; then winner=gang-b; loser=gang-a; break; fi + sleep 2 +done + +if [ -n "$both" ]; then + log "SKIP 6B: cluster placed both gangs (>=4 schedulable cores) — no contention on this runner" +else + [ -n "$winner" ] || fail "no clean settle: gang-a=$ra gang-b=$rb running (possible PARTIAL placement)" + log "winner=$winner (2 running), loser=$loser (expected 0 running)" + # the loser must have ZERO pods scheduled to a node — the all-or-nothing guarantee. + # A single scheduled loser pod = partial placement = the bug. + scheduled_loser="$(kubectl get pods -l app=$loser -o jsonpath='{range .items[*]}{.spec.nodeName}{"\n"}{end}' | grep -c . || true)" + [ "$scheduled_loser" = "0" ] || fail "$loser has $scheduled_loser pod(s) on a node — PARTIAL placement (gang violated)" + log "PASS 6B: $loser stayed entirely pending — no partial placement under contention" +fi + +kubectl delete -f examples/multi-gang-contention.yaml --wait=false || true +for g in gang-a gang-b; do + kubectl patch podgroup $g --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true +done +kubectl wait --for=delete pod -l app=gang-a --timeout=60s 2>/dev/null || true +kubectl wait --for=delete pod -l app=gang-b --timeout=60s 2>/dev/null || true +log "PASS: multi-gang all-or-nothing verified" diff --git a/test/e2e/07-quantum-split.sh b/test/e2e/07-quantum-split.sh new file mode 100755 index 0000000..3899f57 --- /dev/null +++ b/test/e2e/07-quantum-split.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +# Two-group quantum split: a quantum gang of size N is split into a LEADER +# PodGroup (minCount 1) and a WORKER PodGroup -workers +# (minCount N-1). Workers are relinked into the worker group and gated. This is +# the structural guarantee that, combined with the sidecar ungating the worker +# group, makes quantum gangs work. (The runtime ungate is covered by 04; here we +# prove the group SPLIT the ungate path depends on.) +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE}/lib.sh" + +log "TEST 7: quantum two-group split (leader=1, workers=N-1)" +kubectl apply -f examples/test/e2e/quantum-split-pods.yaml + +# leader PodGroup must exist with minCount 1 +log "checking leader group 'qsplit' minCount == 1" +for i in $(seq 1 30); do + lc="$(kubectl get podgroup qsplit -o jsonpath='{.spec.schedulingPolicy.gang.minCount}' 2>/dev/null || true)" + [ -n "$lc" ] && break; sleep 2 +done +[ "$lc" = "1" ] || fail "leader group qsplit minCount=$lc, want 1" + +# worker PodGroup -workers must exist with minCount N-1 = 2 +log "checking worker group 'qsplit-workers' minCount == 2 (N-1)" +for i in $(seq 1 30); do + wc="$(kubectl get podgroup qsplit-workers -o jsonpath='{.spec.schedulingPolicy.gang.minCount}' 2>/dev/null || true)" + [ -n "$wc" ] && break; sleep 2 +done +[ "$wc" = "2" ] || fail "worker group qsplit-workers minCount=$wc, want 2 (N-1); the split did not happen" + +# workers must be RELINKED into the worker group (label rewritten by webhook) +log "checking workers were relinked into qsplit-workers" +for w in qsplit-worker-0 qsplit-worker-1; do + g="$(kubectl get pod "$w" -o jsonpath='{.metadata.labels.fluence\.flux-framework\.org/group}' 2>/dev/null || true)" + [ "$g" = "qsplit-workers" ] || fail "$w group label=$g, want qsplit-workers (relink failed)" +done + +# workers must be GATED (scheduling gate held until leader's task is ready) +log "checking workers carry the quantum scheduling gate" +for w in qsplit-worker-0 qsplit-worker-1; do + gate="$(kubectl get pod "$w" -o jsonpath='{.spec.schedulingGates[0].name}' 2>/dev/null || true)" + [ "$gate" = "quantum.braket/ready" ] || fail "$w not gated (gate=$gate)" +done + +# leader's sidecar must know where to find workers: FLUENCE_WORKER_GROUP_BASE set +log "checking leader sidecar has the worker-group env" +base="$(kubectl get pod qsplit-leader -o jsonpath='{range .spec.containers[*]}{range .env[*]}{.name}={.value}{"\n"}{end}{end}' 2>/dev/null | grep FLUENCE_WORKER_GROUP_BASE || true)" +[ -n "$base" ] || fail "leader sidecar missing FLUENCE_WORKER_GROUP_BASE (sidecar would look in the wrong group and never ungate)" + +log "PASS 7: quantum gang split into leader(1) + workers(N-1), relinked + gated" +kubectl delete -f examples/test/e2e/quantum-split-pods.yaml --wait=false || true +for g in qsplit qsplit-workers; do + kubectl patch podgroup $g --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true +done +kubectl wait --for=delete pod -l app=qsplit --timeout=60s 2>/dev/null || true diff --git a/test/e2e/08-requeue-on-capacity.sh b/test/e2e/08-requeue-on-capacity.sh new file mode 100644 index 0000000..f67421b --- /dev/null +++ b/test/e2e/08-requeue-on-capacity.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +# Requeue-on-capacity: a contended gang that loses the initial race must be +# RE-ATTEMPTED when the winner completes and frees nodes — driven by fluence's +# EventsToRegister, with no manual nudge. Guards the gap where Unschedulable +# gangs only woke on the backoff timer (so contended gangs stalled instead of +# draining as capacity freed). +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE}/lib.sh" + +log "TEST 9: contended gang is requeued when capacity frees (EventsToRegister)" +kubectl apply -f examples/multi-gang-requeue.yaml + +# Both gangs want the same nodes; only one fits. Identify winner/loser. +log "waiting for one gang to win and the other to be Unschedulable" +winner=""; loser="" +for i in $(seq 1 60); do + rw="$(kubectl get pods -l job-name=gang-win --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l | tr -d ' ')" + ra="$(kubectl get pods -l job-name=gang-wait --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l | tr -d ' ')" + if [ "$rw" = "2" ] && [ "$ra" = "0" ]; then winner=gang-win; loser=gang-wait; break; fi + if [ "$ra" = "2" ] && [ "$rw" = "0" ]; then winner=gang-wait; loser=gang-win; break; fi + sleep 2 +done +[ -n "$winner" ] || fail "no gang won a clean 2/0 placement (check capacity/contention)" +log "winner=$winner running; loser=$loser should be Unschedulable" + +# the loser's PodGroup should be Unschedulable (entirely pending — all-or-nothing) +for i in $(seq 1 15); do + st="$(kubectl get podgroup "$loser" -o jsonpath='{.status.conditions[*].type}{" "}{.status}' 2>/dev/null || true)" + echo "$st" | grep -qi "unschedulable\|pending" && break + # status field name varies; also accept: zero loser pods scheduled + sched="$(kubectl get pods -l job-name=$loser -o jsonpath='{range .items[*]}{.spec.nodeName}{"\n"}{end}' | grep -c . || true)" + [ "$sched" = "0" ] && break + sleep 2 +done +sched_loser="$(kubectl get pods -l job-name=$loser -o jsonpath='{range .items[*]}{.spec.nodeName}{"\n"}{end}' | grep -c . || true)" +[ "$sched_loser" = "0" ] || fail "$loser partially placed ($sched_loser pods on nodes) — gang violated" +log " $loser is entirely pending (no pods placed) — correct" + +# THE KEY ASSERTION: when the winner COMPLETES (frees nodes), the loser must be +# requeued and run — WITHOUT us touching it. The winner sleeps ~30s. +log "waiting for winner=$winner to complete and free capacity" +kubectl wait --for=condition=complete job/$winner --timeout=120s \ + || fail "$winner did not complete" +log " $winner completed; capacity freed" + +log "asserting $loser is now requeued and runs (EventsToRegister woke it)" +ok="" +for i in $(seq 1 60); do # up to ~120s; must be well under the 5-min backoff flush + rl="$(kubectl get pods -l job-name=$loser --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l | tr -d ' ')" + done_l="$(kubectl get pods -l job-name=$loser --field-selector=status.phase=Succeeded --no-headers 2>/dev/null | wc -l | tr -d ' ')" + if [ "$((rl + done_l))" -ge 1 ]; then ok=1; break; fi + sleep 2 +done +[ -n "$ok" ] || fail "$loser was NOT requeued after capacity freed within 120s — \ +EventsToRegister not waking unschedulable gangs (would only recover on the 5-min backoff flush)" +log "PASS 9: $loser was requeued and scheduled after $winner freed capacity" + +kubectl delete -f examples/multi-gang-requeue.yaml --wait=false || true +for g in gang-win gang-wait; do + kubectl patch podgroup $g --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true +done +kubectl wait --for=delete pod -l job-name=gang-win --timeout=60s 2>/dev/null || true +kubectl wait --for=delete pod -l job-name=gang-wait --timeout=60s 2>/dev/null || true \ No newline at end of file From af34af62fe1cc812d272a5724cf6f40a3071b2e0 Mon Sep 17 00:00:00 2001 From: vsoch Date: Fri, 26 Jun 2026 21:05:37 -0700 Subject: [PATCH 2/3] reorg: try to separate tests into two Signed-off-by: vsoch --- .github/workflows/e2e-suite.yaml | 109 ++++++++++++ .github/workflows/e2e-tests.yaml | 157 ++---------------- Makefile | 1 - deploy/fluence-pull-test.yaml | 9 +- deploy/fluence-test.yaml | 9 +- deploy/kind-config.yaml | 3 +- .../e2e/gang}/multi-gang-contention.yaml | 4 +- .../e2e/gang}/multi-gang-requeue.yaml | 8 +- examples/{ => test/e2e/gang}/multi-gang.yaml | 12 +- .../{ => test/e2e/gang}/single-podgroup.yaml | 0 .../e2e/{ => quantum}/quantum-pod-mock.yaml | 0 .../e2e/{ => quantum}/quantum-split-pods.yaml | 9 +- .../e2e/{ => quantum}/sidecar-mock-pods.yaml | 3 +- pkg/fluence/enqueue.go | 86 ---------- pkg/fluence/enqueue_test.go | 87 ---------- pkg/fluence/fluence.go | 139 ++++++++++++---- pkg/fluence/fluence_test.go | 156 ++++++++++++++--- test/e2e/03-restart-recovery.sh | 8 +- test/e2e/08-requeue-on-capacity.sh | 63 ------- test/e2e/{ => gang}/01-classical-gang.sh | 4 +- .../02-postfilter-rematch.sh} | 2 +- .../03-multi-gang.sh} | 10 +- test/e2e/gang/04-requeue-on-capacity.sh | 87 ++++++++++ .../01-quantum-placement.sh} | 6 +- .../02-sidecar-ungate.sh} | 6 +- .../03-quantum-split.sh} | 6 +- test/e2e/quantum/04-gang-env-contract.sh | 61 +++++++ 27 files changed, 554 insertions(+), 491 deletions(-) create mode 100644 .github/workflows/e2e-suite.yaml rename examples/{ => test/e2e/gang}/multi-gang-contention.yaml (92%) rename examples/{ => test/e2e/gang}/multi-gang-requeue.yaml (81%) rename examples/{ => test/e2e/gang}/multi-gang.yaml (55%) rename examples/{ => test/e2e/gang}/single-podgroup.yaml (100%) rename examples/test/e2e/{ => quantum}/quantum-pod-mock.yaml (100%) rename examples/test/e2e/{ => quantum}/quantum-split-pods.yaml (84%) rename examples/test/e2e/{ => quantum}/sidecar-mock-pods.yaml (97%) delete mode 100644 pkg/fluence/enqueue.go delete mode 100644 pkg/fluence/enqueue_test.go delete mode 100644 test/e2e/08-requeue-on-capacity.sh rename test/e2e/{ => gang}/01-classical-gang.sh (92%) mode change 100644 => 100755 rename test/e2e/{05-postfilter-rematch.sh => gang/02-postfilter-rematch.sh} (98%) mode change 100644 => 100755 rename test/e2e/{06-multi-gang.sh => gang/03-multi-gang.sh} (91%) create mode 100755 test/e2e/gang/04-requeue-on-capacity.sh rename test/e2e/{02-quantum-placement.sh => quantum/01-quantum-placement.sh} (81%) mode change 100644 => 100755 rename test/e2e/{04-sidecar-ungate.sh => quantum/02-sidecar-ungate.sh} (95%) mode change 100644 => 100755 rename test/e2e/{07-quantum-split.sh => quantum/03-quantum-split.sh} (92%) create mode 100755 test/e2e/quantum/04-gang-env-contract.sh diff --git a/.github/workflows/e2e-suite.yaml b/.github/workflows/e2e-suite.yaml new file mode 100644 index 0000000..dce5b8a --- /dev/null +++ b/.github/workflows/e2e-suite.yaml @@ -0,0 +1,109 @@ +# Reusable e2e workflow (workflow_call): shared setup (build image, kind, deploy +# fluence base), then run ONE test suite — a directory under test/e2e/. The +# suite's tests are DISCOVERED (every NN-*.sh, run in sorted order); adding a test +# is just dropping a file in the directory, no workflow edit. If the suite needs +# special preparation it provides a setup.sh in its directory, which is run before +# the tests (the gang suite has none; the quantum suite installs the qpu add-on). +name: e2e-suite +on: + workflow_call: + inputs: + suite: + description: "test suite directory name under test/e2e/ (e.g. gang, quantum)" + required: true + type: string + +env: + IMAGE: vanessa/fluence:test + +jobs: + run: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build fluence image + uses: docker/build-push-action@v6 + with: + context: . + file: ./Dockerfile + push: false + load: true + tags: ${{ env.IMAGE }} + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Create k8s Kind Cluster + uses: helm/kind-action@v1.10.0 + with: + version: v0.32.0 # required for gang + node_image: kindest/node:v1.36.1 + config: ./deploy/kind-config.yaml + + - name: Free Disk Space (Ubuntu) + run: | + sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc \ + /opt/hostedtoolcache/CodeQL + sudo apt-get clean + df -h + + - name: Load docker images + run: | + cluster=$(kind get clusters) + kind load --name "$cluster" docker-image ${{ env.IMAGE }} + + - name: Deploy fluence (base) + run: | + kubectl apply -f deploy/fluence-test.yaml + kubectl rollout status -n kube-system deployment/fluence --timeout=180s + POD="" + for i in $(seq 1 60); do + POD=$(kubectl -n kube-system get pods -l app=fluence \ + -o go-template='{{range .items}}{{if not .metadata.deletionTimestamp}}{{$name := .metadata.name}}{{range .status.conditions}}{{if and (eq .type "Ready") (eq .status "True")}}{{$name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}' 2>/dev/null | head -1 || true) + [ -n "$POD" ] && break + sleep 2 + done + [ -n "$POD" ] || { echo "ERROR: no Ready non-terminating fluence pod"; kubectl -n kube-system get pods -l app=fluence -o wide; exit 1; } + echo "Using pod: $POD" + sleep 5 + kubectl -n kube-system exec "$POD" -- /bin/bash -c "cat /tmp/fluence-graph-*.json" || true + kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{": cpu="}{.status.allocatable.cpu}{" mem="}{.status.allocatable.memory}{"\n"}{end}' + + # Per-suite special setup, if the suite directory provides one. + - name: Suite setup (${{ inputs.suite }}) + run: | + s="test/e2e/${{ inputs.suite }}/setup.sh" + if [ -f "$s" ]; then + echo "running $s" + bash "$s" + else + echo "no setup.sh for suite '${{ inputs.suite }}' — skipping" + fi + + # Discover and run every NN-*.sh in the suite directory, in sorted order. + - name: Run suite (${{ inputs.suite }}) + run: | + dir="test/e2e/${{ inputs.suite }}" + [ -d "$dir" ] || { echo "ERROR: no such suite dir: $dir"; exit 1; } + shopt -s nullglob + tests=("$dir"/[0-9]*.sh) + [ ${#tests[@]} -gt 0 ] || { echo "ERROR: no NN-*.sh tests in $dir"; exit 1; } + IFS=$'\n' tests=($(sort <<<"${tests[*]}")); unset IFS + echo "discovered ${#tests[@]} test(s) in $dir:" + printf ' %s\n' "${tests[@]}" + for t in "${tests[@]}"; do + echo "::group::$t" + bash "$t" + echo "::endgroup::" + done + + - name: Dump diagnostics on failure + if: failure() + run: | + kubectl get pods -A -o wide + kubectl logs -n kube-system deployment/fluence || true + kubectl logs -n kube-system deployment/fluence-webhook || true \ No newline at end of file diff --git a/.github/workflows/e2e-tests.yaml b/.github/workflows/e2e-tests.yaml index f27fa92..4b405f6 100644 --- a/.github/workflows/e2e-tests.yaml +++ b/.github/workflows/e2e-tests.yaml @@ -8,152 +8,15 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true -env: - KIND_VERSION: v0.32.0 - IMAGE: vanessa/fluence:test - jobs: + # Fan out the suites as parallel jobs, each a call into the reusable workflow. + # The shared setup (build, kind, deploy) lives once in e2e-suite.yaml; the + # matrix runs gang and quantum concurrently. e2e: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Build fluence image - uses: docker/build-push-action@v6 - with: - context: . - file: ./Dockerfile - push: false - load: true - tags: ${{ env.IMAGE }} - cache-from: type=gha - cache-to: type=gha,mode=max - - - name: Create k8s Kind Cluster - uses: helm/kind-action@v1.10.0 - with: - version: v0.32.0 # required for gang - node_image: kindest/node:v1.36.1 - config: ./deploy/kind-config.yaml - - - name: Free Disk Space (Ubuntu) - run: | - echo "=== Disk space before cleanup ===" - df -h - - # Remove large software runtimes and tools - sudo rm -rf /usr/share/dotnet - sudo rm -rf /usr/local/lib/android - sudo rm -rf /opt/ghc - sudo rm -rf /opt/hostedtoolcache/CodeQL - - # Clean package caches - sudo apt-get clean - echo "=== Disk space after cleanup ===" - df -h - - - name: Load docker images - run: | - kind get clusters - cluster=$(kind get clusters) - kind load --name $cluster docker-image vanessa/fluence:test - - - name: Deploy fluence (base) - run: | - kubectl apply -f deploy/fluence-test.yaml - kubectl rollout status -n kube-system deployment/fluence --timeout=180s - # rollout status can return while the OLD ReplicaSet's pod is still - # Running (terminating). Selecting by phase=Running alone can grab that - # stale pod, which then 404s on exec/logs. Wait until exactly one - # fluence pod remains, and require it to be Ready and not terminating. - POD="" - for i in $(seq 1 60); do - # names of pods that are Ready AND have no deletionTimestamp (not terminating) - POD=$(kubectl -n kube-system get pods -l app=fluence \ - -o go-template='{{range .items}}{{if not .metadata.deletionTimestamp}}{{$name := .metadata.name}}{{range .status.conditions}}{{if and (eq .type "Ready") (eq .status "True")}}{{$name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}' 2>/dev/null | head -1 || true) - [ -n "$POD" ] && break - sleep 2 - done - [ -n "$POD" ] || { echo "ERROR: no Ready non-terminating fluence pod found"; kubectl -n kube-system get pods -l app=fluence -o wide; exit 1; } - echo "Using pod: $POD" - # Brief sleep to let the container runtime stabilize before exec - sleep 5 - kubectl -n kube-system exec "$POD" -- ls /tmp/ - kubectl -n kube-system logs "$POD" - kubectl -n kube-system exec "$POD" -- /bin/bash -c "cat /tmp/fluence-graph-*.json" - kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{": cpu="}{.status.allocatable.cpu}{" mem="}{.status.allocatable.memory}{"\n"}{end}' - - - name: E2E - classical gang - run: bash test/e2e/01-classical-gang.sh - - - name: E2E - multi-pod gang (all-or-nothing + contention) - run: bash test/e2e/06-multi-gang.sh - - - name: E2E - requeue on capacity (EventsToRegister) - run: bash test/e2e/08-requeue-on-capacity.sh - - - name: E2E - PostFilter re-match (taint-rejected allocation) - run: bash test/e2e/05-postfilter-rematch.sh - - - name: Deploy quantum add-on - run: | - # Includes the device plugin and oriented to testing container - kubectl apply -f deploy/fluence-resources-test.yaml - kubectl rollout restart -n kube-system deployment/fluence - kubectl rollout status -n kube-system deployment/fluence --timeout=60s - for i in $(seq 1 60); do - kubectl get nodes -o jsonpath='{range .items[*]}{.status.allocatable}{"\n"}{end}' - kubectl get nodes -o jsonpath='{range .items[*]}{.status.allocatable}{"\n"}{end}' | grep -q 'fluxion.flux-framework.org/qpu' && break - sleep 1 - done - # After a rollout restart BOTH the old and new pods are briefly Running. - # Select only a Ready pod with no deletionTimestamp (i.e. the new one, - # not the terminating old one) so exec/logs don't 404. - POD="" - for i in $(seq 1 60); do - POD=$(kubectl -n kube-system get pods -l app=fluence \ - -o go-template='{{range .items}}{{if not .metadata.deletionTimestamp}}{{$name := .metadata.name}}{{range .status.conditions}}{{if and (eq .type "Ready") (eq .status "True")}}{{$name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}' 2>/dev/null | head -1 || true) - [ -n "$POD" ] && break - sleep 2 - done - [ -n "$POD" ] || { echo "ERROR: no Ready non-terminating fluence pod found after restart"; kubectl -n kube-system get pods -l app=fluence -o wide; exit 1; } - echo "Using pod: $POD" - # Brief sleep to let the container runtime stabilize before exec - sleep 5 - kubectl -n kube-system exec "$POD" -- /bin/bash -c "cat /tmp/fluence-graph-*.json" - - - name: Wait for webhook - run: | - - # wait for the deployment AND for the caBundle to be populated on the webhook config - kubectl -n kube-system rollout status deployment/fluence-webhook --timeout=120s - for i in $(seq 1 30); do - cab=$(kubectl get mutatingwebhookconfiguration fluence-webhook \ - -o jsonpath='{.webhooks[0].clientConfig.caBundle}' 2>/dev/null) - [ -n "$cab" ] && break - sleep 2 - done - # let TLS serving settle after caBundle patch - sleep 3 - - - name: E2E - quantum placement - run: bash test/e2e/02-quantum-placement.sh - - #- name: E2E - restart recovery (no double-book) - # run: bash test/e2e/03-restart-recovery.sh - - - name: E2E - sidecar ungate - run: bash test/e2e/04-sidecar-ungate.sh - - - name: E2E - quantum two-group split (leader=1, workers=N-1) - run: bash test/e2e/07-quantum-split.sh - - - name: Dump diagnostics on failure - if: failure() - run: | - kubectl get pods -A -o wide - kubectl logs -n kube-system deployment/fluence \ No newline at end of file + strategy: + fail-fast: false # one suite failing should not cancel the other + matrix: + suite: [gang, quantum] + uses: ./.github/workflows/e2e-suite.yaml + with: + suite: ${{ matrix.suite }} \ No newline at end of file diff --git a/Makefile b/Makefile index 08d15b3..5912c5a 100644 --- a/Makefile +++ b/Makefile @@ -61,7 +61,6 @@ test-image-deploy: test-image .PHONY: test-deploy-recreate test-deploy-recreate: test-image-deploy kubectl apply -f deploy/fluence-pull-test.yaml - kubectl apply -f deploy/device-plugin.yaml .PHONY: deploy deploy: ## Install RBAC + scheduler into kube-system diff --git a/deploy/fluence-pull-test.yaml b/deploy/fluence-pull-test.yaml index 0fc642f..71dc03e 100644 --- a/deploy/fluence-pull-test.yaml +++ b/deploy/fluence-pull-test.yaml @@ -146,6 +146,13 @@ spec: # Without these its PodGroup/GangScheduling plugin is inactive, pods # schedule with no gang semantics, and PodGroup status stays Pending. - --feature-gates=GenericWorkload=true,GangScheduling=true + # Re-attempt unschedulable pods more often than the 5m default. In the + # contention experiment a gang that loses the initial race for nodes is + # marked Unschedulable; this is how soon it is re-tried after capacity + # frees (the event-driven QueueingHint is best-effort; this is the + # backstop that bounds worst-case requeue latency). 30s keeps contended + # gangs draining promptly without thrashing the queue. + - --pod-max-in-unschedulable-pods-duration=30s - --v=4 env: # Path to the resources config (e.g. quantum backends). Unset/empty @@ -274,4 +281,4 @@ metadata: value: 1000000 globalDefault: false preemptionPolicy: PreemptLowerPriority -description: "High priority for classical pods paired with quantum work. Set by Fluence webhook." +description: "High priority for classical pods paired with quantum work. Set by Fluence webhook." \ No newline at end of file diff --git a/deploy/fluence-test.yaml b/deploy/fluence-test.yaml index 6d1dace..48e95a4 100644 --- a/deploy/fluence-test.yaml +++ b/deploy/fluence-test.yaml @@ -146,6 +146,13 @@ spec: # Without these its PodGroup/GangScheduling plugin is inactive, pods # schedule with no gang semantics, and PodGroup status stays Pending. - --feature-gates=GenericWorkload=true,GangScheduling=true + # Re-attempt unschedulable pods more often than the 5m default. In the + # contention experiment a gang that loses the initial race for nodes is + # marked Unschedulable; this is how soon it is re-tried after capacity + # frees (the event-driven QueueingHint is best-effort; this is the + # backstop that bounds worst-case requeue latency). 30s keeps contended + # gangs draining promptly without thrashing the queue. + - --pod-max-in-unschedulable-pods-duration=30s - --v=4 env: # Path to the resources config (e.g. quantum backends). Unset/empty @@ -274,4 +281,4 @@ metadata: value: 1000000 globalDefault: false preemptionPolicy: PreemptLowerPriority -description: "High priority for classical pods paired with quantum work. Set by Fluence webhook." +description: "High priority for classical pods paired with quantum work. Set by Fluence webhook." \ No newline at end of file diff --git a/deploy/kind-config.yaml b/deploy/kind-config.yaml index a13acf5..ec310bc 100644 --- a/deploy/kind-config.yaml +++ b/deploy/kind-config.yaml @@ -32,5 +32,4 @@ nodes: - name: feature-gates value: "GenericWorkload=true" - role: worker - - role: worker - - role: worker + - role: worker \ No newline at end of file diff --git a/examples/multi-gang-contention.yaml b/examples/test/e2e/gang/multi-gang-contention.yaml similarity index 92% rename from examples/multi-gang-contention.yaml rename to examples/test/e2e/gang/multi-gang-contention.yaml index bf5c74e..14b0fd8 100644 --- a/examples/multi-gang-contention.yaml +++ b/examples/test/e2e/gang/multi-gang-contention.yaml @@ -18,7 +18,7 @@ spec: - name: w image: busybox command: ["sleep", "3600"] - resources: {requests: {cpu: "100m"}} + resources: {requests: {cpu: "1"}} --- apiVersion: apps/v1 kind: Deployment @@ -37,4 +37,4 @@ spec: - name: w image: busybox command: ["sleep", "3600"] - resources: {requests: {cpu: "100m"}} + resources: {requests: {cpu: "1"}} diff --git a/examples/multi-gang-requeue.yaml b/examples/test/e2e/gang/multi-gang-requeue.yaml similarity index 81% rename from examples/multi-gang-requeue.yaml rename to examples/test/e2e/gang/multi-gang-requeue.yaml index 1bbaf02..a8e8636 100644 --- a/examples/multi-gang-requeue.yaml +++ b/examples/test/e2e/gang/multi-gang-requeue.yaml @@ -1,9 +1,9 @@ -# Requeue-on-capacity test for EventsToRegister. +# Requeue-on-capacity + gang-atomicity test (test/e2e/gang/09). # gang-win: a 2-pod gang that runs a SHORT job and COMPLETES (pods -> Succeeded), -# freeing its nodes. Pod completion is the capacity-free event. +# freeing its nodes. # gang-wait: a 2-pod gang needing the same nodes; loses the initial race and sits -# Unschedulable. When gang-win completes, fluence's QueueingHint must -# wake gang-wait so it schedules and runs — with NO manual nudge. +# Unschedulable. When gang-win completes, gang-wait must be re-attempted +# (via the shortened unschedulable-recheck timeout) and place atomically. # On a 3-worker (~3-core) cluster the two 2-pod gangs (4 cores) cannot co-run. apiVersion: batch/v1 kind: Job diff --git a/examples/multi-gang.yaml b/examples/test/e2e/gang/multi-gang.yaml similarity index 55% rename from examples/multi-gang.yaml rename to examples/test/e2e/gang/multi-gang.yaml index 5b9f58e..9bfa67c 100644 --- a/examples/multi-gang.yaml +++ b/examples/test/e2e/gang/multi-gang.yaml @@ -1,14 +1,10 @@ -# Multi-pod gang via the WEBHOOK path (the path the experiments use): pods carry -# the group LABEL + group-size annotation; the fluence webhook creates the -# PodGroup with minCount = group-size (3). All 3 must place or none. -# The CI cluster has 3 workers; fluxion graphs ~1 core per node, so a 3-pod -# gang needs all three. minCount=3 enforces all-or-none. +# Multi-pod gang via the WEBHOOK path (the path the experiments use apiVersion: apps/v1 kind: Deployment metadata: name: gang3 spec: - replicas: 3 + replicas: 2 selector: matchLabels: {app: gang3} template: @@ -17,7 +13,7 @@ spec: app: gang3 fluence.flux-framework.org/group: gang3 annotations: - fluence.flux-framework.org/group-size: "3" + fluence.flux-framework.org/group-size: "2" spec: schedulerName: fluence containers: @@ -26,4 +22,4 @@ spec: command: ["sleep", "3600"] resources: requests: - cpu: "100m" + cpu: "1" diff --git a/examples/single-podgroup.yaml b/examples/test/e2e/gang/single-podgroup.yaml similarity index 100% rename from examples/single-podgroup.yaml rename to examples/test/e2e/gang/single-podgroup.yaml diff --git a/examples/test/e2e/quantum-pod-mock.yaml b/examples/test/e2e/quantum/quantum-pod-mock.yaml similarity index 100% rename from examples/test/e2e/quantum-pod-mock.yaml rename to examples/test/e2e/quantum/quantum-pod-mock.yaml diff --git a/examples/test/e2e/quantum-split-pods.yaml b/examples/test/e2e/quantum/quantum-split-pods.yaml similarity index 84% rename from examples/test/e2e/quantum-split-pods.yaml rename to examples/test/e2e/quantum/quantum-split-pods.yaml index 0fcd296..8dd0026 100644 --- a/examples/test/e2e/quantum-split-pods.yaml +++ b/examples/test/e2e/quantum/quantum-split-pods.yaml @@ -12,7 +12,6 @@ metadata: fluence.flux-framework.org/group: qsplit annotations: fluence.flux-framework.org/role: leader - fluence.flux-framework.org/group-size: "3" spec: schedulerName: fluence restartPolicy: Never @@ -29,7 +28,7 @@ kind: Pod metadata: name: qsplit-worker-0 labels: {app: qsplit, fluence.flux-framework.org/group: qsplit} - annotations: {fluence.flux-framework.org/role: worker, fluence.flux-framework.org/group-size: "3"} + annotations: {fluence.flux-framework.org/role: worker, fluence.flux-framework.org/group-size: "2"} spec: schedulerName: fluence restartPolicy: Never @@ -37,14 +36,14 @@ spec: - name: w image: busybox command: ["sh","-c","echo worker; sleep 30"] - resources: {requests: {cpu: "100m"}} + resources: {requests: {cpu: "1"}} --- apiVersion: v1 kind: Pod metadata: name: qsplit-worker-1 labels: {app: qsplit, fluence.flux-framework.org/group: qsplit} - annotations: {fluence.flux-framework.org/role: worker, fluence.flux-framework.org/group-size: "3"} + annotations: {fluence.flux-framework.org/role: worker} spec: schedulerName: fluence restartPolicy: Never @@ -52,4 +51,4 @@ spec: - name: w image: busybox command: ["sh","-c","echo worker; sleep 30"] - resources: {requests: {cpu: "100m"}} + resources: {requests: {cpu: "1"}} diff --git a/examples/test/e2e/sidecar-mock-pods.yaml b/examples/test/e2e/quantum/sidecar-mock-pods.yaml similarity index 97% rename from examples/test/e2e/sidecar-mock-pods.yaml rename to examples/test/e2e/quantum/sidecar-mock-pods.yaml index fb223a7..e3c86bb 100644 --- a/examples/test/e2e/sidecar-mock-pods.yaml +++ b/examples/test/e2e/quantum/sidecar-mock-pods.yaml @@ -60,5 +60,4 @@ spec: fieldPath: metadata.annotations['fluence.flux-framework.org/quantum-job-id'] resources: requests: - cpu: "100m" - memory: "128Mi" + cpu: "1" \ No newline at end of file diff --git a/pkg/fluence/enqueue.go b/pkg/fluence/enqueue.go deleted file mode 100644 index f04e574..0000000 --- a/pkg/fluence/enqueue.go +++ /dev/null @@ -1,86 +0,0 @@ -/* -Copyright 2024 Lawrence Livermore National Security, LLC - (c.f. AUTHORS, NOTICE.LLNS, COPYING) -SPDX-License-Identifier: Apache-2.0 -*/ - -package fluence - -import ( - "context" - - corev1 "k8s.io/api/core/v1" - "k8s.io/klog/v2" - fwk "k8s.io/kube-scheduler/framework" -) - -// Fluence implements EnqueueExtensions so that a gang rejected as Unschedulable -// (because the node pool was full) is RE-ATTEMPTED when capacity frees up. -// -// Without this, a losing gang sits in the unschedulable queue until the -// scheduler's periodic backoff flush — it is NOT woken when another gang -// finishes and releases nodes. For the contention experiment (submit more gang -// demand than the cluster holds, watch gangs drain as others complete) that -// means contended gangs stall instead of draining promptly. The capacity-freeing -// events are: a pod terminating (Succeeded/Failed — batch apps Complete and -// linger before deletion, so Update catches it earlier than Delete), a pod being -// deleted, and node capacity appearing/growing. -var _ fwk.EnqueueExtensions = (*Fluence)(nil) - -// EventsToRegister declares the cluster events that may let a previously -// Unschedulable Fluence gang schedule, each with a QueueingHint that filters out -// events which cannot plausibly free capacity (so we do not churn the queue). -func (f *Fluence) EventsToRegister(_ context.Context) ([]fwk.ClusterEventWithHint, error) { - return []fwk.ClusterEventWithHint{ - // A pod going terminal (Succeeded/Failed) frees its node BEFORE deletion; - // this is the event that actually fires when a batch gang completes. - {Event: fwk.ClusterEvent{Resource: fwk.Pod, ActionType: fwk.Update}, - QueueingHintFn: f.isPodCapacityChange}, - // A pod being deleted frees its node. - {Event: fwk.ClusterEvent{Resource: fwk.Pod, ActionType: fwk.Delete}, - QueueingHintFn: f.isPodCapacityChange}, - // New node, or a node's allocatable growing, adds capacity. - {Event: fwk.ClusterEvent{Resource: fwk.Node, - ActionType: fwk.Add | fwk.UpdateNodeAllocatable}}, - }, nil -} - -// isPodCapacityChange returns Queue when the pod event plausibly frees node -// capacity for a waiting gang — i.e. another pod terminated or was deleted. -// Anything else (a pod being created, an unrelated label change) returns -// QueueSkip so the waiting gang is not retried pointlessly. -// -// We do not try to be clever about which specific nodes freed: any capacity -// release can change a Fluxion match, and PreFilter re-matches the whole graph -// on retry. The hint just suppresses the obviously-irrelevant events. -func (f *Fluence) isPodCapacityChange( - logger klog.Logger, _ *corev1.Pod, oldObj, newObj interface{}, -) (fwk.QueueingHint, error) { - // Delete event: newObj is nil, oldObj is the deleted pod -> frees capacity. - if newObj == nil { - if _, ok := oldObj.(*corev1.Pod); ok { - return fwk.Queue, nil - } - return fwk.QueueSkip, nil - } - // Update event: queue only when the pod BECOMES terminal (was running, now - // Succeeded/Failed) — that is the moment its node frees. - newPod, ok := newObj.(*corev1.Pod) - if !ok { - return fwk.QueueSkip, nil - } - if !isTerminalPhase(newPod.Status.Phase) { - return fwk.QueueSkip, nil - } - // If we can see the old object, only fire on the transition INTO terminal - // (avoid re-queuing on every update of an already-finished pod). - if oldPod, ok := oldObj.(*corev1.Pod); ok && isTerminalPhase(oldPod.Status.Phase) { - return fwk.QueueSkip, nil - } - return fwk.Queue, nil -} - -// isTerminalPhase reports whether a pod phase means its node capacity is released. -func isTerminalPhase(p corev1.PodPhase) bool { - return p == corev1.PodSucceeded || p == corev1.PodFailed -} \ No newline at end of file diff --git a/pkg/fluence/enqueue_test.go b/pkg/fluence/enqueue_test.go deleted file mode 100644 index a99c1c5..0000000 --- a/pkg/fluence/enqueue_test.go +++ /dev/null @@ -1,87 +0,0 @@ -//go:build cgo - -/* -Copyright 2024 Lawrence Livermore National Security, LLC - (c.f. AUTHORS, NOTICE.LLNS, COPYING) -SPDX-License-Identifier: Apache-2.0 -*/ - -// Unit tests for the requeue QueueingHint. Tagged cgo because the package links -// the Fluxion matcher; runs in CI (fluence-base) via `make test`. -package fluence - -import ( - "context" - "testing" - - corev1 "k8s.io/api/core/v1" - "k8s.io/klog/v2" - fwk "k8s.io/kube-scheduler/framework" -) - -func pod(phase corev1.PodPhase) *corev1.Pod { - return &corev1.Pod{Status: corev1.PodStatus{Phase: phase}} -} - -func TestQueueingHint(t *testing.T) { - f := &Fluence{} - lg := klog.Background() - waiting := pod(corev1.PodPending) // the rejected gang pod (unused by the hint) - - cases := []struct { - name string - oldObj interface{} - newObj interface{} - want fwk.QueueingHint - }{ - {"pod deleted frees capacity", - pod(corev1.PodRunning), nil, fwk.Queue}, - {"pod became Succeeded frees capacity", - pod(corev1.PodRunning), pod(corev1.PodSucceeded), fwk.Queue}, - {"pod became Failed frees capacity", - pod(corev1.PodRunning), pod(corev1.PodFailed), fwk.Queue}, - {"already-terminal update does not re-fire", - pod(corev1.PodSucceeded), pod(corev1.PodSucceeded), fwk.QueueSkip}, - {"pod still running is irrelevant", - pod(corev1.PodRunning), pod(corev1.PodRunning), fwk.QueueSkip}, - {"pod created (pending) does not free capacity", - nil, pod(corev1.PodPending), fwk.QueueSkip}, - } - for _, c := range cases { - t.Run(c.name, func(t *testing.T) { - got, err := f.isPodCapacityChange(lg, waiting, c.oldObj, c.newObj) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - if got != c.want { - t.Errorf("hint = %v, want %v", got, c.want) - } - }) - } -} - -// The plugin must advertise the capacity-freeing events. -func TestEventsToRegister(t *testing.T) { - f := &Fluence{} - evts, err := f.EventsToRegister(context.Background()) - if err != nil { - t.Fatalf("EventsToRegister error: %v", err) - } - if len(evts) == 0 { - t.Fatal("no events registered — unschedulable gangs would never wake on capacity change") - } - var podUpdate, podDelete, node bool - for _, e := range evts { - switch { - case e.Event.Resource == fwk.Pod && e.Event.ActionType&fwk.Update != 0: - podUpdate = true - case e.Event.Resource == fwk.Pod && e.Event.ActionType&fwk.Delete != 0: - podDelete = true - case e.Event.Resource == fwk.Node: - node = true - } - } - if !podUpdate || !podDelete || !node { - t.Errorf("missing events: podUpdate=%v podDelete=%v node=%v", podUpdate, podDelete, node) - } -} diff --git a/pkg/fluence/fluence.go b/pkg/fluence/fluence.go index 45c72cd..c7fca16 100644 --- a/pkg/fluence/fluence.go +++ b/pkg/fluence/fluence.go @@ -77,13 +77,15 @@ type Fluence struct { mu sync.Mutex // placement maps a group key to its allocation (nodes, backend, jobids). placement map[string]groupAlloc - // excludedNodes maps a group key to the set of node names that have been - // rejected for that group by other scheduler plugins (taints, affinity, - // volume topology that Fluxion's graph does not model). PostFilter adds the - // whole failed allocation's nodes here; PreFilter feeds them back as an RFC 31 - // negated-hostlist constraint so the re-match is forced onto untried nodes. - // The set only grows for a group, guaranteeing the retry converges (finite - // node pool) and is cleared on teardown. Guarded by mu. + // excludedNodes maps a group key to the set of nodes that are GENUINELY + // INCOMPATIBLE with that group (PostFilter saw UnschedulableAndUnresolvable + // from another plugin: a taint, affinity, or constraint Fluxion's graph does + // not model). PreFilter feeds them back as an RFC 31 negated-hostlist + // constraint so the re-match is steered onto other nodes. Nodes that were + // merely BUSY are deliberately NOT recorded here (excluding them would turn + // transient contention into permanent group failure). The set only grows for a + // group, so the exclusion-driven re-match is finite, and it is cleared on + // teardown. Guarded by mu. excludedNodes map[string]map[string]bool } @@ -94,6 +96,41 @@ var ( _ fwk.PreBindPlugin = (*Fluence)(nil) ) +// schedulableNodes returns only the nodes a normal pod could actually be placed +// on, so the Fluxion graph never offers a node that Kubernetes will then reject +// in Filter. Two kinds are dropped: +// +// - cordoned nodes (spec.unschedulable), and +// - nodes carrying a NoSchedule/NoExecute taint (e.g. the control-plane's +// node-role.kubernetes.io/control-plane:NoSchedule). +// +// Without this, Fluxion can place a gang slot on the control-plane (it looks like +// a valid virtual=false compute node to the graph), the pod is then rejected by +// TaintToleration with UnschedulableAndUnresolvable, and PostFilter abandons the +// whole allocation — on a small cluster that strands the gang permanently. We do +// not attempt to honor specific tolerations here: gang workloads in this setup do +// not tolerate node taints, so any NoSchedule/NoExecute taint means "not for us". +func schedulableNodes(nodes []corev1.Node) []corev1.Node { + out := make([]corev1.Node, 0, len(nodes)) + for _, n := range nodes { + if n.Spec.Unschedulable { + continue + } + tainted := false + for _, t := range n.Spec.Taints { + if t.Effect == corev1.TaintEffectNoSchedule || t.Effect == corev1.TaintEffectNoExecute { + tainted = true + break + } + } + if tainted { + continue + } + out = append(out, n) + } + return out +} + // New builds the plugin: discover cluster nodes, optionally inject quantum // resources, write the JGF graph, initialize the Fluxion matcher, and register // the delete handlers that cancel allocations when their owning object is gone. @@ -138,7 +175,7 @@ func New(ctx context.Context, _ runtime.Object, h fwk.Handle) (fwk.Plugin, error } } - jgfBytes, err := cluster.BuildGraph(nodeList.Items, opts) + jgfBytes, err := cluster.BuildGraph(schedulableNodes(nodeList.Items), opts) if err != nil { return nil, fmt.Errorf("build resource graph: %w", err) } @@ -409,25 +446,33 @@ func (f *Fluence) Filter( } // PostFilter runs when a pod could not be scheduled after Filter — for a Fluence -// group, this means the cached Fluxion allocation's nodes did not all survive -// the other scheduler plugins' Filter checks (a taint, node affinity, or volume -// topology constraint that Fluxion's resource graph does not model rejected one -// or more of them). Without intervention the group would retry forever against -// the same cached allocation while the Fluxion reservation leaked, because -// PreFilter short-circuits on the cache and nothing else releases it on a -// scheduling failure. +// group, this means the cached Fluxion allocation's nodes did not all survive the +// other scheduler plugins' Filter checks. Without intervention the group would +// retry forever against the same cached allocation while the Fluxion reservation +// leaked, because PreFilter short-circuits on the cache and nothing else releases +// it on a scheduling failure. +// +// We always abandon the failed allocation here (cancel the Fluxion jobids, drop +// the cached placement) so the next PreFilter re-matches fresh. The careful part +// is WHICH nodes we then permanently exclude from the group's future matches, +// because a group reaches PostFilter for two very different reasons and they must +// be handled oppositely (see fwk.Code docs): +// +// - UnschedulableAndUnresolvable: the node genuinely cannot host this pod and +// re-trying it is pointless (a taint the pod does not tolerate, node affinity +// mismatch, a constraint Fluxion's graph does not model). EXCLUDE it; the +// next PreFilter feeds the exclusion set back as an RFC 31 negated-hostlist +// constraint so Fluxion is steered onto other nodes. // -// We react by abandoning the failed allocation: the ENTIRE cached node set is -// added to the group's exclusion set, the Fluxion jobids are cancelled, and the -// cached placement is deleted. The next PreFilter for the group re-matches with -// an RFC 31 negated-hostlist constraint over the accumulated exclusion set, so -// Fluxion is forced onto untried nodes. We exclude the whole set (not just the -// individually-rejected nodes) deliberately: if the group as a whole could not -// be admitted, a node that happened to survive this round carries no guarantee -// for the next, and excluding the whole set makes each retry a strictly smaller, -// monotonic search that converges — either to a feasible allocation on untried -// nodes, or to a clean no-match (Unschedulable) once the graph is exhausted, at -// which point the pod waits for a cluster-state change rather than busy-looping. +// - Unschedulable (plain): the node could host the pod, just not at this +// instant (it is momentarily full). This is TRANSIENT. Do NOT exclude it — +// excluding a merely-busy node converts ordinary contention into permanent +// group failure, and in a saturated cluster (a gang that needs the whole node +// set) it strands the gang forever even though it would fit once a node frees. +// +// So contention excludes nothing and the group recovers by waiting/retrying; +// only durable incompatibility accumulates in excludedNodes (cleared on group +// teardown), which keeps the exclusion-driven re-match finite and correct. func (f *Fluence) PostFilter( ctx context.Context, state fwk.CycleState, @@ -444,12 +489,39 @@ func (f *Fluence) PostFilter( f.mu.Unlock() return nil, fwk.NewStatus(fwk.Unschedulable) } - // Accumulate the whole failed allocation's nodes into the exclusion set. + // Exclude ONLY nodes that are genuinely incompatible with this pod, never + // nodes that were merely busy this cycle. The framework gives us a per-node + // status: UnschedulableAndUnresolvable means the node cannot host the pod and + // re-trying it is pointless (a taint the pod does not tolerate, node affinity + // mismatch, a constraint Fluxion's graph does not model) -> exclude it so the + // re-match is steered elsewhere. A plain Unschedulable means the node could + // host the pod but not right now (it is momentarily full) -> do NOT exclude + // it; it must stay eligible so the group can land there once capacity frees. + // + // This is the whole point: a group enters PostFilter for many reasons, and + // "the cluster is just full at this instant" is the common one. Permanently + // banning the busy nodes (the old whole-allocation exclusion) turned transient + // contention into permanent group failure — exactly backwards. Now contention + // excludes nothing; the group simply abandons this cycle's reservation and + // retries the same nodes when they free. if f.excludedNodes[group] == nil { f.excludedNodes[group] = map[string]bool{} } + var incompatible, busy []string for _, n := range alloc.place.Nodes { - f.excludedNodes[group][n] = true + var code fwk.Code + if filteredNodeStatusMap != nil { + if st := filteredNodeStatusMap.Get(n); st != nil { + code = st.Code() + } + } + if code == fwk.UnschedulableAndUnresolvable { + f.excludedNodes[group][n] = true + incompatible = append(incompatible, n) + } else { + // plain Unschedulable, Success, or unknown/nil -> transient, keep. + busy = append(busy, n) + } } excludedCount := len(f.excludedNodes[group]) jobids := alloc.jobids @@ -460,14 +532,13 @@ func (f *Fluence) PostFilter( // does not leak it while the group retries. f.cancelJobids(jobids) - log.Printf("[fluence] group %s unschedulable: abandoning allocation (nodes %v, "+ - "jobids %v); %d node(s) now excluded, will re-match on next cycle", - group, alloc.place.Nodes, jobids, excludedCount) + log.Printf("[fluence] group %s unschedulable: abandoning allocation (jobids %v); "+ + "incompatible(excluded)=%v busy(retryable, NOT excluded)=%v; %d node(s) excluded total", + group, jobids, incompatible, busy, excludedCount) // Returning Unschedulable (no nominated node) lets the pod be requeued; the - // next PreFilter re-matches with the enlarged exclusion set. We do not - // nominate a node — Fluxion, not PostFilter preemption, chooses the next - // placement. + // next PreFilter re-matches (with any incompatible nodes excluded, but busy + // nodes still in play). Fluxion, not PostFilter preemption, chooses placement. return nil, fwk.NewStatus(fwk.Unschedulable) } diff --git a/pkg/fluence/fluence_test.go b/pkg/fluence/fluence_test.go index 6a53b56..5228f97 100644 --- a/pkg/fluence/fluence_test.go +++ b/pkg/fluence/fluence_test.go @@ -354,10 +354,25 @@ func twoSpecs() []*jobspec.Jobspec { // --- PostFilter allocation reconciliation ----------------------------------- -// PostFilter must abandon a group's failed allocation: add the WHOLE cached node -// set to the exclusion set, cancel the Fluxion jobids, and delete the cache, so -// the next PreFilter re-matches onto untried nodes. -func TestPostFilterAbandonsAndExcludesWholeNodeSet(t *testing.T) { +// fakeNodeStatus is a minimal fwk.NodeToStatusReader for PostFilter tests: it +// maps node name -> status code so a test can mark some nodes incompatible +// (UnschedulableAndUnresolvable) and others merely busy (Unschedulable). +type fakeNodeStatus map[string]fwk.Code + +func (s fakeNodeStatus) Get(node string) *fwk.Status { + if c, ok := s[node]; ok { + return fwk.NewStatus(c) + } + return nil +} +func (s fakeNodeStatus) NodesForStatusCode(fwk.NodeInfoLister, fwk.Code) ([]fwk.NodeInfo, error) { + return nil, nil +} + +// PostFilter abandons the failed allocation (cancel jobids, drop cache) and +// excludes ONLY genuinely-incompatible nodes (UnschedulableAndUnresolvable). +// A node that was merely busy (plain Unschedulable) MUST stay eligible. +func TestPostFilterExcludesOnlyIncompatibleNodes(t *testing.T) { m := &fakeMatcher{} f := newTestFluence(m) key := "default/training" @@ -367,53 +382,102 @@ func TestPostFilterAbandonsAndExcludesWholeNodeSet(t *testing.T) { } pod := groupedPod("default", "training-0", "training", nil) - _, status := f.PostFilter(context.Background(), nil, pod, nil) - if status == nil || status.Code() != fwk.Unschedulable { - t.Fatalf("expected Unschedulable status, got %v", status) + // node-a incompatible (taint); node-b busy; node-c survived Filter. + status := fakeNodeStatus{ + "node-a": fwk.UnschedulableAndUnresolvable, + "node-b": fwk.Unschedulable, + "node-c": fwk.Success, + } + + _, st := f.PostFilter(context.Background(), nil, pod, status) + if st == nil || st.Code() != fwk.Unschedulable { + t.Fatalf("expected Unschedulable status, got %v", st) } - // cache cleared if _, still := f.placement[key]; still { t.Fatal("placement cache should be deleted after PostFilter") } - // jobids cancelled if len(m.cancelled) != 2 { t.Fatalf("expected both jobids cancelled, got %v", m.cancelled) } - // the WHOLE node set excluded excl := f.excludedNodes[key] - for _, n := range []string{"node-a", "node-b", "node-c"} { - if !excl[n] { - t.Fatalf("expected %s excluded, set=%v", n, excl) - } + if !excl["node-a"] { + t.Fatalf("incompatible node-a should be excluded, set=%v", excl) + } + if excl["node-b"] || excl["node-c"] { + t.Fatalf("busy/ok nodes must NOT be excluded (would strand a saturated gang), set=%v", excl) + } + if len(excl) != 1 { + t.Fatalf("expected exactly 1 excluded node, got %v", excl) + } +} + +// A group blocked purely by contention (every node merely busy) excludes NOTHING +// so it can retry the same nodes once they free — the saturated-cluster property. +func TestPostFilterContentionExcludesNothing(t *testing.T) { + m := &fakeMatcher{} + f := newTestFluence(m) + key := "default/training" + f.placement[key] = groupAlloc{ + place: placement.Placement{Nodes: []string{"node-a", "node-b"}}, + jobids: []uint64{1}, + } + pod := groupedPod("default", "training-0", "training", nil) + status := fakeNodeStatus{"node-a": fwk.Unschedulable, "node-b": fwk.Unschedulable} + + f.PostFilter(context.Background(), nil, pod, status) + + if len(f.excludedNodes[key]) != 0 { + t.Fatalf("a purely-busy group must exclude no nodes, got %v", f.excludedNodes[key]) } - if len(excl) != 3 { - t.Fatalf("expected exactly 3 excluded nodes, got %v", excl) + if _, still := f.placement[key]; still { + t.Fatal("placement cache should be deleted even when nothing is excluded") + } + if len(m.cancelled) != 1 { + t.Fatalf("expected the jobid cancelled, got %v", m.cancelled) } } -// Repeated failures accumulate monotonically: a second abandoned allocation adds -// its nodes to the existing exclusion set (the set only grows -> convergence). -func TestPostFilterAccumulatesAcrossAttempts(t *testing.T) { +// A nil status map (e.g. all nodes filtered out upstream) must be safe and +// exclude nothing rather than panic or ban the whole allocation. +func TestPostFilterNilStatusMapExcludesNothing(t *testing.T) { + m := &fakeMatcher{} + f := newTestFluence(m) + key := "default/training" + f.placement[key] = groupAlloc{place: placement.Placement{Nodes: []string{"node-a", "node-b"}}, jobids: []uint64{7}} + pod := groupedPod("default", "training-0", "training", nil) + + _, st := f.PostFilter(context.Background(), nil, pod, nil) + if st == nil || st.Code() != fwk.Unschedulable { + t.Fatalf("expected Unschedulable, got %v", st) + } + if len(f.excludedNodes[key]) != 0 { + t.Fatalf("nil status map must exclude nothing, got %v", f.excludedNodes[key]) + } +} + +// Incompatible nodes accumulate across attempts; busy ones never do. +func TestPostFilterAccumulatesIncompatibleAcrossAttempts(t *testing.T) { m := &fakeMatcher{} f := newTestFluence(m) key := "default/training" pod := groupedPod("default", "training-0", "training", nil) - // attempt 1 fails on {a,b} f.placement[key] = groupAlloc{place: placement.Placement{Nodes: []string{"node-a", "node-b"}}, jobids: []uint64{1}} - f.PostFilter(context.Background(), nil, pod, nil) - // attempt 2 (re-matched elsewhere) fails on {c,d} + f.PostFilter(context.Background(), nil, pod, fakeNodeStatus{"node-a": fwk.UnschedulableAndUnresolvable, "node-b": fwk.Unschedulable}) f.placement[key] = groupAlloc{place: placement.Placement{Nodes: []string{"node-c", "node-d"}}, jobids: []uint64{2}} - f.PostFilter(context.Background(), nil, pod, nil) + f.PostFilter(context.Background(), nil, pod, fakeNodeStatus{"node-c": fwk.UnschedulableAndUnresolvable, "node-d": fwk.Unschedulable}) excl := f.excludedNodes[key] - for _, n := range []string{"node-a", "node-b", "node-c", "node-d"} { + for _, n := range []string{"node-a", "node-c"} { if !excl[n] { - t.Fatalf("expected %s in accumulated exclusion set, got %v", n, excl) + t.Fatalf("incompatible %s should accumulate, got %v", n, excl) } } - if len(excl) != 4 { - t.Fatalf("exclusion set should accumulate to 4, got %v", excl) + if excl["node-b"] || excl["node-d"] { + t.Fatalf("busy nodes must never accumulate, got %v", excl) + } + if len(excl) != 2 { + t.Fatalf("exclusion set should be the 2 incompatible nodes, got %v", excl) } } @@ -451,3 +515,41 @@ func TestCancelGroupClearsExclusions(t *testing.T) { t.Fatal("exclusion set should be cleared on teardown") } } + +// schedulableNodes must drop control-plane (NoSchedule taint), NoExecute-tainted, +// and cordoned nodes, keeping only nodes a normal gang pod can actually land on. +// This keeps the Fluxion graph from offering nodes Kubernetes will reject in +// Filter (which, with whole-allocation PostFilter exclusion, strands the gang). +func TestSchedulableNodesDropsTaintedAndCordoned(t *testing.T) { + node := func(name string, unsched bool, effects ...corev1.TaintEffect) corev1.Node { + n := corev1.Node{} + n.Name = name + n.Spec.Unschedulable = unsched + for _, e := range effects { + n.Spec.Taints = append(n.Spec.Taints, corev1.Taint{Key: "k", Effect: e}) + } + return n + } + in := []corev1.Node{ + node("worker-1", false), + node("worker-2", false), + node("control-plane", false, corev1.TaintEffectNoSchedule), + node("draining", false, corev1.TaintEffectNoExecute), + node("cordoned", true), + node("prefer-only", false, corev1.TaintEffectPreferNoSchedule), // soft taint: keep + } + got := schedulableNodes(in) + gotNames := map[string]bool{} + for _, n := range got { + gotNames[n.Name] = true + } + want := []string{"worker-1", "worker-2", "prefer-only"} + if len(got) != len(want) { + t.Fatalf("expected %d schedulable nodes %v, got %d %v", len(want), want, len(got), gotNames) + } + for _, w := range want { + if !gotNames[w] { + t.Fatalf("expected %s kept, got set %v", w, gotNames) + } + } +} diff --git a/test/e2e/03-restart-recovery.sh b/test/e2e/03-restart-recovery.sh index 20c1be9..c26980f 100644 --- a/test/e2e/03-restart-recovery.sh +++ b/test/e2e/03-restart-recovery.sh @@ -9,7 +9,7 @@ ANN="fluence.flux-framework.org/backend" log "TEST 3: restart does not double-book an exclusive backend" # 1. Schedule the first qpu pod and capture its backend. -kubectl apply -f examples/test/e2e/quantum-pod-mock.yaml +kubectl apply -f examples/test/e2e/quantum/quantum-pod-mock.yaml wait_pod_phase sampler-mock "$NS" Running 120 || fail "sampler-mock did not reach Running" backend="$(kubectl get pod sampler-mock -n "$NS" -o jsonpath="{.metadata.annotations.${ANN//./\\.}}" 2>/dev/null || true)" [ -n "$backend" ] || fail "first pod has no backend annotation" @@ -26,7 +26,7 @@ wait_pod_phase sampler-mock "$NS" Running 30 || fail "first pod not Running afte # 4. A second pod requesting the same exclusive qpu must NOT get the same backend. # If recovery worked, the backend is occupied and the second pod stays Pending. -kubectl apply -f examples/test/e2e/quantum-pod-mock-2.yaml +kubectl apply -f examples/test/e2e/quantum/quantum-pod-mock-2.yaml if assert_stays_pending sampler-mock-2 "$NS" 45; then log "PASS: second qpu pod stayed Pending; backend '$backend' was not double-booked" else @@ -38,5 +38,5 @@ else fi fi -kubectl delete -f examples/test/e2e/quantum-pod-mock-2.yaml --wait=false || true -kubectl delete -f examples/test/e2e/quantum-pod-mock.yaml --wait=false || true +kubectl delete -f examples/test/e2e/quantum/quantum-pod-mock-2.yaml --wait=false || true +kubectl delete -f examples/test/e2e/quantum/quantum-pod-mock.yaml --wait=false || true diff --git a/test/e2e/08-requeue-on-capacity.sh b/test/e2e/08-requeue-on-capacity.sh deleted file mode 100644 index f67421b..0000000 --- a/test/e2e/08-requeue-on-capacity.sh +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env bash -# Requeue-on-capacity: a contended gang that loses the initial race must be -# RE-ATTEMPTED when the winner completes and frees nodes — driven by fluence's -# EventsToRegister, with no manual nudge. Guards the gap where Unschedulable -# gangs only woke on the backoff timer (so contended gangs stalled instead of -# draining as capacity freed). -set -euo pipefail -HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE}/lib.sh" - -log "TEST 9: contended gang is requeued when capacity frees (EventsToRegister)" -kubectl apply -f examples/multi-gang-requeue.yaml - -# Both gangs want the same nodes; only one fits. Identify winner/loser. -log "waiting for one gang to win and the other to be Unschedulable" -winner=""; loser="" -for i in $(seq 1 60); do - rw="$(kubectl get pods -l job-name=gang-win --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l | tr -d ' ')" - ra="$(kubectl get pods -l job-name=gang-wait --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l | tr -d ' ')" - if [ "$rw" = "2" ] && [ "$ra" = "0" ]; then winner=gang-win; loser=gang-wait; break; fi - if [ "$ra" = "2" ] && [ "$rw" = "0" ]; then winner=gang-wait; loser=gang-win; break; fi - sleep 2 -done -[ -n "$winner" ] || fail "no gang won a clean 2/0 placement (check capacity/contention)" -log "winner=$winner running; loser=$loser should be Unschedulable" - -# the loser's PodGroup should be Unschedulable (entirely pending — all-or-nothing) -for i in $(seq 1 15); do - st="$(kubectl get podgroup "$loser" -o jsonpath='{.status.conditions[*].type}{" "}{.status}' 2>/dev/null || true)" - echo "$st" | grep -qi "unschedulable\|pending" && break - # status field name varies; also accept: zero loser pods scheduled - sched="$(kubectl get pods -l job-name=$loser -o jsonpath='{range .items[*]}{.spec.nodeName}{"\n"}{end}' | grep -c . || true)" - [ "$sched" = "0" ] && break - sleep 2 -done -sched_loser="$(kubectl get pods -l job-name=$loser -o jsonpath='{range .items[*]}{.spec.nodeName}{"\n"}{end}' | grep -c . || true)" -[ "$sched_loser" = "0" ] || fail "$loser partially placed ($sched_loser pods on nodes) — gang violated" -log " $loser is entirely pending (no pods placed) — correct" - -# THE KEY ASSERTION: when the winner COMPLETES (frees nodes), the loser must be -# requeued and run — WITHOUT us touching it. The winner sleeps ~30s. -log "waiting for winner=$winner to complete and free capacity" -kubectl wait --for=condition=complete job/$winner --timeout=120s \ - || fail "$winner did not complete" -log " $winner completed; capacity freed" - -log "asserting $loser is now requeued and runs (EventsToRegister woke it)" -ok="" -for i in $(seq 1 60); do # up to ~120s; must be well under the 5-min backoff flush - rl="$(kubectl get pods -l job-name=$loser --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l | tr -d ' ')" - done_l="$(kubectl get pods -l job-name=$loser --field-selector=status.phase=Succeeded --no-headers 2>/dev/null | wc -l | tr -d ' ')" - if [ "$((rl + done_l))" -ge 1 ]; then ok=1; break; fi - sleep 2 -done -[ -n "$ok" ] || fail "$loser was NOT requeued after capacity freed within 120s — \ -EventsToRegister not waking unschedulable gangs (would only recover on the 5-min backoff flush)" -log "PASS 9: $loser was requeued and scheduled after $winner freed capacity" - -kubectl delete -f examples/multi-gang-requeue.yaml --wait=false || true -for g in gang-win gang-wait; do - kubectl patch podgroup $g --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true -done -kubectl wait --for=delete pod -l job-name=gang-win --timeout=60s 2>/dev/null || true -kubectl wait --for=delete pod -l job-name=gang-wait --timeout=60s 2>/dev/null || true \ No newline at end of file diff --git a/test/e2e/01-classical-gang.sh b/test/e2e/gang/01-classical-gang.sh old mode 100644 new mode 100755 similarity index 92% rename from test/e2e/01-classical-gang.sh rename to test/e2e/gang/01-classical-gang.sh index ffe8fa8..a854663 --- a/test/e2e/01-classical-gang.sh +++ b/test/e2e/gang/01-classical-gang.sh @@ -1,10 +1,10 @@ #!/usr/bin/env bash # Classical gang scheduling: a PodGroup of 2 must be placed all-or-nothing on real nodes. set -euo pipefail -HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE}/lib.sh" +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" log "TEST 1: classical gang scheduling" -kubectl apply -f examples/single-podgroup.yaml +kubectl apply -f examples/test/e2e/gang/single-podgroup.yaml # All pods in the 'training' deployment must reach Running (scheduled + started). # Wait for the pod to EXIST before waiting for Ready — kubectl wait errors out diff --git a/test/e2e/05-postfilter-rematch.sh b/test/e2e/gang/02-postfilter-rematch.sh old mode 100644 new mode 100755 similarity index 98% rename from test/e2e/05-postfilter-rematch.sh rename to test/e2e/gang/02-postfilter-rematch.sh index 1712ea9..6657a90 --- a/test/e2e/05-postfilter-rematch.sh +++ b/test/e2e/gang/02-postfilter-rematch.sh @@ -9,7 +9,7 @@ # so a pod left over (terminating) from a previous test can never be mistaken for # this test's placement. It also ignores terminating pods when asserting. set -euo pipefail -HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE}/lib.sh" +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" NAME=pf-rematch SEL="app=${NAME}" diff --git a/test/e2e/06-multi-gang.sh b/test/e2e/gang/03-multi-gang.sh similarity index 91% rename from test/e2e/06-multi-gang.sh rename to test/e2e/gang/03-multi-gang.sh index 2bd6a4c..1301382 100755 --- a/test/e2e/06-multi-gang.sh +++ b/test/e2e/gang/03-multi-gang.sh @@ -5,11 +5,11 @@ # B) under contention, a gang that cannot fully fit stays ENTIRELY pending — # never partially placed (no stranded pods holding nodes). set -euo pipefail -HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE}/lib.sh" +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" # ---- A) all-or-nothing placement of a 3-pod gang ------------------------------- log "TEST 6A: multi-pod gang (3) places all-or-nothing" -kubectl apply -f examples/multi-gang.yaml +kubectl apply -f examples/test/e2e/gang/multi-gang.yaml # the webhook must have created the PodGroup with minCount = 3 (the bug set it to 1) log "checking PodGroup minCount == 3 (set by webhook from group-size)" @@ -31,13 +31,13 @@ for p in $(kubectl get pods -l app=gang3 -o name); do done log "PASS 6A: 3-pod gang placed atomically by fluence (minCount=3)" -kubectl delete -f examples/multi-gang.yaml --wait=false || true +kubectl delete -f examples/test/e2e/gang/multi-gang.yaml --wait=false || true kubectl patch podgroup gang3 --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true kubectl wait --for=delete pod -l app=gang3 --timeout=60s 2>/dev/null || true # ---- B) contention: the gang that can't fully fit stays ENTIRELY pending -------- log "TEST 6B: contention — a gang that cannot fully fit must NOT partially place" -kubectl apply -f examples/multi-gang-contention.yaml +kubectl apply -f examples/test/e2e/gang/multi-gang-contention.yaml # wait until the cluster settles. Three possible outcomes: # - one gang fully Running, other fully Pending -> contention; assert no partial @@ -66,7 +66,7 @@ else log "PASS 6B: $loser stayed entirely pending — no partial placement under contention" fi -kubectl delete -f examples/multi-gang-contention.yaml --wait=false || true +kubectl delete -f examples/test/e2e/gang/multi-gang-contention.yaml --wait=false || true for g in gang-a gang-b; do kubectl patch podgroup $g --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true done diff --git a/test/e2e/gang/04-requeue-on-capacity.sh b/test/e2e/gang/04-requeue-on-capacity.sh new file mode 100755 index 0000000..f41aa71 --- /dev/null +++ b/test/e2e/gang/04-requeue-on-capacity.sh @@ -0,0 +1,87 @@ +#!/usr/bin/env bash +# Requeue-on-capacity + gang atomicity under contention. +# +# Two 2-pod gangs contend for a cluster that can only run one at a time. This +# guards two invariants that the GKE contention runs exposed: +# 1. ALL-OR-NOTHING: each gang places ALL its pods or NONE — never a partial +# (e.g. 1-of-2 scheduled). The winner must be a clean 2/2; the loser a clean +# 0/2 while it waits. +# 2. REQUEUE: when the winner completes and frees its nodes, the loser is +# re-attempted on its own (no manual nudge) and then ALSO places atomically +# (2/2), driven by the shortened --pod-max-in-unschedulable-pods-duration. +# +# SCOPE / LIMITATION: this is a 3-node kind cluster with small (1-core) pods. It +# verifies the INVARIANTS on a minimal contention case. It does NOT reproduce the +# GKE-scale dynamics where the bug was first seen — one-pod-per-node (~80-core) +# saturation and ~20 simultaneous mixed-size gangs draining in sequence. That +# scale behavior is validated on the real cluster, not in CI; a pass here means +# the invariants hold on the simple case, not that large-scale draining is proven. +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" + +# running-pod count for a gang (job-name label set by the Job controller) +running() { kubectl get pods -l job-name="$1" --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l | tr -d ' '; } +# count of a gang's pods actually bound to a node (Running OR already Succeeded) +on_nodes() { kubectl get pods -l job-name="$1" -o jsonpath='{range .items[*]}{.spec.nodeName}{"\n"}{end}' 2>/dev/null | grep -c . || true; } + +log "TEST 9: contended gangs stay all-or-nothing, loser requeues when capacity frees" +kubectl apply -f examples/test/e2e/gang/multi-gang-requeue.yaml + +# ---- 1. one gang wins CLEANLY (2/2); the other places NOTHING (0/2) ------------ +log "waiting for a clean 2/0 split (one whole gang runs, the other entirely waits)" +winner=""; loser="" +for i in $(seq 1 60); do + rw="$(running gang-win)"; ra="$(running gang-wait)" + if [ "$rw" = "2" ] && [ "$ra" = "0" ]; then winner=gang-win; loser=gang-wait; break; fi + if [ "$ra" = "2" ] && [ "$rw" = "0" ]; then winner=gang-wait; loser=gang-win; break; fi + # a 1/x or x/1 state that persists is a PARTIAL gang — fail fast on it + if [ "$rw" = "1" ] || [ "$ra" = "1" ]; then + sleep 6 # allow a transient mid-bind moment to resolve + rw="$(running gang-win)"; ra="$(running gang-wait)" + { [ "$rw" = "1" ] || [ "$ra" = "1" ]; } && \ + fail "PARTIAL gang: gang-win=$rw gang-wait=$ra running (all-or-nothing violated)" + fi + sleep 2 +done +[ -n "$winner" ] || fail "no clean 2/0 split (gang-win=$(running gang-win) gang-wait=$(running gang-wait))" +log " winner=$winner (2/2 running), loser=$loser" + +# loser must have ZERO pods on any node — not even one (that would be a partial) +sl="$(on_nodes "$loser")" +[ "$sl" = "0" ] || fail "$loser has $sl pod(s) bound while it should be entirely pending — PARTIAL placement" +log " $loser entirely pending (0 pods bound) — all-or-nothing holds" + +# ---- 2. winner completes -> loser is requeued AND places atomically ------------ +log "waiting for winner=$winner to complete and free its nodes" +kubectl wait --for=condition=complete job/$winner --timeout=120s || fail "$winner did not complete" +log " $winner completed; capacity freed" + +# The loser must now place ALL its pods (2/2), on its own, within a window above +# the 30s recheck flush but below the 5m default — proving the shortened timeout +# is in effect AND that the requeued gang is still atomic (not a partial). +log "asserting $loser requeues and places ATOMICALLY (2/2) within ~75s" +ok="" +for i in $(seq 1 38); do # ~75s + rl="$(running $loser)" + dl="$(kubectl get pods -l job-name=$loser --field-selector=status.phase=Succeeded --no-headers 2>/dev/null | wc -l | tr -d ' ')" + # both pods accounted for (running and/or already completed) = atomic placement + [ "$((rl + dl))" = "2" ] && { ok=1; break; } + # a lone 1/2 that lingers = partial placement of the requeued gang + if [ "$((rl + dl))" = "1" ]; then + sleep 6 + rl="$(running $loser)"; dl="$(kubectl get pods -l job-name=$loser --field-selector=status.phase=Succeeded --no-headers 2>/dev/null | wc -l | tr -d ' ')" + [ "$((rl + dl))" = "1" ] && fail "$loser placed 1 of 2 pods — PARTIAL placement of the requeued gang" + fi + sleep 2 +done +[ -n "$ok" ] || fail "$loser did NOT place both pods within 75s of capacity freeing — \ +either the shortened --pod-max-in-unschedulable-pods-duration is not taking effect \ +(gang stuck) or the requeued gang did not assemble" +log "PASS 9: $loser requeued and placed atomically (2/2) after $winner freed capacity" + +kubectl delete -f examples/test/e2e/gang/multi-gang-requeue.yaml --wait=false || true +for g in gang-win gang-wait; do + kubectl patch podgroup $g --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true +done +kubectl wait --for=delete pod -l job-name=gang-win --timeout=60s 2>/dev/null || true +kubectl wait --for=delete pod -l job-name=gang-wait --timeout=60s 2>/dev/null || true diff --git a/test/e2e/02-quantum-placement.sh b/test/e2e/quantum/01-quantum-placement.sh old mode 100644 new mode 100755 similarity index 81% rename from test/e2e/02-quantum-placement.sh rename to test/e2e/quantum/01-quantum-placement.sh index 17897a3..cc1bfe2 --- a/test/e2e/02-quantum-placement.sh +++ b/test/e2e/quantum/01-quantum-placement.sh @@ -1,11 +1,11 @@ #!/usr/bin/env bash # Quantum placement: a qpu pod is matched to a backend and the webhook injects QRMI_BACKEND. set -euo pipefail -HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE}/lib.sh" +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" ANN="fluence.flux-framework.org/backend" log "TEST 2: quantum placement and backend handoff" -kubectl apply -f examples/test/e2e/quantum-pod-mock.yaml +kubectl apply -f examples/test/e2e/quantum/quantum-pod-mock.yaml wait_pod_phase sampler-mock Running 120 || fail "sampler-mock did not reach Running" @@ -20,4 +20,4 @@ echo "$out" | grep -q "BACKEND=${backend}" \ || (show_webhook sampler-mock && fail "QRMI_BACKEND in container ('$out') does not match annotation ($backend)") log "PASS: qpu pod scheduled, backend '$backend' chosen and injected as QRMI_BACKEND" -kubectl delete -f examples/test/e2e/quantum-pod-mock.yaml --wait=false || true +kubectl delete -f examples/test/e2e/quantum/quantum-pod-mock.yaml --wait=false || true diff --git a/test/e2e/04-sidecar-ungate.sh b/test/e2e/quantum/02-sidecar-ungate.sh old mode 100644 new mode 100755 similarity index 95% rename from test/e2e/04-sidecar-ungate.sh rename to test/e2e/quantum/02-sidecar-ungate.sh index 9ffefc8..61abcb0 --- a/test/e2e/04-sidecar-ungate.sh +++ b/test/e2e/quantum/02-sidecar-ungate.sh @@ -11,11 +11,11 @@ # queue position polling). Those require real AWS credentials and are covered # by sidecars/providers/braket/test/integration.sh which is run locally. set -euo pipefail -HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE}/lib.sh" +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" log "TEST 4: sidecar webhook — RBAC creation, gate injection, sidecar injection" -kubectl apply -f examples/test/e2e/sidecar-mock-pods.yaml +kubectl apply -f examples/test/e2e/quantum/sidecar-mock-pods.yaml # Give webhook time to process the leader pod admission sleep 3 @@ -85,4 +85,4 @@ log " queue polling) is in sidecars/providers/braket/test/integration.sh" # Only clean up pods and PodGroup — RBAC is namespace infrastructure # that persists for future quantum workflows in this namespace -kubectl delete -f examples/test/e2e/sidecar-mock-pods.yaml +kubectl delete -f examples/test/e2e/quantum/sidecar-mock-pods.yaml diff --git a/test/e2e/07-quantum-split.sh b/test/e2e/quantum/03-quantum-split.sh similarity index 92% rename from test/e2e/07-quantum-split.sh rename to test/e2e/quantum/03-quantum-split.sh index 3899f57..c84f2ba 100755 --- a/test/e2e/07-quantum-split.sh +++ b/test/e2e/quantum/03-quantum-split.sh @@ -6,10 +6,10 @@ # group, makes quantum gangs work. (The runtime ungate is covered by 04; here we # prove the group SPLIT the ungate path depends on.) set -euo pipefail -HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE}/lib.sh" +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" log "TEST 7: quantum two-group split (leader=1, workers=N-1)" -kubectl apply -f examples/test/e2e/quantum-split-pods.yaml +kubectl apply -f examples/test/e2e/quantum/quantum-split-pods.yaml # leader PodGroup must exist with minCount 1 log "checking leader group 'qsplit' minCount == 1" @@ -47,7 +47,7 @@ base="$(kubectl get pod qsplit-leader -o jsonpath='{range .spec.containers[*]}{r [ -n "$base" ] || fail "leader sidecar missing FLUENCE_WORKER_GROUP_BASE (sidecar would look in the wrong group and never ungate)" log "PASS 7: quantum gang split into leader(1) + workers(N-1), relinked + gated" -kubectl delete -f examples/test/e2e/quantum-split-pods.yaml --wait=false || true +kubectl delete -f examples/test/e2e/quantum/quantum-split-pods.yaml --wait=false || true for g in qsplit qsplit-workers; do kubectl patch podgroup $g --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true done diff --git a/test/e2e/quantum/04-gang-env-contract.sh b/test/e2e/quantum/04-gang-env-contract.sh new file mode 100755 index 0000000..ced3a3a --- /dev/null +++ b/test/e2e/quantum/04-gang-env-contract.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash +# Env-contract e2e: deploy a mock gang and verify the webhook injects the env the +# real gang workload (gang.py) depends on — IN-CLUSTER, on the real pod specs, +# with no Braket/AWS and WITHOUT requiring the pod to be scheduled. Guards the +# runtime seam that, if broken, makes a gang schedule fine then hang (a leader +# with no FLUENCE_ROLE defaults to worker -> no leader -> deadlock). +# +# This checks the SPEC layer only: the env references the webhook wires onto the +# right container at admission. These are downward-API valueFrom refs (their +# VALUES resolve later, at placement), but their PRESENCE is deterministic at +# admission, so this test needs no scheduling, no qpu add-on, no logs — it cannot +# flake on capacity. Injection paths verified in code: +# FLUENCE_ROLE roleEnvOps (quantum handler) -> all workload containers +# FLUENCE_POD_UID, PYTHONPATH InterceptorOps (core) -> fluxion-resource containers +# FLUXION_BACKEND fluxion handler (InjectEnvOps) -> fluxion-resource containers +# The leader requests qpu (so it gets the full contract); the worker only needs +# FLUENCE_ROLE (it requests no fluxion resource, by design). +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" + +log "TEST 8: gang env contract (webhook injects what gang.py reads) — spec layer" +kubectl apply -f examples/test/e2e/quantum/gang-env-mock.yaml + +# does container 'app' of pod $1 have an env entry named $2 ? (spec-level only) +has_env() { + kubectl get pod "$1" -o jsonpath="{.spec.containers[?(@.name=='app')].env[*].name}" \ + 2>/dev/null | tr ' ' '\n' | grep -qx "$2" +} + +# the webhook mutates at admission; poll briefly for the spec to appear +log "checking the webhook wired the contract onto the leader (qpu) container" +for i in $(seq 1 15); do has_env gangenv-leader FLUENCE_ROLE && break; sleep 2; done + +for v in FLUENCE_ROLE FLUENCE_POD_UID PYTHONPATH FLUXION_BACKEND; do + has_env gangenv-leader "$v" \ + || { kubectl get pod gangenv-leader -o yaml | sed -n '/containers:/,/status:/p'; \ + fail "leader container missing env '$v' (webhook did not inject the contract)"; } + log " leader has env: $v" +done + +# the worker carries FLUENCE_ROLE so gang.py selects 'worker' by contract, not luck +has_env gangenv-worker-0 FLUENCE_ROLE \ + || fail "worker container missing FLUENCE_ROLE (gang.py would default to worker by luck)" +log " worker has env: FLUENCE_ROLE" + +# and the role VALUE on the spec is correct per pod (downward-API ref to the +# role annotation, or a literal — either way the resolved fieldRef/value must +# encode leader vs worker). Assert the annotation the ref reads is right. +lr="$(kubectl get pod gangenv-leader -o jsonpath='{.metadata.annotations.fluence\.flux-framework\.org/role}')" +wr="$(kubectl get pod gangenv-worker-0 -o jsonpath='{.metadata.annotations.fluence\.flux-framework\.org/role}')" +[ "$lr" = "leader" ] || fail "leader role annotation=$lr, want leader" +[ "$wr" = "worker" ] || fail "worker role annotation=$wr, want worker" +log " role annotations correct (leader=$lr worker=$wr)" + +log "PASS 8: webhook injects the gang env contract at admission" + +kubectl delete -f examples/test/e2e/quantum/gang-env-mock.yaml --wait=false || true +for g in gangenv gangenv-workers; do + kubectl patch podgroup $g --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true +done +kubectl wait --for=delete pod -l app=gangenv --timeout=60s 2>/dev/null || true From 10dd846593f8bb53b549c2ffcc0c71d6167f97a7 Mon Sep 17 00:00:00 2001 From: vsoch Date: Sat, 27 Jun 2026 06:02:42 -0700 Subject: [PATCH 3/3] feat: refactor interface to submitter pod I do not like the leader worker design. Instead, lets move the sidecar logic into the handler (quantum specific) and allow for an entire group/gang (e.g., a minicluster/job/deployment or similar abstraction) to be the entire group ungated. We could not do this previously trying to split apart pods into leader worker, as they are derived from a common template. Signed-off-by: vsoch --- cmd/webhook/main.go | 1 - deploy/fluence-pull-test.yaml | 6 +- deploy/fluence-test.yaml | 6 +- deploy/fluence.yaml | 4 +- examples/quantum-pod.yaml | 4 +- .../test/e2e/quantum/quantum-gang-pods.yaml | 49 ++ .../test/e2e/quantum/quantum-split-pods.yaml | 54 -- .../test/e2e/quantum/sidecar-mock-pods.yaml | 63 -- pkg/fluence/fluence.go | 96 ++- pkg/webhook/handler.go | 19 +- pkg/webhook/handlers/dependency.go | 131 ++++ pkg/webhook/handlers/gang.go | 8 +- .../{gang_mincount_test.go => gang_test.go} | 15 +- pkg/webhook/handlers/handlers_test.go | 239 +----- pkg/webhook/handlers/leader.go | 65 -- pkg/webhook/handlers/quantum.go | 702 +++++++++++++----- pkg/webhook/handlers/quantum_split_test.go | 117 --- pkg/webhook/handlers/quantum_test.go | 448 +++++++++++ pkg/webhook/handlers/sidecar.go | 13 +- pkg/webhook/webhook.go | 235 +----- pkg/webhook/webhook_test.go | 46 -- python/fluence/providers/base.py | 4 +- python/fluence/providers/braket.py | 20 +- python/fluence/sidecar.py | 57 +- python/fluence/ungate.py | 31 +- test/e2e/gang/01-classical-gang.sh | 2 +- test/e2e/gang/02-postfilter-rematch.sh | 5 + test/e2e/gang/03-multi-gang.sh | 18 +- test/e2e/lib.sh | 2 +- test/e2e/quantum/01-quantum-placement.sh | 21 +- test/e2e/quantum/02-sidecar-ungate.sh | 128 ++-- test/e2e/quantum/03-gang-submitter.sh | 63 ++ test/e2e/quantum/03-quantum-split.sh | 54 -- test/e2e/quantum/04-gang-env-contract.sh | 95 ++- test/e2e/quantum/setup.sh | 54 ++ 35 files changed, 1563 insertions(+), 1312 deletions(-) create mode 100644 examples/test/e2e/quantum/quantum-gang-pods.yaml delete mode 100644 examples/test/e2e/quantum/quantum-split-pods.yaml delete mode 100644 examples/test/e2e/quantum/sidecar-mock-pods.yaml create mode 100644 pkg/webhook/handlers/dependency.go rename pkg/webhook/handlers/{gang_mincount_test.go => gang_test.go} (90%) delete mode 100644 pkg/webhook/handlers/leader.go delete mode 100644 pkg/webhook/handlers/quantum_split_test.go create mode 100644 pkg/webhook/handlers/quantum_test.go create mode 100644 test/e2e/quantum/03-gang-submitter.sh delete mode 100755 test/e2e/quantum/03-quantum-split.sh create mode 100644 test/e2e/quantum/setup.sh diff --git a/cmd/webhook/main.go b/cmd/webhook/main.go index 4169e8a..1a6709d 100644 --- a/cmd/webhook/main.go +++ b/cmd/webhook/main.go @@ -112,7 +112,6 @@ func main() { mutator := &webhook.Mutator{ AttributeKeys: attrKeys, Clientset: client, - SidecarImage: env("FLUENCE_SIDECAR_IMAGE", ""), } log.Printf("[fluence-webhook] env contract injected into fluxion pods: %v", mutator.EnvVarNames()) diff --git a/deploy/fluence-pull-test.yaml b/deploy/fluence-pull-test.yaml index 71dc03e..94c2425 100644 --- a/deploy/fluence-pull-test.yaml +++ b/deploy/fluence-pull-test.yaml @@ -67,7 +67,9 @@ rules: # the pod object, so grant it here. - apiGroups: [""] resources: ["pods"] - verbs: ["get", "list", "watch", "patch", "update"] + # create/delete: the webhook creates the one-off quantum submitter pod + # (ensureSubmitterPod) and the scheduler reaps it during gang cleanup. + verbs: ["get", "list", "watch", "create", "patch", "update", "delete"] # The webhook self-manages its TLS by patching its own config's caBundle. - apiGroups: ["admissionregistration.k8s.io"] resources: ["mutatingwebhookconfigurations"] @@ -281,4 +283,4 @@ metadata: value: 1000000 globalDefault: false preemptionPolicy: PreemptLowerPriority -description: "High priority for classical pods paired with quantum work. Set by Fluence webhook." \ No newline at end of file +description: "High priority for classical pods paired with quantum work. Set by Fluence webhook." diff --git a/deploy/fluence-test.yaml b/deploy/fluence-test.yaml index 48e95a4..ab61a91 100644 --- a/deploy/fluence-test.yaml +++ b/deploy/fluence-test.yaml @@ -67,7 +67,9 @@ rules: # the pod object, so grant it here. - apiGroups: [""] resources: ["pods"] - verbs: ["get", "list", "watch", "patch", "update"] + # create/delete: the webhook creates the one-off quantum submitter pod + # (ensureSubmitterPod) and the scheduler reaps it during gang cleanup. + verbs: ["get", "list", "watch", "create", "patch", "update", "delete"] # The webhook self-manages its TLS by patching its own config's caBundle. - apiGroups: ["admissionregistration.k8s.io"] resources: ["mutatingwebhookconfigurations"] @@ -281,4 +283,4 @@ metadata: value: 1000000 globalDefault: false preemptionPolicy: PreemptLowerPriority -description: "High priority for classical pods paired with quantum work. Set by Fluence webhook." \ No newline at end of file +description: "High priority for classical pods paired with quantum work. Set by Fluence webhook." diff --git a/deploy/fluence.yaml b/deploy/fluence.yaml index b856268..7d71386 100644 --- a/deploy/fluence.yaml +++ b/deploy/fluence.yaml @@ -67,7 +67,9 @@ rules: # the pod object, so grant it here. - apiGroups: [""] resources: ["pods"] - verbs: ["get", "list", "watch", "patch", "update"] + # create/delete: the webhook creates the one-off quantum submitter pod + # (ensureSubmitterPod) and the scheduler reaps it during gang cleanup. + verbs: ["get", "list", "watch", "create", "patch", "update", "delete"] # The webhook self-manages its TLS by patching its own config's caBundle. - apiGroups: ["admissionregistration.k8s.io"] resources: ["mutatingwebhookconfigurations"] diff --git a/examples/quantum-pod.yaml b/examples/quantum-pod.yaml index a619df9..b5dfbc9 100644 --- a/examples/quantum-pod.yaml +++ b/examples/quantum-pod.yaml @@ -2,7 +2,7 @@ # via resources (the fluence device plugin advertises fluxion.flux-framework.org/qpu # on every node, so NodeResourcesFit is satisfied). Fluence's PreFilter matches # the request against the resource graph and picks a backend, the webhook injects -# QRMI_BACKEND (the allocated backend) automatically, and note we can add other +# FLUXION_BACKEND (the allocated backend) automatically, and note we can add other # envars here in the future. I chose a webhook because I think this is going to # be a requirement, and the pod is immutable after creation. # Then the container submits via qrmi-go (the separate qrmi-sampler image). @@ -27,4 +27,4 @@ spec: requests: fluxion.flux-framework.org/qpu: "1" limits: - fluxion.flux-framework.org/qpu: "1" \ No newline at end of file + fluxion.flux-framework.org/qpu: "1" diff --git a/examples/test/e2e/quantum/quantum-gang-pods.yaml b/examples/test/e2e/quantum/quantum-gang-pods.yaml new file mode 100644 index 0000000..b345398 --- /dev/null +++ b/examples/test/e2e/quantum/quantum-gang-pods.yaml @@ -0,0 +1,49 @@ +# Gang + submitter quantum workload for the e2e (no leader/worker). +# +# Two pods, identical, both requesting the quantum resource, in group "qgang". +# The user authors NO roles and NO submitter — the webhook treats this as a gang +# of full size N=2 (group-size makes N deterministic for raw pods, which have no +# owning Job/Deployment to derive it from), gates every pod, and ADDITIONALLY +# creates the one-off submitter pod "qgang-submitter" (its own group-of-one) that +# runs the real submit and ungates the gang. busybox stands in for the quantum +# app; the interceptor staging fails soft (no python), which is fine for the +# structural assertions in 02/03/04. +apiVersion: v1 +kind: Pod +metadata: + name: qgang-0 + labels: + app: qgang + fluence.flux-framework.org/group: qgang + annotations: + fluence.flux-framework.org/group-size: "2" +spec: + schedulerName: fluence + restartPolicy: Never + containers: + - name: app + image: busybox + command: ["sh", "-c", "echo gang member; sleep 600"] + resources: + requests: {fluxion.flux-framework.org/qpu: "1"} + limits: {fluxion.flux-framework.org/qpu: "1"} +--- +apiVersion: v1 +kind: Pod +metadata: + name: qgang-1 + labels: + app: qgang + fluence.flux-framework.org/group: qgang + annotations: + fluence.flux-framework.org/group-size: "2" +spec: + schedulerName: fluence + restartPolicy: Never + containers: + - name: app + image: busybox + command: ["sh", "-c", "echo gang member; sleep 600"] + resources: + requests: {fluxion.flux-framework.org/qpu: "1"} + limits: {fluxion.flux-framework.org/qpu: "1"} \ No newline at end of file diff --git a/examples/test/e2e/quantum/quantum-split-pods.yaml b/examples/test/e2e/quantum/quantum-split-pods.yaml deleted file mode 100644 index 8dd0026..0000000 --- a/examples/test/e2e/quantum/quantum-split-pods.yaml +++ /dev/null @@ -1,54 +0,0 @@ -# Heterogeneous quantum gang for the two-group split e2e. Leader requests qpu and -# is role=leader; two workers are role=worker. group-size on the leader makes the -# leader group minCount deterministic (1); the worker group is derived as N-1. -# (Owner-derived N from an indexed Job is exercised by the unit tests; a raw-pod -# mock has no owning Job, so we make the split observable via explicit roles.) -apiVersion: v1 -kind: Pod -metadata: - name: qsplit-leader - labels: - app: qsplit - fluence.flux-framework.org/group: qsplit - annotations: - fluence.flux-framework.org/role: leader -spec: - schedulerName: fluence - restartPolicy: Never - containers: - - name: app - image: busybox - command: ["sh","-c","echo leader; sleep 600"] - resources: - requests: {fluxion.flux-framework.org/qpu: "1"} - limits: {fluxion.flux-framework.org/qpu: "1"} ---- -apiVersion: v1 -kind: Pod -metadata: - name: qsplit-worker-0 - labels: {app: qsplit, fluence.flux-framework.org/group: qsplit} - annotations: {fluence.flux-framework.org/role: worker, fluence.flux-framework.org/group-size: "2"} -spec: - schedulerName: fluence - restartPolicy: Never - containers: - - name: w - image: busybox - command: ["sh","-c","echo worker; sleep 30"] - resources: {requests: {cpu: "1"}} ---- -apiVersion: v1 -kind: Pod -metadata: - name: qsplit-worker-1 - labels: {app: qsplit, fluence.flux-framework.org/group: qsplit} - annotations: {fluence.flux-framework.org/role: worker} -spec: - schedulerName: fluence - restartPolicy: Never - containers: - - name: w - image: busybox - command: ["sh","-c","echo worker; sleep 30"] - resources: {requests: {cpu: "1"}} diff --git a/examples/test/e2e/quantum/sidecar-mock-pods.yaml b/examples/test/e2e/quantum/sidecar-mock-pods.yaml deleted file mode 100644 index e3c86bb..0000000 --- a/examples/test/e2e/quantum/sidecar-mock-pods.yaml +++ /dev/null @@ -1,63 +0,0 @@ ---- -# Leader pod — first admitted, webhook creates PodGroup, injects sidecar, creates RBAC -# User only needs schedulerName: fluence and the quantum-group label. -# No PodGroup object needed — Fluence creates it. -apiVersion: v1 -kind: Pod -metadata: - name: sidecar-test-leader - labels: - app: fluence-sidecar-test - fluence.flux-framework.org/group: sidecar-test-group -spec: - schedulerName: fluence - restartPolicy: Never - containers: - - name: mock-quantum-app - image: busybox - command: - - sh - - -c - - | - echo "mock-quantum-app: running" - echo "arn:aws:braket:us-east-1:123456:quantum-task/mock-abc123" \ - > /tmp/task-arn - echo "mock-quantum-app: task ARN written" - sleep 3600 - resources: - requests: - fluxion.flux-framework.org/qpu: "1" - limits: - fluxion.flux-framework.org/qpu: "1" - ---- -# Worker pod — classical (no QPU). Gated by the webhook because it is a -# non-leader member of a group whose leader is a quantum pod. -apiVersion: v1 -kind: Pod -metadata: - name: sidecar-test-worker - labels: - app: fluence-sidecar-test - fluence.flux-framework.org/group: sidecar-test-group -spec: - schedulerName: fluence - restartPolicy: Never - containers: - - name: classical-worker - image: busybox - command: - - sh - - -c - - | - echo "classical-worker: started" - echo "TASK_ARN=$BRAKET_TASK_ARN" - sleep 10 - env: - - name: FLUENCE_QUANTUM_JOB_ID - valueFrom: - fieldRef: - fieldPath: metadata.annotations['fluence.flux-framework.org/quantum-job-id'] - resources: - requests: - cpu: "1" \ No newline at end of file diff --git a/pkg/fluence/fluence.go b/pkg/fluence/fluence.go index c7fca16..fd3b080 100644 --- a/pkg/fluence/fluence.go +++ b/pkg/fluence/fluence.go @@ -93,6 +93,7 @@ var ( _ fwk.PreFilterPlugin = (*Fluence)(nil) _ fwk.FilterPlugin = (*Fluence)(nil) _ fwk.PostFilterPlugin = (*Fluence)(nil) + _ fwk.ReservePlugin = (*Fluence)(nil) _ fwk.PreBindPlugin = (*Fluence)(nil) ) @@ -560,12 +561,59 @@ func (f *Fluence) PreBindPreFlight( return nil, fwk.NewStatus(fwk.Success) } +// Reserve stamps the chosen backend (and matched attributes) onto the pod as +// early as possible — at reservation, in the scheduling cycle — rather than in +// PreBind. The webhook injects FLUXION_BACKEND (and FLUXION_) as a +// downward-API env sourced from these annotations; downward-API env is resolved +// by the kubelet when the container starts and is NOT updated afterward, so the +// annotation must be present well before the container starts. PreBind runs in +// the (asynchronous) binding cycle, milliseconds before Bind, which races the +// kubelet — Reserve runs earlier and synchronously, giving the annotation time +// to propagate so the value reliably surfaces in the container. +func (f *Fluence) Reserve( + ctx context.Context, + state fwk.CycleState, + pod *corev1.Pod, + nodeName string, +) *fwk.Status { + if err := f.stampBackend(ctx, pod); err != nil { + return fwk.AsStatus(fmt.Errorf("stamp backend annotations: %w", err)) + } + return fwk.NewStatus(fwk.Success) +} + +// Unreserve is a no-op: a stale backend annotation from a reservation that was +// later rejected is harmless (it is overwritten on the next attempt and the +// value is correct for the allocation that produced it), and clearing it would +// cost an extra API call. Required to satisfy fwk.ReservePlugin. +func (f *Fluence) Unreserve(ctx context.Context, state fwk.CycleState, pod *corev1.Pod, nodeName string) { +} + +// stampBackend writes the allocated backend name and matched attributes onto the +// pod (idempotent merge patch). No-op when there is no cached allocation or the +// allocation carries no backend (classical, non-quantum gangs). +func (f *Fluence) stampBackend(ctx context.Context, pod *corev1.Pod) error { + f.mu.Lock() + alloc, ok := f.placement[groupKey(pod)] + f.mu.Unlock() + if !ok || alloc.place.Backend == "" { + return nil + } + ann := map[string]string{placement.BackendAnnotation: alloc.place.Backend} + for k, v := range alloc.place.BackendAttributes { + ann[placement.AttributeAnnotationPrefix+k] = v + } + log.Printf("[fluence] group %s -> backend %q attrs %v (reserve-stamped, nodes %v)", + groupKey(pod), alloc.place.Backend, alloc.place.BackendAttributes, alloc.place.Nodes) + return f.patchPodAnnotations(ctx, pod.Namespace, pod.Name, ann) +} + // PreBind records, in the commit phase, the durable state for this group: -// - the Fluxion jobid onto the owning object (the PodGroup for a gang, else the -// pod) so the allocation can be cancelled when that object is deleted; -// - for a quantum group, the allocated backend onto the pod, which the webhook- -// injected downward-API env surfaces as QRMI_BACKEND (container env is -// immutable post-creation, so the value must travel via an annotation). +// the Fluxion jobid onto the owning object (the PodGroup for a gang, else the +// pod) so the allocation can be cancelled when that object is deleted. The +// backend annotation is stamped earlier, in Reserve (see stampBackend), because +// the webhook-injected downward-API env (FLUXION_BACKEND) must be present before +// the container starts; PreBind is too late and races the kubelet. func (f *Fluence) PreBind( ctx context.Context, state fwk.CycleState, @@ -582,20 +630,10 @@ func (f *Fluence) PreBind( if err := f.recordJobIDs(ctx, pod, alloc.jobids); err != nil { return fwk.AsStatus(fmt.Errorf("record jobids: %w", err)) } - if alloc.place.Backend != "" { - // Stamp the backend name and all matched attributes in one patch. The - // webhook injects a normalized env per annotation so the workload reads - // exactly what it matched (backend + region/qubits/...). - ann := map[string]string{placement.BackendAnnotation: alloc.place.Backend} - for k, v := range alloc.place.BackendAttributes { - ann[placement.AttributeAnnotationPrefix+k] = v - } - log.Printf("[fluence] group %s -> backend %q attrs %v (nodes %v, jobids %v)", - groupKey(pod), alloc.place.Backend, alloc.place.BackendAttributes, - alloc.place.Nodes, alloc.jobids) - if err := f.patchPodAnnotations(ctx, pod.Namespace, pod.Name, ann); err != nil { - return fwk.AsStatus(fmt.Errorf("stamp backend annotations: %w", err)) - } + // Backstop: if Reserve was skipped for any reason, ensure the backend is + // stamped before bind anyway (idempotent). + if err := f.stampBackend(ctx, pod); err != nil { + return fwk.AsStatus(fmt.Errorf("stamp backend annotations: %w", err)) } return fwk.NewStatus(fwk.Success) } @@ -789,6 +827,20 @@ func (f *Fluence) reconcileGroup(ctx context.Context, namespace, group string) { } log.Printf("fluence: reconciled completed gang %s/%s — deleted Fluence-created PodGroup, allocation freed", namespace, group) + + // Gang+submitter cleanup: the one-off quantum submitter pod and its + // group-of-one PodGroup (-submitter) are not owned by the user's + // workload, so reap them alongside the gang. The submitter pod also carries + // an ownerReference to this gang PodGroup (so its deletion cascades via GC); + // this explicit delete is the backstop and also removes the submitter's own + // PodGroup. Skip when this group is itself a submitter group, to avoid + // recursing on -submitter-submitter. + if !strings.HasSuffix(group, submitterGroupSuffix) { + sg := group + submitterGroupSuffix + _ = f.handle.ClientSet().SchedulingV1alpha2().PodGroups(namespace).Delete(ctx, sg, metav1.DeleteOptions{}) + _ = f.handle.ClientSet().CoreV1().Pods(namespace).Delete(ctx, sg, metav1.DeleteOptions{}) + log.Printf("fluence: reaped submitter %s/%s for gang %s", namespace, sg, group) + } } // reconcileGraceForEmpty is how long a Fluence-created PodGroup with no live @@ -800,6 +852,12 @@ const reconcileGraceForEmpty = 2 * time.Minute // package (the scheduler must not depend on the webhook). Kept in sync with it. const webhookGroupLabel = "fluence.flux-framework.org/group" +// submitterGroupSuffix mirrors handlers.SubmitterGroupSuffix: the one-off quantum +// submitter for gang is named -submitter (both the pod and its PodGroup). +// Duplicated here to avoid importing the webhook handlers package into the +// scheduler plugin; keep the two in sync. +const submitterGroupSuffix = "-submitter" + // onPodGroupDeleted frees the gang's allocation when its PodGroup is deleted. func (f *Fluence) onPodGroupDeleted(obj interface{}) { pg, ok := obj.(*schedv1a2.PodGroup) diff --git a/pkg/webhook/handler.go b/pkg/webhook/handler.go index ddf1c84..61b97b1 100644 --- a/pkg/webhook/handler.go +++ b/pkg/webhook/handler.go @@ -27,25 +27,16 @@ type MutatorAPI interface { // EnsurePodGroup creates the group's PodGroup with the given gang minCount if // it does not already exist (idempotent). Group identity is the opaque value - // of the group label. Leader election is NOT here — it is a leader/worker - // concern owned by the handlers that need it (see handlers/leader.go). - EnsurePodGroup(ctx context.Context, namespace, group, leaderPod string, minCount int32) - - // Sidecar staging primitives. These remain on the core because the default - // Sidecar implementation (coreSidecar) delegates to them, but handlers do - // NOT use them directly — they go through the handlers.Sidecar interface, - // which is the customization seam. Kept here (not removed) so the concrete - // *Mutator continues to satisfy both this interface and coreSidecar's needs. - EnsureSidecarRBAC(ctx context.Context, namespace string) - InterceptorOps(pod *corev1.Pod) []spec.Op - SidecarContainerOps(pod *corev1.Pod, observe bool, extraEnv []corev1.EnvVar) []spec.Op + // of the group label. creatorPod is recorded only as the PodGroup's creator + // reference; the core ascribes no role semantics to it. + EnsurePodGroup(ctx context.Context, namespace, group, creatorPod string, minCount int32) } // Handler inspects a pod and, when it applies, contributes JSON patch ops. A pod // flows through every registered handler whose Applies returns true; their ops // are concatenated. Applies is fully general — it receives the pod and the -// MutatorAPI, so a handler may consult cluster state (e.g. resolve a group's -// leader) in deciding whether it applies. +// MutatorAPI, so a handler may consult cluster state in deciding whether it +// applies. type Handler interface { Name() string Applies(ctx context.Context, m MutatorAPI, pod *corev1.Pod) bool diff --git a/pkg/webhook/handlers/dependency.go b/pkg/webhook/handlers/dependency.go new file mode 100644 index 0000000..d25d598 --- /dev/null +++ b/pkg/webhook/handlers/dependency.go @@ -0,0 +1,131 @@ +package handlers + +import ( + "github.com/converged-computing/fluence/pkg/webhook/spec" + + corev1 "k8s.io/api/core/v1" +) + +// Dependency is Fluence's GENERAL "this set of pods must wait for a producer to +// be ready" primitive. It is deliberately NOT quantum-specific: quantum is the +// first resource type to use it (a gang waits for a quantum submission to reach +// the device queue), but the same primitive applies to any resource type whose +// readiness is produced out-of-band — a license server, a data stage-in job, a +// warmed cache, another gang, etc. +// +// A Dependency has three parts, each carried as a pod annotation so the +// relationship lives at the GROUP level (not duplicated as bespoke per-resource +// fields) and is readable by both the webhook (at admission) and the scheduler +// (in its reconcile loop): +// +// - Kind: what KIND of readiness this is (the resource type's name). The +// producer side knows how to satisfy this kind; the consumer side +// only knows it must wait. Quantum's kind is "quantum-submit". +// - Producer: the identity of the thing that will signal ready. For quantum it +// is the submitter's (base) group; generally it is whatever the +// kind's handler records as the satisfier. +// - Gate: the scheduling gate held on the dependent (consumer) pods until +// the producer signals ready. Removing the gate is the "ungate" +// and is performed by whatever observes the producer's readiness +// (the quantum sidecar for kind=quantum-submit; the scheduler's +// reconcile loop for kinds whose readiness is in-cluster, e.g. +// "another gang is Running"). +// +// The webhook PRODUCES a Dependency (gates the consumers, stamps the +// annotations); REMOVING the gate is owned by the observer best placed to see +// the producer's readiness. That split — declare here, observe elsewhere — is +// what keeps the primitive general: a new resource type adds a Kind and an +// observer and reuses the gating/annotation machinery unchanged. +type Dependency struct { + Kind string // resource-type readiness kind, e.g. "quantum-submit" + Producer string // identity of the readiness producer (e.g. the base group) + Gate string // scheduling gate held on dependents until ready +} + +// Dependency annotation keys (stamped on the dependent pods). Generic — no +// quantum in the names, so any resource type reuses them. +const ( + // DependsOnKindAnnotation names the readiness kind the dependent waits for. + DependsOnKindAnnotation = "fluence.flux-framework.org/depends-on-kind" + // DependsOnProducerAnnotation names the producer expected to signal ready. + DependsOnProducerAnnotation = "fluence.flux-framework.org/depends-on-producer" + // DependsOnGateAnnotation records which scheduling gate encodes the wait, so + // an observer knows exactly which gate to remove when the producer is ready. + DependsOnGateAnnotation = "fluence.flux-framework.org/depends-on-gate" +) + +// applyOps gates the dependent pod and stamps the dependency annotations so the +// relationship is self-describing on the pod. It reuses the gate machinery +// (gateWithName) verbatim — the gate is the universal "held until ready" +// mechanism regardless of resource type — so a new Kind costs only its readiness +// observer, not new gating code. +func (d Dependency) applyOps(pod *corev1.Pod) []spec.Op { + ops := gateWithName(pod, d.Gate) + ops = append(ops, annotateOp(pod, DependsOnKindAnnotation, d.Kind)...) + ops = append(ops, annotateOp(pod, DependsOnProducerAnnotation, d.Producer)...) + ops = append(ops, annotateOp(pod, DependsOnGateAnnotation, d.Gate)...) + return ops +} + +// DependencyOf reads a dependent pod's declared Dependency, or ok=false if it +// carries none. The scheduler's reconcile loop and the sidecar use this to learn +// what a gated pod is waiting for without hardcoding a kind. +func DependencyOf(pod *corev1.Pod) (Dependency, bool) { + kind := spec.Annotation(pod, DependsOnKindAnnotation) + if kind == "" { + return Dependency{}, false + } + return Dependency{ + Kind: kind, + Producer: spec.Annotation(pod, DependsOnProducerAnnotation), + Gate: spec.Annotation(pod, DependsOnGateAnnotation), + }, true +} + +// annotateOp adds a single metadata annotation (creating the annotations map if +// the pod has none). The key is JSON-Pointer-escaped so slashes are handled. +func annotateOp(pod *corev1.Pod, key, value string) []spec.Op { + if value == "" { + return nil + } + if pod.Annotations == nil { + return []spec.Op{{ + Op: "add", + Path: "/metadata/annotations", + Value: map[string]string{key: value}, + }} + } + return []spec.Op{{ + Op: "add", + Path: "/metadata/annotations/" + escapeJSONPointer(key), + Value: value, + }} +} + +// gateWithName adds a named scheduling gate (idempotent) and raises priority for +// the held pod, generalizing the quantum gating to ANY gate name so the +// dependency primitive is not tied to the quantum gate. +func gateWithName(pod *corev1.Pod, gateName string) []spec.Op { + for _, g := range pod.Spec.SchedulingGates { + if g.Name == gateName { + return nil + } + } + var ops []spec.Op + gate := corev1.PodSchedulingGate{Name: gateName} + if len(pod.Spec.SchedulingGates) == 0 { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/schedulingGates", Value: []corev1.PodSchedulingGate{gate}}) + } else { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/schedulingGates/-", Value: gate}) + } + // Gated dependents schedule reliably once ungated only if they outrank other + // pending work; priorityClassName is immutable post-creation so it must be + // set now. Don't override a user's explicit class. spec.priority is cleared + // to null so the priority admission controller recomputes it from the class + // (add-null is valid whether the field is absent, 0, or set). + if pod.Spec.PriorityClassName == "" { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/priorityClassName", Value: QuantumClassicalPriorityClass}) + ops = append(ops, spec.Op{Op: "add", Path: "/spec/priority", EmitNull: true}) + } + return ops +} diff --git a/pkg/webhook/handlers/gang.go b/pkg/webhook/handlers/gang.go index b1db6e3..0469c11 100644 --- a/pkg/webhook/handlers/gang.go +++ b/pkg/webhook/handlers/gang.go @@ -17,7 +17,7 @@ func init() { } // gangHandler gang-schedules pods that carry the group label: it creates a -// Fluence-owned PodGroup (first pod admitted becomes the recorded leader) and +// Fluence-owned PodGroup and // links every pod to it via spec.schedulingGroup.podGroupName, which is the // field the scheduler gangs by. It knows nothing about quantum — a purely // classical gang is fully handled here, with no sidecar. @@ -34,8 +34,8 @@ func (h *gangHandler) Mutate(ctx context.Context, m webhook.MutatorAPI, pod *cor // Ensure the group's PodGroup exists with the resolved gang size, and link // this pod to it. EnsurePodGroup is idempotent (no-ops if the PodGroup // already exists — e.g. created by an earlier, more specific handler), so we - // call it unconditionally. The gang handler knows nothing about leaders or - // roles; that is a leader/worker concern handled by the quantum handler. + // call it unconditionally. The gang handler knows nothing about quantum or + // submitters; that is the quantum handler's concern. // minCount = full gang size N (group-size annotation, else owner-derived); // see resolveMinCount. m.EnsurePodGroup(ctx, pod.Namespace, g, pod.Name, resolveMinCount(ctx, m, pod)) @@ -52,7 +52,7 @@ func (h *gangHandler) Mutate(ctx context.Context, m webhook.MutatorAPI, pod *cor // 3. otherwise default to 1, logged — never silently size a multi-pod gang to 1. // // The leader/worker (quantum) split is orthogonal and unchanged: it is driven by -// RoleAnnotation / QuantumResource in the quantum handler. minCount is always the +// QuantumResource in the quantum handler. minCount is always the // FULL gang N regardless of which pods get gated. func resolveMinCount(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) int32 { // 1. explicit override diff --git a/pkg/webhook/handlers/gang_mincount_test.go b/pkg/webhook/handlers/gang_test.go similarity index 90% rename from pkg/webhook/handlers/gang_mincount_test.go rename to pkg/webhook/handlers/gang_test.go index 77f7f46..ac027f8 100644 --- a/pkg/webhook/handlers/gang_mincount_test.go +++ b/pkg/webhook/handlers/gang_test.go @@ -136,19 +136,18 @@ func TestGangMinCountDefaultsToOne(t *testing.T) { } } -// Quantum distinction: a gang of full size N=4 that ALSO carries -// expected-workers=3 (the N-1 workers the sidecar ungates) must still get -// minCount=4 (the whole gang), NOT 3. minCount comes from group-size, not -// expected-workers. -func TestGangMinCountHonorsFullNWithQuantumSplit(t *testing.T) { +// group-size is the authoritative gang minCount: a workload that sets it to N +// gets minCount=N (the whole gang schedules atomically), regardless of any owner +// replica count. In the gang+submitter model the full workload IS the gang — +// there is no N-1 worker split. +func TestGangMinCountHonorsGroupSize(t *testing.T) { pod := cpuPod("fluence") pod.Namespace = "default" pod.Labels = map[string]string{webhook.GroupLabel: "q-gang"} pod.Annotations = map[string]string{ - webhook.GroupSizeAnnotation: "4", // full N (leader + workers) - webhook.ExpectedWorkersAnnotation: "3", // N-1 workers to ungate + webhook.GroupSizeAnnotation: "4", // full gang size } if got := minCountOf(t, pod); got != 4 { - t.Errorf("quantum gang: minCount=%d, want 4 (full N, not N-1)", got) + t.Errorf("group-size gang: minCount=%d, want 4 (full N)", got) } } diff --git a/pkg/webhook/handlers/handlers_test.go b/pkg/webhook/handlers/handlers_test.go index 1322ec2..4931a8a 100644 --- a/pkg/webhook/handlers/handlers_test.go +++ b/pkg/webhook/handlers/handlers_test.go @@ -9,10 +9,7 @@ import ( "github.com/converged-computing/fluence/pkg/webhook/spec" corev1 "k8s.io/api/core/v1" - schedulingv1alpha2 "k8s.io/api/scheduling/v1alpha2" "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes/fake" ) // ── fixtures ──────────────────────────────────────────────────────────────────── @@ -87,12 +84,12 @@ func hasSidecarOp(ops []spec.Op) bool { for _, op := range ops { switch v := op.Value.(type) { case corev1.Container: - if v.Name == "fluence-sidecar" { + if v.Name == SidecarContainerName { return true } case []corev1.Container: for _, c := range v { - if c.Name == "fluence-sidecar" { + if c.Name == SidecarContainerName { return true } } @@ -127,238 +124,6 @@ func TestMutateSkipsNonFluxion(t *testing.T) { } } -// ── quantum handler: submitter ────────────────────────────────────────────────── - -func TestSingleQuantumGetsInterceptorNoSidecar(t *testing.T) { - m := &webhook.Mutator{AttributeKeys: []string{"region"}} - ops := m.Mutate(context.Background(), qpuPod("fluence")) - names := opEnvNames(ops) - if !contains(names, "FLUXION_BACKEND") { - t.Errorf("want FLUXION_BACKEND, got %v", names) - } - if !contains(names, "PYTHONPATH") || !contains(names, "FLUENCE_POD_UID") { - t.Errorf("want interceptor env (PYTHONPATH, FLUENCE_POD_UID), got %v", names) - } - if hasSidecarOp(ops) { - t.Error("standalone quantum pod should not get a sidecar") - } - if hasGateOp(ops) { - t.Error("standalone quantum pod should not be gated") - } -} - -func TestObserveLabelInjectsSidecar(t *testing.T) { - m := &webhook.Mutator{} - pod := qpuPod("fluence") - pod.Labels = map[string]string{ObserveLabel: "true"} - ops := m.Mutate(context.Background(), pod) - if !hasSidecarOp(ops) { - t.Error("observe-labeled quantum pod should get the sidecar") - } - if hasGateOp(ops) { - t.Error("observe-only pod should not be gated") - } -} - -// ── quantum handler: worker gating ────────────────────────────────────────────── - -func quantumGroupFixture(ns, group, leaderName string) *fake.Clientset { - pg := &schedulingv1alpha2.PodGroup{ - ObjectMeta: metav1.ObjectMeta{ - Name: group, Namespace: ns, - Annotations: map[string]string{LeaderAnnotation: leaderName}, - }, - } - leaderPod := qpuPod("fluence") - leaderPod.Name = leaderName - leaderPod.Namespace = ns - leaderPod.Labels = map[string]string{webhook.GroupLabel: group} - return fake.NewSimpleClientset(pg, leaderPod) -} - -func TestClassicalWorkerInQuantumGroupIsGated(t *testing.T) { - ns, group, leader := "default", "qaoa", "qaoa-leader" - m := &webhook.Mutator{Clientset: quantumGroupFixture(ns, group, leader)} - - worker := cpuPod("fluence") - worker.Name = "qaoa-worker-0" - worker.Namespace = ns - worker.Labels = map[string]string{webhook.GroupLabel: group} - - ops := m.Mutate(context.Background(), worker) - if !hasGateOp(ops) { - t.Errorf("classical worker in a quantum group should be gated; ops=%v", ops) - } - if hasSidecarOp(ops) { - t.Error("worker should not get a sidecar") - } -} - -func TestClassicalGangWorkerNotGated(t *testing.T) { - ns, group, leader := "default", "classical", "classical-leader" - pg := &schedulingv1alpha2.PodGroup{ - ObjectMeta: metav1.ObjectMeta{Name: group, Namespace: ns, - Annotations: map[string]string{LeaderAnnotation: leader}}, - } - leaderPod := cpuPod("fluence") - leaderPod.Name = leader - leaderPod.Namespace = ns - leaderPod.Labels = map[string]string{webhook.GroupLabel: group} - m := &webhook.Mutator{Clientset: fake.NewSimpleClientset(pg, leaderPod)} - - worker := cpuPod("fluence") - worker.Name = "classical-worker-0" - worker.Namespace = ns - worker.Labels = map[string]string{webhook.GroupLabel: group} - - if hasGateOp(m.Mutate(context.Background(), worker)) { - t.Error("worker in a classical gang must NOT be gated (would deadlock)") - } -} - -// Pod-template gang: every pod requests QPU; only the recorded leader gets the -// sidecar, the rest are gated workers (role by admission order, not request). -func TestPodTemplateGangSecondPodIsWorker(t *testing.T) { - ns, group, leader := "default", "qaoa", "qaoa-abc123" - pg := &schedulingv1alpha2.PodGroup{ - ObjectMeta: metav1.ObjectMeta{Name: group, Namespace: ns, - Annotations: map[string]string{LeaderAnnotation: leader}}, - } - leaderPod := qpuPod("fluence") - leaderPod.Name = leader - leaderPod.Namespace = ns - leaderPod.Labels = map[string]string{webhook.GroupLabel: group} - m := &webhook.Mutator{Clientset: fake.NewSimpleClientset(pg, leaderPod)} - - second := qpuPod("fluence") // identical spec, requests QPU - second.Name = "qaoa-def456" - second.Namespace = ns - second.Labels = map[string]string{webhook.GroupLabel: group} - - ops := m.Mutate(context.Background(), second) - if !hasGateOp(ops) { - t.Error("second pod in a pod-template gang must be gated as a worker") - } - if hasSidecarOp(ops) { - t.Error("second pod must NOT get a sidecar (it is a worker)") - } -} - -// ── quantum handler: explicit role annotation ────────────────────────────────── -// -// These cover the fluence.flux-framework.org/role annotation, which makes the -// leader/worker split EXPLICIT rather than inferred by admission order. When the -// annotation is present it is authoritative; the same value is echoed to the -// container as FLUENCE_ROLE so the app reads the role Fluence used. - -// roledQPUPod is a QPU-requesting pod in a group with an explicit role. -func roledQPUPod(ns, group, name, role string) *corev1.Pod { - p := qpuPod("fluence") - p.Name = name - p.Namespace = ns - p.Labels = map[string]string{webhook.GroupLabel: group} - p.Annotations = map[string]string{webhook.RoleAnnotation: role} - return p -} - -// An explicitly-declared leader gets the sidecar and is NOT gated — even though -// no leader is recorded on the PodGroup (admission order never consulted). -func TestExplicitLeaderGetsSidecarNotGated(t *testing.T) { - ns, group := "default", "qaoa" - // fixture with NO LeaderAnnotation recorded — proves we don't rely on it. - pg := &schedulingv1alpha2.PodGroup{ - ObjectMeta: metav1.ObjectMeta{Name: group, Namespace: ns}, - } - m := &webhook.Mutator{Clientset: fake.NewSimpleClientset(pg)} - - leader := roledQPUPod(ns, group, "qaoa-leader", RoleLeader) - ops := m.Mutate(context.Background(), leader) - if hasGateOp(ops) { - t.Error("explicit leader must NOT be gated") - } - if !hasSidecarOp(ops) { - t.Error("explicit leader must get the sidecar") - } - if !contains(opEnvNames(ops), "FLUENCE_ROLE") { - t.Error("leader must get FLUENCE_ROLE injected for the app to read") - } -} - -// An explicitly-declared worker is gated and gets no sidecar — even if it -// requests the QPU resource itself and even if it (wrongly) appears as the -// recorded leader. The annotation overrides both. -func TestExplicitWorkerIsGatedRegardlessOfAdmission(t *testing.T) { - ns, group := "default", "qaoa" - // Adversarial fixture: record THIS worker's own name as the admission-order - // leader. The explicit role:worker must still win and gate it. - worker := roledQPUPod(ns, group, "qaoa-worker-0", RoleWorker) - pg := &schedulingv1alpha2.PodGroup{ - ObjectMeta: metav1.ObjectMeta{Name: group, Namespace: ns, - Annotations: map[string]string{LeaderAnnotation: worker.Name}}, - } - m := &webhook.Mutator{Clientset: fake.NewSimpleClientset(pg)} - - ops := m.Mutate(context.Background(), worker) - if !hasGateOp(ops) { - t.Error("explicit worker must be gated even if mis-recorded as the admission-order leader") - } - if hasSidecarOp(ops) { - t.Error("explicit worker must NOT get a sidecar") - } - if !contains(opEnvNames(ops), "FLUENCE_ROLE") { - t.Error("worker must get FLUENCE_ROLE injected so the app knows it is a worker") - } -} - -// A heterogeneous gang declared with explicit roles resolves to exactly one -// leader (sidecar, ungated) and the rest workers (gated) — independent of the -// order in which the webhook admits the pods. This is the property a -// leader/worker quantum gang needs and that admission order cannot guarantee. -func TestExplicitRolesResolveRegardlessOfOrder(t *testing.T) { - ns, group := "default", "qaoa" - pg := &schedulingv1alpha2.PodGroup{ - ObjectMeta: metav1.ObjectMeta{Name: group, Namespace: ns}, // no recorded leader - } - m := &webhook.Mutator{Clientset: fake.NewSimpleClientset(pg)} - - pods := []*corev1.Pod{ - roledQPUPod(ns, group, "w0", RoleWorker), - roledQPUPod(ns, group, "leader", RoleLeader), - roledQPUPod(ns, group, "w1", RoleWorker), - } - var leaders, workers int - for _, p := range pods { // any admission order - ops := m.Mutate(context.Background(), p) - switch { - case hasSidecarOp(ops) && !hasGateOp(ops): - leaders++ - case hasGateOp(ops) && !hasSidecarOp(ops): - workers++ - default: - t.Fatalf("pod %s resolved to neither a clean leader nor worker", p.Name) - } - } - if leaders != 1 || workers != 2 { - t.Fatalf("want 1 leader + 2 workers, got %d leaders / %d workers", leaders, workers) - } -} - -// Backwards compatibility: with NO role annotation, the leader is still chosen -// by admission order (the recorded PodGroup leader), exactly as before. -func TestNoRoleAnnotationFallsBackToAdmissionOrder(t *testing.T) { - ns, group, leader := "default", "qaoa", "qaoa-leader" - m := &webhook.Mutator{Clientset: quantumGroupFixture(ns, group, leader)} - - // a second pod with no role annotation, not the recorded leader -> worker - second := qpuPod("fluence") - second.Name = "qaoa-second" - second.Namespace = ns - second.Labels = map[string]string{webhook.GroupLabel: group} - if !hasGateOp(m.Mutate(context.Background(), second)) { - t.Error("without a role annotation, a non-leader group member must be gated by admission order") - } -} - // ── gang handler: scheduling group linkage ────────────────────────────────────── func TestGangStampsSchedulingGroup(t *testing.T) { diff --git a/pkg/webhook/handlers/leader.go b/pkg/webhook/handlers/leader.go deleted file mode 100644 index 7408204..0000000 --- a/pkg/webhook/handlers/leader.go +++ /dev/null @@ -1,65 +0,0 @@ -package handlers - -import ( - "context" - "fmt" - "time" - - "github.com/converged-computing/fluence/pkg/webhook" - - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" -) - -// Leader election is a LEADER/WORKER concern, not a core gang concern, so it -// lives with the handlers that need it (quantum) rather than on the webhook -// core's MutatorAPI. It records/reads the admission-order leader on the group's -// PodGroup via an annotation, used only when a workload does NOT declare an -// explicit role (RoleAnnotation). A purely classical gang never touches this. - -// LeaderAnnotation records the admission-order leader on a PodGroup. -const LeaderAnnotation = "fluence.flux-framework.org/leader" - -// podGroupLeader returns the recorded admission-order leader for the group, or -// "". Retries briefly to absorb the concurrent leader/worker admission race. -func podGroupLeader(ctx context.Context, m webhook.MutatorAPI, namespace, group string) string { - c := m.Client() - if c == nil || group == "" { - return "" - } - for i := 0; i < 3; i++ { - pg, err := c.SchedulingV1alpha2().PodGroups(namespace).Get(ctx, group, metav1.GetOptions{}) - if err != nil { - return "" - } - if pg.Annotations != nil && pg.Annotations[LeaderAnnotation] != "" { - return pg.Annotations[LeaderAnnotation] - } - if i < 2 { - time.Sleep(100 * time.Millisecond) - } - } - return "" -} - -// recordLeaderIfUnset records leaderPod as the group's admission-order leader if -// none is set yet. Best-effort; safe to call on every quantum admission. -func recordLeaderIfUnset(ctx context.Context, m webhook.MutatorAPI, namespace, group, leaderPod string) { - c := m.Client() - if c == nil || group == "" { - return - } - if podGroupLeader(ctx, m, namespace, group) != "" { - return - } - patch := fmt.Sprintf(`{"metadata":{"annotations":{%q:%q}}}`, LeaderAnnotation, leaderPod) - if _, err := c.SchedulingV1alpha2().PodGroups(namespace).Patch( - ctx, group, types.MergePatchType, []byte(patch), metav1.PatchOptions{}); err != nil { - // best-effort; the explicit RoleAnnotation path does not need this - _ = err - } -} - -// leaderName is a tiny helper so callers read naturally. -func leaderName(pod *corev1.Pod) string { return pod.Name } diff --git a/pkg/webhook/handlers/quantum.go b/pkg/webhook/handlers/quantum.go index 8a0527b..47e1714 100644 --- a/pkg/webhook/handlers/quantum.go +++ b/pkg/webhook/handlers/quantum.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "log" + "os" "strconv" "strings" @@ -11,6 +12,8 @@ import ( "github.com/converged-computing/fluence/pkg/webhook/spec" corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -18,209 +21,300 @@ func init() { webhook.Register(&quantumHandler{}) } -// Quantum-specific policy. The webhook core knows NONE of these — they live -// only here, in the quantum handler. +// Quantum-specific policy. The webhook core knows NONE of these — they live only +// here, in the quantum handler. +// +// Model (no leader/worker): a workload requesting the quantum resource (Job, +// Deployment, or loose pods — the trigger is the resource, not the kind) becomes +// a GANG of full size N: one PodGroup, every pod fully gated and raised to a +// preempting priority, each staged with the interceptor in FAUX mode (the submit +// is a no-op). Fluence ALSO creates a separate one-off SUBMITTER pod — a +// group-of-one running the SAME application container plus the real sidecar — +// which submits the quantum task for real, tags it, stamps the resulting job-id +// onto the gang, and ungates the gang. There is no leader among the user's pods; +// the submitter is the only submitting pod and Fluence owns it. const ( - // QuantumResource is the Fluxion resource a pod requests when it wants - // Fluence to schedule quantum work. Requesting it is the trigger for sidecar - // + interceptor injection. + // QuantumResource is the Fluxion resource a pod requests to ask Fluence to + // schedule quantum work. Requesting it is the sole trigger for this handler. QuantumResource = "fluxion.flux-framework.org/qpu" - // QuantumGate holds a classical worker until the leader's quantum task is - // ready (the sidecar removes it). + // QuantumGate holds a gang pod unscheduled until the submitter's task is + // ready (the submitter's sidecar removes it). QuantumGate = "quantum.braket/ready" - // ObserveLabel opts a standalone quantum pod into observe-only telemetry: - // the sidecar is injected and polls queue position but ungates nothing. + // ObserveLabel opts a STANDALONE quantum pod (a group of one) into + // observe-only telemetry: the sidecar is injected and polls queue position + // but ungates nothing. ObserveLabel = "fluence.flux-framework.org/observe" - // Role values for webhook.RoleAnnotation. - RoleLeader = "leader" - RoleWorker = "worker" + // DependencyKindQuantumSubmit is the readiness Kind for the quantum resource + // type: gang pods wait for a quantum submission to reach the device queue. + // First concrete instance of the general Dependency primitive (dependency.go). + DependencyKindQuantumSubmit = "quantum-submit" + + // SubmitterAnnotation marks the Fluence-created submitter pod so its own + // admission is recognized (real sidecar, real submit, not gated) instead of + // being treated as another gang member. + SubmitterAnnotation = "fluence.flux-framework.org/submitter" + + // GangGroupAnnotation, set on the submitter at creation, names the gang group + // the submitter must ungate. Surfaced to its sidecar as FLUENCE_GANG_GROUP. + GangGroupAnnotation = "fluence.flux-framework.org/gang-group" - // WorkerGroupSuffix: a quantum gang of size N is split into TWO PodGroups — - // the leader keeps (minCount 1) and the workers move to - // -workers (minCount N-1, all gated). This suffix MUST match what the - // sidecar uses to discover workers (FLUENCE_WORKER_GROUP env, set on the - // leader's sidecar by the webhook). - WorkerGroupSuffix = "-workers" + // SubmitterGroupSuffix: the submitter is its own group-of-one named + // -submitter (a distinct PodGroup, minCount 1, so it schedules alone + // and never deadlocks against the gated gang). + SubmitterGroupSuffix = "-submitter" + + // GangGroupEnv tells the submitter's sidecar which gang group label to list + // and ungate when the task is ready. + GangGroupEnv = "FLUENCE_GANG_GROUP" ) -// quantumHandler coordinates quantum-classical workflows. It applies to a pod -// in either role: -// - the quantum submitter (requests QuantumResource): inject the interceptor, -// plus the sidecar when there is coordination to do (group leader, or -// observe-only telemetry requested); -// - a classical worker (a non-leader member of a group whose leader is a -// quantum pod): gate it until the leader's task is ready. -// -// This is the only place in the webhook that knows about quantum resources, -// gates, or observe semantics. +// quantumHandler creates, for a quantum workload, a fully-gated faux-submitting +// gang plus a one-off real submitter (see the package-level model comment). It +// is the only place in the webhook that knows about quantum resources, gates, +// submitters, or observe semantics. type quantumHandler struct{} func (h *quantumHandler) Name() string { return "quantum" } +// Applies to any pod requesting the quantum resource. Gang members run the same +// image as the submitter and request it; the submitter (a copy) requests it; a +// standalone quantum pod requests it. Nothing without the resource needs quantum +// handling, so this is the single, unambiguous trigger. func (h *quantumHandler) Applies(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) bool { - if spec.PodRequestsResource(pod, QuantumResource) { - return true - } - // An explicitly-declared worker applies (so it gets gated) even if it - // doesn't request the quantum resource and the leader isn't recorded yet — - // this removes the admission-order race for explicitly-roled gangs. - if webhook.Role(pod) == RoleWorker && webhook.GroupName(pod) != "" { - return true - } - return h.isWorkerOfQuantumGroup(ctx, m, pod) -} - -// isWorkerOfQuantumGroup reports whether pod is a non-leader member of a group -// whose recorded leader is a quantum (QuantumResource-requesting) pod. Workers -// are classical and do not request the resource themselves, so their role is a -// property of group membership, resolved against cluster state. -func (h *quantumHandler) isWorkerOfQuantumGroup(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) bool { - g := webhook.GroupName(pod) - if g == "" || m.Client() == nil { - return false - } - leader := podGroupLeader(ctx, m, pod.Namespace, g) - if leader == "" || leader == pod.Name { - return false - } - lp, err := m.Client().CoreV1().Pods(pod.Namespace).Get(ctx, leader, metav1.GetOptions{}) - if err != nil { - return false - } - return spec.PodRequestsResource(lp, QuantumResource) + return spec.PodRequestsResource(pod, QuantumResource) } func (h *quantumHandler) Mutate(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) []spec.Op { - g := webhook.GroupName(pod) - - // Determine role. An explicit role annotation is AUTHORITATIVE: the workload - // declares which pod leads and which wait, and Fluence honors it directly — - // no admission-order race, and the same value is echoed to the app as - // FLUENCE_ROLE so the webhook's notion of leader and the application's notion - // cannot disagree. When the annotation is absent, fall back to the legacy - // behavior: role is decided by admission order (the first pod admitted in the - // group, recorded on the PodGroup by the gang handler). The admission-order - // path suits a homogeneous pod-template gang where every pod is identical; - // the explicit annotation suits a heterogeneous leader/worker gang. - role := webhook.Role(pod) - var isWorker bool - switch role { - case RoleWorker: - isWorker = true - case RoleLeader: - isWorker = false - default: - if g != "" { - leader := podGroupLeader(ctx, m, pod.Namespace, g) - isWorker = leader != "" && leader != pod.Name - } + // The Fluence-created submitter: real interceptor + real sidecar, its own + // group-of-one, NOT gated. Recognized by the marker set at creation. + if spec.Annotation(pod, SubmitterAnnotation) == "true" { + return h.mutateSubmitter(ctx, m, pod) } - if g != "" && isWorker { - // Two-group split: workers live in -workers with minCount = N-1 - // (the leader is the other group, size 1). N is the full gang size from - // the owning Job. The worker is RE-LINKED from to the worker - // group, and the worker PodGroup is created (idempotent) with minCount - // N-1 so the worker set schedules atomically among itself. - wg := g + WorkerGroupSuffix - workerMin := workerCount(ctx, m, pod) // N-1: the worker subgroup schedules atomically among itself - m.EnsurePodGroup(ctx, pod.Namespace, wg, pod.Name, workerMin) - log.Printf("[fluence-webhook] quantum worker %s/%s (role=%q) — group %s minCount=%d, gating", - pod.Namespace, pod.Name, role, wg, workerMin) - ops := relinkGroupOps(pod, wg) // move label + schedulingGroup to -workers - ops = append(ops, gateOps(pod)...) - ops = append(ops, roleEnvOps(pod, RoleWorker)...) + g := resolveGroup(pod) + observe := spec.Label(pod, ObserveLabel) == "true" + n := resolveGangSize(ctx, m, pod, g) + + // Standalone quantum pod (a group of one): it performs its own real submit. + // No gang, no gating, no faux, no separate submitter. The sidecar is added + // only for observe-only telemetry. + if g == "" || n <= 1 { + ops := interceptorOps(pod) + if observe { + sc := sidecarFor(m) + sc.EnsureRBAC(ctx, pod.Namespace) + ops = append(ops, sc.ContainerOps(pod, true, nil)...) + } + log.Printf("[fluence-webhook] quantum standalone %s/%s (observe=%v)", pod.Namespace, pod.Name, observe) return ops } - // Submitter/leader role: recorded or declared group leader, or a standalone - // quantum pod. Always gets the interceptor (so its task is tagged). It gets - // the SIDECAR only when there is coordination to do: it is a group leader - // (workers to ungate), or observe-only telemetry is requested. - isLeader := g != "" - observe := spec.Label(pod, ObserveLabel) == "true" + // Gang member: full gang of N in one PodGroup, fully gated + preempting + // priority + faux interceptor. Fluence also ensures the one-off submitter + // (idempotent) that does the real submit and ungates this gang. + m.EnsurePodGroup(ctx, pod.Namespace, g, pod.Name, n) + ensureSubmitterPod(ctx, m, pod, g) - log.Printf("[fluence-webhook] quantum pod %s/%s — interceptor (leader=%v role=%q observe=%v)", - pod.Namespace, pod.Name, isLeader, role, observe) + ops := linkGroupOps(pod, g) + // Express the wait as the GENERAL dependency primitive: this gang pod depends + // on the quantum submission produced by -submitter, held by the quantum + // gate. applyOps gates the pod, raises priority, and stamps depends-on-*. + dep := Dependency{Kind: DependencyKindQuantumSubmit, Producer: g + SubmitterGroupSuffix, Gate: QuantumGate} + ops = append(ops, dep.applyOps(pod)...) + // Same interceptor as the submitter, but FAUX mode so the gang pod never + // resubmits; it receives the real task id via FLUENCE_QUANTUM_JOB_ID. + ops = append(ops, interceptorOps(pod)...) + ops = append(ops, fauxSubmitEnvOps(pod)...) + log.Printf("[fluence-webhook] quantum gang member %s/%s — group %s minCount=%d, gated+faux", + pod.Namespace, pod.Name, g, n) + return ops +} - if isLeader { - // Leader is its own group of 1 (the workers are -workers). Create - // the leader PodGroup with minCount=1 so the last-running gang handler - // (which would otherwise parent-derive N) finds it already present and - // leaves it alone. Also record the admission-order leader so a worker - // admitted without an explicit role can resolve its role by membership. - m.EnsurePodGroup(ctx, pod.Namespace, g, pod.Name, 1) - recordLeaderIfUnset(ctx, m, pod.Namespace, g, leaderName(pod)) +// mutateSubmitter wires the Fluence-created submitter pod: its own PodGroup of +// one, the real interceptor (tag mode), RBAC, and the sidecar container told +// which gang group to ungate (FLUENCE_GANG_GROUP). The submitter is never gated. +func (h *quantumHandler) mutateSubmitter(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) []spec.Op { + sg := webhook.GroupName(pod) // the submitter's own group: -submitter + gang := spec.Annotation(pod, GangGroupAnnotation) + if sg != "" { + m.EnsurePodGroup(ctx, pod.Namespace, sg, pod.Name, 1) } sc := sidecarFor(m) ops := sc.InterceptorOps(pod) - ops = append(ops, roleEnvOps(pod, RoleLeader)...) - if isLeader || observe { - sc.EnsureRBAC(ctx, pod.Namespace) - // Leader/worker sidecar env is supplied HERE (the quantum handler owns the - // split), keeping the core domain-agnostic. FLUENCE_EXPECTED_WORKERS is - // copied verbatim from the expected-workers ANNOTATION: env var values - // cannot be computed-and-patched dynamically at admission, so the workload - // declares the count as an annotation and the webhook propagates it to the - // env var the sidecar reads — annotation and env var are the same value in - // two representations. - var extra []corev1.EnvVar - if isLeader { - if n := spec.Annotation(pod, webhook.ExpectedWorkersAnnotation); n != "" { - extra = append(extra, corev1.EnvVar{Name: "FLUENCE_EXPECTED_WORKERS", Value: n}) - } - extra = append(extra, corev1.EnvVar{Name: "FLUENCE_WORKER_GROUP_BASE", Value: g}) + sc.EnsureRBAC(ctx, pod.Namespace) + extra := []corev1.EnvVar{{Name: GangGroupEnv, Value: gang}} + ops = append(ops, sc.ContainerOps(pod, false, extra)...) + log.Printf("[fluence-webhook] quantum submitter %s/%s — group %s (ungates gang %q)", + pod.Namespace, pod.Name, sg, gang) + return ops +} + +// resolveGroup returns the gang group identity: the explicit group label, else +// the owning controller's name (Job/ReplicaSet/StatefulSet — a Deployment's pods +// are owned by a ReplicaSet), else "" (a loose quantum pod with no group, which +// is treated as a standalone group of one). +func resolveGroup(pod *corev1.Pod) string { + if g := webhook.GroupName(pod); g != "" { + return g + } + for _, ref := range pod.OwnerReferences { + switch ref.Kind { + case "Job", "ReplicaSet", "StatefulSet": + return ref.Name } - ops = append(ops, sc.ContainerOps(pod, observe, extra)...) } - return ops + return "" } -// roleEnvOps injects FLUENCE_ROLE into every (non-sidecar) container so the -// application reads its gang role from the same source of truth the webhook -// used. effectiveRole is what the webhook decided (leader/worker), used only -// when the pod carries no explicit role annotation; when the annotation is -// present we source the value from it via the downward API so the two always -// agree. Unlike InterceptorOps, this is NOT limited to Fluxion-resource -// containers — worker containers do not request the quantum resource but still -// need to know they are workers. -func roleEnvOps(pod *corev1.Pod, effectiveRole string) []spec.Op { - var value corev1.EnvVar - if webhook.Role(pod) != "" { - value = spec.AnnotationEnv("FLUENCE_ROLE", webhook.RoleAnnotation) - } else { - value = corev1.EnvVar{Name: "FLUENCE_ROLE", Value: effectiveRole} +// resolveGangSize returns the full gang size N: the explicit group-size +// annotation (authoritative override), else the owner's replica count (Job +// parallelism/completions, ReplicaSet replicas), else a count of pods already +// carrying the group label (best-effort for loose grouped pods; admission-order +// dependent, which is why the annotation is preferred), else 1. +func resolveGangSize(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod, group string) int32 { + if pod.Annotations != nil { + if v, err := strconv.Atoi(pod.Annotations[webhook.GroupSizeAnnotation]); err == nil && v > 0 { + return int32(v) + } } - var ops []spec.Op - for i, c := range pod.Spec.Containers { - if c.Name == "fluence-sidecar" || spec.HasEnv(c, "FLUENCE_ROLE") { + if n := ownerJobN(ctx, m, pod); n > 0 { + return n + } + if n := ownerReplicaSetN(ctx, m, pod); n > 0 { + return n + } + if group != "" { + if n := countGroupPods(ctx, m, pod.Namespace, group); n > 0 { + return n + } + } + return 1 +} + +// ownerReplicaSetN returns the replica count of the ReplicaSet that owns the pod +// (the Deployment case: Deployment -> ReplicaSet -> Pod), or 0 if none. +func ownerReplicaSetN(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) int32 { + c := m.Client() + if c == nil { + return 0 + } + for _, ref := range pod.OwnerReferences { + if ref.Kind != "ReplicaSet" { continue } - if len(c.Env) == 0 { - ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env", i), Value: []corev1.EnvVar{value}}) - } else { - ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env/-", i), Value: value}) + rs, err := c.AppsV1().ReplicaSets(pod.Namespace).Get(ctx, ref.Name, metav1.GetOptions{}) + if err != nil { + return 0 + } + if rs.Spec.Replicas != nil && *rs.Spec.Replicas > 0 { + return *rs.Spec.Replicas } - pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, value) } - return ops + return 0 +} + +// countGroupPods counts pods already carrying the group label (best-effort gang +// size for loose grouped pods that have neither a group-size annotation nor an +// owning controller). Admission-order dependent — prefer the group-size +// annotation when the exact size must be guaranteed. +func countGroupPods(ctx context.Context, m webhook.MutatorAPI, namespace, group string) int32 { + c := m.Client() + if c == nil { + return 0 + } + list, err := c.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: webhook.GroupLabel + "=" + group, + }) + if err != nil { + return 0 + } + return int32(len(list.Items)) +} + +// SubmitterPodSuffix names the Fluence-created submitter for a group: +// -submitter. It also serves as the submitter's own PodGroup name. +const SubmitterPodSuffix = SubmitterGroupSuffix + +// ensureSubmitterPod creates the one-off quantum submitter pod for a group +// (idempotent create-if-absent — a client side-effect of admission, like +// EnsurePodGroup/EnsureSidecarRBAC; NOT a separate controller). It is built from +// the admitted gang pod so it runs the SAME application + credentials, is its own +// group-of-one (-submitter), is marked the submitter (so its admission +// gets the real sidecar and is not gated), and records the gang group it must +// ungate. An ownerReference to the gang's PodGroup cascades GC: when the gang +// PodGroup is deleted (gang completed/deleted), the submitter is collected too. +func ensureSubmitterPod(ctx context.Context, m webhook.MutatorAPI, gangPod *corev1.Pod, group string) { + c := m.Client() + if c == nil { + return + } + name := group + SubmitterGroupSuffix + if _, err := c.CoreV1().Pods(gangPod.Namespace).Get(ctx, name, metav1.GetOptions{}); err == nil { + return // already created (idempotent) + } + // Clean copy of the user's application: same containers (image, env, creds, + // the quantum resource request) and app volumes — none of the gang's gating + // or faux wiring. + src := gangPod.DeepCopy() + submitter := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: gangPod.Namespace, + Labels: map[string]string{webhook.GroupLabel: name}, + Annotations: map[string]string{ + SubmitterAnnotation: "true", + GangGroupAnnotation: group, + }, + }, + Spec: corev1.PodSpec{ + SchedulerName: webhook.SchedulerName, + RestartPolicy: corev1.RestartPolicyNever, + Containers: src.Spec.Containers, + Volumes: src.Spec.Volumes, + }, + } + // Cascade GC: own the submitter by the gang's PodGroup (created moments ago by + // the caller). Best-effort — only set when the PodGroup UID is known (it is on + // a real cluster; the fake client in tests may leave it empty, in which case + // we skip the ref rather than emit an invalid one). + if pg, err := c.SchedulingV1alpha2().PodGroups(gangPod.Namespace).Get(ctx, group, metav1.GetOptions{}); err == nil && pg.UID != "" { + submitter.OwnerReferences = []metav1.OwnerReference{{ + APIVersion: "scheduling.k8s.io/v1alpha2", + Kind: "PodGroup", + Name: group, + UID: pg.UID, + }} + } + if _, err := c.CoreV1().Pods(gangPod.Namespace).Create(ctx, submitter, metav1.CreateOptions{}); err != nil { + log.Printf("[fluence-webhook] submitter pod %s/%s: %v", gangPod.Namespace, name, err) + } else { + log.Printf("[fluence-webhook] created submitter pod %s/%s for gang %s", gangPod.Namespace, name, group) + } } -// relinkGroupOps moves a worker pod into the worker group: it overwrites the -// group label and the schedulingGroup.podGroupName link to point at wg -// (-workers). This is what puts the worker into the size-(N-1) PodGroup -// instead of the leader's size-1 group. -func relinkGroupOps(pod *corev1.Pod, wg string) []spec.Op { +// linkGroupOps ensures the gang pod carries the group label (so the submitter's +// sidecar can list it) and is linked to the gang PodGroup via +// spec.schedulingGroup.podGroupName. Idempotent. +func linkGroupOps(pod *corev1.Pod, group string) []spec.Op { var ops []spec.Op - // label (the value the sidecar lists workers by) — escape "/" and "~" per JSON Pointer - labelPath := "/metadata/labels/" + escapeJSONPointer(webhook.GroupLabel) - ops = append(ops, spec.Op{Op: "add", Path: labelPath, Value: wg}) - // the native gang link - ops = append(ops, spec.Op{Op: "add", Path: "/spec/schedulingGroup", - Value: map[string]string{"podGroupName": wg}}) + if webhook.GroupName(pod) != group { + if pod.Labels == nil { + ops = append(ops, spec.Op{Op: "add", Path: "/metadata/labels", + Value: map[string]string{webhook.GroupLabel: group}}) + } else { + ops = append(ops, spec.Op{Op: "add", + Path: "/metadata/labels/" + escapeJSONPointer(webhook.GroupLabel), + Value: group}) + } + } + if pod.Spec.SchedulingGroup == nil || pod.Spec.SchedulingGroup.PodGroupName == nil || + *pod.Spec.SchedulingGroup.PodGroupName != group { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/schedulingGroup", + Value: map[string]string{"podGroupName": group}}) + } return ops } @@ -231,60 +325,256 @@ func escapeJSONPointer(s string) string { return s } -// workerCount returns N-1, the size of the worker subgroup in a quantum gang of -// full size N (N from the group-size annotation, else the owning Job's -// parallelism). Used for the worker PodGroup's gang minCount so the workers -// schedule atomically among themselves. (The sidecar's FLUENCE_EXPECTED_WORKERS -// is a SEPARATE value, copied from the expected-workers annotation — env vars -// cannot be patched dynamically, so the workload declares that count explicitly.) -// Minimum 1. -func workerCount(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) int32 { - n := int32(0) - if pod.Annotations != nil { - if v, err := strconv.Atoi(pod.Annotations[webhook.GroupSizeAnnotation]); err == nil && v > 0 { - n = int32(v) +const QuantumClassicalPriorityClass = "fluence-quantum-classical" + +// ── faux-submit (worker submit dedup) ─────────────────────────────────────────── +// +// Quantum-specific, and delivered through the SAME Python interceptor as the +// submitter — not a second mechanism. The submitter's interceptor tags the +// submit; the worker's interceptor (same staged code) no-ops the submit. Which +// behavior runs is selected at runtime by FLUENCE_FAUX_SUBMIT, set here on the +// worker. Workers run the submitter's image and may call submit, but by ungate +// time the task already exists, so resubmitting would duplicate it N times. + +const ( + // FauxSubmitEnv selects the interceptor's no-op (faux) mode on workers. + // install_interceptor (see python/fluence/providers/braket.py) reads it and + // patches the vendor submit to return the existing task instead of submitting. + FauxSubmitEnv = "FLUENCE_FAUX_SUBMIT" + + // QuantumJobIDAnnotation is the vendor-neutral task id the ungating sidecar + // stamps on each worker (mirrors python/fluence/ungate.py JOB_ID_ANNOTATION), + // BEFORE removing the gate. Surfaced into FLUENCE_QUANTUM_JOB_ID via the + // downward API so the faux interceptor can return a handle to that task. + QuantumJobIDAnnotation = "fluence.flux-framework.org/quantum-job-id" + + // QuantumJobIDEnv is the env the faux interceptor reads for the existing + // task's id. + QuantumJobIDEnv = "FLUENCE_QUANTUM_JOB_ID" +) + +// fauxSubmitEnvOps sets, on each non-sidecar worker container, the faux-mode +// marker (FLUENCE_FAUX_SUBMIT=true) and the existing task's id +// (FLUENCE_QUANTUM_JOB_ID, downward API from the annotation the ungating sidecar +// stamps). The interceptor is staged separately via the shared sidecar +// InterceptorOps path — these env vars only switch its mode and hand it the id. +func fauxSubmitEnvOps(pod *corev1.Pod) []spec.Op { + faux := corev1.EnvVar{Name: FauxSubmitEnv, Value: "true"} + jobID := spec.AnnotationEnv(QuantumJobIDEnv, QuantumJobIDAnnotation) + var ops []spec.Op + for i, c := range pod.Spec.Containers { + if c.Name == SidecarContainerName { + continue + } + if !spec.HasEnv(c, FauxSubmitEnv) { + if len(c.Env) == 0 { + ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env", i), Value: []corev1.EnvVar{faux}}) + pod.Spec.Containers[i].Env = []corev1.EnvVar{faux} + } else { + ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env/-", i), Value: faux}) + pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, faux) + } + } + if !spec.HasEnv(c, QuantumJobIDEnv) { + ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env/-", i), Value: jobID}) + pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, jobID) } } - if n == 0 { - n = ownerJobN(ctx, m, pod) + return ops +} + +// Sidecar implementation — quantum-owned, NOT core. +// +// The fluence coordination sidecar (its container, name, RBAC, image, and the +// Python interceptor staging) is specific to the quantum integration: it polls a +// vendor queue and ungates workers. None of this belongs on the webhook core, +// which stays domain-agnostic and only exposes generic primitives (Client, +// InjectedEnv, EnsurePodGroup). The core invokes each handler's generic Mutate; +// a handler does its own create/edit side-effects (here: RBAC, ConfigMaps, +// container injection) through the generic client. +// +// These are package-level functions (not methods on the core *Mutator) operating +// on the generic webhook.MutatorAPI. coreSidecar (see sidecar.go) delegates to +// them; a future non-quantum handler that needs a different sidecar supplies its +// own Sidecar implementation and its own container name/image. + +const ( + // SidecarContainerName is the injected sidecar container's name. Owned here + // (not a global core const) because the container is quantum-specific. + SidecarContainerName = "fluence-sidecar" + + // SidecarServiceAccount is the ServiceAccount (and Role/RoleBinding) name the + // sidecar uses to patch pods and read PodGroups. + SidecarServiceAccount = "fluence-sidecar" + + // defaultSidecarImage is used when FLUENCE_SIDECAR_IMAGE is not set. Owned by + // the quantum integration; the deployment may override it via the env var. + defaultSidecarImage = "ghcr.io/converged-computing/fluence-sidecar:latest" + + // StageVolumeName / StageMountPath: the shared emptyDir the init container + // stages the fluence Python package into, mounted into workload containers + // and prepended to PYTHONPATH (Model C delivery). + StageVolumeName = "fluence-pkg" + StageMountPath = "/opt/fluence-staged" +) + +// sidecarImage resolves the sidecar image: the FLUENCE_SIDECAR_IMAGE override +// (deployment config) or the quantum default. Read here so image config is owned +// by the integration that uses it, not the core. +func sidecarImage() string { + if v := os.Getenv("FLUENCE_SIDECAR_IMAGE"); v != "" { + return v } - if n > 1 { - return n - 1 + return defaultSidecarImage +} + +// ensureSidecarRBAC provisions the per-namespace ServiceAccount/Role/RoleBinding +// the sidecar uses to patch pods and read PodGroups. Idempotent (create-if-absent). +func ensureSidecarRBAC(ctx context.Context, m webhook.MutatorAPI, namespace string) { + c := m.Client() + if c == nil { + return + } + lbl := map[string]string{"app": SidecarServiceAccount} + + if _, err := c.CoreV1().ServiceAccounts(namespace).Get(ctx, SidecarServiceAccount, metav1.GetOptions{}); err != nil { + sa := &corev1.ServiceAccount{ObjectMeta: metav1.ObjectMeta{Name: SidecarServiceAccount, Namespace: namespace, Labels: lbl}} + if _, err := c.CoreV1().ServiceAccounts(namespace).Create(ctx, sa, metav1.CreateOptions{}); err != nil { + log.Printf("[fluence-webhook] could not create ServiceAccount %s/%s: %v", namespace, SidecarServiceAccount, err) + } + } + if _, err := c.RbacV1().Roles(namespace).Get(ctx, SidecarServiceAccount, metav1.GetOptions{}); err != nil { + role := &rbacv1.Role{ + ObjectMeta: metav1.ObjectMeta{Name: SidecarServiceAccount, Namespace: namespace, Labels: lbl}, + Rules: []rbacv1.PolicyRule{ + {APIGroups: []string{""}, Resources: []string{"pods"}, Verbs: []string{"get", "list", "patch", "update"}}, + {APIGroups: []string{"scheduling.k8s.io"}, Resources: []string{"podgroups"}, Verbs: []string{"get", "list"}}, + }, + } + if _, err := c.RbacV1().Roles(namespace).Create(ctx, role, metav1.CreateOptions{}); err != nil { + log.Printf("[fluence-webhook] could not create Role %s/%s: %v", namespace, SidecarServiceAccount, err) + } + } + if _, err := c.RbacV1().RoleBindings(namespace).Get(ctx, SidecarServiceAccount, metav1.GetOptions{}); err != nil { + rb := &rbacv1.RoleBinding{ + ObjectMeta: metav1.ObjectMeta{Name: SidecarServiceAccount, Namespace: namespace, Labels: lbl}, + Subjects: []rbacv1.Subject{{Kind: "ServiceAccount", Name: SidecarServiceAccount, Namespace: namespace}}, + RoleRef: rbacv1.RoleRef{APIGroup: "rbac.authorization.k8s.io", Kind: "Role", Name: SidecarServiceAccount}, + } + if _, err := c.RbacV1().RoleBindings(namespace).Create(ctx, rb, metav1.CreateOptions{}); err != nil { + log.Printf("[fluence-webhook] could not create RoleBinding %s/%s: %v", namespace, SidecarServiceAccount, err) + } } - return 1 } -// gateOps adds the quantum scheduling gate (idempotent). -const QuantumClassicalPriorityClass = "fluence-quantum-classical" +// interceptorOps stages the fluence Python package (Model C): an init container +// copies it into a shared emptyDir, mounted into every workload container +// (skipping the sidecar) with PYTHONPATH + FLUENCE_POD_UID, so Python auto-imports +// the interceptor via sitecustomize. Broad mounting is safe (fail-soft when the +// vendor SDK is absent) and is required so a quantum WORKER — which runs the same +// image but does not request the resource — also gets the (faux-mode) interceptor. +func interceptorOps(pod *corev1.Pod) []spec.Op { + var ops []spec.Op + + vol := corev1.Volume{Name: StageVolumeName, VolumeSource: corev1.VolumeSource{EmptyDir: &corev1.EmptyDirVolumeSource{}}} + if len(pod.Spec.Volumes) == 0 { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/volumes", Value: []corev1.Volume{vol}}) + } else { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/volumes/-", Value: vol}) + } + + initc := corev1.Container{ + Name: "fluence-stage", + Image: sidecarImage(), + ImagePullPolicy: corev1.PullAlways, + Command: []string{"sh", "-c", + fmt.Sprintf("python -m fluence.stage %s || echo '[fluence] staging skipped (interceptor unavailable)'", StageMountPath)}, + VolumeMounts: []corev1.VolumeMount{{Name: StageVolumeName, MountPath: StageMountPath}}, + } + if len(pod.Spec.InitContainers) == 0 { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/initContainers", Value: []corev1.Container{initc}}) + } else { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/initContainers/-", Value: initc}) + } -func gateOps(pod *corev1.Pod) []spec.Op { - for _, g := range pod.Spec.SchedulingGates { - if g.Name == QuantumGate { - return nil + mount := corev1.VolumeMount{Name: StageVolumeName, MountPath: StageMountPath, ReadOnly: true} + pythonpath := corev1.EnvVar{Name: "PYTHONPATH", Value: StageMountPath} + uid := spec.FieldEnv("FLUENCE_POD_UID", "metadata.uid") + for i, c := range pod.Spec.Containers { + if c.Name == SidecarContainerName { + continue + } + if len(c.VolumeMounts) == 0 { + ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/volumeMounts", i), Value: []corev1.VolumeMount{mount}}) + } else { + ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/volumeMounts/-", i), Value: mount}) + } + if !spec.HasEnv(c, "PYTHONPATH") { + if len(c.Env) == 0 { + ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env", i), Value: []corev1.EnvVar{pythonpath}}) + pod.Spec.Containers[i].Env = []corev1.EnvVar{pythonpath} + } else { + ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env/-", i), Value: pythonpath}) + pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, pythonpath) + } + } + if !spec.HasEnv(c, "FLUENCE_POD_UID") { + ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env/-", i), Value: uid}) + pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, uid) } } + return ops +} + +// sidecarContainerOps adds the fluence sidecar container (pod identity env, the +// generic FLUXION_* contract from InjectedEnv, the observe flag, handler-supplied +// extraEnv, and the workload's secret/configMap-sourced credentials) and sets the +// sidecar ServiceAccount. observe=true selects observe-only telemetry mode. +func sidecarContainerOps(m webhook.MutatorAPI, pod *corev1.Pod, observe bool, extraEnv []corev1.EnvVar) []spec.Op { var ops []spec.Op - gate := corev1.PodSchedulingGate{Name: QuantumGate} - if len(pod.Spec.SchedulingGates) == 0 { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/schedulingGates", Value: []corev1.PodSchedulingGate{gate}}) + env := []corev1.EnvVar{ + spec.FieldEnv("FLUENCE_POD_UID", "metadata.uid"), + spec.FieldEnv("FLUENCE_POD_NAME", "metadata.name"), + spec.FieldEnv("FLUENCE_NAMESPACE", "metadata.namespace"), + spec.FieldEnv("FLUENCE_GROUP", "metadata.labels['"+webhook.GroupLabel+"']"), + } + env = append(env, m.InjectedEnv()...) + if observe { + env = append(env, corev1.EnvVar{Name: "FLUENCE_OBSERVE", Value: "true"}) + } + env = append(env, extraEnv...) + // Copy the workload container's secret/configMap-sourced env onto the sidecar + // so it can talk to the same backend (domain-agnostic: we propagate whatever + // the workload pulls from a secret/configMap; existing FLUENCE_/FLUXION_ names + // are not overwritten). + if len(pod.Spec.Containers) > 0 { + have := map[string]bool{} + for _, e := range env { + have[e.Name] = true + } + for _, e := range pod.Spec.Containers[0].Env { + if have[e.Name] || e.ValueFrom == nil { + continue + } + if e.ValueFrom.SecretKeyRef != nil || e.ValueFrom.ConfigMapKeyRef != nil { + env = append(env, e) + } + } + } + sidecar := corev1.Container{ + Name: SidecarContainerName, Image: sidecarImage(), ImagePullPolicy: corev1.PullAlways, + Env: env, + Resources: corev1.ResourceRequirements{Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("100m"), corev1.ResourceMemory: resource.MustParse("256Mi"), + }}, + } + if len(pod.Spec.Containers) == 0 { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/containers", Value: []corev1.Container{sidecar}}) } else { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/schedulingGates/-", Value: gate}) - } - // Give gated classical workers a raised priority so they schedule reliably - // once ungated. priorityClassName is immutable post-creation, so it MUST be - // set here at admission, not at ungate time. Only set it if the pod doesn't - // already declare one (don't overwrite a user's class). - if pod.Spec.PriorityClassName == "" { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/priorityClassName", Value: QuantumClassicalPriorityClass}) - // Clear spec.priority so the priority admission controller recomputes it - // from the class. The controller errors only when spec.priority is - // non-nil AND differs from the class value; setting it to null avoids - // that in every case. We use add-with-null (not remove): a JSON Patch - // "remove" of an absent path is a hard error, and whether the API has - // already defaulted spec.priority differs across clusters/k8s versions - // (it broke in CI but not on GKE, or vice versa). add-null is valid - // whether the field is absent, 0, or set. - ops = append(ops, spec.Op{Op: "add", Path: "/spec/priority", EmitNull: true}) + ops = append(ops, spec.Op{Op: "add", Path: "/spec/containers/-", Value: sidecar}) + } + if pod.Spec.ServiceAccountName == "" || pod.Spec.ServiceAccountName == "default" { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/serviceAccountName", Value: SidecarServiceAccount}) } return ops } diff --git a/pkg/webhook/handlers/quantum_split_test.go b/pkg/webhook/handlers/quantum_split_test.go deleted file mode 100644 index 4bf1160..0000000 --- a/pkg/webhook/handlers/quantum_split_test.go +++ /dev/null @@ -1,117 +0,0 @@ -/* -Copyright 2024 Lawrence Livermore National Security, LLC - (c.f. AUTHORS, NOTICE.LLNS, COPYING) -SPDX-License-Identifier: Apache-2.0 -*/ - -// Two-group quantum split: a quantum gang of size N becomes a leader PodGroup -// (minCount 1) and a worker PodGroup -workers (minCount N-1). -// minCount is derived from the owning Job's parallelism (N). -package handlers - -import ( - "context" - "testing" - - "github.com/converged-computing/fluence/pkg/webhook" - - batchv1 "k8s.io/api/batch/v1" - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes/fake" -) - -func qpuLeader(ns, group, name, job string) *corev1.Pod { - p := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, Namespace: ns, - Labels: map[string]string{webhook.GroupLabel: group}, - Annotations: map[string]string{webhook.RoleAnnotation: RoleLeader}, - OwnerReferences: []metav1.OwnerReference{{Kind: "Job", Name: job}}, - }, - Spec: corev1.PodSpec{ - SchedulerName: webhook.SchedulerName, - Containers: []corev1.Container{{Name: "app", Resources: corev1.ResourceRequirements{ - Requests: corev1.ResourceList{QuantumResource: *resource.NewQuantity(1, resource.DecimalSI)}}}}, - }, - } - return p -} - -func qpuWorker(ns, group, name, job string) *corev1.Pod { - p := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, Namespace: ns, - Labels: map[string]string{webhook.GroupLabel: group}, - Annotations: map[string]string{webhook.RoleAnnotation: RoleWorker}, - OwnerReferences: []metav1.OwnerReference{{Kind: "Job", Name: job}}, - }, - Spec: corev1.PodSpec{ - SchedulerName: webhook.SchedulerName, - Containers: []corev1.Container{{Name: "w"}}, - }, - } - return p -} - -func mincount(t *testing.T, cs *fake.Clientset, ns, group string) (int32, bool) { - t.Helper() - pg, err := cs.SchedulingV1alpha2().PodGroups(ns).Get(context.Background(), group, metav1.GetOptions{}) - if err != nil { - return 0, false - } - if pg.Spec.SchedulingPolicy.Gang == nil { - return 0, false - } - return pg.Spec.SchedulingPolicy.Gang.MinCount, true -} - -// Quantum gang of size N=4 owned by a Job(parallelism=4): leader group minCount -// 1, worker group -workers minCount 3. -func TestQuantumSplitLeaderOneWorkersNMinus1(t *testing.T) { - ns, group, job := "default", "qg", "qg-job" - par := int32(4) - jobObj := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{Name: job, Namespace: ns}, - Spec: batchv1.JobSpec{Parallelism: &par, Completions: &par}, - } - cs := fake.NewSimpleClientset(jobObj) - m := &webhook.Mutator{Clientset: cs} - - // leader admitted first - m.Mutate(context.Background(), qpuLeader(ns, group, "qg-0", job)) - // then a worker - m.Mutate(context.Background(), qpuWorker(ns, group, "qg-1", job)) - - if mc, ok := mincount(t, cs, ns, group); !ok || mc != 1 { - t.Errorf("leader group %q minCount=%d (ok=%v), want 1", group, mc, ok) - } - wg := group + WorkerGroupSuffix - if mc, ok := mincount(t, cs, ns, wg); !ok || mc != 3 { - t.Errorf("worker group %q minCount=%d (ok=%v), want 3 (N-1)", wg, mc, ok) - } -} - -// The worker is relinked into -workers (label + schedulingGroup op). -func TestQuantumWorkerRelinkedToWorkerGroup(t *testing.T) { - ns, group, job := "default", "qg2", "qg2-job" - par := int32(3) - cs := fake.NewSimpleClientset(&batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{Name: job, Namespace: ns}, - Spec: batchv1.JobSpec{Parallelism: &par, Completions: &par}}) - m := &webhook.Mutator{Clientset: cs} - m.Mutate(context.Background(), qpuLeader(ns, group, "qg2-0", job)) - - ops := m.Mutate(context.Background(), qpuWorker(ns, group, "qg2-1", job)) - wg := group + WorkerGroupSuffix - var relinked bool - for _, op := range ops { - if v, ok := op.Value.(map[string]string); ok && v["podGroupName"] == wg { - relinked = true - } - } - if !relinked { - t.Errorf("worker not relinked to %q (ops: %+v)", wg, ops) - } -} diff --git a/pkg/webhook/handlers/quantum_test.go b/pkg/webhook/handlers/quantum_test.go new file mode 100644 index 0000000..613724d --- /dev/null +++ b/pkg/webhook/handlers/quantum_test.go @@ -0,0 +1,448 @@ +/* +Copyright 2024 Lawrence Livermore National Security, LLC + (c.f. AUTHORS, NOTICE.LLNS, COPYING) +SPDX-License-Identifier: Apache-2.0 +*/ + +// quantum_test.go — all tests for the quantum handler: the gang + submitter +// model, faux-submit, the sidecar wiring, the Dependency primitive, and the +// standalone/observe paths. Shared fixtures (qpuPod, cpuPod, op helpers) live in +// handlers_test.go. +package handlers + +import ( + "context" + "testing" + + "github.com/converged-computing/fluence/pkg/webhook" + "github.com/converged-computing/fluence/pkg/webhook/spec" + + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" +) + +// ── standalone / observe ──────────────────────────────────────────────────────── + +func TestSingleQuantumGetsInterceptorNoSidecar(t *testing.T) { + m := &webhook.Mutator{AttributeKeys: []string{"region"}} + ops := m.Mutate(context.Background(), qpuPod("fluence")) + names := opEnvNames(ops) + if !contains(names, "FLUXION_BACKEND") { + t.Errorf("want FLUXION_BACKEND, got %v", names) + } + if !contains(names, "PYTHONPATH") || !contains(names, "FLUENCE_POD_UID") { + t.Errorf("want interceptor env (PYTHONPATH, FLUENCE_POD_UID), got %v", names) + } + if hasSidecarOp(ops) { + t.Error("standalone quantum pod should not get a sidecar") + } + if hasGateOp(ops) { + t.Error("standalone quantum pod should not be gated") + } +} + +func TestObserveLabelInjectsSidecar(t *testing.T) { + m := &webhook.Mutator{} + pod := qpuPod("fluence") + pod.Labels = map[string]string{ObserveLabel: "true"} + ops := m.Mutate(context.Background(), pod) + if !hasSidecarOp(ops) { + t.Error("observe-labeled quantum pod should get the sidecar") + } + if hasGateOp(ops) { + t.Error("observe-only pod should not be gated") + } +} + +// ── gang + submitter ──────────────────────────────────────────────────────────── + +// gangQPUPod is a quantum workload pod (requests the resource) in a group, +// owned by a Job of parallelism N — the common real shape (a MiniCluster / +// indexed Job). No role annotation: the new model has no leader/worker. +func gangQPUPod(ns, group, name, job string) *corev1.Pod { + p := qpuPod("fluence") + p.Name = name + p.Namespace = ns + p.Labels = map[string]string{webhook.GroupLabel: group} + p.OwnerReferences = []metav1.OwnerReference{{Kind: "Job", Name: job}} + return p +} + +// mincount returns the gang minCount of the named PodGroup, or ok=false. +func mincount(t *testing.T, cs *fake.Clientset, ns, group string) (int32, bool) { + t.Helper() + pg, err := cs.SchedulingV1alpha2().PodGroups(ns).Get(context.Background(), group, metav1.GetOptions{}) + if err != nil || pg.Spec.SchedulingPolicy.Gang == nil { + return 0, false + } + return pg.Spec.SchedulingPolicy.Gang.MinCount, true +} + +// A quantum gang member (owned by Job parallelism=3) is gated + faux, its gang +// PodGroup is minCount 3 (full N — no N-1 split), and Fluence creates the +// separate -submitter pod. It gets NO sidecar (it is gated). +func TestQuantumGangGatedFauxAndSubmitterCreated(t *testing.T) { + ns, group, job := "default", "qg", "qg-job" + par := int32(3) + cs := fake.NewSimpleClientset(&batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: job, Namespace: ns}, + Spec: batchv1.JobSpec{Parallelism: &par, Completions: &par}}) + m := &webhook.Mutator{Clientset: cs} + + ops := m.Mutate(context.Background(), gangQPUPod(ns, group, "qg-0", job)) + + if !hasGateOp(ops) { + t.Error("gang member must be gated") + } + if hasSidecarOp(ops) { + t.Error("gang member (gated) must NOT get a sidecar") + } + if e, ok := envOp(ops, FauxSubmitEnv); !ok || e.Value != "true" { + t.Errorf("gang member must get %s=true", FauxSubmitEnv) + } + if mc, ok := mincount(t, cs, ns, group); !ok || mc != 3 { + t.Errorf("gang PodGroup minCount=%d (ok=%v), want 3 (full N, no split)", mc, ok) + } + // No -workers subgroup in the new model. + if _, ok := mincount(t, cs, ns, group+"-workers"); ok { + t.Error("there must be no -workers subgroup in the gang+submitter model") + } + // Fluence created the submitter. + sub, err := cs.CoreV1().Pods(ns).Get(context.Background(), group+SubmitterGroupSuffix, metav1.GetOptions{}) + if err != nil { + t.Fatalf("submitter pod not created: %v", err) + } + if sub.Annotations[SubmitterAnnotation] != "true" { + t.Error("submitter must carry the submitter marker") + } + if sub.Annotations[GangGroupAnnotation] != group { + t.Errorf("submitter gang-group=%q, want %q", sub.Annotations[GangGroupAnnotation], group) + } + if len(sub.Spec.SchedulingGates) != 0 { + t.Error("submitter must NOT be gated") + } +} + +// The submitter pod, on its own admission, is wired as the real coordinator: its +// own PodGroup minCount 1, the real sidecar (not faux), not gated, and told which +// gang to ungate via FLUENCE_GANG_GROUP. +func TestSubmitterWiredAsRealSidecar(t *testing.T) { + ns, group, job := "default", "qg2", "qg2-job" + par := int32(2) + cs := fake.NewSimpleClientset(&batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: job, Namespace: ns}, + Spec: batchv1.JobSpec{Parallelism: &par, Completions: &par}}) + m := &webhook.Mutator{Clientset: cs} + + // First a gang member, which creates the submitter. + m.Mutate(context.Background(), gangQPUPod(ns, group, "qg2-0", job)) + sub, err := cs.CoreV1().Pods(ns).Get(context.Background(), group+SubmitterGroupSuffix, metav1.GetOptions{}) + if err != nil { + t.Fatalf("submitter not created: %v", err) + } + + ops := m.Mutate(context.Background(), sub) + if !hasSidecarOp(ops) { + t.Error("submitter must get the real sidecar") + } + if hasGateOp(ops) { + t.Error("submitter must NOT be gated") + } + if _, ok := envOp(ops, FauxSubmitEnv); ok { + t.Error("submitter must NOT be in faux mode") + } + // FLUENCE_GANG_GROUP is on the sidecar container itself. + var sidecar *corev1.Container + for _, op := range ops { + if c, ok := op.Value.(corev1.Container); ok && c.Name == SidecarContainerName { + cc := c + sidecar = &cc + } + } + if sidecar == nil { + t.Fatal("no sidecar container on submitter") + } + var gotGang bool + for _, e := range sidecar.Env { + if e.Name == GangGroupEnv && e.Value == group { + gotGang = true + } + } + if !gotGang { + t.Errorf("submitter sidecar must get %s=%q", GangGroupEnv, group) + } + if mc, ok := mincount(t, cs, ns, group+SubmitterGroupSuffix); !ok || mc != 1 { + t.Errorf("submitter PodGroup minCount=%d (ok=%v), want 1", mc, ok) + } +} + +// A standalone quantum pod (no group, no owner → group of one) does its own real +// submit: interceptor staged, but no gating, no faux, and no separate submitter. +func TestStandaloneQuantumIsRealNoSubmitter(t *testing.T) { + ns := "default" + cs := fake.NewSimpleClientset() + m := &webhook.Mutator{Clientset: cs} + + pod := qpuPod("fluence") + pod.Name = "solo" + pod.Namespace = ns + + ops := m.Mutate(context.Background(), pod) + if hasGateOp(ops) { + t.Error("standalone quantum pod must not be gated") + } + if _, ok := envOp(ops, FauxSubmitEnv); ok { + t.Error("standalone quantum pod must not be faux") + } + pods, _ := cs.CoreV1().Pods(ns).List(context.Background(), metav1.ListOptions{}) + if len(pods.Items) != 0 { + t.Error("standalone quantum pod must not spawn a submitter") + } +} + +// ── faux-submit + dependency ──────────────────────────────────────────────────── + +// envValueFrom returns the env var op with the given name, if present (covers +// both single-EnvVar and []EnvVar op shapes). +func envOp(ops []spec.Op, name string) (corev1.EnvVar, bool) { + for _, op := range ops { + switch v := op.Value.(type) { + case corev1.EnvVar: + if v.Name == name { + return v, true + } + case []corev1.EnvVar: + for _, e := range v { + if e.Name == name { + return e, true + } + } + } + } + return corev1.EnvVar{}, false +} + +// annotationOps collects all annotation key=value pairs the ops would stamp. +func annotationOps(ops []spec.Op) map[string]string { + out := map[string]string{} + for _, op := range ops { + // whole-map add: /metadata/annotations + if op.Path == "/metadata/annotations" { + if m, ok := op.Value.(map[string]string); ok { + for k, v := range m { + out[k] = v + } + } + continue + } + // single-key add: /metadata/annotations/ -> string value + const pfx = "/metadata/annotations/" + if len(op.Path) > len(pfx) && op.Path[:len(pfx)] == pfx { + if s, ok := op.Value.(string); ok { + key := unescapeJSONPointer(op.Path[len(pfx):]) + out[key] = s + } + } + } + return out +} + +// unescapeJSONPointer reverses escapeJSONPointer for assertion readability. +func unescapeJSONPointer(s string) string { + // reverse order of escape: ~1 -> /, then ~0 -> ~ + out := "" + for i := 0; i < len(s); i++ { + if s[i] == '~' && i+1 < len(s) { + switch s[i+1] { + case '1': + out += "/" + i++ + continue + case '0': + out += "~" + i++ + continue + } + } + out += string(s[i]) + } + return out +} + +// A quantum worker (no group-size of its own) is expressed as a general +// Dependency: gated, stamped with depends-on-{kind,producer,gate}, and the +// producer is the base group. +func TestQuantumWorkerIsGeneralDependency(t *testing.T) { + ns, group, job := "default", "depq", "depq-job" + par := int32(3) + cs := fake.NewSimpleClientset(&batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: job, Namespace: ns}, + Spec: batchv1.JobSpec{Parallelism: &par, Completions: &par}}) + m := &webhook.Mutator{Clientset: cs} + + ops := m.Mutate(context.Background(), gangQPUPod(ns, group, "depq-0", job)) + + if !hasGateOp(ops) { + t.Errorf("worker not gated by the dependency (ops: %+v)", ops) + } + ann := annotationOps(ops) + if ann[DependsOnKindAnnotation] != DependencyKindQuantumSubmit { + t.Errorf("depends-on-kind=%q, want %q", ann[DependsOnKindAnnotation], DependencyKindQuantumSubmit) + } + if ann[DependsOnProducerAnnotation] != group+SubmitterGroupSuffix { + t.Errorf("depends-on-producer=%q, want %q (the submitter group)", ann[DependsOnProducerAnnotation], group+SubmitterGroupSuffix) + } + if ann[DependsOnGateAnnotation] != QuantumGate { + t.Errorf("depends-on-gate=%q, want %q", ann[DependsOnGateAnnotation], QuantumGate) + } +} + +// DependencyOf round-trips the stamped annotations back into a Dependency, so a +// scheduler/sidecar observer can read what a gated pod waits for. +func TestDependencyOfRoundTrip(t *testing.T) { + pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{ + DependsOnKindAnnotation: DependencyKindQuantumSubmit, + DependsOnProducerAnnotation: "grp", + DependsOnGateAnnotation: QuantumGate, + }}} + d, ok := DependencyOf(pod) + if !ok || d.Kind != DependencyKindQuantumSubmit || d.Producer != "grp" || d.Gate != QuantumGate { + t.Errorf("DependencyOf=%+v ok=%v, want quantum-submit/grp/%s", d, ok, QuantumGate) + } + if _, ok := DependencyOf(&corev1.Pod{}); ok { + t.Errorf("DependencyOf on a pod with no dependency should be ok=false") + } +} + +// The worker is staged with the SAME interceptor as the submitter (PYTHONPATH + +// FLUENCE_POD_UID), put into faux mode (FLUENCE_FAUX_SUBMIT=true), and handed the +// existing task id via the FLUENCE_QUANTUM_JOB_ID downward-API env. One +// mechanism, two modes — no separate ConfigMap shim. The user sets nothing. +func TestQuantumWorkerStagedWithFauxSubmit(t *testing.T) { + ns, group, job := "default", "fauxq", "fauxq-job" + par := int32(2) + cs := fake.NewSimpleClientset(&batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: job, Namespace: ns}, + Spec: batchv1.JobSpec{Parallelism: &par, Completions: &par}}) + m := &webhook.Mutator{Clientset: cs} + + ops := m.Mutate(context.Background(), gangQPUPod(ns, group, "fauxq-0", job)) + + // Same interceptor staging as the submitter (PYTHONPATH set on the worker). + if _, ok := envOp(ops, "PYTHONPATH"); !ok { + t.Errorf("worker not staged with the interceptor (no PYTHONPATH); ops: %+v", ops) + } + + // Faux mode selected. + if e, ok := envOp(ops, FauxSubmitEnv); !ok || e.Value != "true" { + t.Errorf("worker missing %s=true (got %+v, ok=%v)", FauxSubmitEnv, e, ok) + } + + // Existing task id sourced from the annotation the ungating sidecar stamps. + e, ok := envOp(ops, QuantumJobIDEnv) + if !ok { + t.Fatalf("worker missing %s env", QuantumJobIDEnv) + } + if e.ValueFrom == nil || e.ValueFrom.FieldRef == nil || + e.ValueFrom.FieldRef.FieldPath != "metadata.annotations['"+QuantumJobIDAnnotation+"']" { + t.Errorf("%s should be a downward-API ref to %s, got %+v", QuantumJobIDEnv, QuantumJobIDAnnotation, e) + } +} + +// Classical override below the replica count: group-size=2 on a gang owned by a +// Job(parallelism=5) must yield minCount=2 (the override), not 5. With a cluster +// sized to 2, the gang reaches quorum and runs; if the override were dropped the +// gang would wait forever for 5 (the e2e hang that fails CI). +func TestClassicalOverrideBelowReplicaCount(t *testing.T) { + ns, group, job := "default", "ovr2", "ovr2-job" + pod := cpuPod("fluence") + pod.Namespace = ns + pod.Labels = map[string]string{webhook.GroupLabel: group} + pod.Annotations = map[string]string{webhook.GroupSizeAnnotation: "2"} + ownedBy(pod, "Job", job) + + got := minCountWithClient(t, pod, jobWithParallelism(ns, job, 5)) + if got != 2 { + t.Errorf("override below replicas: minCount=%d, want 2 (override wins over Job=5)", got) + } +} + +// ── sidecar wiring ────────────────────────────────────────────────────────────── + +// The sidecar inherits the workload's secret/configMap-sourced credentials so it +// can talk to the same backend, but NOT plain-value env. (Moved from the core +// webhook package: sidecar construction is now quantum-owned.) +func TestSidecarInheritsWorkloadSecretEnv(t *testing.T) { + m := &webhook.Mutator{Clientset: fake.NewSimpleClientset()} + pod := &corev1.Pod{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{ + Name: "gang", + Env: []corev1.EnvVar{ + {Name: "GANG_ROLE", Value: "leader"}, // plain value: NOT copied + {Name: "AWS_ACCESS_KEY_ID", ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: "aws-braket-credentials"}, + Key: "AWS_ACCESS_KEY_ID", + }}}, + }, + }}, + }, + } + ops := sidecarContainerOps(m, pod, false, nil) + var sidecar *corev1.Container + for _, op := range ops { + if c, ok := op.Value.(corev1.Container); ok && c.Name == SidecarContainerName { + sidecar = &c + } + } + if sidecar == nil { + t.Fatal("no sidecar container added") + } + var gotSecret, gotPlain bool + for _, e := range sidecar.Env { + if e.Name == "AWS_ACCESS_KEY_ID" && e.ValueFrom != nil && e.ValueFrom.SecretKeyRef != nil { + gotSecret = true + } + if e.Name == "GANG_ROLE" { + gotPlain = true + } + } + if !gotSecret { + t.Error("sidecar should inherit the workload's secret-sourced AWS creds") + } + if gotPlain { + t.Error("sidecar should NOT copy plain-value workload env like GANG_ROLE") + } +} + +// A plain quantum workload pod (no role, owned by a Job of N>1) is gated as a +// faux gang member AND triggers creation of the one-off submitter. The user +// authors no submitter and no roles. +func TestGangMemberTriggersSubmitter(t *testing.T) { + ns, group, job := "default", "qauto", "qauto-job" + par := int32(2) + cs := fake.NewSimpleClientset(&batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: job, Namespace: ns}, + Spec: batchv1.JobSpec{Parallelism: &par, Completions: &par}}) + m := &webhook.Mutator{Clientset: cs} + + workload := gangQPUPod(ns, group, "qauto-0", job) + ops := m.Mutate(context.Background(), workload) + + if !hasGateOp(ops) { + t.Error("gang member must be gated") + } + if _, ok := envOp(ops, FauxSubmitEnv); !ok { + t.Error("gang member must get FLUENCE_FAUX_SUBMIT") + } + sub, err := cs.CoreV1().Pods(ns).Get(context.Background(), group+SubmitterGroupSuffix, metav1.GetOptions{}) + if err != nil { + t.Fatalf("submitter pod not created: %v", err) + } + if !spec.PodRequestsResource(sub, QuantumResource) { + t.Error("submitter must request the quantum resource (it runs the real submit)") + } +} diff --git a/pkg/webhook/handlers/sidecar.go b/pkg/webhook/handlers/sidecar.go index 19b6569..d105a7c 100644 --- a/pkg/webhook/handlers/sidecar.go +++ b/pkg/webhook/handlers/sidecar.go @@ -35,19 +35,20 @@ type Sidecar interface { ContainerOps(pod *corev1.Pod, observe bool, extraEnv []corev1.EnvVar) []spec.Op } -// coreSidecar is the default Sidecar, delegating to the webhook core. It is the -// shared, generic staging path; the quantum handler uses it as-is today and a -// custom handler could wrap or replace it. +// coreSidecar is the default Sidecar. It delegates to the quantum-owned sidecar +// implementation (see sidecar_impl.go), which uses only the generic MutatorAPI +// (Client, InjectedEnv). The webhook core no longer carries any sidecar logic; a +// custom handler could supply its own Sidecar with a different container/image. type coreSidecar struct{ m webhook.MutatorAPI } func (s coreSidecar) EnsureRBAC(ctx context.Context, namespace string) { - s.m.EnsureSidecarRBAC(ctx, namespace) + ensureSidecarRBAC(ctx, s.m, namespace) } func (s coreSidecar) InterceptorOps(pod *corev1.Pod) []spec.Op { - return s.m.InterceptorOps(pod) + return interceptorOps(pod) } func (s coreSidecar) ContainerOps(pod *corev1.Pod, observe bool, extraEnv []corev1.EnvVar) []spec.Op { - return s.m.SidecarContainerOps(pod, observe, extraEnv) + return sidecarContainerOps(s.m, pod, observe, extraEnv) } // sidecarFor returns the Sidecar a handler should use. Centralized so the choice diff --git a/pkg/webhook/webhook.go b/pkg/webhook/webhook.go index bce6e8a..b39bec1 100644 --- a/pkg/webhook/webhook.go +++ b/pkg/webhook/webhook.go @@ -1,11 +1,11 @@ // Package webhook is fluence's mutating admission webhook. // // The core here is domain-agnostic plumbing: it owns the Mutator, the handler -// dispatcher, per-namespace PodGroup/RBAC provisioning, the Model C package -// staging (init container + shared volume on PYTHONPATH), the HTTP entrypoint, -// and self-managed TLS. It knows nothing about quantum, Braket, gate names, or -// observe labels — that policy lives entirely in the handlers (pkg/webhook/ -// handlers), which self-register via Register(). +// dispatcher, per-namespace PodGroup provisioning, the HTTP entrypoint, and +// self-managed TLS. It knows nothing about quantum, Braket, gate names, sidecars, +// RBAC, or interceptor staging — that policy and machinery lives entirely in the +// handlers (pkg/webhook/handlers), which self-register via Register() and perform +// their own create/edit side-effects through the generic MutatorAPI. // // The webhook self-manages TLS via a self-signed CA patched into the // MutatingWebhookConfiguration caBundle at startup. @@ -32,9 +32,7 @@ import ( admissionv1 "k8s.io/api/admission/v1" corev1 "k8s.io/api/core/v1" - rbacv1 "k8s.io/api/rbac/v1" schedulingv1alpha2 "k8s.io/api/scheduling/v1alpha2" - "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" @@ -52,38 +50,12 @@ const ( // meaning to it (a handler decides what a group means). GroupLabel = "fluence.flux-framework.org/group" - // RoleAnnotation, set by the workload on each pod, explicitly declares the - // pod's gang role ("leader" or "worker"). When present it is AUTHORITATIVE: - // the quantum handler gates workers and gives the leader the sidecar based - // on this value, instead of inferring the leader by admission order. The - // same value is injected into the container env as FLUENCE_ROLE so the - // application reads its role from the same source of truth Fluence used. - // When absent, role falls back to admission order (backwards compatible). - RoleAnnotation = "fluence.flux-framework.org/role" - - // ExpectedWorkersAnnotation, set by the workload on the leader pod, tells the - // sidecar how many gated workers to wait for before ungating. The count is - // known at admission (the workload declares it) even though worker names are - // not, so it travels as a static sidecar env var. The core treats it as an - // opaque string and ascribes no meaning to it beyond propagation. - ExpectedWorkersAnnotation = "fluence.flux-framework.org/expected-workers" - - // GroupSizeAnnotation is the FULL gang member count N (leader + workers), - // set by the workload on each pod. It drives the PodGroup gang minCount so the - // whole group schedules atomically. This is distinct from - // ExpectedWorkersAnnotation (N-1: the workers the sidecar ungates; the leader - // is not gated). For a classical gang with no leader/worker split, N = size. + // GroupSizeAnnotation is the gang member count N, set by the workload on each + // pod. It is the authoritative override for the PodGroup gang minCount when + // the size cannot (or should not) be derived from the owning controller — and + // for loose grouped pods where counting at admission is unreliable. The core + // treats it as an opaque integer string. GroupSizeAnnotation = "fluence.flux-framework.org/group-size" - - // Sidecar/staging infrastructure (generic — not quantum-specific). - SidecarImage = "ghcr.io/converged-computing/fluence-sidecar:latest" - SidecarServiceAccount = "fluence-sidecar" - - // StageVolumeName / StageMountPath: the shared emptyDir the init container - // stages the fluence Python package into, mounted into the user container and - // prepended to PYTHONPATH (Model C delivery). - StageVolumeName = "fluence-pkg" - StageMountPath = "/opt/fluence-staged" ) // ── Mutator ───────────────────────────────────────────────────────────────────── @@ -91,31 +63,14 @@ const ( type Mutator struct { AttributeKeys []string Clientset kubernetes.Interface - SidecarImage string } // compile-time check that *Mutator satisfies the handler capability interface. var _ MutatorAPI = (*Mutator)(nil) -func (m *Mutator) sidecarImage() string { - if m.SidecarImage != "" { - return m.SidecarImage - } - return SidecarImage -} - // GroupName returns the value of GroupLabel on the pod, or "". func GroupName(pod *corev1.Pod) string { return spec.Label(pod, GroupLabel) } -// Role returns the explicit gang role declared on the pod via RoleAnnotation -// ("leader"/"worker"), or "" if unset (caller falls back to admission order). -func Role(pod *corev1.Pod) string { return spec.Annotation(pod, RoleAnnotation) } - -func resourceQuantity(s string) *resource.Quantity { - q := resource.MustParse(s) - return &q -} - // ── MutatorAPI: capabilities exposed to handlers ──────────────────────────────── // Client implements MutatorAPI: returns the Kubernetes client (nil in tests). @@ -178,176 +133,6 @@ func (m *Mutator) EnsurePodGroup(ctx context.Context, namespace, group, leaderPo } } -// EnsureSidecarRBAC provisions the per-namespace ServiceAccount/Role/RoleBinding -// the sidecar uses to patch pods and read PodGroups. -func (m *Mutator) EnsureSidecarRBAC(ctx context.Context, namespace string) { - if m.Clientset == nil { - return - } - lbl := map[string]string{"app": "fluence-sidecar"} - - if _, err := m.Clientset.CoreV1().ServiceAccounts(namespace).Get(ctx, SidecarServiceAccount, metav1.GetOptions{}); err != nil { - sa := &corev1.ServiceAccount{ObjectMeta: metav1.ObjectMeta{Name: SidecarServiceAccount, Namespace: namespace, Labels: lbl}} - if _, err := m.Clientset.CoreV1().ServiceAccounts(namespace).Create(ctx, sa, metav1.CreateOptions{}); err != nil { - log.Printf("[fluence-webhook] could not create ServiceAccount %s/%s: %v", namespace, SidecarServiceAccount, err) - } - } - if _, err := m.Clientset.RbacV1().Roles(namespace).Get(ctx, SidecarServiceAccount, metav1.GetOptions{}); err != nil { - role := &rbacv1.Role{ - ObjectMeta: metav1.ObjectMeta{Name: SidecarServiceAccount, Namespace: namespace, Labels: lbl}, - Rules: []rbacv1.PolicyRule{ - {APIGroups: []string{""}, Resources: []string{"pods"}, Verbs: []string{"get", "list", "patch", "update"}}, - {APIGroups: []string{"scheduling.k8s.io"}, Resources: []string{"podgroups"}, Verbs: []string{"get", "list"}}, - }, - } - if _, err := m.Clientset.RbacV1().Roles(namespace).Create(ctx, role, metav1.CreateOptions{}); err != nil { - log.Printf("[fluence-webhook] could not create Role %s/%s: %v", namespace, SidecarServiceAccount, err) - } - } - if _, err := m.Clientset.RbacV1().RoleBindings(namespace).Get(ctx, SidecarServiceAccount, metav1.GetOptions{}); err != nil { - rb := &rbacv1.RoleBinding{ - ObjectMeta: metav1.ObjectMeta{Name: SidecarServiceAccount, Namespace: namespace, Labels: lbl}, - Subjects: []rbacv1.Subject{{Kind: "ServiceAccount", Name: SidecarServiceAccount, Namespace: namespace}}, - RoleRef: rbacv1.RoleRef{APIGroup: "rbac.authorization.k8s.io", Kind: "Role", Name: SidecarServiceAccount}, - } - if _, err := m.Clientset.RbacV1().RoleBindings(namespace).Create(ctx, rb, metav1.CreateOptions{}); err != nil { - log.Printf("[fluence-webhook] could not create RoleBinding %s/%s: %v", namespace, SidecarServiceAccount, err) - } - } -} - -// InterceptorOps implements Model C delivery. It injects an init container (the -// sidecar image) that stages the fluence Python package into a shared emptyDir, -// mounts that volume into every Fluxion-resource container, and prepends it to -// PYTHONPATH plus sets FLUENCE_POD_UID. Python auto-imports the staged -// sitecustomize on startup, which runs the interceptor — no user code changes, -// no PYTHONSTARTUP (which only fires interactively), no vendor SDK on our side. -func (m *Mutator) InterceptorOps(pod *corev1.Pod) []spec.Op { - var ops []spec.Op - - // Shared volume. - vol := corev1.Volume{Name: StageVolumeName, VolumeSource: corev1.VolumeSource{EmptyDir: &corev1.EmptyDirVolumeSource{}}} - if len(pod.Spec.Volumes) == 0 { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/volumes", Value: []corev1.Volume{vol}}) - } else { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/volumes/-", Value: vol}) - } - - // Init container that stages the package into the shared volume. - // - // Fail-soft: the interceptor is best-effort, so its delivery must be too. We - // wrap the stage command so a failure (bad image, missing python, package - // problem) leaves the shared volume empty and exits 0 rather than blocking - // the user's pod with Init:Error. An empty staged dir simply means the - // interceptor does not run — the user application is unaffected. (This also - // lets CI use a minimal placeholder sidecar image for placement-only tests.) - initc := corev1.Container{ - Name: "fluence-stage", - Image: m.sidecarImage(), - ImagePullPolicy: corev1.PullAlways, - Command: []string{"sh", "-c", - fmt.Sprintf("python -m fluence.stage %s || echo '[fluence] staging skipped (interceptor unavailable)'", StageMountPath)}, - VolumeMounts: []corev1.VolumeMount{{Name: StageVolumeName, MountPath: StageMountPath}}, - } - if len(pod.Spec.InitContainers) == 0 { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/initContainers", Value: []corev1.Container{initc}}) - } else { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/initContainers/-", Value: initc}) - } - - // Mount the staged volume + set PYTHONPATH and FLUENCE_POD_UID on each - // Fluxion-resource container. - mount := corev1.VolumeMount{Name: StageVolumeName, MountPath: StageMountPath, ReadOnly: true} - pythonpath := corev1.EnvVar{Name: "PYTHONPATH", Value: StageMountPath} - uid := spec.FieldEnv("FLUENCE_POD_UID", "metadata.uid") - for i, c := range pod.Spec.Containers { - if !spec.RequestsFluxionResource(c) { - continue - } - if len(c.VolumeMounts) == 0 { - ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/volumeMounts", i), Value: []corev1.VolumeMount{mount}}) - } else { - ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/volumeMounts/-", i), Value: mount}) - } - if !spec.HasEnv(c, "PYTHONPATH") { - if len(c.Env) == 0 { - ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env", i), Value: []corev1.EnvVar{pythonpath}}) - pod.Spec.Containers[i].Env = []corev1.EnvVar{pythonpath} - } else { - ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env/-", i), Value: pythonpath}) - pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, pythonpath) - } - } - if !spec.HasEnv(c, "FLUENCE_POD_UID") { - ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env/-", i), Value: uid}) - pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, uid) - } - } - return ops -} - -// SidecarContainerOps adds the fluence-sidecar container and sets its -// ServiceAccount. observe=true selects observe-only telemetry mode. -func (m *Mutator) SidecarContainerOps(pod *corev1.Pod, observe bool, extraEnv []corev1.EnvVar) []spec.Op { - var ops []spec.Op - // The sidecar resolves its vendor provider at runtime from the backend the - // scheduler chose. It gets the same FLUXION_* contract as the workload - // containers (FLUXION_BACKEND + attribute vars like FLUXION_VENDOR), sourced - // via the downward API from the scheduler's annotations — so the values - // resolve once the scheduler writes them, after admission. - env := []corev1.EnvVar{ - spec.FieldEnv("FLUENCE_POD_UID", "metadata.uid"), - spec.FieldEnv("FLUENCE_POD_NAME", "metadata.name"), - spec.FieldEnv("FLUENCE_NAMESPACE", "metadata.namespace"), - spec.FieldEnv("FLUENCE_GROUP", "metadata.labels['"+GroupLabel+"']"), - } - env = append(env, m.InjectedEnv()...) - if observe { - env = append(env, corev1.EnvVar{Name: "FLUENCE_OBSERVE", Value: "true"}) - } - // Handler-supplied, domain-specific env (e.g. quantum's FLUENCE_EXPECTED_WORKERS - // and FLUENCE_WORKER_GROUP_BASE). The core does not know what these mean; the - // handler that owns the gang shape computes and passes them. Appended before - // the credential copy so workload creds still win on name collisions below. - env = append(env, extraEnv...) - // The sidecar talks to the same backend the workload does (e.g. to find the - // task and read its queue position), so it needs the same credentials. Copy - // the workload container's secret/configmap-sourced env onto the sidecar. - // This stays domain-agnostic: we don't know or name the provider's creds, we - // just propagate whatever the workload pulls from a secret/configMap (e.g. - // AWS_*, IBM tokens). Existing FLUENCE_/FLUXION_ names are not overwritten. - if len(pod.Spec.Containers) > 0 { - have := map[string]bool{} - for _, e := range env { - have[e.Name] = true - } - for _, e := range pod.Spec.Containers[0].Env { - if have[e.Name] || e.ValueFrom == nil { - continue - } - if e.ValueFrom.SecretKeyRef != nil || e.ValueFrom.ConfigMapKeyRef != nil { - env = append(env, e) - } - } - } - sidecar := corev1.Container{ - Name: "fluence-sidecar", Image: m.sidecarImage(), ImagePullPolicy: corev1.PullAlways, - Env: env, - Resources: corev1.ResourceRequirements{Requests: corev1.ResourceList{ - corev1.ResourceCPU: *resourceQuantity("100m"), corev1.ResourceMemory: *resourceQuantity("256Mi"), - }}, - } - if len(pod.Spec.Containers) == 0 { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/containers", Value: []corev1.Container{sidecar}}) - } else { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/containers/-", Value: sidecar}) - } - if pod.Spec.ServiceAccountName == "" || pod.Spec.ServiceAccountName == "default" { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/serviceAccountName", Value: SidecarServiceAccount}) - } - return ops -} - // ── Dispatcher ────────────────────────────────────────────────────────────────── // Mutate dispatches the pod to every registered handler and concatenates the diff --git a/pkg/webhook/webhook_test.go b/pkg/webhook/webhook_test.go index dd32ac6..9af6c9c 100644 --- a/pkg/webhook/webhook_test.go +++ b/pkg/webhook/webhook_test.go @@ -2,8 +2,6 @@ package webhook import ( "testing" - - corev1 "k8s.io/api/core/v1" ) // EnvVarNames returns the FLUXION_* contract names (used by the scheduler plugin @@ -22,47 +20,3 @@ func TestEnvVarNames(t *testing.T) { } } } - -func TestSidecarInheritsWorkloadSecretEnv(t *testing.T) { - m := &Mutator{} - pod := &corev1.Pod{ - Spec: corev1.PodSpec{ - Containers: []corev1.Container{{ - Name: "gang", - Env: []corev1.EnvVar{ - {Name: "GANG_ROLE", Value: "leader"}, // plain value: NOT copied - {Name: "AWS_ACCESS_KEY_ID", ValueFrom: &corev1.EnvVarSource{ - SecretKeyRef: &corev1.SecretKeySelector{ - LocalObjectReference: corev1.LocalObjectReference{Name: "aws-braket-credentials"}, - Key: "AWS_ACCESS_KEY_ID", - }}}, - }, - }}, - }, - } - ops := m.SidecarContainerOps(pod, false, nil) - var sidecar *corev1.Container - for _, op := range ops { - if c, ok := op.Value.(corev1.Container); ok && c.Name == "fluence-sidecar" { - sidecar = &c - } - } - if sidecar == nil { - t.Fatal("no sidecar container added") - } - var gotSecret, gotPlain bool - for _, e := range sidecar.Env { - if e.Name == "AWS_ACCESS_KEY_ID" && e.ValueFrom != nil && e.ValueFrom.SecretKeyRef != nil { - gotSecret = true - } - if e.Name == "GANG_ROLE" { - gotPlain = true - } - } - if !gotSecret { - t.Error("sidecar should inherit the workload's secret-sourced AWS creds") - } - if gotPlain { - t.Error("sidecar should NOT copy plain-value workload env like GANG_ROLE") - } -} diff --git a/python/fluence/providers/base.py b/python/fluence/providers/base.py index dca4429..561bca2 100644 --- a/python/fluence/providers/base.py +++ b/python/fluence/providers/base.py @@ -80,7 +80,7 @@ def find_my_task(self, pod_uid: str, backend: str, timeout: int) -> "Task | None raise NotImplementedError def is_ready_to_ungate(self, task: "Task") -> bool: - """True when workers should be ungated — queue position == 1 or the task + """True when the gang should be ungated — queue position == 1 or the task is already RUNNING/terminal. Always implementable.""" raise NotImplementedError @@ -134,4 +134,4 @@ def resolve_from_env() -> "Provider | None": for k, v in os.environ.items(): if k.startswith("FLUXION_"): attrs[k[len("FLUXION_"):].lower()] = v - return resolve(attrs) + return resolve(attrs) \ No newline at end of file diff --git a/python/fluence/providers/braket.py b/python/fluence/providers/braket.py index 23bd9fc..33f1683 100644 --- a/python/fluence/providers/braket.py +++ b/python/fluence/providers/braket.py @@ -49,8 +49,26 @@ def install_interceptor(self, pod_uid: str) -> bool: return False # braket SDK not in this container — fail-soft original_run = AwsDevice.run + faux = os.environ.get("FLUENCE_FAUX_SUBMIT", "").lower() == "true" def patched_run(self, task_specification, *args, **kwargs): + # Two modes of the ONE interceptor: + # faux (worker): the one-off submitter already submitted this task + # before the worker was ungated, so submitting again would + # duplicate it N times. Return a handle to the EXISTING task (by + # ARN, handed over via FLUENCE_QUANTUM_JOB_ID) without submitting. + # tag (submitter): stamp the pod-uid tag so the sidecar can find the + # task in the queue, then submit for real. + if faux: + arn = os.environ.get("FLUENCE_QUANTUM_JOB_ID", "") + if arn: + from braket.aws import AwsQuantumTask + log(f"faux-submit: returning existing task {arn} " + f"(no resubmission)") + return AwsQuantumTask(arn=arn) + log("faux-submit: no job id; suppressing submit " + "(worker consumes results by id)") + return None if pod_uid: tags = kwargs.get("tags", {}) tags[TAG_KEY] = pod_uid @@ -226,4 +244,4 @@ def job_id(self, task: BraketTask) -> str: PROVIDER = BraketProvider() -register(PROVIDER) +register(PROVIDER) \ No newline at end of file diff --git a/python/fluence/sidecar.py b/python/fluence/sidecar.py index 9e1184b..d0724e5 100644 --- a/python/fluence/sidecar.py +++ b/python/fluence/sidecar.py @@ -1,18 +1,19 @@ """ fluence.sidecar — provider-agnostic quantum coordination sidecar main loop. -Injected by the Fluence webhook into the quantum-submitting pod. Resolves its -vendor at runtime from the backend annotation, discovers the task the user -application submitted (tagged by the interceptor), polls readiness, and either -ungates gated workers (gang mode) or just logs the queue-position series -(observe-only mode). +Injected by the Fluence webhook into the one-off SUBMITTER pod (gang + submitter +model — there is no leader/worker split). Resolves its vendor at runtime from the +backend annotation, discovers the task the user application submitted (tagged by +the interceptor), polls readiness, and either ungates the gated GANG group (gang +mode) or just logs the queue-position series (observe-only mode). Entry point: `fluence-sidecar` console script (see pyproject.toml) -> main(). Environment (injected by the Fluence webhook): FLUENCE_POD_UID UID of this pod (matches interceptor tag) FLUENCE_NAMESPACE Kubernetes namespace - FLUENCE_GATED_PODS comma-separated gated worker names + FLUENCE_GANG_GROUP group label of the gated gang to ungate + FLUENCE_GATED_PODS optional explicit comma-separated gang pod names FLUENCE_OBSERVE "true" for observe-only telemetry mode FLUXION_BACKEND / FLUXION_VENDOR scheduler-chosen backend / vendor FLUENCE_TASK_DISCOVERY_TIMEOUT seconds to wait for discovery (default 300) @@ -29,11 +30,6 @@ from fluence.providers.base import log from fluence.ungate import ungate_pods, gated_pods_from_env, namespace_from_env, wait_for_gated_pods -# MUST match handlers.WorkerGroupSuffix in the Go webhook. A quantum gang of size -# N is split into the leader group (size 1) and the worker group -# -workers (size N-1, all gated). The sidecar runs in the leader and -# discovers/ungates workers in the WORKER group, not the leader's group. -WORKER_GROUP_SUFFIX = "-workers" def _poll(provider, task, poll_interval, ungate): @@ -58,25 +54,22 @@ def main(): pod_uid = os.environ.get("FLUENCE_POD_UID", "") pod_name = os.environ.get("FLUENCE_POD_NAME", "") group = os.environ.get("FLUENCE_GROUP", "") - # Two-group quantum split: the leader (where this sidecar runs) is in - # ; the gated workers were moved to -workers by the webhook. - # WORKER_GROUP_SUFFIX MUST match handlers.WorkerGroupSuffix in the Go webhook - # (pkg/webhook/handlers/quantum.go). The webhook also passes the base group - # via FLUENCE_WORKER_GROUP_BASE; prefer it, fall back to FLUENCE_GROUP. - worker_group_base = os.environ.get("FLUENCE_WORKER_GROUP_BASE", group) - worker_group = worker_group_base + WORKER_GROUP_SUFFIX if worker_group_base else "" + # Gang + submitter model: this sidecar runs in the one-off SUBMITTER pod + # (its own group-of-one, -submitter). The gated workload it must ungate + # is the GANG group, named by FLUENCE_GANG_GROUP (set by the webhook). There + # is no leader/worker split and no -workers subgroup. + gang_group = os.environ.get("FLUENCE_GANG_GROUP", "") backend = os.environ.get("FLUXION_BACKEND", "") observe = os.environ.get("FLUENCE_OBSERVE", "").lower() == "true" discovery_timeout = int(os.environ.get("FLUENCE_TASK_DISCOVERY_TIMEOUT", 300)) poll_interval = int(os.environ.get("FLUENCE_POLL_INTERVAL", 30)) - expected_workers = int(os.environ.get("FLUENCE_EXPECTED_WORKERS", 0)) ungate_timeout = int(os.environ.get("FLUENCE_UNGATE_TIMEOUT", 120)) namespace = namespace_from_env() - log("starting fluence quantum sidecar") + log("starting fluence quantum submitter sidecar") log(f" pod_uid={pod_uid} namespace={namespace} group={group} " - f"backend={backend} observe={observe} expected_workers={expected_workers} worker_group={worker_group}") + f"gang_group={gang_group} backend={backend} observe={observe}") provider = resolve_from_env() if provider is None: @@ -88,8 +81,9 @@ def main(): if task is None: log("ERROR: could not discover quantum task") if not observe: - ungate_pods(wait_for_gated_pods(namespace, worker_group, expected_workers, - exclude=pod_name, timeout=ungate_timeout), + # Fail open: ungate the gang so it is not stranded forever. + ungate_pods(wait_for_gated_pods(namespace, gang_group, exclude=pod_name, + timeout=ungate_timeout), "", namespace) sys.exit(1) @@ -102,19 +96,18 @@ def main(): log("observe-only run complete") return - # Wait until all expected gated workers are present (gang is submitted - # together), then ungate them. expected_workers is N-1, propagated by the - # webhook from the leader at admission; if unset we ungate whatever is found. + # Ungate the gang: discover the gated pods in the gang group and remove their + # gate, stamping the job-id so each can fetch results by id. The gang pods are + # created up front (Job/Deployment), so they are present by submit time. gated_pods = gated_pods_from_env() or wait_for_gated_pods( - namespace, worker_group, expected_workers, exclude=pod_name, - timeout=ungate_timeout) - log(f"ungating {len(gated_pods)} worker(s): {gated_pods}") + namespace, gang_group, exclude=pod_name, timeout=ungate_timeout) + log(f"ungating {len(gated_pods)} gang pod(s): {gated_pods}") n_ok = ungate_pods(gated_pods, job_id, namespace) if n_ok == len(gated_pods): - log(f"done — {n_ok} worker(s) ungated") + log(f"done — {n_ok} gang pod(s) ungated") else: - log(f"WARNING: ungated only {n_ok}/{len(gated_pods)} worker(s) — see errors above") + log(f"WARNING: ungated only {n_ok}/{len(gated_pods)} gang pod(s) — see errors above") if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/python/fluence/ungate.py b/python/fluence/ungate.py index 1019ead..a40e662 100644 --- a/python/fluence/ungate.py +++ b/python/fluence/ungate.py @@ -84,10 +84,10 @@ def gated_pods_from_env(): def discover_gated_pods(namespace, group, exclude=""): """ Find the names of pods in the same group that still carry the quantum - scheduling gate (i.e. the workers this sidecar's leader must ungate). + scheduling gate (i.e. the gang pods this submitter must ungate). - The leader's sidecar is created before the workers are admitted, so the gated - set cannot be known at admission time and must be discovered at runtime. We + The submitter is created alongside the gang, so the gated set is discovered + at runtime rather than known at admission. We list pods by the group label and keep those with the QUANTUM_GATE_NAME gate still present, excluding the leader pod itself. """ @@ -114,31 +114,24 @@ def discover_gated_pods(namespace, group, exclude=""): return names -def wait_for_gated_pods(namespace, group, expected, exclude="", timeout=120, - interval=3): +def wait_for_gated_pods(namespace, group, exclude="", timeout=120, interval=3): """ - Wait until at least `expected` gated workers have been discovered in the - group, or `timeout` seconds elapse. The gang is submitted together, so all - workers appear quickly; the timeout is a backstop against a crashed/never- - admitted worker so the sidecar never hangs. Returns the discovered list - (which may be short of `expected` if the timeout fired). + Wait until at least one gated gang pod is discovered in the group (the gang + is created up front, so its pods appear quickly), then return all currently + gated pods. The timeout is a backstop so the submitter never hangs if the + gang never appears. Returns the discovered list (possibly empty on timeout). """ deadline = time.time() + timeout found = [] while time.time() < deadline: found = discover_gated_pods(namespace, group, exclude=exclude) - if expected and len(found) >= expected: - log(f"all {expected} gated worker(s) present") + if found: return found - if not expected: - # No expected count known — return whatever is present now. - return found - log(f"waiting for gated workers: {len(found)}/{expected}") + log("waiting for gated gang pods to appear") time.sleep(interval) - log(f"WARNING: timed out waiting for gated workers " - f"({len(found)}/{expected}); ungating what is present") + log("WARNING: timed out waiting for gated gang pods; none found") return found def namespace_from_env(): - return os.environ.get("FLUENCE_NAMESPACE", "default") + return os.environ.get("FLUENCE_NAMESPACE", "default") \ No newline at end of file diff --git a/test/e2e/gang/01-classical-gang.sh b/test/e2e/gang/01-classical-gang.sh index a854663..1ebfc64 100755 --- a/test/e2e/gang/01-classical-gang.sh +++ b/test/e2e/gang/01-classical-gang.sh @@ -25,7 +25,7 @@ count="$(kubectl get pods -l app=training --no-headers | wc -l | tr -d ' ')" [ "$count" = "1" ] || fail "expected 2 training pods, got $count" log "PASS: classical gang placed all $count pods via fluence" -kubectl delete -f examples/single-podgroup.yaml --wait=false || true +kubectl delete -f examples/test/e2e/gang/single-podgroup.yaml --wait=false || true kubectl patch podgroup training --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true # Wait for the pods to actually be gone before the next test runs — otherwise a # terminating 'training' pod (same name/labels reused by other scenarios) can be diff --git a/test/e2e/gang/02-postfilter-rematch.sh b/test/e2e/gang/02-postfilter-rematch.sh index 6657a90..f74c87b 100755 --- a/test/e2e/gang/02-postfilter-rematch.sh +++ b/test/e2e/gang/02-postfilter-rematch.sh @@ -22,6 +22,11 @@ kubectl delete podgroup "$NAME" --ignore-not-found >/dev/null 2>&1 || true kubectl patch podgroup "$NAME" --type=merge \ -p '{"metadata":{"finalizers":null}}' >/dev/null 2>&1 || true kubectl wait --for=delete pod -l "$SEL" --timeout=60s >/dev/null 2>&1 || true +# Defensive: a prior test's workload left running would occupy the only +# untainted worker and make this test fail with a (correct) fluxion +# allocate -1 for lack of capacity. Ensure none lingers. +kubectl delete deployment training --ignore-not-found --wait=false >/dev/null 2>&1 || true +kubectl wait --for=delete pod -l app=training --timeout=60s >/dev/null 2>&1 || true TAINTED="$(kubectl get nodes -l '!node-role.kubernetes.io/control-plane' \ -o jsonpath='{.items[0].metadata.name}')" diff --git a/test/e2e/gang/03-multi-gang.sh b/test/e2e/gang/03-multi-gang.sh index 1301382..9f01ae5 100755 --- a/test/e2e/gang/03-multi-gang.sh +++ b/test/e2e/gang/03-multi-gang.sh @@ -1,35 +1,35 @@ #!/usr/bin/env bash # Multi-pod gang scheduling on real nodes. Guards the two failures that the # single-pod 01 test could NOT catch (and that shipped a minCount=1 bug): -# A) a 3-pod gang must place ALL 3 (minCount must equal the gang size, not 1) +# A) a multi-pod gang must place ALL of them (minCount must equal the gang size, not 1) # B) under contention, a gang that cannot fully fit stays ENTIRELY pending — # never partially placed (no stranded pods holding nodes). set -euo pipefail HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" # ---- A) all-or-nothing placement of a 3-pod gang ------------------------------- -log "TEST 6A: multi-pod gang (3) places all-or-nothing" +log "TEST 6A: multi-pod gang (2) places all-or-nothing" kubectl apply -f examples/test/e2e/gang/multi-gang.yaml -# the webhook must have created the PodGroup with minCount = 3 (the bug set it to 1) -log "checking PodGroup minCount == 3 (set by webhook from group-size)" +# the webhook must have created the PodGroup with minCount = 2 (the bug set it to 1) +log "checking PodGroup minCount == 2 (set by webhook from group-size)" for i in $(seq 1 30); do mc="$(kubectl get podgroup gang3 -o jsonpath='{.spec.schedulingPolicy.gang.minCount}' 2>/dev/null || true)" [ -n "$mc" ] && break; sleep 2 done -[ "$mc" = "3" ] || fail "PodGroup gang3 minCount=$mc, want 3 (minCount=1 bug -> partial gangs)" +[ "$mc" = "2" ] || fail "PodGroup gang3 minCount=$mc, want 2 (minCount=1 bug -> partial gangs)" -log "waiting for all 3 gang pods to be Ready" -wait_pods_ready "app=gang3" 3 180 || fail "gang3 did not place all 3 pods (gang scheduling failed)" +log "waiting for all 2 gang pods to be Ready" +wait_pods_ready "app=gang3" 2 180 || fail "gang3 did not place all 2 pods (gang scheduling failed)" count="$(kubectl get pods -l app=gang3 --field-selector=status.phase=Running --no-headers | wc -l | tr -d ' ')" -[ "$count" = "3" ] || fail "expected 3 Running gang3 pods, got $count (partial placement)" +[ "$count" = "2" ] || fail "expected 2 Running gang3 pods, got $count (partial placement)" for p in $(kubectl get pods -l app=gang3 -o name); do pod="${p#pod/}" sched="$(kubectl get pod "$pod" -o jsonpath='{.spec.schedulerName}')" [ "$sched" = "fluence" ] || fail "$pod not scheduled by fluence (got: $sched)" done -log "PASS 6A: 3-pod gang placed atomically by fluence (minCount=3)" +log "PASS 6A: 2-pod gang placed atomically by fluence (minCount=2)" kubectl delete -f examples/test/e2e/gang/multi-gang.yaml --wait=false || true kubectl patch podgroup gang3 --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true diff --git a/test/e2e/lib.sh b/test/e2e/lib.sh index cad6a2e..13390c9 100644 --- a/test/e2e/lib.sh +++ b/test/e2e/lib.sh @@ -44,7 +44,7 @@ wait_fluence_ready() { show_webhook() { pod=$1 - echo "FAIL: QRMI_BACKEND mismatch" + echo "FAIL: FLUXION_BACKEND mismatch" kubectl get pod $pod -o jsonpath='{.spec.containers[0].env}'; echo kubectl get pod $pod -o jsonpath='{.metadata.annotations}'; echo kubectl -n kube-system logs deploy/fluence-webhook --tail=50 diff --git a/test/e2e/quantum/01-quantum-placement.sh b/test/e2e/quantum/01-quantum-placement.sh index cc1bfe2..8f5c475 100755 --- a/test/e2e/quantum/01-quantum-placement.sh +++ b/test/e2e/quantum/01-quantum-placement.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Quantum placement: a qpu pod is matched to a backend and the webhook injects QRMI_BACKEND. +# Quantum placement: a qpu pod is matched to a backend and the webhook injects FLUXION_BACKEND. set -euo pipefail HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" ANN="fluence.flux-framework.org/backend" @@ -14,10 +14,21 @@ backend="$(kubectl get pod sampler-mock -o jsonpath="{.metadata.annotations.${AN [ -n "$backend" ] || (show_webhook sampler-mock && fail "backend annotation ($ANN) was not set by fluence") log "fluence chose backend: $backend" -# The webhook must have surfaced it as QRMI_BACKEND inside the container. +# The webhook must have surfaced it as FLUXION_BACKEND inside the container. out="$(kubectl logs sampler-mock || true)" -echo "$out" | grep -q "BACKEND=${backend}" \ - || (show_webhook sampler-mock && fail "QRMI_BACKEND in container ('$out') does not match annotation ($backend)") +if ! echo "$out" | grep -q "BACKEND=${backend}"; then + # Diagnostic (CI has no interactive shell): show whether the env var is ABSENT + # (not injected -> webhook issue) or PRESENT-BUT-EMPTY (annotation not resolved + # at container start -> delivery/timing issue), and what the container actually got. + log "--- diagnostic: container env spec ---" + kubectl get pod sampler-mock -o jsonpath='{.spec.containers[0].env}' ; echo + log "--- diagnostic: live value via exec ---" + kubectl exec sampler-mock -- sh -c 'echo "FLUXION_BACKEND=[$FLUXION_BACKEND]"' 2>&1 || true + log "--- diagnostic: backend annotation on pod ---" + kubectl get pod sampler-mock -o jsonpath="{.metadata.annotations.${ANN//./\\.}}" ; echo + show_webhook sampler-mock + fail "FLUXION_BACKEND in container ('$out') does not match annotation ($backend)" +fi -log "PASS: qpu pod scheduled, backend '$backend' chosen and injected as QRMI_BACKEND" +log "PASS: qpu pod scheduled, backend '$backend' chosen and injected as FLUXION_BACKEND" kubectl delete -f examples/test/e2e/quantum/quantum-pod-mock.yaml --wait=false || true diff --git a/test/e2e/quantum/02-sidecar-ungate.sh b/test/e2e/quantum/02-sidecar-ungate.sh index 61abcb0..88f047b 100755 --- a/test/e2e/quantum/02-sidecar-ungate.sh +++ b/test/e2e/quantum/02-sidecar-ungate.sh @@ -1,88 +1,82 @@ #!/usr/bin/env bash -# Sidecar webhook test. +# Gang + submitter webhook test (no leader/worker). # -# Verifies that when a PodGroup of size > 1 with QPU resources is submitted: -# 1. The webhook creates fluence-sidecar RBAC in the namespace automatically -# 2. The leader pod gets the sidecar container injected -# 3. The worker pod gets the quantum.braket/ready scheduling gate added -# 4. The worker pod gets fluence-quantum-classical priority class set +# When a quantum workload (a gang of N pods all requesting QPU, no roles) is +# submitted, the webhook must: +# 1. create the fluence-sidecar RBAC in the namespace automatically +# 2. gate every gang pod with quantum.braket/ready +# 3. raise every gang pod to the fluence-quantum-classical priority class +# 4. ADDITIONALLY create the one-off submitter pod -submitter +# 5. inject the fluence-stage init container + the sidecar container into the +# submitter (Model C staging + the real coordinator) # -# Does NOT test the sidecar itself (task discovery, interceptor, -# queue position polling). Those require real AWS credentials and are covered -# by sidecars/providers/braket/test/integration.sh which is run locally. +# Does NOT test the sidecar runtime (task discovery, interceptor, queue polling) +# — that needs real AWS creds (sidecars/providers/braket/test/integration.sh). set -euo pipefail HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" -log "TEST 4: sidecar webhook — RBAC creation, gate injection, sidecar injection" +GROUP=qgang +SUBMITTER=${GROUP}-submitter -kubectl apply -f examples/test/e2e/quantum/sidecar-mock-pods.yaml - -# Give webhook time to process the leader pod admission +log "TEST 4: gang+submitter webhook — RBAC, gating, priority, submitter creation" +kubectl apply -f examples/test/e2e/quantum/quantum-gang-pods.yaml sleep 3 -# Print webhook logs — always show these so we can see what happened log "--- webhook logs ---" kubectl logs -n kube-system deployment/fluence-webhook --tail=50 || true log "--- end webhook logs ---" -# 1. Webhook should have created fluence-sidecar ServiceAccount -log "checking webhook created fluence-sidecar ServiceAccount..." +# 1. RBAC created by the webhook (idempotent, per-namespace). +log "checking webhook created fluence-sidecar RBAC..." for i in $(seq 1 30); do - kubectl get serviceaccount fluence-sidecar -n default > /dev/null 2>&1 && break + kubectl get serviceaccount fluence-sidecar -n default >/dev/null 2>&1 && break sleep 2 done -kubectl get serviceaccount fluence-sidecar -n default \ - || fail "webhook did not create fluence-sidecar ServiceAccount" -log " fluence-sidecar ServiceAccount created" - -# 2. Webhook should have created fluence-sidecar Role -kubectl get role fluence-sidecar -n default \ - || fail "webhook did not create fluence-sidecar Role" -log " fluence-sidecar Role created" - -# 3. Webhook should have created fluence-sidecar RoleBinding -kubectl get rolebinding fluence-sidecar -n default \ - || fail "webhook did not create fluence-sidecar RoleBinding" -log " fluence-sidecar RoleBinding created" +kubectl get serviceaccount fluence-sidecar -n default || fail "no fluence-sidecar ServiceAccount" +kubectl get role fluence-sidecar -n default || fail "no fluence-sidecar Role" +kubectl get rolebinding fluence-sidecar -n default || fail "no fluence-sidecar RoleBinding" +log " RBAC present" -# 4. Leader pod should have the fluence-stage init container injected (Model C: -# it stages the fluence Python package into a shared volume on PYTHONPATH). -log "checking webhook injected the fluence-stage init container..." -wait_pod_phase sidecar-test-leader Running 120 \ - || { kubectl describe pod sidecar-test-leader; fail "sidecar-test-leader did not reach Running"; } -initc=$(kubectl get pod sidecar-test-leader \ - -o jsonpath='{.spec.initContainers[*].name}') -echo "$initc" | grep -q "fluence-stage" \ - || fail "fluence-stage init container not injected (initContainers: $initc)" -log " fluence-stage init container injected" - -# 5. Leader pod should have the sidecar container injected -log "checking sidecar injected into leader pod..." -containers=$(kubectl get pod sidecar-test-leader \ - -o jsonpath='{.spec.containers[*].name}') -echo "$containers" | grep -q "fluence-sidecar" \ - || fail "fluence-sidecar container not injected into leader (containers: $containers)" -log " fluence-sidecar container injected into leader" +# 2 + 3. Every gang pod is gated and at the preempting priority class. +for p in ${GROUP}-0 ${GROUP}-1; do + gate="$(kubectl get pod "$p" -o jsonpath='{.spec.schedulingGates[0].name}' 2>/dev/null || true)" + [ "$gate" = "quantum.braket/ready" ] || fail "$p not gated (gate=$gate)" + pc="$(kubectl get pod "$p" -o jsonpath='{.spec.priorityClassName}' 2>/dev/null || true)" + [ "$pc" = "fluence-quantum-classical" ] || fail "$p priorityClass=$pc, want fluence-quantum-classical" +done +log " gang pods gated + fluence-quantum-classical priority" -# 6. Worker pod should have scheduling gate added by webhook -gate=$(kubectl get pod sidecar-test-worker \ - -o jsonpath='{.spec.schedulingGates[0].name}') -[ "$gate" = "quantum.braket/ready" ] \ - || fail "worker pod does not have quantum.braket/ready gate (got: $gate)" -log " quantum.braket/ready gate set on worker" +# 4. Fluence created the submitter pod. +log "checking webhook created the submitter pod $SUBMITTER..." +for i in $(seq 1 30); do + kubectl get pod "$SUBMITTER" -n default >/dev/null 2>&1 && break + sleep 2 +done +kubectl get pod "$SUBMITTER" -n default || fail "webhook did not create submitter pod $SUBMITTER" +sub_marker="$(kubectl get pod "$SUBMITTER" -o jsonpath='{.metadata.annotations.fluence\.flux-framework\.org/submitter}' 2>/dev/null || true)" +[ "$sub_marker" = "true" ] || fail "submitter missing the submitter marker" +log " submitter pod created" -# 7. Worker pod should have the fluence-quantum-classical priority class set by -# the webhook at admission (so it schedules reliably once ungated). -pc=$(kubectl get pod sidecar-test-worker -o jsonpath='{.spec.priorityClassName}') -[ "$pc" = "fluence-quantum-classical" ] \ - || fail "worker pod missing fluence-quantum-classical priority class (got: $pc)" -log " fluence-quantum-classical priority class set on worker" +# 5. Submitter has the staging init container + the sidecar container, and is NOT gated. +wait_pod_phase "$SUBMITTER" Running 120 \ + || { kubectl describe pod "$SUBMITTER"; fail "$SUBMITTER did not reach Running"; } +initc="$(kubectl get pod "$SUBMITTER" -o jsonpath='{.spec.initContainers[*].name}')" +echo "$initc" | grep -q fluence-stage || fail "fluence-stage init container not injected (init: $initc)" +conts="$(kubectl get pod "$SUBMITTER" -o jsonpath='{.spec.containers[*].name}')" +echo "$conts" | grep -q fluence-sidecar || fail "fluence-sidecar container not injected (containers: $conts)" +sgate="$(kubectl get pod "$SUBMITTER" -o jsonpath='{.spec.schedulingGates[0].name}' 2>/dev/null || true)" +[ -z "$sgate" ] || fail "submitter must NOT be gated (gate=$sgate)" +log " submitter has fluence-stage + fluence-sidecar, not gated" -log "PASS: webhook correctly created RBAC, injected sidecar, gated worker" -log "NOTE: fluence-quantum-classical priority is set by the webhook at admission (immutable post-creation)" -log "NOTE: braket sidecar integration test (SDK intercept, tag discovery," -log " queue polling) is in sidecars/providers/braket/test/integration.sh" +log "PASS: webhook gated the gang, set priority, created RBAC + the submitter" +log "NOTE: priority is set at admission (immutable post-creation)" +log "NOTE: braket sidecar runtime (SDK intercept, tag discovery, queue polling)" +log " is in sidecars/providers/braket/test/integration.sh" -# Only clean up pods and PodGroup — RBAC is namespace infrastructure -# that persists for future quantum workflows in this namespace -kubectl delete -f examples/test/e2e/quantum/sidecar-mock-pods.yaml +# Clean up pods + PodGroups; RBAC is namespace infra and persists. +kubectl delete -f examples/test/e2e/quantum/quantum-gang-pods.yaml --wait=false || true +kubectl delete pod "$SUBMITTER" --wait=false 2>/dev/null || true +for g in "$GROUP" "$SUBMITTER"; do + kubectl patch podgroup "$g" --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true +done +kubectl wait --for=delete pod -l app="$GROUP" --timeout=60s 2>/dev/null || true diff --git a/test/e2e/quantum/03-gang-submitter.sh b/test/e2e/quantum/03-gang-submitter.sh new file mode 100644 index 0000000..46905ca --- /dev/null +++ b/test/e2e/quantum/03-gang-submitter.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +# Gang + submitter structure (replaces the old leader/worker split). +# +# The structural guarantee the ungate path depends on: a quantum gang of size N +# is ONE fully-gated PodGroup (minCount N), and Fluence creates a +# SEPARATE submitter pod in its OWN group-of-one -submitter (minCount 1, +# not gated) that does the real submit and ungates the gang. There is no +# -workers subgroup and no leader among the user's pods. (The runtime +# ungate is covered by the braket integration test; here we prove the shape.) +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" + +GROUP=qgang +SUBMITTER=${GROUP}-submitter + +log "TEST 7: gang(N, gated) + separate submitter(1) structure" +kubectl apply -f examples/test/e2e/quantum/quantum-gang-pods.yaml + +# Gang PodGroup exists with minCount N=2 (full gang, no split). +log "checking gang group '$GROUP' minCount == 2 (full N)" +for i in $(seq 1 30); do + gc="$(kubectl get podgroup "$GROUP" -o jsonpath='{.spec.schedulingPolicy.gang.minCount}' 2>/dev/null || true)" + [ -n "$gc" ] && break; sleep 2 +done +[ "$gc" = "2" ] || fail "gang group $GROUP minCount=$gc, want 2 (full N)" + +# There must be NO -workers subgroup (the old split is gone). +if kubectl get podgroup "${GROUP}-workers" >/dev/null 2>&1; then + fail "found ${GROUP}-workers PodGroup — the obsolete leader/worker split must not exist" +fi +log " gang group minCount=2, no -workers subgroup" + +# Submitter PodGroup -submitter exists with minCount 1 (schedules alone). +log "checking submitter group '$SUBMITTER' minCount == 1" +for i in $(seq 1 30); do + sc="$(kubectl get podgroup "$SUBMITTER" -o jsonpath='{.spec.schedulingPolicy.gang.minCount}' 2>/dev/null || true)" + [ -n "$sc" ] && break; sleep 2 +done +[ "$sc" = "1" ] || fail "submitter group $SUBMITTER minCount=$sc, want 1" + +# Submitter pod records the gang group it ungates, and is its own group. +gg="$(kubectl get pod "$SUBMITTER" -o jsonpath='{.metadata.annotations.fluence\.flux-framework\.org/gang-group}' 2>/dev/null || true)" +[ "$gg" = "$GROUP" ] || fail "submitter gang-group annotation=$gg, want $GROUP" +sl="$(kubectl get pod "$SUBMITTER" -o jsonpath='{.metadata.labels.fluence\.flux-framework\.org/group}' 2>/dev/null || true)" +[ "$sl" = "$SUBMITTER" ] || fail "submitter group label=$sl, want $SUBMITTER" +log " submitter group minCount=1, ungates gang '$GROUP'" + +# Gang pods stay in (NOT relinked) and are gated. +for p in ${GROUP}-0 ${GROUP}-1; do + g="$(kubectl get pod "$p" -o jsonpath='{.metadata.labels.fluence\.flux-framework\.org/group}' 2>/dev/null || true)" + [ "$g" = "$GROUP" ] || fail "$p group label=$g, want $GROUP (gang pods must not be relinked)" + gate="$(kubectl get pod "$p" -o jsonpath='{.spec.schedulingGates[0].name}' 2>/dev/null || true)" + [ "$gate" = "quantum.braket/ready" ] || fail "$p not gated (gate=$gate)" +done +log " gang pods remain in '$GROUP' and are gated" + +log "PASS 7: gang(N=2, gated) + submitter(1, ungates gang), no leader/worker split" +kubectl delete -f examples/test/e2e/quantum/quantum-gang-pods.yaml --wait=false || true +kubectl delete pod "$SUBMITTER" --wait=false 2>/dev/null || true +for g in "$GROUP" "$SUBMITTER"; do + kubectl patch podgroup "$g" --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true +done +kubectl wait --for=delete pod -l app="$GROUP" --timeout=60s 2>/dev/null || true diff --git a/test/e2e/quantum/03-quantum-split.sh b/test/e2e/quantum/03-quantum-split.sh deleted file mode 100755 index c84f2ba..0000000 --- a/test/e2e/quantum/03-quantum-split.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env bash -# Two-group quantum split: a quantum gang of size N is split into a LEADER -# PodGroup (minCount 1) and a WORKER PodGroup -workers -# (minCount N-1). Workers are relinked into the worker group and gated. This is -# the structural guarantee that, combined with the sidecar ungating the worker -# group, makes quantum gangs work. (The runtime ungate is covered by 04; here we -# prove the group SPLIT the ungate path depends on.) -set -euo pipefail -HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" - -log "TEST 7: quantum two-group split (leader=1, workers=N-1)" -kubectl apply -f examples/test/e2e/quantum/quantum-split-pods.yaml - -# leader PodGroup must exist with minCount 1 -log "checking leader group 'qsplit' minCount == 1" -for i in $(seq 1 30); do - lc="$(kubectl get podgroup qsplit -o jsonpath='{.spec.schedulingPolicy.gang.minCount}' 2>/dev/null || true)" - [ -n "$lc" ] && break; sleep 2 -done -[ "$lc" = "1" ] || fail "leader group qsplit minCount=$lc, want 1" - -# worker PodGroup -workers must exist with minCount N-1 = 2 -log "checking worker group 'qsplit-workers' minCount == 2 (N-1)" -for i in $(seq 1 30); do - wc="$(kubectl get podgroup qsplit-workers -o jsonpath='{.spec.schedulingPolicy.gang.minCount}' 2>/dev/null || true)" - [ -n "$wc" ] && break; sleep 2 -done -[ "$wc" = "2" ] || fail "worker group qsplit-workers minCount=$wc, want 2 (N-1); the split did not happen" - -# workers must be RELINKED into the worker group (label rewritten by webhook) -log "checking workers were relinked into qsplit-workers" -for w in qsplit-worker-0 qsplit-worker-1; do - g="$(kubectl get pod "$w" -o jsonpath='{.metadata.labels.fluence\.flux-framework\.org/group}' 2>/dev/null || true)" - [ "$g" = "qsplit-workers" ] || fail "$w group label=$g, want qsplit-workers (relink failed)" -done - -# workers must be GATED (scheduling gate held until leader's task is ready) -log "checking workers carry the quantum scheduling gate" -for w in qsplit-worker-0 qsplit-worker-1; do - gate="$(kubectl get pod "$w" -o jsonpath='{.spec.schedulingGates[0].name}' 2>/dev/null || true)" - [ "$gate" = "quantum.braket/ready" ] || fail "$w not gated (gate=$gate)" -done - -# leader's sidecar must know where to find workers: FLUENCE_WORKER_GROUP_BASE set -log "checking leader sidecar has the worker-group env" -base="$(kubectl get pod qsplit-leader -o jsonpath='{range .spec.containers[*]}{range .env[*]}{.name}={.value}{"\n"}{end}{end}' 2>/dev/null | grep FLUENCE_WORKER_GROUP_BASE || true)" -[ -n "$base" ] || fail "leader sidecar missing FLUENCE_WORKER_GROUP_BASE (sidecar would look in the wrong group and never ungate)" - -log "PASS 7: quantum gang split into leader(1) + workers(N-1), relinked + gated" -kubectl delete -f examples/test/e2e/quantum/quantum-split-pods.yaml --wait=false || true -for g in qsplit qsplit-workers; do - kubectl patch podgroup $g --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true -done -kubectl wait --for=delete pod -l app=qsplit --timeout=60s 2>/dev/null || true diff --git a/test/e2e/quantum/04-gang-env-contract.sh b/test/e2e/quantum/04-gang-env-contract.sh index ced3a3a..19f2439 100755 --- a/test/e2e/quantum/04-gang-env-contract.sh +++ b/test/e2e/quantum/04-gang-env-contract.sh @@ -1,61 +1,58 @@ #!/usr/bin/env bash -# Env-contract e2e: deploy a mock gang and verify the webhook injects the env the -# real gang workload (gang.py) depends on — IN-CLUSTER, on the real pod specs, -# with no Braket/AWS and WITHOUT requiring the pod to be scheduled. Guards the -# runtime seam that, if broken, makes a gang schedule fine then hang (a leader -# with no FLUENCE_ROLE defaults to worker -> no leader -> deadlock). +# Env-contract e2e (gang + submitter): verify the webhook injects, at admission, +# the env the runtime depends on — IN-CLUSTER, on the real pod specs, with no +# Braket/AWS and WITHOUT requiring scheduling. Guards the seam that, if broken, +# makes a gang schedule then hang or double-submit. # -# This checks the SPEC layer only: the env references the webhook wires onto the -# right container at admission. These are downward-API valueFrom refs (their -# VALUES resolve later, at placement), but their PRESENCE is deterministic at -# admission, so this test needs no scheduling, no qpu add-on, no logs — it cannot -# flake on capacity. Injection paths verified in code: -# FLUENCE_ROLE roleEnvOps (quantum handler) -> all workload containers -# FLUENCE_POD_UID, PYTHONPATH InterceptorOps (core) -> fluxion-resource containers -# FLUXION_BACKEND fluxion handler (InjectEnvOps) -> fluxion-resource containers -# The leader requests qpu (so it gets the full contract); the worker only needs -# FLUENCE_ROLE (it requests no fluxion resource, by design). +# Spec layer only (these are downward-API valueFrom refs whose VALUES resolve at +# placement, but whose PRESENCE is deterministic at admission), so no scheduling, +# no qpu capacity, no logs — it cannot flake on capacity. Contract: +# gang pod (faux): FLUENCE_FAUX_SUBMIT, FLUENCE_QUANTUM_JOB_ID, PYTHONPATH, FLUXION_BACKEND +# submitter: FLUENCE_GANG_GROUP on the sidecar (real submit, ungates the gang) set -euo pipefail HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" -log "TEST 8: gang env contract (webhook injects what gang.py reads) — spec layer" -kubectl apply -f examples/test/e2e/quantum/gang-env-mock.yaml +GROUP=qgang +SUBMITTER=${GROUP}-submitter -# does container 'app' of pod $1 have an env entry named $2 ? (spec-level only) +log "TEST 8: gang+submitter env contract — spec layer" +kubectl apply -f examples/test/e2e/quantum/quantum-gang-pods.yaml + +# does container $2 of pod $1 have an env entry named $3 ? (spec-level only) has_env() { - kubectl get pod "$1" -o jsonpath="{.spec.containers[?(@.name=='app')].env[*].name}" \ - 2>/dev/null | tr ' ' '\n' | grep -qx "$2" + kubectl get pod "$1" -o jsonpath="{.spec.containers[?(@.name=='$2')].env[*].name}" \ + 2>/dev/null | tr ' ' '\n' | grep -qx "$3" } -# the webhook mutates at admission; poll briefly for the spec to appear -log "checking the webhook wired the contract onto the leader (qpu) container" -for i in $(seq 1 15); do has_env gangenv-leader FLUENCE_ROLE && break; sleep 2; done - -for v in FLUENCE_ROLE FLUENCE_POD_UID PYTHONPATH FLUXION_BACKEND; do - has_env gangenv-leader "$v" \ - || { kubectl get pod gangenv-leader -o yaml | sed -n '/containers:/,/status:/p'; \ - fail "leader container missing env '$v' (webhook did not inject the contract)"; } - log " leader has env: $v" +log "checking the webhook wired the faux contract onto a gang pod" +for i in $(seq 1 15); do has_env ${GROUP}-0 app FLUENCE_FAUX_SUBMIT && break; sleep 2; done +for v in FLUENCE_FAUX_SUBMIT FLUENCE_QUANTUM_JOB_ID PYTHONPATH FLUXION_BACKEND; do + has_env ${GROUP}-0 app "$v" \ + || { kubectl get pod ${GROUP}-0 -o yaml | sed -n '/containers:/,/status:/p'; \ + fail "gang pod 'app' container missing env '$v'"; } + log " gang pod has env: $v" done -# the worker carries FLUENCE_ROLE so gang.py selects 'worker' by contract, not luck -has_env gangenv-worker-0 FLUENCE_ROLE \ - || fail "worker container missing FLUENCE_ROLE (gang.py would default to worker by luck)" -log " worker has env: FLUENCE_ROLE" - -# and the role VALUE on the spec is correct per pod (downward-API ref to the -# role annotation, or a literal — either way the resolved fieldRef/value must -# encode leader vs worker). Assert the annotation the ref reads is right. -lr="$(kubectl get pod gangenv-leader -o jsonpath='{.metadata.annotations.fluence\.flux-framework\.org/role}')" -wr="$(kubectl get pod gangenv-worker-0 -o jsonpath='{.metadata.annotations.fluence\.flux-framework\.org/role}')" -[ "$lr" = "leader" ] || fail "leader role annotation=$lr, want leader" -[ "$wr" = "worker" ] || fail "worker role annotation=$wr, want worker" -log " role annotations correct (leader=$lr worker=$wr)" - -log "PASS 8: webhook injects the gang env contract at admission" - -kubectl delete -f examples/test/e2e/quantum/gang-env-mock.yaml --wait=false || true -for g in gangenv gangenv-workers; do - kubectl patch podgroup $g --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true +# The submitter's sidecar must know which gang to ungate. +log "checking the submitter sidecar has FLUENCE_GANG_GROUP=$GROUP" +for i in $(seq 1 30); do kubectl get pod "$SUBMITTER" >/dev/null 2>&1 && break; sleep 2; done +gg="$(kubectl get pod "$SUBMITTER" \ + -o jsonpath="{.spec.containers[?(@.name=='fluence-sidecar')].env[?(@.name=='FLUENCE_GANG_GROUP')].value}" \ + 2>/dev/null || true)" +[ "$gg" = "$GROUP" ] || fail "submitter sidecar FLUENCE_GANG_GROUP=$gg, want $GROUP" +log " submitter sidecar has FLUENCE_GANG_GROUP=$gg" + +# And the submitter must NOT be in faux mode (it does the real submit). +if has_env "$SUBMITTER" app FLUENCE_FAUX_SUBMIT; then + fail "submitter must NOT carry FLUENCE_FAUX_SUBMIT (it submits for real)" +fi +log " submitter is not faux" + +log "PASS 8: webhook injects the gang(faux) + submitter(real) env contract at admission" + +kubectl delete -f examples/test/e2e/quantum/quantum-gang-pods.yaml --wait=false || true +kubectl delete pod "$SUBMITTER" --wait=false 2>/dev/null || true +for g in "$GROUP" "$SUBMITTER"; do + kubectl patch podgroup "$g" --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true done -kubectl wait --for=delete pod -l app=gangenv --timeout=60s 2>/dev/null || true +kubectl wait --for=delete pod -l app="$GROUP" --timeout=60s 2>/dev/null || true diff --git a/test/e2e/quantum/setup.sh b/test/e2e/quantum/setup.sh new file mode 100644 index 0000000..cf35020 --- /dev/null +++ b/test/e2e/quantum/setup.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +# Quantum suite setup (run by the e2e-suite workflow before the NN-*.sh tests). +# +# Installs the qpu add-on so nodes advertise fluxion.flux-framework.org/qpu — +# without it every quantum pod stays Pending (fluence matches in its own graph, +# but the default NodeResourcesFit plugin rejects each node because the extended +# resource is not in allocatable, so the match is rolled back). The base deploy +# (deploy/fluence-test.yaml) does NOT include this; it is quantum-only. +# +# Also points the webhook-injected sidecar/stage image at the CI-loaded image: +# the default sidecar image (ghcr.io/.../fluence-sidecar:latest) is not loaded in +# kind, so the submitter's containers could not pull. The fluence-stage init is +# fail-soft (no python in this image -> it logs and exits 0), which is fine for +# the structural assertions; the submitter still schedules and runs. +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" +IMAGE="${IMAGE:-vanessa/fluence:test}" + +log "quantum setup: installing the qpu add-on (resources ConfigMap + device plugin)" +kubectl apply -f deploy/fluence-resources-test.yaml + +# Run the device plugin from the CI-loaded image (its manifest ships a registry +# image that kind has not pulled). Container name is 'deviceplugin'. +kubectl -n kube-system set image daemonset/fluence-deviceplugin deviceplugin="$IMAGE" +kubectl -n kube-system patch daemonset/fluence-deviceplugin --type=json \ + -p '[{"op":"replace","path":"/spec/template/spec/containers/0/imagePullPolicy","value":"IfNotPresent"}]' \ + 2>/dev/null || true + +# Injected sidecar + stage init must use a present image too (see header). +kubectl -n kube-system set env deployment/fluence-webhook FLUENCE_SIDECAR_IMAGE="$IMAGE" +kubectl -n kube-system rollout status deployment/fluence-webhook --timeout=180s + +# Scheduler re-reads the resources config now that the ConfigMap exists. +kubectl -n kube-system rollout restart deployment/fluence +kubectl -n kube-system rollout status deployment/fluence --timeout=180s + +log "waiting for the device plugin DaemonSet to be Ready" +kubectl -n kube-system rollout status daemonset/fluence-deviceplugin --timeout=180s + +# Block until at least one node advertises the qpu extended resource, so the +# tests do not race the kubelet's device registration. +log "waiting for nodes to advertise fluxion.flux-framework.org/qpu" +ok=0 +for i in $(seq 1 60); do + if kubectl get nodes -o jsonpath='{.items[*].status.allocatable}' 2>/dev/null \ + | grep -q 'fluxion.flux-framework.org/qpu'; then + ok=1; break + fi + sleep 3 +done +[ "$ok" = 1 ] || fail "no node advertised fluxion.flux-framework.org/qpu after the add-on (device plugin not registering)" +log "qpu advertised on at least one node" + +log "quantum setup complete: qpu add-on installed, scheduler restarted, sidecar image=$IMAGE" \ No newline at end of file